#443: Add armv7 neon mplementation for Simd<u8, 16>::swizzle_dyn

hkratz · web-flow · commit 7e162d19dd2e · 2024-10-22T15:42:29.000-07:00
Use arm neon intrinsics to swizzle two u8x8 blocks with a u8x8x2 lookup table.
diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
@@ -57,6 +57,13 @@ where
                     target_endian = "little"
                 ))]
                 16 => transize(vqtbl1q_u8, self, idxs),
+                #[cfg(all(
+                    target_arch = "arm",
+                    target_feature = "v7",
+                    target_feature = "neon",
+                    target_endian = "little"
+                ))]
+                16 => transize(armv7_neon_swizzle_u8x16, self, idxs),
                 #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
                 32 => transize(avx2_pshufb, self, idxs),
                 #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
@@ -98,6 +105,28 @@ where
     }
 }
 
+/// armv7 neon supports swizzling `u8x16` by swizzling two u8x8 blocks
+/// with a u8x8x2 lookup table.
+///
+/// # Safety
+/// This requires armv7 neon to work
+#[cfg(all(
+    target_arch = "arm",
+    target_feature = "v7",
+    target_feature = "neon",
+    target_endian = "little"
+))]
+unsafe fn armv7_neon_swizzle_u8x16(bytes: Simd<u8, 16>, idxs: Simd<u8, 16>) -> Simd<u8, 16> {
+    use core::arch::arm::{uint8x8x2_t, vcombine_u8, vget_high_u8, vget_low_u8, vtbl2_u8};
+    // SAFETY: Caller promised arm neon support
+    unsafe {
+        let bytes = uint8x8x2_t(vget_low_u8(bytes.into()), vget_high_u8(bytes.into()));
+        let lo = vtbl2_u8(bytes, vget_low_u8(idxs.into()));
+        let hi = vtbl2_u8(bytes, vget_high_u8(idxs.into()));
+        vcombine_u8(lo, hi).into()
+    }
+}
+
 /// "vpshufb like it was meant to be" on AVX2
 ///
 /// # Safety