Skip to content

Commit 7e162d1

Browse files
authored
#443: Add armv7 neon mplementation for Simd<u8, 16>::swizzle_dyn
Use arm neon intrinsics to swizzle two u8x8 blocks with a u8x8x2 lookup table.
1 parent 158e240 commit 7e162d1

File tree

1 file changed

+29
-0
lines changed

1 file changed

+29
-0
lines changed

crates/core_simd/src/swizzle_dyn.rs

+29
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,13 @@ where
5757
target_endian = "little"
5858
))]
5959
16 => transize(vqtbl1q_u8, self, idxs),
60+
#[cfg(all(
61+
target_arch = "arm",
62+
target_feature = "v7",
63+
target_feature = "neon",
64+
target_endian = "little"
65+
))]
66+
16 => transize(armv7_neon_swizzle_u8x16, self, idxs),
6067
#[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
6168
32 => transize(avx2_pshufb, self, idxs),
6269
#[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
@@ -98,6 +105,28 @@ where
98105
}
99106
}
100107

108+
/// armv7 neon supports swizzling `u8x16` by swizzling two u8x8 blocks
109+
/// with a u8x8x2 lookup table.
110+
///
111+
/// # Safety
112+
/// This requires armv7 neon to work
113+
#[cfg(all(
114+
target_arch = "arm",
115+
target_feature = "v7",
116+
target_feature = "neon",
117+
target_endian = "little"
118+
))]
119+
unsafe fn armv7_neon_swizzle_u8x16(bytes: Simd<u8, 16>, idxs: Simd<u8, 16>) -> Simd<u8, 16> {
120+
use core::arch::arm::{uint8x8x2_t, vcombine_u8, vget_high_u8, vget_low_u8, vtbl2_u8};
121+
// SAFETY: Caller promised arm neon support
122+
unsafe {
123+
let bytes = uint8x8x2_t(vget_low_u8(bytes.into()), vget_high_u8(bytes.into()));
124+
let lo = vtbl2_u8(bytes, vget_low_u8(idxs.into()));
125+
let hi = vtbl2_u8(bytes, vget_high_u8(idxs.into()));
126+
vcombine_u8(lo, hi).into()
127+
}
128+
}
129+
101130
/// "vpshufb like it was meant to be" on AVX2
102131
///
103132
/// # Safety

0 commit comments

Comments
 (0)