|
57 | 57 | target_endian = "little"
|
58 | 58 | ))]
|
59 | 59 | 16 => transize(vqtbl1q_u8, self, idxs),
|
| 60 | + #[cfg(all( |
| 61 | + target_arch = "arm", |
| 62 | + target_feature = "v7", |
| 63 | + target_feature = "neon", |
| 64 | + target_endian = "little" |
| 65 | + ))] |
| 66 | + 16 => transize(armv7_neon_swizzle_u8x16, self, idxs), |
60 | 67 | #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
|
61 | 68 | 32 => transize(avx2_pshufb, self, idxs),
|
62 | 69 | #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
|
@@ -98,6 +105,28 @@ where
|
98 | 105 | }
|
99 | 106 | }
|
100 | 107 |
|
| 108 | +/// armv7 neon supports swizzling `u8x16` by swizzling two u8x8 blocks |
| 109 | +/// with a u8x8x2 lookup table. |
| 110 | +/// |
| 111 | +/// # Safety |
| 112 | +/// This requires armv7 neon to work |
| 113 | +#[cfg(all( |
| 114 | + target_arch = "arm", |
| 115 | + target_feature = "v7", |
| 116 | + target_feature = "neon", |
| 117 | + target_endian = "little" |
| 118 | +))] |
| 119 | +unsafe fn armv7_neon_swizzle_u8x16(bytes: Simd<u8, 16>, idxs: Simd<u8, 16>) -> Simd<u8, 16> { |
| 120 | + use core::arch::arm::{uint8x8x2_t, vcombine_u8, vget_high_u8, vget_low_u8, vtbl2_u8}; |
| 121 | + // SAFETY: Caller promised arm neon support |
| 122 | + unsafe { |
| 123 | + let bytes = uint8x8x2_t(vget_low_u8(bytes.into()), vget_high_u8(bytes.into())); |
| 124 | + let lo = vtbl2_u8(bytes, vget_low_u8(idxs.into())); |
| 125 | + let hi = vtbl2_u8(bytes, vget_high_u8(idxs.into())); |
| 126 | + vcombine_u8(lo, hi).into() |
| 127 | + } |
| 128 | +} |
| 129 | + |
101 | 130 | /// "vpshufb like it was meant to be" on AVX2
|
102 | 131 | ///
|
103 | 132 | /// # Safety
|
|
0 commit comments