Description
Godbolt: https://rust.godbolt.org/z/hhMWb6Eja
On aarch64 I have this code:
// Note: Doesn't happen for 16, everything is great then.
const CHUNK: usize = 32;
pub fn all_ascii_chunk(s: &[u8; CHUNK]) -> bool {
use std::simd::*;
const ALL_HI: Simd<u8, CHUNK> = Simd::from_array([0x80; CHUNK]);
const ZERO: Simd<u8, CHUNK> = Simd::from_array([0; CHUNK]);
(Simd::<u8, CHUNK>::from_array(*s) & ALL_HI)
.simd_eq(ZERO)
.all()
}
pub fn all_ascii(s: &[[u8; CHUNK]]) -> bool {
s.iter().all(|chunk| all_ascii_chunk(chunk))
}
Wonderfully, all_ascii_chunk
compiles to essentially what I want (I mean, it's not perfect, but I certainly wouldn't file a bug about it):
example::all_ascii_chunk:
ldp q1, q0, [x0]
orr.16b v0, v1, v0
cmlt.16b v0, v0, #0
umaxv.16b b0, v0
fmov w8, s0
mvn w8, w8
and w0, w8, #0x1
ret
Unfortunately, when it gets called in a loop from all_ascii
, we... seem to completely loose our ability to do something reasonable, and get this monstrosity:
example::all_ascii:
sub sp, sp, #16
lsl x9, x1, #5
LBB0_1:
mov x8, x9
cbz x9, LBB0_3
ldp q0, q1, [x0], #32
cmlt.16b v1, v1, #0
umov.b w9, v1[1]
umov.b w10, v1[0]
and w9, w9, #0x1
and w10, w10, #0x1
bfi w10, w9, #1, #1
umov.b w9, v1[2]
and w9, w9, #0x1
bfi w10, w9, #2, #1
umov.b w9, v1[3]
and w9, w9, #0x1
umov.b w11, v1[4]
bfi w10, w9, #3, #1
and w9, w11, #0x1
bfi w10, w9, #4, #1
umov.b w9, v1[5]
and w9, w9, #0x1
bfi w10, w9, #5, #1
umov.b w9, v1[6]
and w9, w9, #0x1
umov.b w11, v1[7]
orr w9, w10, w9, lsl #6
and w10, w11, #0x1
orr w9, w9, w10, lsl #7
umov.b w10, v1[8]
and w10, w10, #0x1
orr w9, w9, w10, lsl #8
umov.b w10, v1[9]
and w10, w10, #0x1
umov.b w11, v1[10]
orr w9, w9, w10, lsl #9
and w10, w11, #0x1
orr w9, w9, w10, lsl #10
umov.b w10, v1[11]
and w10, w10, #0x1
orr w9, w9, w10, lsl #11
umov.b w10, v1[12]
and w10, w10, #0x1
umov.b w11, v1[13]
orr w9, w9, w10, lsl #12
and w10, w11, #0x1
orr w9, w9, w10, lsl #13
umov.b w10, v1[14]
and w10, w10, #0x1
orr w9, w9, w10, lsl #14
umov.b w10, v1[15]
cmlt.16b v0, v0, #0
umov.b w11, v0[1]
orr w9, w9, w10, lsl #15
and w10, w11, #0x1
umov.b w11, v0[0]
and w11, w11, #0x1
bfi w11, w10, #1, #1
umov.b w10, v0[2]
and w10, w10, #0x1
bfi w11, w10, #2, #1
umov.b w10, v0[3]
and w10, w10, #0x1
bfi w11, w10, #3, #1
umov.b w10, v0[4]
and w10, w10, #0x1
bfi w11, w10, #4, #1
umov.b w10, v0[5]
and w10, w10, #0x1
bfi w11, w10, #5, #1
umov.b w10, v0[6]
and w10, w10, #0x1
orr w10, w11, w10, lsl #6
umov.b w11, v0[7]
and w11, w11, #0x1
orr w10, w10, w11, lsl #7
umov.b w11, v0[8]
and w11, w11, #0x1
orr w10, w10, w11, lsl #8
umov.b w11, v0[9]
and w11, w11, #0x1
orr w10, w10, w11, lsl #9
umov.b w11, v0[10]
and w11, w11, #0x1
orr w10, w10, w11, lsl #10
umov.b w11, v0[11]
and w11, w11, #0x1
orr w10, w10, w11, lsl #11
umov.b w11, v0[12]
and w11, w11, #0x1
orr w10, w10, w11, lsl #12
umov.b w11, v0[13]
and w11, w11, #0x1
orr w10, w10, w11, lsl #13
umov.b w11, v0[14]
and w11, w11, #0x1
orr w10, w10, w11, lsl #14
umov.b w11, v0[15]
orr w10, w10, w11, lsl #15
orr w10, w10, w9
sub x9, x8, #32
tst w10, #0xffff
b.eq LBB0_1
LBB0_3:
cmp x8, #0
cset w0, eq
add sp, sp, #16
ret
And performance takes a nose-dive. This is very annoying, because this kind of issue means that I can't rely on functions that appear to codegen well continuing to do so when called :(. I know we have little control over this, but its... kind of an a huge issue for using std::simd
to optimize functions if we can't rely it behaving consistently.
This is possibly related to the bad aarch64 scalar reductions I saw before, although it doesn't seem like it because all_ascii_chunk
is fine.