Simd-using functions sometimes scalarize after inlining, even if they use vector ops on their own

*Godbolt: https://rust.godbolt.org/z/hhMWb6Eja*

On aarch64 I have this code:

```rs
// Note: Doesn't happen for 16, everything is great then.
const CHUNK: usize = 32;

pub fn all_ascii_chunk(s: &[u8; CHUNK]) -> bool {
    use std::simd::*;
    const ALL_HI: Simd<u8, CHUNK> = Simd::from_array([0x80; CHUNK]);
    const ZERO: Simd<u8, CHUNK> = Simd::from_array([0; CHUNK]);
    (Simd::<u8, CHUNK>::from_array(*s) & ALL_HI)
        .simd_eq(ZERO)
        .all()
}

pub fn all_ascii(s: &[[u8; CHUNK]]) -> bool {
    s.iter().all(|chunk| all_ascii_chunk(chunk))
}
```

Wonderfully, `all_ascii_chunk` compiles to essentially what I want (I mean, it's not perfect, but I certainly wouldn't file a bug about it):
```asm
example::all_ascii_chunk:
        ldp     q1, q0, [x0]
        orr.16b v0, v1, v0
        cmlt.16b        v0, v0, #0
        umaxv.16b       b0, v0
        fmov    w8, s0
        mvn     w8, w8
        and     w0, w8, #0x1
        ret
```

Unfortunately, when it gets called in a loop from `all_ascii`, we... seem to completely loose our ability to do something reasonable, and get this monstrosity:

```asm
example::all_ascii:
        sub     sp, sp, #16
        lsl     x9, x1, #5
LBB0_1:
        mov     x8, x9
        cbz     x9, LBB0_3
        ldp     q0, q1, [x0], #32
        cmlt.16b        v1, v1, #0
        umov.b  w9, v1[1]
        umov.b  w10, v1[0]
        and     w9, w9, #0x1
        and     w10, w10, #0x1
        bfi     w10, w9, #1, #1
        umov.b  w9, v1[2]
        and     w9, w9, #0x1
        bfi     w10, w9, #2, #1
        umov.b  w9, v1[3]
        and     w9, w9, #0x1
        umov.b  w11, v1[4]
        bfi     w10, w9, #3, #1
        and     w9, w11, #0x1
        bfi     w10, w9, #4, #1
        umov.b  w9, v1[5]
        and     w9, w9, #0x1
        bfi     w10, w9, #5, #1
        umov.b  w9, v1[6]
        and     w9, w9, #0x1
        umov.b  w11, v1[7]
        orr     w9, w10, w9, lsl #6
        and     w10, w11, #0x1
        orr     w9, w9, w10, lsl #7
        umov.b  w10, v1[8]
        and     w10, w10, #0x1
        orr     w9, w9, w10, lsl #8
        umov.b  w10, v1[9]
        and     w10, w10, #0x1
        umov.b  w11, v1[10]
        orr     w9, w9, w10, lsl #9
        and     w10, w11, #0x1
        orr     w9, w9, w10, lsl #10
        umov.b  w10, v1[11]
        and     w10, w10, #0x1
        orr     w9, w9, w10, lsl #11
        umov.b  w10, v1[12]
        and     w10, w10, #0x1
        umov.b  w11, v1[13]
        orr     w9, w9, w10, lsl #12
        and     w10, w11, #0x1
        orr     w9, w9, w10, lsl #13
        umov.b  w10, v1[14]
        and     w10, w10, #0x1
        orr     w9, w9, w10, lsl #14
        umov.b  w10, v1[15]
        cmlt.16b        v0, v0, #0
        umov.b  w11, v0[1]
        orr     w9, w9, w10, lsl #15
        and     w10, w11, #0x1
        umov.b  w11, v0[0]
        and     w11, w11, #0x1
        bfi     w11, w10, #1, #1
        umov.b  w10, v0[2]
        and     w10, w10, #0x1
        bfi     w11, w10, #2, #1
        umov.b  w10, v0[3]
        and     w10, w10, #0x1
        bfi     w11, w10, #3, #1
        umov.b  w10, v0[4]
        and     w10, w10, #0x1
        bfi     w11, w10, #4, #1
        umov.b  w10, v0[5]
        and     w10, w10, #0x1
        bfi     w11, w10, #5, #1
        umov.b  w10, v0[6]
        and     w10, w10, #0x1
        orr     w10, w11, w10, lsl #6
        umov.b  w11, v0[7]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #7
        umov.b  w11, v0[8]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #8
        umov.b  w11, v0[9]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #9
        umov.b  w11, v0[10]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #10
        umov.b  w11, v0[11]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #11
        umov.b  w11, v0[12]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #12
        umov.b  w11, v0[13]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #13
        umov.b  w11, v0[14]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #14
        umov.b  w11, v0[15]
        orr     w10, w10, w11, lsl #15
        orr     w10, w10, w9
        sub     x9, x8, #32
        tst     w10, #0xffff
        b.eq    LBB0_1
LBB0_3:
        cmp     x8, #0
        cset    w0, eq
        add     sp, sp, #16
        ret
```

And performance takes a nose-dive. This is very annoying, because this kind of issue means that I can't rely on functions that appear to codegen well continuing to do so when called :(. I know we have little control over this, but its... kind of an a huge issue for using `std::simd` to optimize functions if we can't rely it behaving consistently.

This is possibly related to the bad aarch64 scalar reductions I saw before, although it doesn't seem like it because `all_ascii_chunk` is fine.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Simd-using functions sometimes scalarize after inlining, even if they use vector ops on their own #321

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Simd-using functions sometimes scalarize after inlining, even if they use vector ops on their own #321

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions