Skip to content

Simd-using functions sometimes scalarize after inlining, even if they use vector ops on their own #321

Open
@thomcc

Description

@thomcc

Godbolt: https://rust.godbolt.org/z/hhMWb6Eja

On aarch64 I have this code:

// Note: Doesn't happen for 16, everything is great then.
const CHUNK: usize = 32;

pub fn all_ascii_chunk(s: &[u8; CHUNK]) -> bool {
    use std::simd::*;
    const ALL_HI: Simd<u8, CHUNK> = Simd::from_array([0x80; CHUNK]);
    const ZERO: Simd<u8, CHUNK> = Simd::from_array([0; CHUNK]);
    (Simd::<u8, CHUNK>::from_array(*s) & ALL_HI)
        .simd_eq(ZERO)
        .all()
}

pub fn all_ascii(s: &[[u8; CHUNK]]) -> bool {
    s.iter().all(|chunk| all_ascii_chunk(chunk))
}

Wonderfully, all_ascii_chunk compiles to essentially what I want (I mean, it's not perfect, but I certainly wouldn't file a bug about it):

example::all_ascii_chunk:
        ldp     q1, q0, [x0]
        orr.16b v0, v1, v0
        cmlt.16b        v0, v0, #0
        umaxv.16b       b0, v0
        fmov    w8, s0
        mvn     w8, w8
        and     w0, w8, #0x1
        ret

Unfortunately, when it gets called in a loop from all_ascii, we... seem to completely loose our ability to do something reasonable, and get this monstrosity:

example::all_ascii:
        sub     sp, sp, #16
        lsl     x9, x1, #5
LBB0_1:
        mov     x8, x9
        cbz     x9, LBB0_3
        ldp     q0, q1, [x0], #32
        cmlt.16b        v1, v1, #0
        umov.b  w9, v1[1]
        umov.b  w10, v1[0]
        and     w9, w9, #0x1
        and     w10, w10, #0x1
        bfi     w10, w9, #1, #1
        umov.b  w9, v1[2]
        and     w9, w9, #0x1
        bfi     w10, w9, #2, #1
        umov.b  w9, v1[3]
        and     w9, w9, #0x1
        umov.b  w11, v1[4]
        bfi     w10, w9, #3, #1
        and     w9, w11, #0x1
        bfi     w10, w9, #4, #1
        umov.b  w9, v1[5]
        and     w9, w9, #0x1
        bfi     w10, w9, #5, #1
        umov.b  w9, v1[6]
        and     w9, w9, #0x1
        umov.b  w11, v1[7]
        orr     w9, w10, w9, lsl #6
        and     w10, w11, #0x1
        orr     w9, w9, w10, lsl #7
        umov.b  w10, v1[8]
        and     w10, w10, #0x1
        orr     w9, w9, w10, lsl #8
        umov.b  w10, v1[9]
        and     w10, w10, #0x1
        umov.b  w11, v1[10]
        orr     w9, w9, w10, lsl #9
        and     w10, w11, #0x1
        orr     w9, w9, w10, lsl #10
        umov.b  w10, v1[11]
        and     w10, w10, #0x1
        orr     w9, w9, w10, lsl #11
        umov.b  w10, v1[12]
        and     w10, w10, #0x1
        umov.b  w11, v1[13]
        orr     w9, w9, w10, lsl #12
        and     w10, w11, #0x1
        orr     w9, w9, w10, lsl #13
        umov.b  w10, v1[14]
        and     w10, w10, #0x1
        orr     w9, w9, w10, lsl #14
        umov.b  w10, v1[15]
        cmlt.16b        v0, v0, #0
        umov.b  w11, v0[1]
        orr     w9, w9, w10, lsl #15
        and     w10, w11, #0x1
        umov.b  w11, v0[0]
        and     w11, w11, #0x1
        bfi     w11, w10, #1, #1
        umov.b  w10, v0[2]
        and     w10, w10, #0x1
        bfi     w11, w10, #2, #1
        umov.b  w10, v0[3]
        and     w10, w10, #0x1
        bfi     w11, w10, #3, #1
        umov.b  w10, v0[4]
        and     w10, w10, #0x1
        bfi     w11, w10, #4, #1
        umov.b  w10, v0[5]
        and     w10, w10, #0x1
        bfi     w11, w10, #5, #1
        umov.b  w10, v0[6]
        and     w10, w10, #0x1
        orr     w10, w11, w10, lsl #6
        umov.b  w11, v0[7]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #7
        umov.b  w11, v0[8]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #8
        umov.b  w11, v0[9]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #9
        umov.b  w11, v0[10]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #10
        umov.b  w11, v0[11]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #11
        umov.b  w11, v0[12]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #12
        umov.b  w11, v0[13]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #13
        umov.b  w11, v0[14]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #14
        umov.b  w11, v0[15]
        orr     w10, w10, w11, lsl #15
        orr     w10, w10, w9
        sub     x9, x8, #32
        tst     w10, #0xffff
        b.eq    LBB0_1
LBB0_3:
        cmp     x8, #0
        cset    w0, eq
        add     sp, sp, #16
        ret

And performance takes a nose-dive. This is very annoying, because this kind of issue means that I can't rely on functions that appear to codegen well continuing to do so when called :(. I know we have little control over this, but its... kind of an a huge issue for using std::simd to optimize functions if we can't rely it behaving consistently.

This is possibly related to the bad aarch64 scalar reductions I saw before, although it doesn't seem like it because all_ascii_chunk is fine.

Metadata

Metadata

Assignees

No one assigned

    Labels

    I-scalarizeImpact: code that should be vectorized, isn't

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions