Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x86: use SSE2 to pass float and SIMD types #135408

Merged
merged 1 commit into from
Feb 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 74 additions & 44 deletions compiler/rustc_target/src/callconv/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use rustc_abi::{
};
use rustc_macros::HashStable_Generic;

use crate::spec::{HasTargetSpec, HasWasmCAbiOpt, HasX86AbiOpt, WasmCAbi};
use crate::spec::{HasTargetSpec, HasWasmCAbiOpt, HasX86AbiOpt, RustcAbi, WasmCAbi};

mod aarch64;
mod amdgpu;
Expand Down Expand Up @@ -386,6 +386,7 @@ impl<'a, Ty> ArgAbi<'a, Ty> {
/// Pass this argument directly instead. Should NOT be used!
/// Only exists because of past ABI mistakes that will take time to fix
/// (see <https://github.com/rust-lang/rust/issues/115666>).
#[track_caller]
pub fn make_direct_deprecated(&mut self) {
match self.mode {
PassMode::Indirect { .. } => {
Expand All @@ -398,6 +399,7 @@ impl<'a, Ty> ArgAbi<'a, Ty> {

/// Pass this argument indirectly, by passing a (thin or wide) pointer to the argument instead.
/// This is valid for both sized and unsized arguments.
#[track_caller]
pub fn make_indirect(&mut self) {
match self.mode {
PassMode::Direct(_) | PassMode::Pair(_, _) => {
Expand All @@ -412,6 +414,7 @@ impl<'a, Ty> ArgAbi<'a, Ty> {

/// Same as `make_indirect`, but for arguments that are ignored. Only needed for ABIs that pass
/// ZSTs indirectly.
#[track_caller]
pub fn make_indirect_from_ignore(&mut self) {
match self.mode {
PassMode::Ignore => {
Expand Down Expand Up @@ -716,27 +719,46 @@ impl<'a, Ty> FnAbi<'a, Ty> {
C: HasDataLayout + HasTargetSpec,
{
let spec = cx.target_spec();
match &spec.arch[..] {
match &*spec.arch {
"x86" => x86::compute_rust_abi_info(cx, self, abi),
"riscv32" | "riscv64" => riscv::compute_rust_abi_info(cx, self, abi),
"loongarch64" => loongarch::compute_rust_abi_info(cx, self, abi),
"aarch64" => aarch64::compute_rust_abi_info(cx, self),
_ => {}
};

// Decides whether we can pass the given SIMD argument via `PassMode::Direct`.
// May only return `true` if the target will always pass those arguments the same way,
// no matter what the user does with `-Ctarget-feature`! In other words, whatever
// target features are required to pass a SIMD value in registers must be listed in
// the `abi_required_features` for the current target and ABI.
let can_pass_simd_directly = |arg: &ArgAbi<'_, Ty>| match &*spec.arch {
// On x86, if we have SSE2 (which we have by default for x86_64), we can always pass up
// to 128-bit-sized vectors.
"x86" if spec.rustc_abi == Some(RustcAbi::X86Sse2) => arg.layout.size.bits() <= 128,
"x86_64" if spec.rustc_abi != Some(RustcAbi::X86Softfloat) => {
arg.layout.size.bits() <= 128
}
// So far, we haven't implemented this logic for any other target.
_ => false,
};

for (arg_idx, arg) in self
.args
.iter_mut()
.enumerate()
.map(|(idx, arg)| (Some(idx), arg))
.chain(iter::once((None, &mut self.ret)))
{
if arg.is_ignore() {
// If the logic above already picked a specific type to cast the argument to, leave that
// in place.
if matches!(arg.mode, PassMode::Ignore | PassMode::Cast { .. }) {
continue;
}

if arg_idx.is_none()
&& arg.layout.size > Primitive::Pointer(AddressSpace::DATA).size(cx) * 2
&& !matches!(arg.layout.backend_repr, BackendRepr::Vector { .. })
{
// Return values larger than 2 registers using a return area
// pointer. LLVM and Cranelift disagree about how to return
Expand All @@ -746,7 +768,8 @@ impl<'a, Ty> FnAbi<'a, Ty> {
// return value independently and decide to pass it in a
// register or not, which would result in the return value
// being passed partially in registers and partially through a
// return area pointer.
// return area pointer. For large IR-level values such as `i128`,
// cranelift will even split up the value into smaller chunks.
//
// While Cranelift may need to be fixed as the LLVM behavior is
// generally more correct with respect to the surface language,
Expand Down Expand Up @@ -776,53 +799,60 @@ impl<'a, Ty> FnAbi<'a, Ty> {
// rustc_target already ensure any return value which doesn't
// fit in the available amount of return registers is passed in
// the right way for the current target.
//
// The adjustment is not necessary nor desired for types with a vector
// representation; those are handled below.
arg.make_indirect();
continue;
}

match arg.layout.backend_repr {
BackendRepr::Memory { .. } => {}

// This is a fun case! The gist of what this is doing is
// that we want callers and callees to always agree on the
// ABI of how they pass SIMD arguments. If we were to *not*
// make these arguments indirect then they'd be immediates
// in LLVM, which means that they'd used whatever the
// appropriate ABI is for the callee and the caller. That
// means, for example, if the caller doesn't have AVX
// enabled but the callee does, then passing an AVX argument
// across this boundary would cause corrupt data to show up.
//
// This problem is fixed by unconditionally passing SIMD
// arguments through memory between callers and callees
// which should get them all to agree on ABI regardless of
// target feature sets. Some more information about this
// issue can be found in #44367.
//
// Note that the intrinsic ABI is exempt here as
// that's how we connect up to LLVM and it's unstable
// anyway, we control all calls to it in libstd.
BackendRepr::Vector { .. }
if abi != ExternAbi::RustIntrinsic && spec.simd_types_indirect =>
{
arg.make_indirect();
continue;
BackendRepr::Memory { .. } => {
// Compute `Aggregate` ABI.

let is_indirect_not_on_stack =
matches!(arg.mode, PassMode::Indirect { on_stack: false, .. });
assert!(is_indirect_not_on_stack);

let size = arg.layout.size;
if arg.layout.is_sized()
&& size <= Primitive::Pointer(AddressSpace::DATA).size(cx)
{
// We want to pass small aggregates as immediates, but using
// an LLVM aggregate type for this leads to bad optimizations,
// so we pick an appropriately sized integer type instead.
arg.cast_to(Reg { kind: RegKind::Integer, size });
}
}

_ => continue,
}
// Compute `Aggregate` ABI.

let is_indirect_not_on_stack =
matches!(arg.mode, PassMode::Indirect { on_stack: false, .. });
assert!(is_indirect_not_on_stack);

let size = arg.layout.size;
if !arg.layout.is_unsized() && size <= Primitive::Pointer(AddressSpace::DATA).size(cx) {
// We want to pass small aggregates as immediates, but using
// an LLVM aggregate type for this leads to bad optimizations,
// so we pick an appropriately sized integer type instead.
arg.cast_to(Reg { kind: RegKind::Integer, size });
BackendRepr::Vector { .. } => {
// This is a fun case! The gist of what this is doing is
// that we want callers and callees to always agree on the
// ABI of how they pass SIMD arguments. If we were to *not*
// make these arguments indirect then they'd be immediates
// in LLVM, which means that they'd used whatever the
// appropriate ABI is for the callee and the caller. That
// means, for example, if the caller doesn't have AVX
// enabled but the callee does, then passing an AVX argument
// across this boundary would cause corrupt data to show up.
//
// This problem is fixed by unconditionally passing SIMD
// arguments through memory between callers and callees
// which should get them all to agree on ABI regardless of
// target feature sets. Some more information about this
// issue can be found in #44367.
//
// Note that the intrinsic ABI is exempt here as those are not
// real functions anyway, and the backend expects very specific types.
if abi != ExternAbi::RustIntrinsic
&& spec.simd_types_indirect
&& !can_pass_simd_directly(arg)
{
arg.make_indirect();
}
}

_ => {}
}
}
}
Expand Down
14 changes: 11 additions & 3 deletions compiler/rustc_target/src/callconv/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use rustc_abi::{
};

use crate::callconv::{ArgAttribute, FnAbi, PassMode};
use crate::spec::HasTargetSpec;
use crate::spec::{HasTargetSpec, RustcAbi};

#[derive(PartialEq)]
pub(crate) enum Flavor {
Expand Down Expand Up @@ -236,8 +236,16 @@ where
_ => false, // anyway not passed via registers on x86
};
if has_float {
if fn_abi.ret.layout.size <= Primitive::Pointer(AddressSpace::DATA).size(cx) {
// Same size or smaller than pointer, return in a register.
if cx.target_spec().rustc_abi == Some(RustcAbi::X86Sse2)
&& fn_abi.ret.layout.backend_repr.is_scalar()
&& fn_abi.ret.layout.size.bits() <= 128
{
// This is a single scalar that fits into an SSE register, and the target uses the
// SSE ABI. We prefer this over integer registers as float scalars need to be in SSE
// registers for float operations, so that's the best place to pass them around.
fn_abi.ret.cast_to(Reg { kind: RegKind::Vector, size: fn_abi.ret.layout.size });
} else if fn_abi.ret.layout.size <= Primitive::Pointer(AddressSpace::DATA).size(cx) {
// Same size or smaller than pointer, return in an integer register.
fn_abi.ret.cast_to(Reg { kind: RegKind::Integer, size: fn_abi.ret.layout.size });
} else {
// Larger than a pointer, return indirectly.
Expand Down
7 changes: 4 additions & 3 deletions tests/assembly/closure-inherit-target-feature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

use std::arch::x86_64::{__m128, _mm_blend_ps};

// Use an explicit return pointer to prevent tail call optimization.
#[no_mangle]
pub unsafe fn sse41_blend_nofeature(x: __m128, y: __m128) -> __m128 {
pub unsafe fn sse41_blend_nofeature(x: __m128, y: __m128, ret: *mut __m128) {
let f = {
// check that _mm_blend_ps is not being inlined into the closure
// CHECK-LABEL: {{sse41_blend_nofeature.*closure.*:}}
Expand All @@ -18,9 +19,9 @@ pub unsafe fn sse41_blend_nofeature(x: __m128, y: __m128) -> __m128 {
// CHECK-NOT: blendps
// CHECK: ret
#[inline(never)]
|x, y| _mm_blend_ps(x, y, 0b0101)
|x, y, ret: *mut __m128| unsafe { *ret = _mm_blend_ps(x, y, 0b0101) }
};
f(x, y)
f(x, y, ret);
}

#[no_mangle]
Expand Down
42 changes: 18 additions & 24 deletions tests/assembly/x86-return-float.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,18 @@ use minicore::*;
// CHECK-LABEL: return_f32:
#[no_mangle]
pub fn return_f32(x: f32) -> f32 {
// CHECK: movl {{.*}}(%ebp), %eax
// CHECK-NOT: ax
// CHECK: retl
// CHECK: movss {{.*}}(%ebp), %xmm0
// CHECK-NEXT: popl %ebp
// CHECK-NEXT: retl
x
}

// CHECK-LABEL: return_f64:
#[no_mangle]
pub fn return_f64(x: f64) -> f64 {
// CHECK: movl [[#%d,OFFSET:]](%ebp), %[[PTR:.*]]
// CHECK-NEXT: movsd [[#%d,OFFSET+4]](%ebp), %[[VAL:.*]]
// CHECK-NEXT: movsd %[[VAL]], (%[[PTR]])
// CHECK: retl
// CHECK: movsd {{.*}}(%ebp), %xmm0
// CHECK-NEXT: popl %ebp
// CHECK-NEXT: retl
x
}

Expand Down Expand Up @@ -157,7 +156,7 @@ pub unsafe fn call_f32(x: &mut f32) {
}
// CHECK: movl {{.*}}(%ebp), %[[PTR:.*]]
// CHECK: calll {{()|_}}get_f32
// CHECK-NEXT: movl %eax, (%[[PTR]])
// CHECK-NEXT: movss %xmm0, (%[[PTR]])
*x = get_f32();
}

Expand All @@ -169,8 +168,7 @@ pub unsafe fn call_f64(x: &mut f64) {
}
// CHECK: movl {{.*}}(%ebp), %[[PTR:.*]]
// CHECK: calll {{()|_}}get_f64
// CHECK: movsd {{.*}}(%{{ebp|esp}}), %[[VAL:.*]]
// CHECK-NEXT: movsd %[[VAL:.*]], (%[[PTR]])
// CHECK-NEXT: movlps %xmm0, (%[[PTR]])
*x = get_f64();
}

Expand Down Expand Up @@ -315,25 +313,21 @@ pub unsafe fn call_other_f64(x: &mut (usize, f64)) {
#[no_mangle]
pub fn return_f16(x: f16) -> f16 {
// CHECK: pushl %ebp
// CHECK: movl %esp, %ebp
// CHECK: movzwl 8(%ebp), %eax
// CHECK: popl %ebp
// CHECK: retl
// CHECK-NEXT: movl %esp, %ebp
// CHECK-NEXT: pinsrw $0, 8(%ebp), %xmm0
// CHECK-NEXT: popl %ebp
// CHECK-NEXT: retl
x
}

// CHECK-LABEL: return_f128:
#[no_mangle]
pub fn return_f128(x: f128) -> f128 {
// CHECK: movl [[#%d,OFFSET:]](%ebp), %[[PTR:.*]]
// CHECK-NEXT: movl [[#%d,OFFSET+4]](%ebp), %[[VAL1:.*]]
// CHECK-NEXT: movl [[#%d,OFFSET+8]](%ebp), %[[VAL2:.*]]
// CHECK-NEXT: movl [[#%d,OFFSET+12]](%ebp), %[[VAL3:.*]]
// CHECK-NEXT: movl [[#%d,OFFSET+16]](%ebp), %[[VAL4:.*]]
// CHECK-NEXT: movl %[[VAL4:.*]] 12(%[[PTR]])
// CHECK-NEXT: movl %[[VAL3:.*]] 8(%[[PTR]])
// CHECK-NEXT: movl %[[VAL2:.*]] 4(%[[PTR]])
// CHECK-NEXT: movl %[[VAL1:.*]] (%[[PTR]])
// CHECK: retl
// CHECK: pushl %ebp
// CHECK-NEXT: movl %esp, %ebp
// linux-NEXT: movaps 8(%ebp), %xmm0
// win-NEXT: movups 8(%ebp), %xmm0
// CHECK-NEXT: popl %ebp
// CHECK-NEXT: retl
x
}
36 changes: 36 additions & 0 deletions tests/codegen/abi-x86-sse.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
//@ compile-flags: -Z merge-functions=disabled

//@ revisions: x86-64
//@[x86-64] compile-flags: --target x86_64-unknown-linux-gnu
//@[x86-64] needs-llvm-components: x86

//@ revisions: x86-32
//@[x86-32] compile-flags: --target i686-unknown-linux-gnu
//@[x86-32] needs-llvm-components: x86

//@ revisions: x86-32-nosse
//@[x86-32-nosse] compile-flags: --target i586-unknown-linux-gnu
//@[x86-32-nosse] needs-llvm-components: x86

#![feature(no_core, lang_items, rustc_attrs, repr_simd)]
#![no_core]
#![crate_type = "lib"]

#[lang = "sized"]
trait Sized {}

#[lang = "copy"]
trait Copy {}

// Ensure this type is passed without ptr indirection on targets that
// require SSE2.
#[repr(simd)]
pub struct Sse([f32; 4]);

// x86-64: <4 x float> @sse_id(<4 x float> {{[^,]*}})
// x86-32: <4 x float> @sse_id(<4 x float> {{[^,]*}})
// x86-32-nosse: void @sse_id(ptr{{( [^,]*)?}} sret([16 x i8]){{( .*)?}}, ptr{{( [^,]*)?}})
#[no_mangle]
pub fn sse_id(x: Sse) -> Sse {
x
}
Loading
Loading