Skip to content

Commit 803feb5

Browse files
committed
x86-sse2 ABI: use SSE registers for floats and SIMD
1 parent de91711 commit 803feb5

14 files changed

+273
-151
lines changed

compiler/rustc_target/src/callconv/mod.rs

+74-44
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use rustc_abi::{
77
};
88
use rustc_macros::HashStable_Generic;
99

10-
use crate::spec::{HasTargetSpec, HasWasmCAbiOpt, HasX86AbiOpt, WasmCAbi};
10+
use crate::spec::{HasTargetSpec, HasWasmCAbiOpt, HasX86AbiOpt, RustcAbi, WasmCAbi};
1111

1212
mod aarch64;
1313
mod amdgpu;
@@ -386,6 +386,7 @@ impl<'a, Ty> ArgAbi<'a, Ty> {
386386
/// Pass this argument directly instead. Should NOT be used!
387387
/// Only exists because of past ABI mistakes that will take time to fix
388388
/// (see <https://github.com/rust-lang/rust/issues/115666>).
389+
#[track_caller]
389390
pub fn make_direct_deprecated(&mut self) {
390391
match self.mode {
391392
PassMode::Indirect { .. } => {
@@ -398,6 +399,7 @@ impl<'a, Ty> ArgAbi<'a, Ty> {
398399

399400
/// Pass this argument indirectly, by passing a (thin or wide) pointer to the argument instead.
400401
/// This is valid for both sized and unsized arguments.
402+
#[track_caller]
401403
pub fn make_indirect(&mut self) {
402404
match self.mode {
403405
PassMode::Direct(_) | PassMode::Pair(_, _) => {
@@ -412,6 +414,7 @@ impl<'a, Ty> ArgAbi<'a, Ty> {
412414

413415
/// Same as `make_indirect`, but for arguments that are ignored. Only needed for ABIs that pass
414416
/// ZSTs indirectly.
417+
#[track_caller]
415418
pub fn make_indirect_from_ignore(&mut self) {
416419
match self.mode {
417420
PassMode::Ignore => {
@@ -716,27 +719,46 @@ impl<'a, Ty> FnAbi<'a, Ty> {
716719
C: HasDataLayout + HasTargetSpec,
717720
{
718721
let spec = cx.target_spec();
719-
match &spec.arch[..] {
722+
match &*spec.arch {
720723
"x86" => x86::compute_rust_abi_info(cx, self, abi),
721724
"riscv32" | "riscv64" => riscv::compute_rust_abi_info(cx, self, abi),
722725
"loongarch64" => loongarch::compute_rust_abi_info(cx, self, abi),
723726
"aarch64" => aarch64::compute_rust_abi_info(cx, self),
724727
_ => {}
725728
};
726729

730+
// Decides whether we can pass the given SIMD argument via `PassMode::Direct`.
731+
// May only return `true` if the target will always pass those arguments the same way,
732+
// no matter what the user does with `-Ctarget-feature`! In other words, whatever
733+
// target features are required to pass a SIMD value in registers must be listed in
734+
// the `abi_required_features` for the current target and ABI.
735+
let can_pass_simd_directly = |arg: &ArgAbi<'_, Ty>| match &*spec.arch {
736+
// On x86, if we have SSE2 (which we have by default for x86_64), we can always pass up
737+
// to 128-bit-sized vectors.
738+
"x86" if spec.rustc_abi == Some(RustcAbi::X86Sse2) => arg.layout.size.bits() <= 128,
739+
"x86_64" if spec.rustc_abi != Some(RustcAbi::X86Softfloat) => {
740+
arg.layout.size.bits() <= 128
741+
}
742+
// So far, we haven't implemented this logic for any other target.
743+
_ => false,
744+
};
745+
727746
for (arg_idx, arg) in self
728747
.args
729748
.iter_mut()
730749
.enumerate()
731750
.map(|(idx, arg)| (Some(idx), arg))
732751
.chain(iter::once((None, &mut self.ret)))
733752
{
734-
if arg.is_ignore() {
753+
// If the logic above already picked a specific type to cast the argument to, leave that
754+
// in place.
755+
if matches!(arg.mode, PassMode::Ignore | PassMode::Cast { .. }) {
735756
continue;
736757
}
737758

738759
if arg_idx.is_none()
739760
&& arg.layout.size > Primitive::Pointer(AddressSpace::DATA).size(cx) * 2
761+
&& !matches!(arg.layout.backend_repr, BackendRepr::Vector { .. })
740762
{
741763
// Return values larger than 2 registers using a return area
742764
// pointer. LLVM and Cranelift disagree about how to return
@@ -746,7 +768,8 @@ impl<'a, Ty> FnAbi<'a, Ty> {
746768
// return value independently and decide to pass it in a
747769
// register or not, which would result in the return value
748770
// being passed partially in registers and partially through a
749-
// return area pointer.
771+
// return area pointer. For large IR-level values such as `i128`,
772+
// cranelift will even split up the value into smaller chunks.
750773
//
751774
// While Cranelift may need to be fixed as the LLVM behavior is
752775
// generally more correct with respect to the surface language,
@@ -776,53 +799,60 @@ impl<'a, Ty> FnAbi<'a, Ty> {
776799
// rustc_target already ensure any return value which doesn't
777800
// fit in the available amount of return registers is passed in
778801
// the right way for the current target.
802+
//
803+
// The adjustment is not necessary nor desired for types with a vector
804+
// representation; those are handled below.
779805
arg.make_indirect();
780806
continue;
781807
}
782808

783809
match arg.layout.backend_repr {
784-
BackendRepr::Memory { .. } => {}
785-
786-
// This is a fun case! The gist of what this is doing is
787-
// that we want callers and callees to always agree on the
788-
// ABI of how they pass SIMD arguments. If we were to *not*
789-
// make these arguments indirect then they'd be immediates
790-
// in LLVM, which means that they'd used whatever the
791-
// appropriate ABI is for the callee and the caller. That
792-
// means, for example, if the caller doesn't have AVX
793-
// enabled but the callee does, then passing an AVX argument
794-
// across this boundary would cause corrupt data to show up.
795-
//
796-
// This problem is fixed by unconditionally passing SIMD
797-
// arguments through memory between callers and callees
798-
// which should get them all to agree on ABI regardless of
799-
// target feature sets. Some more information about this
800-
// issue can be found in #44367.
801-
//
802-
// Note that the intrinsic ABI is exempt here as
803-
// that's how we connect up to LLVM and it's unstable
804-
// anyway, we control all calls to it in libstd.
805-
BackendRepr::Vector { .. }
806-
if abi != ExternAbi::RustIntrinsic && spec.simd_types_indirect =>
807-
{
808-
arg.make_indirect();
809-
continue;
810+
BackendRepr::Memory { .. } => {
811+
// Compute `Aggregate` ABI.
812+
813+
let is_indirect_not_on_stack =
814+
matches!(arg.mode, PassMode::Indirect { on_stack: false, .. });
815+
assert!(is_indirect_not_on_stack);
816+
817+
let size = arg.layout.size;
818+
if arg.layout.is_sized()
819+
&& size <= Primitive::Pointer(AddressSpace::DATA).size(cx)
820+
{
821+
// We want to pass small aggregates as immediates, but using
822+
// an LLVM aggregate type for this leads to bad optimizations,
823+
// so we pick an appropriately sized integer type instead.
824+
arg.cast_to(Reg { kind: RegKind::Integer, size });
825+
}
810826
}
811827

812-
_ => continue,
813-
}
814-
// Compute `Aggregate` ABI.
815-
816-
let is_indirect_not_on_stack =
817-
matches!(arg.mode, PassMode::Indirect { on_stack: false, .. });
818-
assert!(is_indirect_not_on_stack);
819-
820-
let size = arg.layout.size;
821-
if !arg.layout.is_unsized() && size <= Primitive::Pointer(AddressSpace::DATA).size(cx) {
822-
// We want to pass small aggregates as immediates, but using
823-
// an LLVM aggregate type for this leads to bad optimizations,
824-
// so we pick an appropriately sized integer type instead.
825-
arg.cast_to(Reg { kind: RegKind::Integer, size });
828+
BackendRepr::Vector { .. } => {
829+
// This is a fun case! The gist of what this is doing is
830+
// that we want callers and callees to always agree on the
831+
// ABI of how they pass SIMD arguments. If we were to *not*
832+
// make these arguments indirect then they'd be immediates
833+
// in LLVM, which means that they'd used whatever the
834+
// appropriate ABI is for the callee and the caller. That
835+
// means, for example, if the caller doesn't have AVX
836+
// enabled but the callee does, then passing an AVX argument
837+
// across this boundary would cause corrupt data to show up.
838+
//
839+
// This problem is fixed by unconditionally passing SIMD
840+
// arguments through memory between callers and callees
841+
// which should get them all to agree on ABI regardless of
842+
// target feature sets. Some more information about this
843+
// issue can be found in #44367.
844+
//
845+
// Note that the intrinsic ABI is exempt here as those are not
846+
// real functions anyway, and the backend expects very specific types.
847+
if abi != ExternAbi::RustIntrinsic
848+
&& spec.simd_types_indirect
849+
&& !can_pass_simd_directly(arg)
850+
{
851+
arg.make_indirect();
852+
}
853+
}
854+
855+
_ => {}
826856
}
827857
}
828858
}

compiler/rustc_target/src/callconv/x86.rs

+11-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use rustc_abi::{
44
};
55

66
use crate::callconv::{ArgAttribute, FnAbi, PassMode};
7-
use crate::spec::HasTargetSpec;
7+
use crate::spec::{HasTargetSpec, RustcAbi};
88

99
#[derive(PartialEq)]
1010
pub(crate) enum Flavor {
@@ -236,8 +236,16 @@ where
236236
_ => false, // anyway not passed via registers on x86
237237
};
238238
if has_float {
239-
if fn_abi.ret.layout.size <= Primitive::Pointer(AddressSpace::DATA).size(cx) {
240-
// Same size or smaller than pointer, return in a register.
239+
if cx.target_spec().rustc_abi == Some(RustcAbi::X86Sse2)
240+
&& fn_abi.ret.layout.backend_repr.is_scalar()
241+
&& fn_abi.ret.layout.size.bits() <= 128
242+
{
243+
// This is a single scalar that fits into an SSE register, and the target uses the
244+
// SSE ABI. We prefer this over integer registers as float scalars need to be in SSE
245+
// registers for float operations, so that's the best place to pass them around.
246+
fn_abi.ret.cast_to(Reg { kind: RegKind::Vector, size: fn_abi.ret.layout.size });
247+
} else if fn_abi.ret.layout.size <= Primitive::Pointer(AddressSpace::DATA).size(cx) {
248+
// Same size or smaller than pointer, return in an integer register.
241249
fn_abi.ret.cast_to(Reg { kind: RegKind::Integer, size: fn_abi.ret.layout.size });
242250
} else {
243251
// Larger than a pointer, return indirectly.

tests/assembly/closure-inherit-target-feature.rs

+4-3
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88

99
use std::arch::x86_64::{__m128, _mm_blend_ps};
1010

11+
// Use an explicit return pointer to prevent tail call optimization.
1112
#[no_mangle]
12-
pub unsafe fn sse41_blend_nofeature(x: __m128, y: __m128) -> __m128 {
13+
pub unsafe fn sse41_blend_nofeature(x: __m128, y: __m128, ret: *mut __m128) {
1314
let f = {
1415
// check that _mm_blend_ps is not being inlined into the closure
1516
// CHECK-LABEL: {{sse41_blend_nofeature.*closure.*:}}
@@ -18,9 +19,9 @@ pub unsafe fn sse41_blend_nofeature(x: __m128, y: __m128) -> __m128 {
1819
// CHECK-NOT: blendps
1920
// CHECK: ret
2021
#[inline(never)]
21-
|x, y| _mm_blend_ps(x, y, 0b0101)
22+
|x, y, ret: *mut __m128| unsafe { *ret = _mm_blend_ps(x, y, 0b0101) }
2223
};
23-
f(x, y)
24+
f(x, y, ret);
2425
}
2526

2627
#[no_mangle]

tests/assembly/x86-return-float.rs

+18-24
Original file line numberDiff line numberDiff line change
@@ -33,19 +33,18 @@ use minicore::*;
3333
// CHECK-LABEL: return_f32:
3434
#[no_mangle]
3535
pub fn return_f32(x: f32) -> f32 {
36-
// CHECK: movl {{.*}}(%ebp), %eax
37-
// CHECK-NOT: ax
38-
// CHECK: retl
36+
// CHECK: movss {{.*}}(%ebp), %xmm0
37+
// CHECK-NEXT: popl %ebp
38+
// CHECK-NEXT: retl
3939
x
4040
}
4141

4242
// CHECK-LABEL: return_f64:
4343
#[no_mangle]
4444
pub fn return_f64(x: f64) -> f64 {
45-
// CHECK: movl [[#%d,OFFSET:]](%ebp), %[[PTR:.*]]
46-
// CHECK-NEXT: movsd [[#%d,OFFSET+4]](%ebp), %[[VAL:.*]]
47-
// CHECK-NEXT: movsd %[[VAL]], (%[[PTR]])
48-
// CHECK: retl
45+
// CHECK: movsd {{.*}}(%ebp), %xmm0
46+
// CHECK-NEXT: popl %ebp
47+
// CHECK-NEXT: retl
4948
x
5049
}
5150

@@ -157,7 +156,7 @@ pub unsafe fn call_f32(x: &mut f32) {
157156
}
158157
// CHECK: movl {{.*}}(%ebp), %[[PTR:.*]]
159158
// CHECK: calll {{()|_}}get_f32
160-
// CHECK-NEXT: movl %eax, (%[[PTR]])
159+
// CHECK-NEXT: movss %xmm0, (%[[PTR]])
161160
*x = get_f32();
162161
}
163162

@@ -169,8 +168,7 @@ pub unsafe fn call_f64(x: &mut f64) {
169168
}
170169
// CHECK: movl {{.*}}(%ebp), %[[PTR:.*]]
171170
// CHECK: calll {{()|_}}get_f64
172-
// CHECK: movsd {{.*}}(%{{ebp|esp}}), %[[VAL:.*]]
173-
// CHECK-NEXT: movsd %[[VAL:.*]], (%[[PTR]])
171+
// CHECK-NEXT: movlps %xmm0, (%[[PTR]])
174172
*x = get_f64();
175173
}
176174

@@ -315,25 +313,21 @@ pub unsafe fn call_other_f64(x: &mut (usize, f64)) {
315313
#[no_mangle]
316314
pub fn return_f16(x: f16) -> f16 {
317315
// CHECK: pushl %ebp
318-
// CHECK: movl %esp, %ebp
319-
// CHECK: movzwl 8(%ebp), %eax
320-
// CHECK: popl %ebp
321-
// CHECK: retl
316+
// CHECK-NEXT: movl %esp, %ebp
317+
// CHECK-NEXT: pinsrw $0, 8(%ebp), %xmm0
318+
// CHECK-NEXT: popl %ebp
319+
// CHECK-NEXT: retl
322320
x
323321
}
324322

325323
// CHECK-LABEL: return_f128:
326324
#[no_mangle]
327325
pub fn return_f128(x: f128) -> f128 {
328-
// CHECK: movl [[#%d,OFFSET:]](%ebp), %[[PTR:.*]]
329-
// CHECK-NEXT: movl [[#%d,OFFSET+4]](%ebp), %[[VAL1:.*]]
330-
// CHECK-NEXT: movl [[#%d,OFFSET+8]](%ebp), %[[VAL2:.*]]
331-
// CHECK-NEXT: movl [[#%d,OFFSET+12]](%ebp), %[[VAL3:.*]]
332-
// CHECK-NEXT: movl [[#%d,OFFSET+16]](%ebp), %[[VAL4:.*]]
333-
// CHECK-NEXT: movl %[[VAL4:.*]] 12(%[[PTR]])
334-
// CHECK-NEXT: movl %[[VAL3:.*]] 8(%[[PTR]])
335-
// CHECK-NEXT: movl %[[VAL2:.*]] 4(%[[PTR]])
336-
// CHECK-NEXT: movl %[[VAL1:.*]] (%[[PTR]])
337-
// CHECK: retl
326+
// CHECK: pushl %ebp
327+
// CHECK-NEXT: movl %esp, %ebp
328+
// linux-NEXT: movaps 8(%ebp), %xmm0
329+
// win-NEXT: movups 8(%ebp), %xmm0
330+
// CHECK-NEXT: popl %ebp
331+
// CHECK-NEXT: retl
338332
x
339333
}

tests/codegen/abi-x86-sse.rs

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
//@ compile-flags: -Z merge-functions=disabled
2+
3+
//@ revisions: x86-64
4+
//@[x86-64] compile-flags: --target x86_64-unknown-linux-gnu
5+
//@[x86-64] needs-llvm-components: x86
6+
7+
//@ revisions: x86-32
8+
//@[x86-32] compile-flags: --target i686-unknown-linux-gnu
9+
//@[x86-32] needs-llvm-components: x86
10+
11+
//@ revisions: x86-32-nosse
12+
//@[x86-32-nosse] compile-flags: --target i586-unknown-linux-gnu
13+
//@[x86-32-nosse] needs-llvm-components: x86
14+
15+
#![feature(no_core, lang_items, rustc_attrs, repr_simd)]
16+
#![no_core]
17+
#![crate_type = "lib"]
18+
19+
#[lang = "sized"]
20+
trait Sized {}
21+
22+
#[lang = "copy"]
23+
trait Copy {}
24+
25+
// Ensure this type is passed without ptr indirection on targets that
26+
// require SSE2.
27+
#[repr(simd)]
28+
pub struct Sse([f32; 4]);
29+
30+
// x86-64: <4 x float> @sse_id(<4 x float> {{[^,]*}})
31+
// x86-32: <4 x float> @sse_id(<4 x float> {{[^,]*}})
32+
// x86-32-nosse: void @sse_id(ptr{{( [^,]*)?}} sret([16 x i8]){{( .*)?}}, ptr{{( [^,]*)?}})
33+
#[no_mangle]
34+
pub fn sse_id(x: Sse) -> Sse {
35+
x
36+
}

0 commit comments

Comments
 (0)