Skip to content

x86 (32/64): go back to passing SIMD vectors by-ptr #141309

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions compiler/rustc_monomorphize/src/mono_checks/abi_check.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@ fn do_check_simd_vector_abi<'tcx>(
is_call: bool,
loc: impl Fn() -> (Span, HirId),
) {
// We check this on all functions, including those using the "Rust" ABI.
// For the "Rust" ABI it would be a bug if the lint ever triggered, but better safe than sorry.
let feature_def = tcx.sess.target.features_for_correct_vector_abi();
let codegen_attrs = tcx.codegen_fn_attrs(def_id);
let have_feature = |feat: Symbol| {
Expand Down Expand Up @@ -123,8 +121,9 @@ fn do_check_wasm_abi<'tcx>(
is_call: bool,
loc: impl Fn() -> (Span, HirId),
) {
// Only proceed for `extern "C" fn` on wasm32-unknown-unknown (same check as what `adjust_for_foreign_abi` uses to call `compute_wasm_abi_info`),
// and only proceed if `wasm_c_abi_opt` indicates we should emit the lint.
// Only proceed for `extern "C" fn` on wasm32-unknown-unknown (same check as what
// `adjust_for_foreign_abi` uses to call `compute_wasm_abi_info`), and only proceed if
// `wasm_c_abi_opt` indicates we should emit the lint.
if !(tcx.sess.target.arch == "wasm32"
&& tcx.sess.target.os == "unknown"
&& tcx.wasm_c_abi_opt() == WasmCAbi::Legacy { with_lint: true }
Expand Down Expand Up @@ -157,8 +156,15 @@ fn check_instance_abi<'tcx>(tcx: TyCtxt<'tcx>, instance: Instance<'tcx>) {
else {
// An error will be reported during codegen if we cannot determine the ABI of this
// function.
tcx.dcx().delayed_bug("ABI computation failure should lead to compilation failure");
return;
};
// Unlike the call-site check, we do also check "Rust" ABI functions here.
// This should never trigger, *except* if we start making use of vector registers
// for the "Rust" ABI and the user disables those vector registers (which should trigger a
// warning as that's clearly disabling a "required" target feature for this target).
// Using such a function is where disabling the vector register actually can start leading
// to soundness issues, so erroring here seems good.
let loc = || {
let def_id = instance.def_id();
(
Expand All @@ -179,7 +185,8 @@ fn check_call_site_abi<'tcx>(
loc: impl Fn() -> (Span, HirId) + Copy,
) {
if callee.fn_sig(tcx).abi().is_rustic_abi() {
// we directly handle the soundness of Rust ABIs
// We directly handle the soundness of Rust ABIs -- so let's skip the majority of
// call sites to avoid a perf regression.
return;
}
let typing_env = ty::TypingEnv::fully_monomorphized();
Expand Down
27 changes: 5 additions & 22 deletions compiler/rustc_target/src/callconv/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use rustc_abi::{
use rustc_macros::HashStable_Generic;

pub use crate::spec::AbiMap;
use crate::spec::{HasTargetSpec, HasWasmCAbiOpt, HasX86AbiOpt, RustcAbi, WasmCAbi};
use crate::spec::{HasTargetSpec, HasWasmCAbiOpt, HasX86AbiOpt, WasmCAbi};

mod aarch64;
mod amdgpu;
Expand Down Expand Up @@ -696,24 +696,6 @@ impl<'a, Ty> FnAbi<'a, Ty> {
_ => {}
};

// Decides whether we can pass the given SIMD argument via `PassMode::Direct`.
// May only return `true` if the target will always pass those arguments the same way,
// no matter what the user does with `-Ctarget-feature`! In other words, whatever
// target features are required to pass a SIMD value in registers must be listed in
// the `abi_required_features` for the current target and ABI.
let can_pass_simd_directly = |arg: &ArgAbi<'_, Ty>| match &*spec.arch {
// On x86, if we have SSE2 (which we have by default for x86_64), we can always pass up
// to 128-bit-sized vectors.
"x86" if spec.rustc_abi == Some(RustcAbi::X86Sse2) => arg.layout.size.bits() <= 128,
"x86_64" if spec.rustc_abi != Some(RustcAbi::X86Softfloat) => {
// FIXME once https://github.com/bytecodealliance/wasmtime/issues/10254 is fixed
// accept vectors up to 128bit rather than vectors of exactly 128bit.
arg.layout.size.bits() == 128
}
// So far, we haven't implemented this logic for any other target.
_ => false,
};

for (arg_idx, arg) in self
.args
.iter_mut()
Expand Down Expand Up @@ -813,9 +795,10 @@ impl<'a, Ty> FnAbi<'a, Ty> {
// target feature sets. Some more information about this
// issue can be found in #44367.
//
// Note that the intrinsic ABI is exempt here as those are not
// real functions anyway, and the backend expects very specific types.
if spec.simd_types_indirect && !can_pass_simd_directly(arg) {
// We *could* do better in some cases, e.g. on x86_64 targets where SSE2 is
// required. However, it turns out that that makes LLVM worse at optimizing this
// code, so we pass things indirectly even there. See #139029 for more on that.
if spec.simd_types_indirect {
arg.make_indirect();
}
}
Expand Down
5 changes: 3 additions & 2 deletions tests/codegen/abi-x86-sse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ trait Copy {}
#[repr(simd)]
pub struct Sse([f32; 4]);

// x86-64: <4 x float> @sse_id(<4 x float> {{[^,]*}})
// x86-32: <4 x float> @sse_id(<4 x float> {{[^,]*}})
// FIXME: due to #139029 we are passing them all indirectly.
// x86-64: void @sse_id(ptr{{( [^,]*)?}} sret([16 x i8]){{( .*)?}}, ptr{{( [^,]*)?}})
// x86-32: void @sse_id(ptr{{( [^,]*)?}} sret([16 x i8]){{( .*)?}}, ptr{{( [^,]*)?}})
// x86-32-nosse: void @sse_id(ptr{{( [^,]*)?}} sret([16 x i8]){{( .*)?}}, ptr{{( [^,]*)?}})
#[no_mangle]
pub fn sse_id(x: Sse) -> Sse {
Expand Down
20 changes: 5 additions & 15 deletions tests/codegen/simd-intrinsic/simd-intrinsic-transmute-array.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
//
//@ compile-flags: -C no-prepopulate-passes
// LLVM IR isn't very portable and the one tested here depends on the ABI
// which is different between x86 (where we use SSE registers) and others.
// `x86-64` and `x86-32-sse2` are identical, but compiletest does not support
// taking the union of multiple `only` annotations.
//@ revisions: x86-64 x86-32-sse2 other
//@[x86-64] only-x86_64
//@[x86-32-sse2] only-rustc_abi-x86-sse2
//@[other] ignore-rustc_abi-x86-sse2
//@[other] ignore-x86_64
// 32bit MSVC does not align things properly so we suppress high alignment annotations (#112480)
//@ ignore-i686-pc-windows-msvc
//@ ignore-i686-pc-windows-gnu

#![crate_type = "lib"]
#![allow(non_camel_case_types)]
Expand Down Expand Up @@ -47,9 +41,7 @@ pub fn build_array_s(x: [f32; 4]) -> S<4> {
#[no_mangle]
pub fn build_array_transmute_s(x: [f32; 4]) -> S<4> {
// CHECK: %[[VAL:.+]] = load <4 x float>, ptr %x, align [[ARRAY_ALIGN]]
// x86-32: ret <4 x float> %[[VAL:.+]]
// x86-64: ret <4 x float> %[[VAL:.+]]
// other: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
// CHECK: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
unsafe { std::mem::transmute(x) }
}

Expand All @@ -64,8 +56,6 @@ pub fn build_array_t(x: [f32; 4]) -> T {
#[no_mangle]
pub fn build_array_transmute_t(x: [f32; 4]) -> T {
// CHECK: %[[VAL:.+]] = load <4 x float>, ptr %x, align [[ARRAY_ALIGN]]
// x86-32: ret <4 x float> %[[VAL:.+]]
// x86-64: ret <4 x float> %[[VAL:.+]]
// other: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
// CHECK: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
unsafe { std::mem::transmute(x) }
}
16 changes: 9 additions & 7 deletions tests/codegen/simd/packed-simd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,18 @@ fn load<T, const N: usize>(v: PackedSimd<T, N>) -> FullSimd<T, N> {
}
}

// CHECK-LABEL: define <3 x float> @square_packed_full(ptr{{[a-z_ ]*}} align 4 {{[^,]*}})
// CHECK-LABEL: square_packed_full
// CHECK-SAME: ptr{{[a-z_ ]*}} sret([[RET_TYPE:[^)]+]]) [[RET_ALIGN:align (8|16)]]{{[^%]*}} [[RET_VREG:%[_0-9]*]]
// CHECK-SAME: ptr{{[a-z_ ]*}} align 4
#[no_mangle]
pub fn square_packed_full(x: PackedSimd<f32, 3>) -> FullSimd<f32, 3> {
// The unoptimized version of this is not very interesting to check
// since `load` does not get inlined.
// opt3-NEXT: start:
// opt3-NEXT: load <3 x float>
// CHECK-NEXT: start
// noopt: alloca [[RET_TYPE]], [[RET_ALIGN]]
// CHECK: load <3 x float>
let x = load(x);
// opt3-NEXT: [[VREG:%[a-z0-9_]+]] = fmul <3 x float>
// opt3-NEXT: ret <3 x float> [[VREG:%[a-z0-9_]+]]
// CHECK: [[VREG:%[a-z0-9_]+]] = fmul <3 x float>
// CHECK-NEXT: store <3 x float> [[VREG]], ptr [[RET_VREG]], [[RET_ALIGN]]
// CHECK-NEXT: ret void
unsafe { intrinsics::simd_mul(x, x) }
}

Expand Down
Loading