diff --git a/compiler/rustc_codegen_cranelift/src/abi/mod.rs b/compiler/rustc_codegen_cranelift/src/abi/mod.rs
index 2466bfe60c7ab..983584f58aba3 100644
--- a/compiler/rustc_codegen_cranelift/src/abi/mod.rs
+++ b/compiler/rustc_codegen_cranelift/src/abi/mod.rs
@@ -122,7 +122,7 @@ impl<'tcx> FunctionCx<'_, '_, 'tcx> {
         &mut self,
         name: &str,
         params: Vec<AbiParam>,
-        returns: Vec<AbiParam>,
+        mut returns: Vec<AbiParam>,
         args: &[Value],
     ) -> Cow<'_, [Value]> {
         // Pass i128 arguments by-ref on Windows.
@@ -146,15 +146,19 @@ impl<'tcx> FunctionCx<'_, '_, 'tcx> {
             (params, args.into())
         };
 
-        // Return i128 using a return area pointer on Windows and s390x.
-        let adjust_ret_param =
-            if self.tcx.sess.target.is_like_windows || self.tcx.sess.target.arch == "s390x" {
-                returns.len() == 1 && returns[0].value_type == types::I128
-            } else {
-                false
-            };
+        let ret_single_i128 = returns.len() == 1 && returns[0].value_type == types::I128;
+        if ret_single_i128 && self.tcx.sess.target.is_like_windows {
+            // Return i128 using the vector ABI on Windows
+            returns[0].value_type = types::I64X2;
+
+            let ret = self.lib_call_unadjusted(name, params, returns, &args)[0];
 
-        if adjust_ret_param {
+            // FIXME(bytecodealliance/wasmtime#6104) use bitcast instead of store to get from i64x2 to i128
+            let ret_ptr = self.create_stack_slot(16, 16);
+            ret_ptr.store(self, ret, MemFlags::trusted());
+            Cow::Owned(vec![ret_ptr.load(self, types::I128, MemFlags::trusted())])
+        } else if ret_single_i128 && self.tcx.sess.target.arch == "s390x" {
+            // Return i128 using a return area pointer on s390x.
             let mut params = params;
             let mut args = args.to_vec();
 
diff --git a/compiler/rustc_codegen_cranelift/src/cast.rs b/compiler/rustc_codegen_cranelift/src/cast.rs
index 0b5cb1547fc69..4463631c524be 100644
--- a/compiler/rustc_codegen_cranelift/src/cast.rs
+++ b/compiler/rustc_codegen_cranelift/src/cast.rs
@@ -96,25 +96,9 @@ pub(crate) fn clif_int_or_float_cast(
                 },
             );
 
-            if fx.tcx.sess.target.is_like_windows {
-                let ret = fx.lib_call(
-                    &name,
-                    vec![AbiParam::new(from_ty)],
-                    vec![AbiParam::new(types::I64X2)],
-                    &[from],
-                )[0];
-                // FIXME(bytecodealliance/wasmtime#6104) use bitcast instead of store to get from i64x2 to i128
-                let ret_ptr = fx.create_stack_slot(16, 16);
-                ret_ptr.store(fx, ret, MemFlags::trusted());
-                ret_ptr.load(fx, types::I128, MemFlags::trusted())
-            } else {
-                fx.lib_call(
-                    &name,
-                    vec![AbiParam::new(from_ty)],
-                    vec![AbiParam::new(types::I128)],
-                    &[from],
-                )[0]
-            }
+            fx.lib_call(&name, vec![AbiParam::new(from_ty)], vec![AbiParam::new(types::I128)], &[
+                from,
+            ])[0]
         } else if to_ty == types::I8 || to_ty == types::I16 {
             // FIXME implement fcvt_to_*int_sat.i8/i16
             let val = if to_signed {
diff --git a/compiler/rustc_codegen_cranelift/src/codegen_i128.rs b/compiler/rustc_codegen_cranelift/src/codegen_i128.rs
index dcfd7ddabbc42..df5a79086fa3e 100644
--- a/compiler/rustc_codegen_cranelift/src/codegen_i128.rs
+++ b/compiler/rustc_codegen_cranelift/src/codegen_i128.rs
@@ -33,28 +33,14 @@ pub(crate) fn maybe_codegen<'tcx>(
                 (BinOp::Rem, true) => "__modti3",
                 _ => unreachable!(),
             };
-            if fx.tcx.sess.target.is_like_windows {
-                let args = [lhs.load_scalar(fx), rhs.load_scalar(fx)];
-                let ret = fx.lib_call(
-                    name,
-                    vec![AbiParam::new(types::I128), AbiParam::new(types::I128)],
-                    vec![AbiParam::new(types::I64X2)],
-                    &args,
-                )[0];
-                // FIXME(bytecodealliance/wasmtime#6104) use bitcast instead of store to get from i64x2 to i128
-                let ret_place = CPlace::new_stack_slot(fx, lhs.layout());
-                ret_place.to_ptr().store(fx, ret, MemFlags::trusted());
-                Some(ret_place.to_cvalue(fx))
-            } else {
-                let args = [lhs.load_scalar(fx), rhs.load_scalar(fx)];
-                let ret_val = fx.lib_call(
-                    name,
-                    vec![AbiParam::new(types::I128), AbiParam::new(types::I128)],
-                    vec![AbiParam::new(types::I128)],
-                    &args,
-                )[0];
-                Some(CValue::by_val(ret_val, lhs.layout()))
-            }
+            let args = [lhs.load_scalar(fx), rhs.load_scalar(fx)];
+            let ret_val = fx.lib_call(
+                name,
+                vec![AbiParam::new(types::I128), AbiParam::new(types::I128)],
+                vec![AbiParam::new(types::I128)],
+                &args,
+            )[0];
+            Some(CValue::by_val(ret_val, lhs.layout()))
         }
         BinOp::Lt | BinOp::Le | BinOp::Eq | BinOp::Ge | BinOp::Gt | BinOp::Ne | BinOp::Cmp => None,
         BinOp::Shl | BinOp::ShlUnchecked | BinOp::Shr | BinOp::ShrUnchecked => None,
diff --git a/compiler/rustc_target/src/callconv/x86_win64.rs b/compiler/rustc_target/src/callconv/x86_win64.rs
index 0944bda26875d..816564d2fed8b 100644
--- a/compiler/rustc_target/src/callconv/x86_win64.rs
+++ b/compiler/rustc_target/src/callconv/x86_win64.rs
@@ -1,4 +1,4 @@
-use rustc_abi::{BackendRepr, Float, Primitive};
+use rustc_abi::{BackendRepr, Float, Integer, Primitive, RegKind, Size};
 
 use crate::abi::call::{ArgAbi, FnAbi, Reg};
 use crate::spec::HasTargetSpec;
@@ -6,7 +6,7 @@ use crate::spec::HasTargetSpec;
 // Win64 ABI: https://docs.microsoft.com/en-us/cpp/build/parameter-passing
 
 pub(crate) fn compute_abi_info<Ty>(_cx: &impl HasTargetSpec, fn_abi: &mut FnAbi<'_, Ty>) {
-    let fixup = |a: &mut ArgAbi<'_, Ty>| {
+    let fixup = |a: &mut ArgAbi<'_, Ty>, is_ret: bool| {
         match a.layout.backend_repr {
             BackendRepr::Uninhabited | BackendRepr::Memory { sized: false } => {}
             BackendRepr::ScalarPair(..) | BackendRepr::Memory { sized: true } => {
@@ -23,11 +23,16 @@ pub(crate) fn compute_abi_info<Ty>(_cx: &impl HasTargetSpec, fn_abi: &mut FnAbi<
                 // (probably what clang calls "illegal vectors").
             }
             BackendRepr::Scalar(scalar) => {
-                // Match what LLVM does for `f128` so that `compiler-builtins` builtins match up
-                // with what LLVM expects.
-                if a.layout.size.bytes() > 8
+                if is_ret && matches!(scalar.primitive(), Primitive::Int(Integer::I128, _)) {
+                    // `i128` is returned in xmm0 by Clang and GCC
+                    // FIXME(#134288): This may change for the `-msvc` targets in the future.
+                    let reg = Reg { kind: RegKind::Vector, size: Size::from_bits(128) };
+                    a.cast_to(reg);
+                } else if a.layout.size.bytes() > 8
                     && !matches!(scalar.primitive(), Primitive::Float(Float::F128))
                 {
+                    // Match what LLVM does for `f128` so that `compiler-builtins` builtins match up
+                    // with what LLVM expects.
                     a.make_indirect();
                 } else {
                     a.extend_integer_width_to(32);
@@ -37,8 +42,9 @@ pub(crate) fn compute_abi_info<Ty>(_cx: &impl HasTargetSpec, fn_abi: &mut FnAbi<
     };
 
     if !fn_abi.ret.is_ignore() {
-        fixup(&mut fn_abi.ret);
+        fixup(&mut fn_abi.ret, true);
     }
+
     for arg in fn_abi.args.iter_mut() {
         if arg.is_ignore() && arg.layout.is_zst() {
             // Windows ABIs do not talk about ZST since such types do not exist in MSVC.
@@ -49,7 +55,7 @@ pub(crate) fn compute_abi_info<Ty>(_cx: &impl HasTargetSpec, fn_abi: &mut FnAbi<
             arg.make_indirect_from_ignore();
             continue;
         }
-        fixup(arg);
+        fixup(arg, false);
     }
     // FIXME: We should likely also do something about ZST return types, similar to above.
     // However, that's non-trivial due to `()`.
diff --git a/tests/codegen/i128-x86-callconv.rs b/tests/codegen/i128-x86-callconv.rs
new file mode 100644
index 0000000000000..9a9c9002fc026
--- /dev/null
+++ b/tests/codegen/i128-x86-callconv.rs
@@ -0,0 +1,79 @@
+//! Verify that Rust implements the expected calling convention for `i128`/`u128`.
+
+// Eliminate intermediate instructions during `nop` tests
+//@ compile-flags: -Copt-level=1
+
+//@ add-core-stubs
+//@ revisions: MSVC MINGW
+//@ [MSVC] needs-llvm-components: x86
+//@ [MINGW] needs-llvm-components: x86
+//@ [MSVC] compile-flags: --target x86_64-pc-windows-msvc
+//@ [MINGW] compile-flags: --target x86_64-pc-windows-gnu
+//@ [MSVC] filecheck-flags: --check-prefix=WIN
+//@ [MINGW] filecheck-flags: --check-prefix=WIN
+
+#![crate_type = "lib"]
+#![no_std]
+#![no_core]
+#![feature(no_core, lang_items)]
+
+extern crate minicore;
+
+extern "C" {
+    fn extern_call(arg0: i128);
+    fn extern_ret() -> i128;
+}
+
+#[no_mangle]
+pub extern "C" fn pass(_arg0: u32, arg1: i128) {
+    // CHECK-LABEL: @pass(
+    // i128 is passed indirectly on Windows. It should load the pointer to the stack and pass
+    // a pointer to that allocation.
+    // WIN-SAME: %_arg0, ptr{{.*}} %arg1)
+    // WIN: [[PASS:%[_0-9]+]] = alloca [16 x i8], align 16
+    // WIN: [[LOADED:%[_0-9]+]] = load i128, ptr %arg1
+    // WIN: store i128 [[LOADED]], ptr [[PASS]]
+    // WIN: call void @extern_call
+    unsafe { extern_call(arg1) };
+}
+
+// Check that we produce the correct return ABI
+#[no_mangle]
+pub extern "C" fn ret(_arg0: u32, arg1: i128) -> i128 {
+    // CHECK-LABEL: @ret(
+    // i128 is returned in xmm0 on Windows
+    // FIXME(#134288): This may change for the `-msvc` targets in the future.
+    // WIN-SAME: i32{{.*}} %_arg0, ptr{{.*}} %arg1)
+    // WIN: [[LOADED:%[_0-9]+]] = load <16 x i8>, ptr %arg1
+    // WIN-NEXT: ret <16 x i8> [[LOADED]]
+    arg1
+}
+
+// Check that we consume the correct return ABI
+#[no_mangle]
+pub extern "C" fn forward(dst: *mut i128) {
+    // CHECK-LABEL: @forward
+    // WIN-SAME: ptr{{.*}} %dst)
+    // WIN: [[RETURNED:%[_0-9]+]] = tail call <16 x i8> @extern_ret()
+    // WIN: store <16 x i8> [[RETURNED]], ptr %dst
+    // WIN: ret void
+    unsafe { *dst = extern_ret() };
+}
+
+#[repr(C)]
+struct RetAggregate {
+    a: i32,
+    b: i128,
+}
+
+#[no_mangle]
+pub extern "C" fn ret_aggregate(_arg0: u32, arg1: i128) -> RetAggregate {
+    // CHECK-LABEL: @ret_aggregate(
+    // Aggregates should also be returned indirectly
+    // WIN-SAME: ptr{{.*}}sret([32 x i8]){{.*}}[[RET:%[_0-9]+]], i32{{.*}}%_arg0, ptr{{.*}}%arg1)
+    // WIN: [[LOADED:%[_0-9]+]] = load i128, ptr %arg1
+    // WIN: [[GEP:%[_0-9]+]] = getelementptr{{.*}}, ptr [[RET]]
+    // WIN: store i128 [[LOADED]], ptr [[GEP]]
+    // WIN: ret void
+    RetAggregate { a: 1, b: arg1 }
+}