diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
index 481f75f337d63..c6aae89f1e51e 100644
--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -560,6 +560,12 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                 return Ok(());
             }
 
+            sym::amdgpu_dispatch_ptr => {
+                let val = self.call_intrinsic("llvm.amdgcn.dispatch.ptr", &[], &[]);
+                // Relying on `LLVMBuildPointerCast` to produce an addrspacecast
+                self.pointercast(val, self.type_ptr())
+            }
+
             _ if name.as_str().starts_with("simd_") => {
                 // Unpack non-power-of-2 #[repr(packed, simd)] arguments.
                 // This gives them the expected layout of a regular #[repr(simd)] vector.
diff --git a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
index f4fae40d8828f..f5ee9406f4bf1 100644
--- a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
@@ -112,6 +112,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                 | sym::unreachable
                 | sym::cold_path
                 | sym::breakpoint
+                | sym::amdgpu_dispatch_ptr
                 | sym::assert_zero_valid
                 | sym::assert_mem_uninitialized_valid
                 | sym::assert_inhabited
diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
index c84c1a8ca16d8..d3d167f6e2544 100644
--- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs
+++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
@@ -70,6 +70,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi
         | sym::add_with_overflow
         | sym::aggregate_raw_ptr
         | sym::align_of
+        | sym::amdgpu_dispatch_ptr
         | sym::assert_inhabited
         | sym::assert_mem_uninitialized_valid
         | sym::assert_zero_valid
@@ -286,6 +287,7 @@ pub(crate) fn check_intrinsic_type(
     let (n_tps, n_cts, inputs, output) = match intrinsic_name {
         sym::autodiff => (4, 0, vec![param(0), param(1), param(2)], param(3)),
         sym::abort => (0, 0, vec![], tcx.types.never),
+        sym::amdgpu_dispatch_ptr => (0, 0, vec![], Ty::new_imm_ptr(tcx, tcx.types.unit)),
         sym::unreachable => (0, 0, vec![], tcx.types.never),
         sym::breakpoint => (0, 0, vec![], tcx.types.unit),
         sym::size_of | sym::align_of | sym::variant_count => (1, 0, vec![], tcx.types.usize),
diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs
index 4080c1cd59ec7..98db0df593bec 100644
--- a/compiler/rustc_span/src/symbol.rs
+++ b/compiler/rustc_span/src/symbol.rs
@@ -458,6 +458,7 @@ symbols! {
         alu32,
         always,
         amdgpu,
+        amdgpu_dispatch_ptr,
         analysis,
         and,
         and_then,
diff --git a/library/core/src/intrinsics/gpu.rs b/library/core/src/intrinsics/gpu.rs
new file mode 100644
index 0000000000000..9e7624841d0c6
--- /dev/null
+++ b/library/core/src/intrinsics/gpu.rs
@@ -0,0 +1,23 @@
+//! Intrinsics for GPU targets.
+//!
+//! Intrinsics in this module are intended for use on GPU targets.
+//! They can be target specific but in general GPU targets are similar.
+
+#![unstable(feature = "gpu_intrinsics", issue = "none")]
+
+/// Returns a pointer to the HSA kernel dispatch packet.
+///
+/// A `gpu-kernel` on amdgpu is always launched through a kernel dispatch packet.
+/// The dispatch packet contains the workgroup size, launch size and other data.
+/// The content is defined by the [HSA Platform System Architecture Specification],
+/// which is implemented e.g. in AMD's [hsa.h].
+/// The intrinsic returns a unit pointer so that rustc does not need to know the packet struct.
+/// The pointer is valid for the whole lifetime of the program.
+///
+/// [HSA Platform System Architecture Specification]: https://hsafoundation.com/wp-content/uploads/2021/02/HSA-SysArch-1.2.pdf
+/// [hsa.h]: https://github.com/ROCm/rocm-systems/blob/rocm-7.1.0/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa.h#L2959
+#[rustc_nounwind]
+#[rustc_intrinsic]
+#[cfg(target_arch = "amdgpu")]
+#[must_use = "returns a pointer that does nothing unless used"]
+pub fn amdgpu_dispatch_ptr() -> *const ();
diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs
index 20f34036b25c9..8d112b4c5d187 100644
--- a/library/core/src/intrinsics/mod.rs
+++ b/library/core/src/intrinsics/mod.rs
@@ -60,6 +60,7 @@ use crate::{mem, ptr};
 
 mod bounds;
 pub mod fallback;
+pub mod gpu;
 pub mod mir;
 pub mod simd;
 
diff --git a/library/coretests/tests/floats/f128.rs b/library/coretests/tests/floats/f128.rs
deleted file mode 100644
index 8e4f0c9899e1c..0000000000000
--- a/library/coretests/tests/floats/f128.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-// FIXME(f16_f128): only tested on platforms that have symbols and aren't buggy
-#![cfg(target_has_reliable_f128)]
-
-use super::assert_biteq;
-
-// Note these tolerances make sense around zero, but not for more extreme exponents.
-
-/// Default tolerances. Works for values that should be near precise but not exact. Roughly
-/// the precision carried by `100 * 100`.
-#[allow(unused)]
-const TOL: f128 = 1e-12;
-
-/// For operations that are near exact, usually not involving math of different
-/// signs.
-#[allow(unused)]
-const TOL_PRECISE: f128 = 1e-28;
-
-// FIXME(f16_f128,miri): many of these have to be disabled since miri does not yet support
-// the intrinsics.
-
-#[test]
-fn test_from() {
-    assert_biteq!(f128::from(false), 0.0);
-    assert_biteq!(f128::from(true), 1.0);
-    assert_biteq!(f128::from(u8::MIN), 0.0);
-    assert_biteq!(f128::from(42_u8), 42.0);
-    assert_biteq!(f128::from(u8::MAX), 255.0);
-    assert_biteq!(f128::from(i8::MIN), -128.0);
-    assert_biteq!(f128::from(42_i8), 42.0);
-    assert_biteq!(f128::from(i8::MAX), 127.0);
-    assert_biteq!(f128::from(u16::MIN), 0.0);
-    assert_biteq!(f128::from(42_u16), 42.0);
-    assert_biteq!(f128::from(u16::MAX), 65535.0);
-    assert_biteq!(f128::from(i16::MIN), -32768.0);
-    assert_biteq!(f128::from(42_i16), 42.0);
-    assert_biteq!(f128::from(i16::MAX), 32767.0);
-    assert_biteq!(f128::from(u32::MIN), 0.0);
-    assert_biteq!(f128::from(42_u32), 42.0);
-    assert_biteq!(f128::from(u32::MAX), 4294967295.0);
-    assert_biteq!(f128::from(i32::MIN), -2147483648.0);
-    assert_biteq!(f128::from(42_i32), 42.0);
-    assert_biteq!(f128::from(i32::MAX), 2147483647.0);
-    // FIXME(f16_f128): Uncomment these tests once the From<{u64,i64}> impls are added.
-    // assert_eq!(f128::from(u64::MIN), 0.0);
-    // assert_eq!(f128::from(42_u64), 42.0);
-    // assert_eq!(f128::from(u64::MAX), 18446744073709551615.0);
-    // assert_eq!(f128::from(i64::MIN), -9223372036854775808.0);
-    // assert_eq!(f128::from(42_i64), 42.0);
-    // assert_eq!(f128::from(i64::MAX), 9223372036854775807.0);
-}
diff --git a/library/coretests/tests/floats/f16.rs b/library/coretests/tests/floats/f16.rs
deleted file mode 100644
index 3cff4259de54f..0000000000000
--- a/library/coretests/tests/floats/f16.rs
+++ /dev/null
@@ -1,35 +0,0 @@
-// FIXME(f16_f128): only tested on platforms that have symbols and aren't buggy
-#![cfg(target_has_reliable_f16)]
-
-use super::assert_biteq;
-
-/// Tolerance for results on the order of 10.0e-2
-#[allow(unused)]
-const TOL_N2: f16 = 0.0001;
-
-/// Tolerance for results on the order of 10.0e+0
-#[allow(unused)]
-const TOL_0: f16 = 0.01;
-
-/// Tolerance for results on the order of 10.0e+2
-#[allow(unused)]
-const TOL_P2: f16 = 0.5;
-
-/// Tolerance for results on the order of 10.0e+4
-#[allow(unused)]
-const TOL_P4: f16 = 10.0;
-
-// FIXME(f16_f128,miri): many of these have to be disabled since miri does not yet support
-// the intrinsics.
-
-#[test]
-fn test_from() {
-    assert_biteq!(f16::from(false), 0.0);
-    assert_biteq!(f16::from(true), 1.0);
-    assert_biteq!(f16::from(u8::MIN), 0.0);
-    assert_biteq!(f16::from(42_u8), 42.0);
-    assert_biteq!(f16::from(u8::MAX), 255.0);
-    assert_biteq!(f16::from(i8::MIN), -128.0);
-    assert_biteq!(f16::from(42_i8), 42.0);
-    assert_biteq!(f16::from(i8::MAX), 127.0);
-}
diff --git a/library/coretests/tests/floats/mod.rs b/library/coretests/tests/floats/mod.rs
index 63d5b8fb2c6e9..87e21b21f310d 100644
--- a/library/coretests/tests/floats/mod.rs
+++ b/library/coretests/tests/floats/mod.rs
@@ -375,9 +375,6 @@ macro_rules! float_test {
     };
 }
 
-mod f128;
-mod f16;
-
 float_test! {
     name: num,
     attrs: {
@@ -1582,3 +1579,78 @@ float_test! {
         assert_biteq!((flt(-3.2)).mul_add(2.4, neg_inf), neg_inf);
     }
 }
+
+float_test! {
+    name: from,
+    attrs: {
+        f16: #[cfg(any(miri, target_has_reliable_f16))],
+        f128: #[cfg(any(miri, target_has_reliable_f128))],
+    },
+    test<Float> {
+        assert_biteq!(Float::from(false), Float::ZERO);
+        assert_biteq!(Float::from(true), Float::ONE);
+
+        assert_biteq!(Float::from(u8::MIN), Float::ZERO);
+        assert_biteq!(Float::from(42_u8), 42.0);
+        assert_biteq!(Float::from(u8::MAX), 255.0);
+
+        assert_biteq!(Float::from(i8::MIN), -128.0);
+        assert_biteq!(Float::from(42_i8), 42.0);
+        assert_biteq!(Float::from(i8::MAX), 127.0);
+    }
+}
+
+float_test! {
+    name: from_u16_i16,
+    attrs: {
+        f16: #[cfg(false)],
+        const f16: #[cfg(false)],
+        f128: #[cfg(any(miri, target_has_reliable_f128))],
+    },
+    test<Float> {
+        assert_biteq!(Float::from(u16::MIN), Float::ZERO);
+        assert_biteq!(Float::from(42_u16), 42.0);
+        assert_biteq!(Float::from(u16::MAX), 65535.0);
+        assert_biteq!(Float::from(i16::MIN), -32768.0);
+        assert_biteq!(Float::from(42_i16), 42.0);
+        assert_biteq!(Float::from(i16::MAX), 32767.0);
+    }
+}
+
+float_test! {
+    name: from_u32_i32,
+    attrs: {
+        f16: #[cfg(false)],
+        const f16: #[cfg(false)],
+        f32: #[cfg(false)],
+        const f32: #[cfg(false)],
+        f128: #[cfg(any(miri, target_has_reliable_f128))],
+    },
+    test<Float> {
+        assert_biteq!(Float::from(u32::MIN), Float::ZERO);
+        assert_biteq!(Float::from(42_u32), 42.0);
+        assert_biteq!(Float::from(u32::MAX), 4294967295.0);
+        assert_biteq!(Float::from(i32::MIN), -2147483648.0);
+        assert_biteq!(Float::from(42_i32), 42.0);
+        assert_biteq!(Float::from(i32::MAX), 2147483647.0);
+    }
+}
+
+// FIXME(f16_f128): Uncomment and adapt these tests once the From<{u64,i64}> impls are added.
+// float_test! {
+//     name: from_u64_i64,
+//     attrs: {
+//         f16: #[cfg(false)],
+//         f32: #[cfg(false)],
+//         f64: #[cfg(false)],
+//         f128: #[cfg(any(miri, target_has_reliable_f128))],
+//     },
+//     test<Float> {
+//         assert_biteq!(Float::from(u64::MIN), Float::ZERO);
+//         assert_biteq!(Float::from(42_u64), 42.0);
+//         assert_biteq!(Float::from(u64::MAX), 18446744073709551615.0);
+//         assert_biteq!(Float::from(i64::MIN), -9223372036854775808.0);
+//         assert_biteq!(Float::from(42_i64), 42.0);
+//         assert_biteq!(Float::from(i64::MAX), 9223372036854775807.0);
+//     }
+// }
diff --git a/tests/codegen-llvm/amdgpu-dispatch-ptr.rs b/tests/codegen-llvm/amdgpu-dispatch-ptr.rs
new file mode 100644
index 0000000000000..00bde96c3d596
--- /dev/null
+++ b/tests/codegen-llvm/amdgpu-dispatch-ptr.rs
@@ -0,0 +1,27 @@
+// Tests the amdgpu_dispatch_ptr intrinsic.
+
+//@ compile-flags: --crate-type=rlib --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900
+//@ needs-llvm-components: amdgpu
+//@ add-minicore
+#![feature(intrinsics, no_core, rustc_attrs)]
+#![no_core]
+
+extern crate minicore;
+
+pub struct DispatchPacket {
+    pub header: u16,
+    pub setup: u16,
+    pub workgroup_size_x: u16, // and more
+}
+
+#[rustc_intrinsic]
+#[rustc_nounwind]
+fn amdgpu_dispatch_ptr() -> *const ();
+
+// CHECK-LABEL: @get_dispatch_data
+// CHECK: %[[ORIG_PTR:[^ ]+]] = {{(tail )?}}call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT: %[[PTR:[^ ]+]] = addrspacecast ptr addrspace(4) %[[ORIG_PTR]] to ptr
+#[unsafe(no_mangle)]
+pub fn get_dispatch_data() -> &'static DispatchPacket {
+    unsafe { &*(amdgpu_dispatch_ptr() as *const _) }
+}