diff --git a/libm/src/lib.rs b/libm/src/lib.rs
index 85ed5e2c9..e44e19130 100644
--- a/libm/src/lib.rs
+++ b/libm/src/lib.rs
@@ -23,6 +23,7 @@
 #![allow(clippy::unreadable_literal)]
 #![allow(clippy::zero_divided_by_zero)]
 #![forbid(unsafe_op_in_unsafe_fn)]
+#![feature(funnel_shifts)]
 
 mod libm_helper;
 mod math;
diff --git a/libm/src/math/support/big.rs b/libm/src/math/support/big.rs
index c316d93f5..379342eb2 100644
--- a/libm/src/math/support/big.rs
+++ b/libm/src/math/support/big.rs
@@ -6,8 +6,11 @@ mod tests;
 use core::{fmt, ops};
 
 use super::{DInt, HInt, Int, MinInt};
+use crate::support::Word;
 
 const U128_LO_MASK: u128 = u64::MAX as u128;
+const U128_WORDS: usize = (u128::BITS / Word::BITS) as usize;
+const U256_WORDS: usize = U128_WORDS * 2;
 
 /// A 256-bit unsigned integer represented as two 128-bit native-endian limbs.
 #[allow(non_camel_case_types)]
@@ -31,6 +34,29 @@ impl u256 {
             hi: self.hi as i128,
         }
     }
+
+    /// Split into words, with the least significant word first.
+    fn to_words(self) -> [Word; U256_WORDS] {
+        // The result with 64-bit words will be: [lo.lo(), lo.hi(), hi.lo(), hi.hi()].
+        let mut ret: [Word; _] = [0; U256_WORDS];
+        for i in 0..U128_WORDS {
+            let shift = i as u32 * Word::BITS;
+            ret[i] = (self.lo >> shift) as Word;
+            ret[i + U128_WORDS] = (self.hi >> shift) as Word;
+        }
+        ret
+    }
+
+    /// Perform the opposite of [`to_words`].
+    fn from_words(words: [Word; U256_WORDS]) -> Self {
+        let mut ret = u256::ZERO;
+        for i in 0..U128_WORDS {
+            let shift = i as u32 * usize::BITS;
+            ret.lo |= (words[i] as u128) << shift;
+            ret.hi |= (words[i + U128_WORDS] as u128) << shift;
+        }
+        ret
+    }
 }
 
 /// A 256-bit signed integer represented as two 128-bit native-endian limbs.
@@ -58,6 +84,16 @@ impl i256 {
             hi: self.hi as u128,
         }
     }
+
+    /// Split into words, with the least significant word first.
+    fn to_words(self) -> [Word; U256_WORDS] {
+        self.unsigned().to_words()
+    }
+
+    /// Perform the opposite of [`to_words`].
+    fn from_words(words: [Word; U256_WORDS]) -> Self {
+        u256::from_words(words).signed()
+    }
 }
 
 impl MinInt for u256 {
@@ -129,60 +165,100 @@ macro_rules! impl_common {
                 Self { lo, hi }
             }
         }
+    };
+}
 
-        impl ops::Shl<u32> for $ty {
-            type Output = Self;
+impl ops::Shr<u32> for u256 {
+    type Output = Self;
 
-            fn shl(mut self, rhs: u32) -> Self::Output {
-                debug_assert!(rhs < Self::BITS, "attempt to shift left with overflow");
+    // #[inline(never)]
+    fn shr(self, rhs: u32) -> Self::Output {
+        debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
 
-                let half_bits = Self::BITS / 2;
-                let low_mask = half_bits - 1;
-                let s = rhs & low_mask;
+        if rhs < 128 {
+            let lo = u128::funnel_shr(self.hi, self.lo, rhs);
+            let hi = self.hi >> rhs;
+            Self { lo, hi }
+        } else {
+            let lo = self.hi >> (rhs - 128);
+            Self { lo, hi: 0 }
+        }
+    }
+}
 
-                let lo = self.lo;
-                let hi = self.hi;
+impl ops::Shr<u32> for i256 {
+    type Output = Self;
 
-                self.lo = lo << s;
+    // #[inline(never)]
+    fn shr(self, rhs: u32) -> Self::Output {
+        debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
 
-                if rhs & half_bits == 0 {
-                    self.hi = (lo >> (low_mask ^ s) >> 1) as _;
-                    self.hi |= hi << s;
-                } else {
-                    self.hi = self.lo as _;
-                    self.lo = 0;
-                }
-                self
+        // Set up an array with the input in the low half, zeros in the upper half
+        let mut words = [Word::ZERO; U256_WORDS * 2];
+        words[..U256_WORDS].copy_from_slice(&self.to_words());
+
+        if i256::SIGNED {
+            // For i256, branchlessly set the upper words to all ones if the input
+            // is negative.
+            let top_word = words[U256_WORDS - 1].signed() >> (Word::BITS - 1);
+            for x in &mut words[U256_WORDS..] {
+                *x = top_word.unsigned();
             }
         }
 
-        impl ops::Shr<u32> for $ty {
-            type Output = Self;
-
-            fn shr(mut self, rhs: u32) -> Self::Output {
-                debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
+        let shift = rhs & 255; // limit to 255 in cases of overflow
+        let word_shift = (shift / Word::BITS) as usize;
+        let bit_shift = shift % Word::BITS;
+
+        let mut ret: [Word; U256_WORDS] = [0; _];
+
+        // Each output word is a coarse (word-sized) shift plus a small bit shift. Note that
+        // these loops get unrolled.
+        for i in 0..U256_WORDS {
+            if i < (U256_WORDS - 1) {
+                let hi = words[word_shift + i + 1];
+                let lo = words[word_shift + i];
+
+                ret[i] = <Word as HInt>::funnel_shr(hi, lo, bit_shift);
+            } else if i256::SIGNED {
+                // The upper word doesn't get any sign bits via a funnel shift, so we need
+                // an arithmetic shift to preserve sign.
+                let mut x = words[word_shift + i].signed();
+                x >>= bit_shift;
+                ret[i] = x.unsigned();
+            } else {
+                ret[i] = words[word_shift + i] >> bit_shift;
+            }
+        }
 
-                let half_bits = Self::BITS / 2;
-                let low_mask = half_bits - 1;
-                let s = rhs & low_mask;
+        i256::from_words(ret)
+    }
+}
 
-                let lo = self.lo;
-                let hi = self.hi;
+impl ops::Shl<u32> for u256 {
+    type Output = Self;
 
-                self.hi = hi >> s;
+    #[inline(never)]
+    fn shl(self, rhs: u32) -> Self::Output {
+        debug_assert!(rhs < Self::BITS, "attempt to shift left with overflow");
 
-                #[allow(unused_comparisons)]
-                if rhs & half_bits == 0 {
-                    self.lo = (hi << (low_mask ^ s) << 1) as _;
-                    self.lo |= lo >> s;
-                } else {
-                    self.lo = self.hi as _;
-                    self.hi = if hi < 0 { !0 } else { 0 };
-                }
-                self
-            }
+        if rhs < 128 {
+            let hi = u128::funnel_shl(self.hi, self.lo, rhs);
+            let lo = self.lo << rhs;
+            Self { lo, hi }
+        } else {
+            let hi = self.lo << (rhs - 128);
+            Self { lo: 0, hi }
         }
-    };
+    }
+}
+
+impl ops::Shl<u32> for i256 {
+    type Output = Self;
+
+    fn shl(self, rhs: u32) -> Self::Output {
+        (self.unsigned() << rhs).signed()
+    }
 }
 
 impl_common!(i256);
diff --git a/libm/src/math/support/int_traits.rs b/libm/src/math/support/int_traits.rs
index f113f9d62..52a7a2280 100644
--- a/libm/src/math/support/int_traits.rs
+++ b/libm/src/math/support/int_traits.rs
@@ -347,6 +347,26 @@ pub trait HInt: Int {
     fn zero_widen_mul(self, rhs: Self) -> Self::D;
     /// Widening multiplication. This cannot overflow.
     fn widen_mul(self, rhs: Self) -> Self::D;
+
+    // FIXME(msrv): Use funnel shifts from `core` as a trait on `Int` when available.
+
+    /// Concatenate `self` and `right`, shift by `shift`, and return the upper half.
+    #[allow(unused)]
+    fn funnel_shl(self, right: Self, shift: u32) -> Self {
+        assert!(!Self::SIGNED, "unsupported for signed integers");
+        assert!(shift < Self::BITS, "attempt to funnel shift with overflow");
+        let n = Self::D::from_lo_hi(right, self);
+        (n << shift).hi()
+    }
+
+    /// Concatenate `self` and `right`, shift by `shift`, and return the lower half.
+    #[allow(unused)]
+    fn funnel_shr(self, right: Self, shift: u32) -> Self {
+        assert!(!Self::SIGNED, "unsupported for signed integers");
+        assert!(shift < Self::BITS, "attempt to funnel shift with overflow");
+        let n = Self::D::from_lo_hi(right, self);
+        (n >> shift).lo()
+    }
 }
 
 macro_rules! impl_d_int {
diff --git a/libm/src/math/support/mod.rs b/libm/src/math/support/mod.rs
index f28c02104..f55e62a4a 100644
--- a/libm/src/math/support/mod.rs
+++ b/libm/src/math/support/mod.rs
@@ -30,6 +30,21 @@ pub use hex_float::{DisplayHex, Hex, hf32, hf64};
 pub use int_traits::{CastFrom, CastInto, DInt, HInt, Int, MinInt, NarrowingDiv};
 pub use modular::linear_mul_reduction;
 
+cfg_if! {
+    if #[cfg(target_pointer_width = "16")] {
+        /// Concrete sized integer compatible with `usize` (exists for using `DInt`/`HInt`).
+        pub type Word = u16;
+    } else if #[cfg(target_pointer_width = "32")] {
+        /// Concrete sized integer compatible with `usize` (exists for using `DInt`/`HInt`).
+        pub type Word = u32;
+    } else if #[cfg(target_pointer_width = "64")] {
+        /// Concrete sized integer compatible with `usize` (exists for using `DInt`/`HInt`).
+        pub type Word = u64;
+    } else {
+        compile_error!("unsupported pointer width");
+    }
+}
+
 /// Hint to the compiler that the current path is cold.
 pub fn cold_path() {
     #[cfg(intrinsics_enabled)]
@@ -68,3 +83,14 @@ pub unsafe fn unchecked_div_isize(x: isize, y: isize) -> isize {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn word_size() {
+        assert_eq!(size_of::<Word>(), size_of::<usize>());
+        assert_eq!(align_of::<Word>(), align_of::<usize>());
+    }
+}