diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index b84bc29..3aac993 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -11,8 +11,7 @@ jobs:
     strategy:
       matrix:
         rust:
-          - 1.57.0
-          - stable
+          - nightly
     steps:
     - name: Checkout
       uses: actions/checkout@v2
@@ -23,16 +22,13 @@ jobs:
         toolchain: ${{ matrix.rust }}
         override: true
 
-    - name: Build with minimal features (no_std)
-      run: cargo build --verbose --no-default-features --features no-std-float
+    # - name: Build with minimal features (no_std)
+    #   run: cargo build --verbose --no-default-features --features no-std-float
 
     - name: Run tests for tiny-skia-path
       working-directory: path
       run: cargo test --verbose
 
-    - name: Run tests without SIMD
-      run: cargo test --verbose --no-default-features --features png-format
-
     - name: Run tests with SSE2
       env:
         RUSTFLAGS: -Ctarget-feature=+sse2
@@ -62,7 +58,7 @@ jobs:
     - name: Install toolchain
       uses: actions-rs/toolchain@v1
       with:
-        toolchain: stable
+        toolchain: nightly
         override: true
         target: wasm32-wasi
 
@@ -71,11 +67,8 @@ jobs:
         curl https://wasmtime.dev/install.sh -sSf | bash
         echo "$HOME/.wasmtime/bin" >> $GITHUB_PATH
 
-    - name: Build with minimal features (no_std)
-      run: cargo build --target wasm32-wasi --verbose --no-default-features --features no-std-float
-
-    - name: Run tests without SIMD
-      run: cargo test --target wasm32-wasi --verbose --no-default-features --features png-format
+    # - name: Build with minimal features (no_std)
+    #   run: cargo build --target wasm32-wasi --verbose --no-default-features --features no-std-float
 
     - name: Run tests with SIMD128
       env:
@@ -91,18 +84,15 @@ jobs:
     - name: Install toolchain
       uses: actions-rs/toolchain@v1
       with:
-        toolchain: stable
+        toolchain: nightly
         override: true
         target: aarch64-unknown-linux-gnu
 
     - name: Install cross
       run: cargo install cross
 
-    - name: Build with minimal features (no_std)
-      run: cross build --target aarch64-unknown-linux-gnu --verbose --no-default-features --features no-std-float
-
-    - name: Run tests without SIMD
-      run: cross test --target aarch64-unknown-linux-gnu --verbose --no-default-features --features png-format
+    # - name: Build with minimal features (no_std)
+    #   run: cross build --target aarch64-unknown-linux-gnu --verbose --no-default-features --features no-std-float
 
     - name: Run tests with Neon
       run: cross test --target aarch64-unknown-linux-gnu
diff --git a/Cargo.toml b/Cargo.toml
index e80a97c..52de6e0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,17 +24,12 @@ png = { version = "0.17", optional = true }
 tiny-skia-path = { version = "0.10.0", path = "path", default-features = false }
 
 [features]
-default = ["std", "simd", "png-format"]
+default = ["std", "png-format"]
 
 # Enables the use of the standard library. Deactivate this and activate the no-std-float
 # feature to compile for targets that don't have std.
 std = ["tiny-skia-path/std"]
 no-std-float = ["tiny-skia-path/no-std-float"]
 
-# Enables SIMD instructions on x86 (from SSE up to AVX2), WebAssembly (SIMD128)
-# and AArch64 (Neon).
-# Has no effect on other targets. Present mainly for testing.
-simd = []
-
 # Allows loading and saving `Pixmap` as PNG.
 png-format = ["std", "png"]
diff --git a/path/src/f32x2_t.rs b/path/src/f32x2_t.rs
deleted file mode 100644
index e471f42..0000000
--- a/path/src/f32x2_t.rs
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright 2020 Yevhenii Reizner
-//
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#[cfg(all(not(feature = "std"), feature = "no-std-float"))]
-use crate::NoStdFloat;
-
-// Right now, there are no visible benefits of using SIMD for f32x2. So we don't.
-/// A pair of f32 numbers.
-///
-/// Mainly for internal use. Do not rely on it!
-#[allow(non_camel_case_types)]
-#[derive(Copy, Clone, Default, PartialEq, Debug)]
-pub struct f32x2(pub [f32; 2]);
-
-impl f32x2 {
-    /// Creates a new pair.
-    pub fn new(a: f32, b: f32) -> f32x2 {
-        f32x2([a, b])
-    }
-
-    /// Creates a new pair from a single value.
-    pub fn splat(x: f32) -> f32x2 {
-        f32x2([x, x])
-    }
-
-    /// Returns an absolute value.
-    pub fn abs(self) -> f32x2 {
-        f32x2([self.x().abs(), self.y().abs()])
-    }
-
-    /// Returns a minimum value.
-    pub fn min(self, other: f32x2) -> f32x2 {
-        f32x2([pmin(self.x(), other.x()), pmin(self.y(), other.y())])
-    }
-
-    /// Returns a maximum value.
-    pub fn max(self, other: f32x2) -> f32x2 {
-        f32x2([pmax(self.x(), other.x()), pmax(self.y(), other.y())])
-    }
-
-    /// Returns a maximum of both values.
-    pub fn max_component(self) -> f32 {
-        pmax(self.x(), self.y())
-    }
-
-    /// Returns the first value.
-    pub fn x(&self) -> f32 {
-        self.0[0]
-    }
-
-    /// Returns the second value.
-    pub fn y(&self) -> f32 {
-        self.0[1]
-    }
-}
-
-impl core::ops::Add<f32x2> for f32x2 {
-    type Output = f32x2;
-
-    fn add(self, other: f32x2) -> f32x2 {
-        f32x2([self.x() + other.x(), self.y() + other.y()])
-    }
-}
-
-impl core::ops::Sub<f32x2> for f32x2 {
-    type Output = f32x2;
-
-    fn sub(self, other: f32x2) -> f32x2 {
-        f32x2([self.x() - other.x(), self.y() - other.y()])
-    }
-}
-
-impl core::ops::Mul<f32x2> for f32x2 {
-    type Output = f32x2;
-
-    fn mul(self, other: f32x2) -> f32x2 {
-        f32x2([self.x() * other.x(), self.y() * other.y()])
-    }
-}
-
-impl core::ops::Div<f32x2> for f32x2 {
-    type Output = f32x2;
-
-    fn div(self, other: f32x2) -> f32x2 {
-        f32x2([self.x() / other.x(), self.y() / other.y()])
-    }
-}
-
-// A faster and more forgiving f32 min/max implementation.
-//
-// Unlike std one, we do not care about NaN.
-
-fn pmax(a: f32, b: f32) -> f32 {
-    if a < b {
-        b
-    } else {
-        a
-    }
-}
-
-fn pmin(a: f32, b: f32) -> f32 {
-    if b < a {
-        b
-    } else {
-        a
-    }
-}
diff --git a/path/src/f32x4_t.rs b/path/src/f32x4_t.rs
deleted file mode 100644
index e591b55..0000000
--- a/path/src/f32x4_t.rs
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright 2020 Yevhenii Reizner
-//
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Right now, there are no visible benefits of using SIMD for f32x4. So we don't.
-#[derive(Default, Clone, Copy, PartialEq, Debug)]
-#[repr(C, align(16))]
-pub struct f32x4(pub [f32; 4]);
-
-impl f32x4 {
-    pub fn max(self, rhs: Self) -> Self {
-        Self([
-            self.0[0].max(rhs.0[0]),
-            self.0[1].max(rhs.0[1]),
-            self.0[2].max(rhs.0[2]),
-            self.0[3].max(rhs.0[3]),
-        ])
-    }
-
-    pub fn min(self, rhs: Self) -> Self {
-        Self([
-            self.0[0].min(rhs.0[0]),
-            self.0[1].min(rhs.0[1]),
-            self.0[2].min(rhs.0[2]),
-            self.0[3].min(rhs.0[3]),
-        ])
-    }
-}
-
-impl core::ops::Add for f32x4 {
-    type Output = Self;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        Self([
-            self.0[0] + rhs.0[0],
-            self.0[1] + rhs.0[1],
-            self.0[2] + rhs.0[2],
-            self.0[3] + rhs.0[3],
-        ])
-    }
-}
-
-impl core::ops::AddAssign for f32x4 {
-    fn add_assign(&mut self, rhs: f32x4) {
-        *self = *self + rhs;
-    }
-}
-
-impl core::ops::Sub for f32x4 {
-    type Output = Self;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        Self([
-            self.0[0] - rhs.0[0],
-            self.0[1] - rhs.0[1],
-            self.0[2] - rhs.0[2],
-            self.0[3] - rhs.0[3],
-        ])
-    }
-}
-
-impl core::ops::Mul for f32x4 {
-    type Output = Self;
-
-    fn mul(self, rhs: Self) -> Self::Output {
-        Self([
-            self.0[0] * rhs.0[0],
-            self.0[1] * rhs.0[1],
-            self.0[2] * rhs.0[2],
-            self.0[3] * rhs.0[3],
-        ])
-    }
-}
-
-impl core::ops::MulAssign for f32x4 {
-    fn mul_assign(&mut self, rhs: f32x4) {
-        *self = *self * rhs;
-    }
-}
diff --git a/path/src/lib.rs b/path/src/lib.rs
index e80a70c..23eb068 100644
--- a/path/src/lib.rs
+++ b/path/src/lib.rs
@@ -12,6 +12,7 @@
 //!
 //! Note that all types use single precision floats (`f32`), just like [Skia](https://skia.org/).
 
+#![feature(portable_simd)]
 #![no_std]
 #![warn(missing_docs)]
 #![warn(missing_copy_implementations)]
@@ -36,8 +37,6 @@ extern crate std;
 extern crate alloc;
 
 mod dash;
-mod f32x2_t;
-mod f32x4_t;
 mod floating_point;
 mod path;
 mod path_builder;
@@ -49,7 +48,6 @@ mod stroker;
 mod transform;
 
 pub use dash::StrokeDash;
-pub use f32x2_t::f32x2;
 pub use floating_point::*;
 pub use path::*;
 pub use path_builder::*;
@@ -86,16 +84,6 @@ impl Point {
         Point { x, y }
     }
 
-    /// Creates a new `Point` from `f32x2`.
-    pub fn from_f32x2(r: f32x2) -> Self {
-        Point::from_xy(r.x(), r.y())
-    }
-
-    /// Converts a `Point` into a `f32x2`.
-    pub fn to_f32x2(&self) -> f32x2 {
-        f32x2::new(self.x, self.y)
-    }
-
     /// Creates a point at 0x0 position.
     pub fn zero() -> Self {
         Point { x: 0.0, y: 0.0 }
diff --git a/path/src/path_geometry.rs b/path/src/path_geometry.rs
index d4c3746..ebd101c 100644
--- a/path/src/path_geometry.rs
+++ b/path/src/path_geometry.rs
@@ -10,9 +10,10 @@
 
 #![allow(missing_docs)]
 
+use core::simd::f32x2;
+
 use crate::{Point, Transform};
 
-use crate::f32x2_t::f32x2;
 use crate::floating_point::FLOAT_PI;
 use crate::scalar::{Scalar, SCALAR_NEARLY_ZERO, SCALAR_ROOT_2_OVER_2};
 
@@ -22,6 +23,21 @@ use crate::path_builder::PathDirection;
 #[cfg(all(not(feature = "std"), feature = "no-std-float"))]
 use crate::NoStdFloat;
 
+trait PointExt {
+    fn from_f32x2(r: f32x2) -> Self;
+    fn to_f32x2(&self) -> f32x2;
+}
+
+impl PointExt for Point {
+    fn from_f32x2(r: f32x2) -> Self {
+        Point::from_xy(r.as_array()[0], r.as_array()[1])
+    }
+
+    fn to_f32x2(&self) -> f32x2 {
+        f32x2::from_array([self.x, self.y])
+    }
+}
+
 // use for : eval(t) == A * t^2 + B * t + C
 #[derive(Clone, Copy, Default, Debug)]
 pub struct QuadCoeff {
diff --git a/path/src/rect.rs b/path/src/rect.rs
index d199f9d..06b7f1c 100644
--- a/path/src/rect.rs
+++ b/path/src/rect.rs
@@ -345,7 +345,7 @@ impl Rect {
     ///
     /// Returns None if count is zero or if Point array contains an infinity or NaN.
     pub fn from_points(points: &[Point]) -> Option<Self> {
-        use crate::f32x4_t::f32x4;
+        use core::simd::{f32x4, SimdFloat};
 
         if points.is_empty() {
             return None;
@@ -356,13 +356,13 @@ impl Rect {
         let mut max;
         if points.len() & 1 != 0 {
             let pt = points[0];
-            min = f32x4([pt.x, pt.y, pt.x, pt.y]);
+            min = f32x4::from_array([pt.x, pt.y, pt.x, pt.y]);
             max = min;
             offset += 1;
         } else {
             let pt0 = points[0];
             let pt1 = points[1];
-            min = f32x4([pt0.x, pt0.y, pt1.x, pt1.y]);
+            min = f32x4::from_array([pt0.x, pt0.y, pt1.x, pt1.y]);
             max = min;
             offset += 2;
         }
@@ -371,17 +371,17 @@ impl Rect {
         while offset != points.len() {
             let pt0 = points[offset + 0];
             let pt1 = points[offset + 1];
-            let xy = f32x4([pt0.x, pt0.y, pt1.x, pt1.y]);
+            let xy = f32x4::from_array([pt0.x, pt0.y, pt1.x, pt1.y]);
 
             accum *= xy;
-            min = min.min(xy);
-            max = max.max(xy);
+            min = min.simd_min(xy);
+            max = max.simd_max(xy);
             offset += 2;
         }
 
         let all_finite = accum * f32x4::default() == f32x4::default();
-        let min: [f32; 4] = min.0;
-        let max: [f32; 4] = max.0;
+        let min: &[f32; 4] = min.as_array();
+        let max: &[f32; 4] = max.as_array();
         if all_finite {
             Rect::from_ltrb(
                 min[0].min(min[2]),
diff --git a/src/lib.rs b/src/lib.rs
index 38de1cb..f11f8a4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,6 +9,7 @@ and a user should manage the world transform, clipping mask and style manually.
 See the `examples/` directory for usage examples.
 */
 
+#![feature(portable_simd)]
 #![no_std]
 #![warn(missing_docs)]
 #![warn(missing_copy_implementations)]
@@ -52,7 +53,6 @@ mod pipeline;
 mod pixmap;
 mod scan;
 mod shaders;
-mod wide;
 
 mod painter; // Keep it under `pixmap` for a better order in the docs.
 
diff --git a/src/pipeline/highp.rs b/src/pipeline/highp.rs
index bcb113c..66a3ecb 100644
--- a/src/pipeline/highp.rs
+++ b/src/pipeline/highp.rs
@@ -15,11 +15,12 @@ For some reason, we are almost 2x slower. Maybe because Skia uses clang's vector
 and we're using a manual implementation.
 */
 
+use std::simd::{f32x8, i32x8, u32x8, StdFloat, SimdFloat, SimdPartialOrd, SimdPartialEq};
+
 use crate::{PremultipliedColorU8, SpreadMode, PixmapRef};
 
 use crate::geom::ScreenIntRect;
 use crate::pixmap::SubPixmapMut;
-use crate::wide::{f32x8, i32x8, u32x8};
 
 pub const STAGE_WIDTH: usize = 8;
 
@@ -125,6 +126,16 @@ pub fn fn_ptr(f: StageFn) -> *const () {
     f as *const ()
 }
 
+trait F32x8Ext {
+    fn normalize(self) -> Self;
+}
+
+impl F32x8Ext for f32x8 {
+    fn normalize(self) -> Self {
+        self.simd_max(f32x8::default()).simd_min(f32x8::splat(1.0))
+    }
+}
+
 #[inline(never)]
 pub fn start(
     functions: &[StageFn],
@@ -209,19 +220,19 @@ fn move_destination_to_source(p: &mut Pipeline) {
 }
 
 fn clamp_0(p: &mut Pipeline) {
-    p.r = p.r.max(f32x8::default());
-    p.g = p.g.max(f32x8::default());
-    p.b = p.b.max(f32x8::default());
-    p.a = p.a.max(f32x8::default());
+    p.r = p.r.simd_max(f32x8::default());
+    p.g = p.g.simd_max(f32x8::default());
+    p.b = p.b.simd_max(f32x8::default());
+    p.a = p.a.simd_max(f32x8::default());
 
     p.next_stage();
 }
 
 fn clamp_a(p: &mut Pipeline) {
-    p.r = p.r.min(f32x8::splat(1.0));
-    p.g = p.g.min(f32x8::splat(1.0));
-    p.b = p.b.min(f32x8::splat(1.0));
-    p.a = p.a.min(f32x8::splat(1.0));
+    p.r = p.r.simd_min(f32x8::splat(1.0));
+    p.g = p.g.simd_min(f32x8::splat(1.0));
+    p.b = p.b.simd_min(f32x8::splat(1.0));
+    p.a = p.a.simd_min(f32x8::splat(1.0));
 
     p.next_stage();
 }
@@ -301,10 +312,10 @@ fn gather_ix(pixmap: PixmapRef, mut x: f32x8, mut y: f32x8) -> u32x8 {
     // Exclusive -> inclusive.
     let w = ulp_sub(pixmap.width() as f32);
     let h = ulp_sub(pixmap.height() as f32);
-    x = x.max(f32x8::default()).min(f32x8::splat(w));
-    y = y.max(f32x8::default()).min(f32x8::splat(h));
+    x = x.simd_max(f32x8::default()).simd_min(f32x8::splat(w));
+    y = y.simd_max(f32x8::default()).simd_min(f32x8::splat(h));
 
-    (y.trunc_int() * i32x8::splat(pixmap.width() as i32) + x.trunc_int()).to_u32x8_bitcast()
+    (y.trunc().cast::<i32>() * i32x8::splat(pixmap.width() as i32) + x.trunc().cast::<i32>()).cast()
 }
 
 #[inline(always)]
@@ -405,15 +416,15 @@ blend_fn!(source_in,        |s, _,  _, da| s * da);
 blend_fn!(destination_in,   |_, d, sa,  _| d * sa);
 blend_fn!(source_out,       |s, _,  _, da| s * inv(da));
 blend_fn!(destination_out,  |_, d, sa,  _| d * inv(sa));
-blend_fn!(source_over,      |s, d, sa,  _| mad(d, inv(sa), s));
-blend_fn!(destination_over, |s, d,  _, da| mad(s, inv(da), d));
+blend_fn!(source_over,      |s, d: f32x8, sa,  _| d.mul_add(inv(sa), s));
+blend_fn!(destination_over, |s: f32x8, d,  _, da| s.mul_add(inv(da), d));
 blend_fn!(modulate,         |s, d,  _,  _| s * d);
 blend_fn!(multiply,         |s, d, sa, da| s * inv(da) + d * inv(sa) + s * d);
 blend_fn!(screen,           |s, d,  _,  _| s + d - s * d);
 blend_fn!(xor,              |s, d, sa, da| s * inv(da) + d * inv(sa));
 
 // Wants a type for some reason.
-blend_fn!(plus, |s: f32x8, d: f32x8, _, _| (s + d).min(f32x8::splat(1.0)));
+blend_fn!(plus, |s: f32x8, d: f32x8, _, _| (s + d).simd_min(f32x8::splat(1.0)));
 
 macro_rules! blend_fn2 {
     ($name:ident, $f:expr) => {
@@ -422,54 +433,54 @@ macro_rules! blend_fn2 {
             p.r = $f(p.r, p.dr, p.a, p.da);
             p.g = $f(p.g, p.dg, p.a, p.da);
             p.b = $f(p.b, p.db, p.a, p.da);
-            p.a = mad(p.da, inv(p.a), p.a);
+            p.a = p.da.mul_add(inv(p.a), p.a);
 
             p.next_stage();
         }
     };
 }
 
-blend_fn2!(darken,      |s: f32x8, d, sa, da: f32x8| s + d - (s * da).max(d * sa));
-blend_fn2!(lighten,     |s: f32x8, d, sa, da: f32x8| s + d - (s * da).min(d * sa));
-blend_fn2!(difference,  |s: f32x8, d, sa, da: f32x8| s + d - two((s * da).min(d * sa)));
+blend_fn2!(darken,      |s: f32x8, d, sa, da: f32x8| s + d - (s * da).simd_max(d * sa));
+blend_fn2!(lighten,     |s: f32x8, d, sa, da: f32x8| s + d - (s * da).simd_min(d * sa));
+blend_fn2!(difference,  |s: f32x8, d, sa, da: f32x8| s + d - two((s * da).simd_min(d * sa)));
 blend_fn2!(exclusion,   |s: f32x8, d,  _,  _| s + d - two(s * d));
 
 blend_fn2!(color_burn, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
-    d.cmp_eq(da).blend(
+    d.simd_eq(da).select(
         d + s * inv(da),
-        s.cmp_eq(f32x8::default()).blend(
+        s.simd_eq(f32x8::default()).select(
             d * inv(sa),
-            sa * (da - da.min((da - d) * sa * s.recip_fast())) + s * inv(da) + d * inv(sa)
+            sa * (da - da.simd_min((da - d) * sa * s.recip())) + s * inv(da) + d * inv(sa)
         )
     )
 );
 
 blend_fn2!(color_dodge, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
-    d.cmp_eq(f32x8::default()).blend(
+    d.simd_eq(f32x8::default()).select(
         s * inv(da),
-        s.cmp_eq(sa).blend(
+        s.simd_eq(sa).select(
             s + d * inv(sa),
-            sa * da.min((d * sa) * (sa - s).recip_fast()) + s * inv(da) + d * inv(sa)
+            sa * da.simd_min((d * sa) * (sa - s).recip()) + s * inv(da) + d * inv(sa)
         )
     )
 );
 
 blend_fn2!(hard_light, |s: f32x8, d: f32x8, sa, da|
-    s * inv(da) + d * inv(sa) + two(s).cmp_le(sa).blend(
+    s * inv(da) + d * inv(sa) + two(s).simd_le(sa).select(
         two(s * d),
         sa * da - two((da - d) * (sa - s))
     )
 );
 
 blend_fn2!(overlay, |s: f32x8, d: f32x8, sa, da|
-    s * inv(da) + d * inv(sa) + two(d).cmp_le(da).blend(
+    s * inv(da) + d * inv(sa) + two(d).simd_le(da).select(
         two(s * d),
         sa * da - two((da - d) * (sa - s))
     )
 );
 
 blend_fn2!(soft_light, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| {
-    let m  = da.cmp_gt(f32x8::default()).blend(d / da, f32x8::default());
+    let m  = da.simd_gt(f32x8::default()).select(d / da, f32x8::default());
     let s2 = two(s);
     let m4 = two(two(m));
 
@@ -481,9 +492,9 @@ blend_fn2!(soft_light, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| {
     let dark_dst = (m4 * m4 + m4) * (m - f32x8::splat(1.0)) + f32x8::splat(7.0) * m;
     let lite_dst = m.sqrt() - m;
     let lite_src = d * sa + da * (s2 - sa)
-        * two(two(d)).cmp_le(da).blend(dark_dst, lite_dst); // 2 or 3?
+        * two(two(d)).simd_le(da).select(dark_dst, lite_dst); // 2 or 3?
 
-    s * inv(da) + d * inv(sa) + s2.cmp_le(sa).blend(dark_src, lite_src) // 1 or (2 or 3)?
+    s * inv(da) + d * inv(sa) + s2.simd_le(sa).select(dark_src, lite_src) // 1 or (2 or 3)?
 });
 
 // We're basing our implementation of non-separable blend modes on
@@ -600,7 +611,7 @@ fn luminosity_k(
 
 #[inline(always)]
 fn sat(r: f32x8, g: f32x8, b: f32x8) -> f32x8 {
-    r.max(g.max(b)) - r.min(g.min(b))
+    r.simd_max(g.simd_max(b)) - r.simd_min(g.simd_min(b))
 }
 
 #[inline(always)]
@@ -610,13 +621,13 @@ fn lum(r: f32x8, g: f32x8, b: f32x8) -> f32x8 {
 
 #[inline(always)]
 fn set_sat(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, s: f32x8) {
-    let mn  = r.min(g.min(*b));
-    let mx  = r.max(g.max(*b));
+    let mn  = r.simd_min(g.simd_min(*b));
+    let mx  = r.simd_max(g.simd_max(*b));
     let sat = mx - mn;
 
     // Map min channel to 0, max channel to s, and scale the middle proportionally.
-    let scale = |c| sat.cmp_eq(f32x8::default())
-                       .blend(f32x8::default(), (c - mn) * s / sat);
+    let scale = |c| sat.simd_eq(f32x8::default())
+                       .select(f32x8::default(), (c - mn) * s / sat);
 
     *r = scale(*r);
     *g = scale(*g);
@@ -633,14 +644,14 @@ fn set_lum(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, l: f32x8) {
 
 #[inline(always)]
 fn clip_color(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: f32x8) {
-    let mn = r.min(g.min(*b));
-    let mx = r.max(g.max(*b));
+    let mn = r.simd_min(g.simd_min(*b));
+    let mx = r.simd_max(g.simd_max(*b));
     let l  = lum(*r, *g, *b);
 
     let clip = |mut c| {
-        c = mx.cmp_ge(f32x8::default()).blend(c, l + (c - l) * l / (l - mn));
-        c = mx.cmp_gt(a).blend(l + (c - l) * (a - l) / (mx - l), c);
-        c = c.max(f32x8::default()); // Sometimes without this we may dip just a little negative.
+        c = mx.simd_ge(f32x8::default()).select(c, l + (c - l) * l / (l - mn));
+        c = mx.simd_gt(a).select(l + (c - l) * (a - l) / (mx - l), c);
+        c = c.simd_max(f32x8::default()); // Sometimes without this we may dip just a little negative.
         c
     };
 
@@ -652,10 +663,10 @@ fn clip_color(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: f32x8) {
 pub fn source_over_rgba(p: &mut Pipeline) {
     let pixels = p.pixmap_dst.slice4_at_xy(p.dx, p.dy);
     load_8888(pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
-    p.r = mad(p.dr, inv(p.a), p.r);
-    p.g = mad(p.dg, inv(p.a), p.g);
-    p.b = mad(p.db, inv(p.a), p.b);
-    p.a = mad(p.da, inv(p.a), p.a);
+    p.r = p.dr.mul_add(inv(p.a), p.r);
+    p.g = p.dg.mul_add(inv(p.a), p.g);
+    p.b = p.db.mul_add(inv(p.a), p.b);
+    p.a = p.da.mul_add(inv(p.a), p.a);
     store_8888(&p.r, &p.g, &p.b, &p.a, pixels);
 
     p.next_stage();
@@ -664,10 +675,10 @@ pub fn source_over_rgba(p: &mut Pipeline) {
 pub fn source_over_rgba_tail(p: &mut Pipeline) {
     let pixels = p.pixmap_dst.slice_at_xy(p.dx, p.dy);
     load_8888_tail(p.tail, pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
-    p.r = mad(p.dr, inv(p.a), p.r);
-    p.g = mad(p.dg, inv(p.a), p.g);
-    p.b = mad(p.db, inv(p.a), p.b);
-    p.a = mad(p.da, inv(p.a), p.a);
+    p.r = p.dr.mul_add(inv(p.a), p.r);
+    p.g = p.dg.mul_add(inv(p.a), p.g);
+    p.b = p.db.mul_add(inv(p.a), p.b);
+    p.a = p.da.mul_add(inv(p.a), p.a);
     store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, pixels);
 
     p.next_stage();
@@ -676,8 +687,8 @@ pub fn source_over_rgba_tail(p: &mut Pipeline) {
 fn transform(p: &mut Pipeline) {
     let ts = &p.ctx.transform;
 
-    let tr = mad(p.r, f32x8::splat(ts.sx), mad(p.g, f32x8::splat(ts.kx), f32x8::splat(ts.tx)));
-    let tg = mad(p.r, f32x8::splat(ts.ky), mad(p.g, f32x8::splat(ts.sy), f32x8::splat(ts.ty)));
+    let tr = p.r.mul_add(f32x8::splat(ts.sx), p.g.mul_add(f32x8::splat(ts.kx), f32x8::splat(ts.tx)));
+    let tg = p.r.mul_add(f32x8::splat(ts.ky), p.g.mul_add(f32x8::splat(ts.sy), f32x8::splat(ts.ty)));
     p.r = tr;
     p.g = tg;
 
@@ -757,11 +768,9 @@ fn bicubic(p: &mut Pipeline) {
 #[inline(always)]
 fn bicubic_near(t: f32x8) -> f32x8 {
     // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18
-    mad(
-        t,
-        mad(t,
-            mad(
-                f32x8::splat(-21.0/18.0),
+    t.mul_add(
+        t.mul_add(
+            f32x8::splat(-21.0/18.0).mul_add(
                 t,
                 f32x8::splat(27.0/18.0),
             ),
@@ -774,7 +783,7 @@ fn bicubic_near(t: f32x8) -> f32x8 {
 #[inline(always)]
 fn bicubic_far(t: f32x8) -> f32x8 {
     // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18)
-    (t * t) * mad(f32x8::splat(7.0/18.0), t, f32x8::splat(-6.0/18.0))
+    (t * t) * f32x8::splat(7.0/18.0).mul_add(t, f32x8::splat(-6.0/18.0))
 }
 
 #[inline(always)]
@@ -803,10 +812,10 @@ fn sampler_2x2(
             sample(pixmap, ctx, x,y, &mut rr, &mut gg, &mut bb, &mut aa);
 
             let w = wx[i] * wy[j];
-            *r = mad(w, rr, *r);
-            *g = mad(w, gg, *g);
-            *b = mad(w, bb, *b);
-            *a = mad(w, aa, *a);
+            *r = w.mul_add(rr, *r);
+            *g = w.mul_add(gg, *g);
+            *b = w.mul_add(bb, *b);
+            *a = w.mul_add(aa, *a);
 
             x += one;
         }
@@ -841,10 +850,10 @@ fn sampler_4x4(
             sample(pixmap, ctx, x,y, &mut rr, &mut gg, &mut bb, &mut aa);
 
             let w = wx[i] * wy[j];
-            *r = mad(w, rr, *r);
-            *g = mad(w, gg, *g);
-            *b = mad(w, bb, *b);
-            *a = mad(w, aa, *a);
+            *r = w.mul_add(rr, *r);
+            *g = w.mul_add(gg, *g);
+            *b = w.mul_add(bb, *b);
+            *a = w.mul_add(aa, *a);
 
             x += one;
         }
@@ -904,7 +913,7 @@ fn gradient(p: &mut Pipeline) {
     let mut idx = u32x8::default();
     for i in 1..ctx.len {
         let tt = ctx.t_values[i].get();
-        let n: u32x8 = bytemuck::cast([
+        let n = u32x8::from_array([
             (t[0] >= tt) as u32,
             (t[1] >= tt) as u32,
             (t[2] >= tt) as u32,
@@ -925,7 +934,7 @@ fn gradient_lookup(
     ctx: &super::GradientCtx, idx: &u32x8, t: f32x8,
     r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
 ) {
-    let idx: [u32; 8] = bytemuck::cast(*idx);
+    let idx: &[u32; 8] = idx.as_array();
 
     macro_rules! gather {
         ($d:expr, $c:ident) => {
@@ -954,20 +963,20 @@ fn gradient_lookup(
     let bb = gather!(&ctx.biases, b);
     let ba = gather!(&ctx.biases, a);
 
-    *r = mad(t, fr, br);
-    *g = mad(t, fg, bg);
-    *b = mad(t, fb, bb);
-    *a = mad(t, fa, ba);
+    *r = t.mul_add(fr, br);
+    *g = t.mul_add(fg, bg);
+    *b = t.mul_add(fb, bb);
+    *a = t.mul_add(fa, ba);
 }
 
 fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) {
     let ctx = &p.ctx.evenly_spaced_2_stop_gradient;
 
     let t = p.r;
-    p.r = mad(t, f32x8::splat(ctx.factor.r), f32x8::splat(ctx.bias.r));
-    p.g = mad(t, f32x8::splat(ctx.factor.g), f32x8::splat(ctx.bias.g));
-    p.b = mad(t, f32x8::splat(ctx.factor.b), f32x8::splat(ctx.bias.b));
-    p.a = mad(t, f32x8::splat(ctx.factor.a), f32x8::splat(ctx.bias.a));
+    p.r = t.mul_add(f32x8::splat(ctx.factor.r), f32x8::splat(ctx.bias.r));
+    p.g = t.mul_add(f32x8::splat(ctx.factor.g), f32x8::splat(ctx.bias.g));
+    p.b = t.mul_add(f32x8::splat(ctx.factor.b), f32x8::splat(ctx.bias.b));
+    p.a = t.mul_add(f32x8::splat(ctx.factor.a), f32x8::splat(ctx.bias.a));
 
     p.next_stage();
 }
@@ -1012,12 +1021,12 @@ fn mask_2pt_conical_degenerates(p: &mut Pipeline) {
     let ctx = &mut p.ctx.two_point_conical_gradient;
 
     let t = p.r;
-    let is_degenerate = t.cmp_le(f32x8::default()) | t.cmp_ne(t);
-    p.r = is_degenerate.blend(f32x8::default(), t);
+    let is_degenerate = t.simd_le(f32x8::default()) | t.simd_ne(t);
+    p.r = is_degenerate.select(f32x8::default(), t);
 
-    let is_not_degenerate = !is_degenerate.to_u32x8_bitcast();
-    let is_not_degenerate: [u32; 8] = bytemuck::cast(is_not_degenerate);
-    ctx.mask = bytemuck::cast([
+    let is_not_degenerate = !is_degenerate.to_int().cast::<u32>();
+    let is_not_degenerate = is_not_degenerate.as_array();
+    ctx.mask = u32x8::from_array([
         if is_not_degenerate[0] != 0 { !0 } else { 0 },
         if is_not_degenerate[1] != 0 { !0 } else { 0 },
         if is_not_degenerate[2] != 0 { !0 } else { 0 },
@@ -1034,10 +1043,10 @@ fn mask_2pt_conical_degenerates(p: &mut Pipeline) {
 fn apply_vector_mask(p: &mut Pipeline) {
     let ctx = &p.ctx.two_point_conical_gradient;
 
-    p.r = (p.r.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
-    p.g = (p.g.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
-    p.b = (p.b.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
-    p.a = (p.a.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
+    p.r = (p.r.cast::<u32>() & ctx.mask).cast::<f32>();
+    p.g = (p.g.cast::<u32>() & ctx.mask).cast::<f32>();
+    p.b = (p.b.cast::<u32>() & ctx.mask).cast::<f32>();
+    p.a = (p.a.cast::<u32>() & ctx.mask).cast::<f32>();
 
     p.next_stage();
 }
@@ -1145,7 +1154,7 @@ fn store_8888_tail(
 
 #[inline(always)]
 fn unnorm(v: &f32x8) -> i32x8 {
-    (v.max(f32x8::default()).min(f32x8::splat(1.0)) * f32x8::splat(255.0)).round_int()
+    (v.simd_max(f32x8::default()).simd_min(f32x8::splat(1.0)) * f32x8::splat(255.0)).round().cast()
 }
 
 #[inline(always)]
@@ -1158,12 +1167,7 @@ fn two(v: f32x8) -> f32x8 {
     v + v
 }
 
-#[inline(always)]
-fn mad(f: f32x8, m: f32x8, a: f32x8) -> f32x8 {
-    f * m + a
-}
-
 #[inline(always)]
 fn lerp(from: f32x8, to: f32x8, t: f32x8) -> f32x8 {
-    mad(to - from, t, from)
+    (to - from).mul_add(t, from)
 }
diff --git a/src/pipeline/lowp.rs b/src/pipeline/lowp.rs
index df0a1d3..a7f4de0 100644
--- a/src/pipeline/lowp.rs
+++ b/src/pipeline/lowp.rs
@@ -28,10 +28,11 @@ we are still 40-60% behind Skia built for Haswell.
 On ARM AArch64 the story is different and explicit SIMD make our code up to 2-3x faster.
 */
 
+use std::simd::{u16x16, f32x16, StdFloat, SimdFloat, SimdPartialOrd};
+
 use crate::PremultipliedColorU8;
 
 use crate::pixmap::SubPixmapMut;
-use crate::wide::{f32x8, u16x16, f32x16};
 use crate::geom::ScreenIntRect;
 
 pub const STAGE_WIDTH: usize = 16;
@@ -142,6 +143,16 @@ pub fn fn_ptr_eq(f1: StageFn, f2: StageFn) -> bool {
     core::ptr::eq(f1 as *const (), f2 as *const ())
 }
 
+trait F32x16Ext {
+    fn normalize(self) -> Self;
+}
+
+impl F32x16Ext for f32x16 {
+    fn normalize(self) -> Self {
+        self.simd_max(f32x16::default()).simd_min(f32x16::splat(1.0))
+    }
+}
+
 #[inline(never)]
 pub fn start(
     functions: &[StageFn],
@@ -234,10 +245,10 @@ fn uniform_color(p: &mut Pipeline) {
 }
 
 fn seed_shader(p: &mut Pipeline) {
-    let iota = f32x16(
-        f32x8::from([0.5,  1.5,  2.5,  3.5,  4.5,  5.5,  6.5,  7.5]),
-        f32x8::from([8.5,  9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5]),
-    );
+    let iota = f32x16::from_array([
+        0.5,  1.5,  2.5,  3.5,  4.5,  5.5,  6.5,  7.5,
+        8.5,  9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5,
+    ]);
 
     let x = f32x16::splat(p.dx as f32) + iota;
     let y = f32x16::splat(p.dy as f32 + 0.5);
@@ -285,7 +296,7 @@ pub fn load_dst_u8_tail(p: &mut Pipeline) {
 
 pub fn store_u8(p: &mut Pipeline) {
     let data = p.pixmap.slice16_mask_at_xy(p.dx, p.dy);
-    let a = p.a.as_slice();
+    let a = p.a.as_array();
 
     data[ 0] = a[ 0] as u8;
     data[ 1] = a[ 1] as u8;
@@ -309,7 +320,7 @@ pub fn store_u8(p: &mut Pipeline) {
 
 pub fn store_u8_tail(p: &mut Pipeline) {
     let data = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
-    let a = p.a.as_slice();
+    let a = p.a.as_array();
 
     // This is better than `for i in 0..tail`, because this way the compiler
     // knows that we have only 16 steps and slices access is guarantee to be valid.
@@ -331,7 +342,7 @@ fn load_mask_u8(p: &mut Pipeline) {
 
     let mut c = u16x16::default();
     for i in 0..p.tail {
-        c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
+        c.as_mut_array()[i] = u16::from(p.mask_ctx.data[offset + i]);
     }
 
     p.r = u16x16::splat(0);
@@ -347,7 +358,7 @@ fn mask_u8(p: &mut Pipeline) {
 
     let mut c = u16x16::default();
     for i in 0..p.tail {
-        c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
+        c.as_mut_array()[i] = u16::from(p.mask_ctx.data[offset + i]);
     }
 
     if c == u16x16::default() {
@@ -365,7 +376,7 @@ fn mask_u8(p: &mut Pipeline) {
 fn scale_u8(p: &mut Pipeline) {
     // Load u8xTail and cast it to u16x16.
     let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
-    let c = u16x16([
+    let c = u16x16::from_array([
         u16::from(data[0]),
         u16::from(data[1]),
         0,
@@ -395,7 +406,7 @@ fn scale_u8(p: &mut Pipeline) {
 fn lerp_u8(p: &mut Pipeline) {
     // Load u8xTail and cast it to u16x16.
     let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
-    let c = u16x16([
+    let c = u16x16::from_array([
         u16::from(data[0]),
         u16::from(data[1]),
         0,
@@ -470,7 +481,7 @@ blend_fn!(screen,           |s, d,  _,  _| s + d - div255(s * d));
 blend_fn!(xor,              |s, d, sa, da| div255(s * inv(da) + d * inv(sa)));
 
 // Wants a type for some reason.
-blend_fn!(plus, |s: u16x16, d, _, _| (s + d).min(&u16x16::splat(255)));
+blend_fn!(plus, |s: u16x16, d: u16x16, _, _| (s + d).min(u16x16::splat(255)));
 
 
 macro_rules! blend_fn2 {
@@ -487,25 +498,30 @@ macro_rules! blend_fn2 {
     };
 }
 
-blend_fn2!(darken,      |s: u16x16, d, sa, da| s + d - div255((s * da).max(&(d * sa))));
-blend_fn2!(lighten,     |s: u16x16, d, sa, da| s + d - div255((s * da).min(&(d * sa))));
+blend_fn2!(darken,      |s: u16x16, d: u16x16, sa: u16x16, da: u16x16| s + d - div255((s * da).max(d * sa)));
+blend_fn2!(lighten,     |s: u16x16, d: u16x16, sa: u16x16, da: u16x16| s + d - div255((s * da).min(d * sa)));
 blend_fn2!(exclusion,   |s: u16x16, d,  _,  _| s + d - u16x16::splat(2) * div255(s * d));
 
-blend_fn2!(difference,  |s: u16x16, d, sa, da|
-    s + d - u16x16::splat(2) * div255((s * da).min(&(d * sa))));
+blend_fn2!(difference,  |s: u16x16, d, sa, da: u16x16|
+    s + d - u16x16::splat(2) * div255((s * da).min(d * sa)));
 
 blend_fn2!(hard_light, |s: u16x16, d: u16x16, sa, da| {
     div255(s * inv(da) + d * inv(sa)
-        + (s+s).cmp_le(&sa).blend(
+        + blend((s+s).simd_le(sa).to_int().cast(),
             u16x16::splat(2) * s * d,
             sa * da - u16x16::splat(2) * (sa-s)*(da-d)
         )
     )
 });
 
+#[inline]
+fn blend(a: u16x16, t: u16x16, e: u16x16) -> u16x16 {
+    (t & a) | (e & !a)
+}
+
 blend_fn2!(overlay, |s: u16x16, d: u16x16, sa, da| {
     div255(s * inv(da) + d * inv(sa)
-        + (d+d).cmp_le(&da).blend(
+        + blend((d+d).simd_le(da).to_int().cast(),
             u16x16::splat(2) * s * d,
             sa * da - u16x16::splat(2) * (sa-s)*(da-d)
         )
@@ -542,8 +558,8 @@ fn transform(p: &mut Pipeline) {
     let x = join(&p.r, &p.g);
     let y = join(&p.b, &p.a);
 
-    let nx = mad(x, f32x16::splat(ts.sx), mad(y, f32x16::splat(ts.kx), f32x16::splat(ts.tx)));
-    let ny = mad(x, f32x16::splat(ts.ky), mad(y, f32x16::splat(ts.sy), f32x16::splat(ts.ty)));
+    let nx = x.mul_add(f32x16::splat(ts.sx), y.mul_add(f32x16::splat(ts.kx), f32x16::splat(ts.tx)));
+    let ny = x.mul_add(f32x16::splat(ts.ky), y.mul_add(f32x16::splat(ts.sy), f32x16::splat(ts.ty)));
 
     split(&nx, &mut p.r, &mut p.g);
     split(&ny, &mut p.b, &mut p.a);
@@ -588,24 +604,24 @@ fn gradient(p: &mut Pipeline) {
     let mut idx = u16x16::splat(0);
     for i in 1..ctx.len {
         let tt = ctx.t_values[i].get();
-        let t0: [f32; 8] = t.0.into();
-        let t1: [f32; 8] = t.1.into();
-        idx.0[ 0] += (t0[0] >= tt) as u16;
-        idx.0[ 1] += (t0[1] >= tt) as u16;
-        idx.0[ 2] += (t0[2] >= tt) as u16;
-        idx.0[ 3] += (t0[3] >= tt) as u16;
-        idx.0[ 4] += (t0[4] >= tt) as u16;
-        idx.0[ 5] += (t0[5] >= tt) as u16;
-        idx.0[ 6] += (t0[6] >= tt) as u16;
-        idx.0[ 7] += (t0[7] >= tt) as u16;
-        idx.0[ 8] += (t1[0] >= tt) as u16;
-        idx.0[ 9] += (t1[1] >= tt) as u16;
-        idx.0[10] += (t1[2] >= tt) as u16;
-        idx.0[11] += (t1[3] >= tt) as u16;
-        idx.0[12] += (t1[4] >= tt) as u16;
-        idx.0[13] += (t1[5] >= tt) as u16;
-        idx.0[14] += (t1[6] >= tt) as u16;
-        idx.0[15] += (t1[7] >= tt) as u16;
+        let t = t.as_array();
+        let idx = idx.as_mut_array();
+        idx[ 0] += (t[ 0] >= tt) as u16;
+        idx[ 1] += (t[ 1] >= tt) as u16;
+        idx[ 2] += (t[ 2] >= tt) as u16;
+        idx[ 3] += (t[ 3] >= tt) as u16;
+        idx[ 4] += (t[ 4] >= tt) as u16;
+        idx[ 5] += (t[ 5] >= tt) as u16;
+        idx[ 6] += (t[ 6] >= tt) as u16;
+        idx[ 7] += (t[ 7] >= tt) as u16;
+        idx[ 8] += (t[ 8] >= tt) as u16;
+        idx[ 9] += (t[ 9] >= tt) as u16;
+        idx[10] += (t[10] >= tt) as u16;
+        idx[11] += (t[11] >= tt) as u16;
+        idx[12] += (t[12] >= tt) as u16;
+        idx[13] += (t[13] >= tt) as u16;
+        idx[14] += (t[14] >= tt) as u16;
+        idx[15] += (t[15] >= tt) as u16;
     }
     gradient_lookup(ctx, &idx, t, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
 
@@ -617,10 +633,10 @@ fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) {
 
     let t = join(&p.r, &p.g);
     round_f32_to_u16(
-        mad(t, f32x16::splat(ctx.factor.r), f32x16::splat(ctx.bias.r)),
-        mad(t, f32x16::splat(ctx.factor.g), f32x16::splat(ctx.bias.g)),
-        mad(t, f32x16::splat(ctx.factor.b), f32x16::splat(ctx.bias.b)),
-        mad(t, f32x16::splat(ctx.factor.a), f32x16::splat(ctx.bias.a)),
+        t.mul_add(f32x16::splat(ctx.factor.r), f32x16::splat(ctx.bias.r)),
+        t.mul_add(f32x16::splat(ctx.factor.g), f32x16::splat(ctx.bias.g)),
+        t.mul_add(f32x16::splat(ctx.factor.b), f32x16::splat(ctx.bias.b)),
+        t.mul_add(f32x16::splat(ctx.factor.a), f32x16::splat(ctx.bias.a)),
         &mut p.r, &mut p.g, &mut p.b, &mut p.a,
     );
 
@@ -643,32 +659,29 @@ fn gradient_lookup(
     ctx: &super::GradientCtx, idx: &u16x16, t: f32x16,
     r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
 ) {
+    let idx = idx.as_array();
     macro_rules! gather {
         ($d:expr, $c:ident) => {
             // Surprisingly, but bound checking doesn't affect the performance.
             // And since `idx` can contain any number, we should leave it in place.
-            f32x16(
-                f32x8::from([
-                    $d[idx.0[ 0] as usize].$c,
-                    $d[idx.0[ 1] as usize].$c,
-                    $d[idx.0[ 2] as usize].$c,
-                    $d[idx.0[ 3] as usize].$c,
-                    $d[idx.0[ 4] as usize].$c,
-                    $d[idx.0[ 5] as usize].$c,
-                    $d[idx.0[ 6] as usize].$c,
-                    $d[idx.0[ 7] as usize].$c,
-                ]),
-                f32x8::from([
-                    $d[idx.0[ 8] as usize].$c,
-                    $d[idx.0[ 9] as usize].$c,
-                    $d[idx.0[10] as usize].$c,
-                    $d[idx.0[11] as usize].$c,
-                    $d[idx.0[12] as usize].$c,
-                    $d[idx.0[13] as usize].$c,
-                    $d[idx.0[14] as usize].$c,
-                    $d[idx.0[15] as usize].$c,
-                ]),
-            )
+            f32x16::from_array([
+                $d[idx[ 0] as usize].$c,
+                $d[idx[ 1] as usize].$c,
+                $d[idx[ 2] as usize].$c,
+                $d[idx[ 3] as usize].$c,
+                $d[idx[ 4] as usize].$c,
+                $d[idx[ 5] as usize].$c,
+                $d[idx[ 6] as usize].$c,
+                $d[idx[ 7] as usize].$c,
+                $d[idx[ 8] as usize].$c,
+                $d[idx[ 9] as usize].$c,
+                $d[idx[10] as usize].$c,
+                $d[idx[11] as usize].$c,
+                $d[idx[12] as usize].$c,
+                $d[idx[13] as usize].$c,
+                $d[idx[14] as usize].$c,
+                $d[idx[15] as usize].$c,
+            ])
         };
     }
 
@@ -683,10 +696,10 @@ fn gradient_lookup(
     let ba = gather!(&ctx.biases, a);
 
     round_f32_to_u16(
-        mad(t, fr, br),
-        mad(t, fg, bg),
-        mad(t, fb, bb),
-        mad(t, fa, ba),
+        t.mul_add(fr, br),
+        t.mul_add(fg, bg),
+        t.mul_add(fb, bb),
+        t.mul_add(fa, ba),
         r, g, b, a,
     );
 }
@@ -704,10 +717,42 @@ fn round_f32_to_u16(
     let bf = bf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
     let af = af * f32x16::splat(255.0) + f32x16::splat(0.5);
 
-    rf.save_to_u16x16(r);
-    gf.save_to_u16x16(g);
-    bf.save_to_u16x16(b);
-    af.save_to_u16x16(a);
+    save_to_u16x16(rf, r);
+    save_to_u16x16(gf, g);
+    save_to_u16x16(bf, b);
+    save_to_u16x16(af, a);
+}
+
+// TODO: optimize
+// This method is too heavy and shouldn't be inlined.
+fn save_to_u16x16(src: f32x16, dst: &mut u16x16) {
+    // Do not use to_i32x8, because it involves rounding,
+    // and Skia cast's without it.
+
+    // let n0: [f32; 8] = self.0.into();
+    // let n1: [f32; 8] = self.1.into();
+    let n = src.as_array();
+    let dst = dst.as_mut_array();
+
+    dst[0] = n[0] as u16;
+    dst[1] = n[1] as u16;
+    dst[2] = n[2] as u16;
+    dst[3] = n[3] as u16;
+
+    dst[4] = n[4] as u16;
+    dst[5] = n[5] as u16;
+    dst[6] = n[6] as u16;
+    dst[7] = n[7] as u16;
+
+    dst[8] = n[8] as u16;
+    dst[9] = n[9] as u16;
+    dst[10] = n[10] as u16;
+    dst[11] = n[11] as u16;
+
+    dst[12] = n[12] as u16;
+    dst[13] = n[13] as u16;
+    dst[14] = n[14] as u16;
+    dst[15] = n[15] as u16;
 }
 
 pub fn just_return(_: &mut Pipeline) {
@@ -723,28 +768,28 @@ fn load_8888(
     data: &[PremultipliedColorU8; STAGE_WIDTH],
     r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
 ) {
-    *r = u16x16([
+    *r = u16x16::from_array([
         data[ 0].red() as u16, data[ 1].red() as u16, data[ 2].red() as u16, data[ 3].red() as u16,
         data[ 4].red() as u16, data[ 5].red() as u16, data[ 6].red() as u16, data[ 7].red() as u16,
         data[ 8].red() as u16, data[ 9].red() as u16, data[10].red() as u16, data[11].red() as u16,
         data[12].red() as u16, data[13].red() as u16, data[14].red() as u16, data[15].red() as u16,
     ]);
 
-    *g = u16x16([
+    *g = u16x16::from_array([
         data[ 0].green() as u16, data[ 1].green() as u16, data[ 2].green() as u16, data[ 3].green() as u16,
         data[ 4].green() as u16, data[ 5].green() as u16, data[ 6].green() as u16, data[ 7].green() as u16,
         data[ 8].green() as u16, data[ 9].green() as u16, data[10].green() as u16, data[11].green() as u16,
         data[12].green() as u16, data[13].green() as u16, data[14].green() as u16, data[15].green() as u16,
     ]);
 
-    *b = u16x16([
+    *b = u16x16::from_array([
         data[ 0].blue() as u16, data[ 1].blue() as u16, data[ 2].blue() as u16, data[ 3].blue() as u16,
         data[ 4].blue() as u16, data[ 5].blue() as u16, data[ 6].blue() as u16, data[ 7].blue() as u16,
         data[ 8].blue() as u16, data[ 9].blue() as u16, data[10].blue() as u16, data[11].blue() as u16,
         data[12].blue() as u16, data[13].blue() as u16, data[14].blue() as u16, data[15].blue() as u16,
     ]);
 
-    *a = u16x16([
+    *a = u16x16::from_array([
         data[ 0].alpha() as u16, data[ 1].alpha() as u16, data[ 2].alpha() as u16, data[ 3].alpha() as u16,
         data[ 4].alpha() as u16, data[ 5].alpha() as u16, data[ 6].alpha() as u16, data[ 7].alpha() as u16,
         data[ 8].alpha() as u16, data[ 9].alpha() as u16, data[10].alpha() as u16, data[11].alpha() as u16,
@@ -769,10 +814,10 @@ fn store_8888(
     r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
     data: &mut [PremultipliedColorU8; STAGE_WIDTH],
 ) {
-    let r = r.as_slice();
-    let g = g.as_slice();
-    let b = b.as_slice();
-    let a = a.as_slice();
+    let r = r.as_array();
+    let g = g.as_array();
+    let b = b.as_array();
+    let a = a.as_array();
 
     data[ 0] = PremultipliedColorU8::from_rgba_unchecked(r[ 0] as u8, g[ 0] as u8, b[ 0] as u8, a[ 0] as u8);
     data[ 1] = PremultipliedColorU8::from_rgba_unchecked(r[ 1] as u8, g[ 1] as u8, b[ 1] as u8, a[ 1] as u8);
@@ -797,10 +842,10 @@ fn store_8888_tail(
     r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
     tail: usize, data: &mut [PremultipliedColorU8],
 ) {
-    let r = r.as_slice();
-    let g = g.as_slice();
-    let b = b.as_slice();
-    let a = a.as_slice();
+    let r = r.as_array();
+    let g = g.as_array();
+    let b = b.as_array();
+    let a = a.as_array();
 
     // This is better than `for i in 0..tail`, because this way the compiler
     // knows that we have only 16 steps and slices access is guarantee to be valid.
@@ -818,7 +863,7 @@ fn store_8888_tail(
 
 #[inline(always)]
 fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) {
-    *a = u16x16([
+    *a = u16x16::from_array([
         data[ 0] as u16, data[ 1] as u16, data[ 2] as u16, data[ 3] as u16,
         data[ 4] as u16, data[ 5] as u16, data[ 6] as u16, data[ 7] as u16,
         data[ 8] as u16, data[ 9] as u16, data[10] as u16, data[11] as u16,
@@ -830,7 +875,7 @@ fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) {
 fn div255(v: u16x16) -> u16x16 {
     // Skia uses `vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8)` here when NEON is available,
     // but it doesn't affect performance much and breaks reproducible result. Ignore it.
-    // NOTE: the compiler does not replace the devision with a shift.
+    // NOTE: the compiler does not replace the division with a shift.
     (v + u16x16::splat(255)) >> u16x16::splat(8) // / u16x16::splat(256)
 }
 
@@ -852,9 +897,9 @@ fn lerp(from: u16x16, to: u16x16, t: u16x16) -> u16x16 {
 #[inline(always)]
 fn split(v: &f32x16, lo: &mut u16x16, hi: &mut u16x16) {
     // We're splitting f32x16 (512bit) into two u16x16 (256 bit).
-    let data: [u8; 64] = bytemuck::cast(*v);
-    let d0: &mut [u8; 32] = bytemuck::cast_mut(&mut lo.0);
-    let d1: &mut [u8; 32] = bytemuck::cast_mut(&mut hi.0);
+    let data: [u8; 64] = bytemuck::cast(*v.as_array());
+    let d0: &mut [u8; 32] = bytemuck::cast_mut(lo.as_mut_array());
+    let d1: &mut [u8; 32] = bytemuck::cast_mut(hi.as_mut_array());
 
     d0.copy_from_slice(&data[0..32]);
     d1.copy_from_slice(&data[32..64]);
@@ -864,20 +909,14 @@ fn split(v: &f32x16, lo: &mut u16x16, hi: &mut u16x16) {
 fn join(lo: &u16x16, hi: &u16x16) -> f32x16 {
     // We're joining two u16x16 (256 bit) into f32x16 (512bit).
 
-    let d0: [u8; 32] = bytemuck::cast(lo.0);
-    let d1: [u8; 32] = bytemuck::cast(hi.0);
+    let d0: [u8; 32] = bytemuck::cast(*lo.as_array());
+    let d1: [u8; 32] = bytemuck::cast(*hi.as_array());
 
     let mut v = f32x16::default();
-    let data: &mut [u8; 64] = bytemuck::cast_mut(&mut v);
+    let data: &mut [u8; 64] = bytemuck::cast_mut(v.as_mut_array());
 
     data[0..32].copy_from_slice(&d0);
     data[32..64].copy_from_slice(&d1);
 
     v
 }
-
-#[inline(always)]
-fn mad(f: f32x16, m: f32x16, a: f32x16) -> f32x16 {
-    // NEON vmlaq_f32 doesn't seem to affect performance in any way. Ignore it.
-    f * m + a
-}
diff --git a/src/pipeline/mod.rs b/src/pipeline/mod.rs
index ee2b252..d7c9318 100644
--- a/src/pipeline/mod.rs
+++ b/src/pipeline/mod.rs
@@ -45,6 +45,7 @@ and should be optimized out in the future.
 */
 
 use alloc::vec::Vec;
+use core::simd::u32x8;
 
 use arrayvec::ArrayVec;
 
@@ -57,7 +58,6 @@ pub use blitter::RasterPipelineBlitter;
 
 use crate::geom::ScreenIntRect;
 use crate::pixmap::SubPixmapMut;
-use crate::wide::u32x8;
 
 mod blitter;
 #[rustfmt::skip] mod highp;
@@ -137,7 +137,7 @@ pub const STAGES_COUNT: usize = Stage::ApplyVectorMask as usize + 1;
 impl<'a> PixmapRef<'a> {
     #[inline(always)]
     pub(crate) fn gather(&self, index: u32x8) -> [PremultipliedColorU8; highp::STAGE_WIDTH] {
-        let index: [u32; 8] = bytemuck::cast(index);
+        let index: &[u32; 8] = index.as_array();
         let pixels = self.pixels();
         [
             pixels[index[0] as usize],
diff --git a/src/scan/hairline.rs b/src/scan/hairline.rs
index 150078b..6abd3d2 100644
--- a/src/scan/hairline.rs
+++ b/src/scan/hairline.rs
@@ -5,8 +5,9 @@
 // found in the LICENSE file.
 
 use core::convert::TryInto;
+use core::simd::{f32x2, SimdFloat};
 
-use tiny_skia_path::{f32x2, PathVerb, SaturateCast, Scalar};
+use tiny_skia_path::{PathVerb, SaturateCast, Scalar};
 
 use crate::{IntRect, LineCap, Path, PathSegment, Point, Rect};
 
@@ -27,6 +28,48 @@ pub type LineProc = fn(&[Point], Option<&ScreenIntRect>, &mut dyn Blitter);
 const MAX_CUBIC_SUBDIVIDE_LEVEL: u8 = 9;
 const MAX_QUAD_SUBDIVIDE_LEVEL: u8 = 5;
 
+trait F32x2Ext {
+    fn x(self) -> f32;
+    fn y(self) -> f32;
+    fn max_component(self) -> f32;
+}
+
+impl F32x2Ext for f32x2 {
+    fn x(self) -> f32 {
+        self.as_array()[0]
+    }
+
+    fn y(self) -> f32 {
+        self.as_array()[1]
+    }
+
+    fn max_component(self) -> f32 {
+        let a = self.x();
+        let b = self.y();
+        // This is faster than `f32::max`. Unlike std one, we do not care about NaN.
+        if a < b {
+            b
+        } else {
+            a
+        }
+    }
+}
+
+trait PointExt {
+    fn from_f32x2(r: f32x2) -> Self;
+    fn to_f32x2(&self) -> f32x2;
+}
+
+impl PointExt for Point {
+    fn from_f32x2(r: f32x2) -> Self {
+        Point::from_xy(r.as_array()[0], r.as_array()[1])
+    }
+
+    fn to_f32x2(&self) -> f32x2 {
+        f32x2::from_array([self.x, self.y])
+    }
+}
+
 pub fn stroke_path(
     path: &Path,
     line_cap: LineCap,
@@ -429,8 +472,8 @@ fn compute_nocheck_quad_bounds(points: &[Point; 3]) -> Option<Rect> {
     let mut max = min;
     for i in 1..3 {
         let pair = points[i].to_f32x2();
-        min = min.min(pair);
-        max = max.max(pair);
+        min = min.simd_min(pair);
+        max = max.simd_max(pair);
     }
 
     Rect::from_ltrb(min.x(), min.y(), max.x(), max.y())
@@ -564,8 +607,8 @@ fn compute_nocheck_cubic_bounds(points: &[Point; 4]) -> Option<Rect> {
     let mut max = min;
     for i in 1..4 {
         let pair = points[i].to_f32x2();
-        min = min.min(pair);
-        max = max.max(pair);
+        min = min.simd_min(pair);
+        max = max.simd_max(pair);
     }
 
     Rect::from_ltrb(min.x(), min.y(), max.x(), max.y())
@@ -631,7 +674,7 @@ fn compute_cubic_segments(points: &[Point; 4]) -> usize {
     let p13 = one_third * p3 + two_third * p0;
     let p23 = one_third * p0 + two_third * p3;
 
-    let diff = (p1 - p13).abs().max((p2 - p23).abs()).max_component();
+    let diff = (p1 - p13).abs().simd_max((p2 - p23).abs()).max_component();
     let mut tol = 1.0 / 8.0;
 
     for i in 0..MAX_CUBIC_SUBDIVIDE_LEVEL {
diff --git a/src/shaders/linear_gradient.rs b/src/shaders/linear_gradient.rs
index 8cf1b41..81b47c3 100644
--- a/src/shaders/linear_gradient.rs
+++ b/src/shaders/linear_gradient.rs
@@ -110,10 +110,10 @@ fn points_to_unit_ts(start: Point, end: Point) -> Option<Transform> {
 }
 
 fn average_gradient_color(points: &[GradientStop]) -> Color {
-    use crate::wide::f32x4;
+    use core::simd::f32x4;
 
     fn load_color(c: Color) -> f32x4 {
-        f32x4::from([c.red(), c.green(), c.blue(), c.alpha()])
+        f32x4::from_array([c.red(), c.green(), c.blue(), c.alpha()])
     }
 
     fn store_color(c: f32x4) -> Color {
diff --git a/src/shaders/radial_gradient.rs b/src/shaders/radial_gradient.rs
index 3c7f441..ce1104d 100644
--- a/src/shaders/radial_gradient.rs
+++ b/src/shaders/radial_gradient.rs
@@ -13,7 +13,6 @@ use crate::{GradientStop, Point, Shader, SpreadMode, Transform};
 use super::gradient::{Gradient, DEGENERATE_THRESHOLD};
 use crate::pipeline;
 use crate::pipeline::RasterPipelineBuilder;
-use crate::wide::u32x8;
 
 #[cfg(all(not(feature = "std"), feature = "no-std-float"))]
 use tiny_skia_path::NoStdFloat;
@@ -142,7 +141,7 @@ impl RadialGradient {
         };
 
         p.ctx.two_point_conical_gradient = pipeline::TwoPointConicalGradientCtx {
-            mask: u32x8::default(),
+            mask: core::simd::u32x8::default(),
             p0,
         };
 
diff --git a/src/wide/f32x16_t.rs b/src/wide/f32x16_t.rs
deleted file mode 100644
index 3cd76a1..0000000
--- a/src/wide/f32x16_t.rs
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright 2020 Yevhenii Reizner
-//
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-use super::{f32x8, u16x16};
-
-#[derive(Copy, Clone, Debug)]
-#[repr(C, align(32))]
-pub struct f32x16(pub f32x8, pub f32x8);
-
-unsafe impl bytemuck::Zeroable for f32x16 {}
-unsafe impl bytemuck::Pod for f32x16 {}
-
-impl Default for f32x16 {
-    fn default() -> Self {
-        Self::splat(0.0)
-    }
-}
-
-impl f32x16 {
-    pub fn splat(n: f32) -> Self {
-        Self(f32x8::splat(n), f32x8::splat(n))
-    }
-
-    #[inline]
-    pub fn abs(&self) -> Self {
-        // Yes, Skia does it in the same way.
-        let abs = |x| bytemuck::cast::<i32, f32>(bytemuck::cast::<f32, i32>(x) & 0x7fffffff);
-
-        let n0: [f32; 8] = self.0.into();
-        let n1: [f32; 8] = self.1.into();
-        Self(
-            f32x8::from([
-                abs(n0[0]),
-                abs(n0[1]),
-                abs(n0[2]),
-                abs(n0[3]),
-                abs(n0[4]),
-                abs(n0[5]),
-                abs(n0[6]),
-                abs(n0[7]),
-            ]),
-            f32x8::from([
-                abs(n1[0]),
-                abs(n1[1]),
-                abs(n1[2]),
-                abs(n1[3]),
-                abs(n1[4]),
-                abs(n1[5]),
-                abs(n1[6]),
-                abs(n1[7]),
-            ]),
-        )
-    }
-
-    pub fn cmp_gt(self, rhs: &Self) -> Self {
-        Self(self.0.cmp_gt(rhs.0), self.1.cmp_gt(rhs.1))
-    }
-
-    pub fn blend(self, t: Self, f: Self) -> Self {
-        Self(self.0.blend(t.0, f.0), self.1.blend(t.1, f.1))
-    }
-
-    pub fn normalize(&self) -> Self {
-        Self(self.0.normalize(), self.1.normalize())
-    }
-
-    pub fn floor(&self) -> Self {
-        // Yes, Skia does it in the same way.
-        let roundtrip = self.round();
-        roundtrip
-            - roundtrip
-                .cmp_gt(self)
-                .blend(f32x16::splat(1.0), f32x16::splat(0.0))
-    }
-
-    pub fn sqrt(&self) -> Self {
-        Self(self.0.sqrt(), self.1.sqrt())
-    }
-
-    pub fn round(&self) -> Self {
-        Self(self.0.round(), self.1.round())
-    }
-
-    // This method is too heavy and shouldn't be inlined.
-    pub fn save_to_u16x16(&self, dst: &mut u16x16) {
-        // Do not use to_i32x8, because it involves rounding,
-        // and Skia cast's without it.
-
-        let n0: [f32; 8] = self.0.into();
-        let n1: [f32; 8] = self.1.into();
-
-        dst.0[0] = n0[0] as u16;
-        dst.0[1] = n0[1] as u16;
-        dst.0[2] = n0[2] as u16;
-        dst.0[3] = n0[3] as u16;
-
-        dst.0[4] = n0[4] as u16;
-        dst.0[5] = n0[5] as u16;
-        dst.0[6] = n0[6] as u16;
-        dst.0[7] = n0[7] as u16;
-
-        dst.0[8] = n1[0] as u16;
-        dst.0[9] = n1[1] as u16;
-        dst.0[10] = n1[2] as u16;
-        dst.0[11] = n1[3] as u16;
-
-        dst.0[12] = n1[4] as u16;
-        dst.0[13] = n1[5] as u16;
-        dst.0[14] = n1[6] as u16;
-        dst.0[15] = n1[7] as u16;
-    }
-}
-
-impl core::ops::Add<f32x16> for f32x16 {
-    type Output = Self;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        Self(self.0 + rhs.0, self.1 + rhs.1)
-    }
-}
-
-impl core::ops::Sub<f32x16> for f32x16 {
-    type Output = Self;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        Self(self.0 - rhs.0, self.1 - rhs.1)
-    }
-}
-
-impl core::ops::Mul<f32x16> for f32x16 {
-    type Output = Self;
-
-    fn mul(self, rhs: Self) -> Self::Output {
-        Self(self.0 * rhs.0, self.1 * rhs.1)
-    }
-}
diff --git a/src/wide/f32x4_t.rs b/src/wide/f32x4_t.rs
deleted file mode 100644
index 21d5140..0000000
--- a/src/wide/f32x4_t.rs
+++ /dev/null
@@ -1,640 +0,0 @@
-// Copyright 2020 Yevhenii Reizner
-//
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Based on https://github.com/Lokathor/wide (Zlib)
-
-use bytemuck::cast;
-
-#[cfg(all(not(feature = "std"), feature = "no-std-float"))]
-use tiny_skia_path::NoStdFloat;
-
-use super::i32x4;
-
-cfg_if::cfg_if! {
-    if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-        #[cfg(target_arch = "x86")]
-        use core::arch::x86::*;
-        #[cfg(target_arch = "x86_64")]
-        use core::arch::x86_64::*;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(16))]
-        pub struct f32x4(__m128);
-    } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-        use core::arch::wasm32::*;
-
-        // repr(transparent) allows for directly passing the v128 on the WASM stack.
-        #[derive(Clone, Copy, Debug)]
-        #[repr(transparent)]
-        pub struct f32x4(v128);
-    } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-        use core::arch::aarch64::*;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(16))]
-        pub struct f32x4(float32x4_t);
-    } else {
-        use super::FasterMinMax;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(16))]
-        pub struct f32x4([f32; 4]);
-    }
-}
-
-unsafe impl bytemuck::Zeroable for f32x4 {}
-unsafe impl bytemuck::Pod for f32x4 {}
-
-impl Default for f32x4 {
-    fn default() -> Self {
-        Self::splat(0.0)
-    }
-}
-
-impl f32x4 {
-    pub fn splat(n: f32) -> Self {
-        Self::from([n, n, n, n])
-    }
-
-    pub fn floor(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_floor(self.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vrndmq_f32(self.0) })
-            } else {
-                let roundtrip: f32x4 = cast(self.trunc_int().to_f32x4());
-                roundtrip - roundtrip.cmp_gt(self).blend(f32x4::splat(1.0), f32x4::default())
-            }
-        }
-    }
-
-    pub fn abs(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_abs(self.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vabsq_f32(self.0) })
-            } else {
-                let non_sign_bits = f32x4::splat(f32::from_bits(i32::MAX as u32));
-                self & non_sign_bits
-            }
-        }
-    }
-
-    pub fn max(self, rhs: Self) -> Self {
-        // These technically don't have the same semantics for NaN and 0, but it
-        // doesn't seem to matter as Skia does it the same way.
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_max_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_pmax(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vmaxq_f32(self.0, rhs.0) })
-            } else {
-                Self([
-                    self.0[0].faster_max(rhs.0[0]),
-                    self.0[1].faster_max(rhs.0[1]),
-                    self.0[2].faster_max(rhs.0[2]),
-                    self.0[3].faster_max(rhs.0[3]),
-                ])
-            }
-        }
-    }
-
-    pub fn min(self, rhs: Self) -> Self {
-        // These technically don't have the same semantics for NaN and 0, but it
-        // doesn't seem to matter as Skia does it the same way.
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_min_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_pmin(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vminq_f32(self.0, rhs.0) })
-            } else {
-                Self([
-                    self.0[0].faster_min(rhs.0[0]),
-                    self.0[1].faster_min(rhs.0[1]),
-                    self.0[2].faster_min(rhs.0[2]),
-                    self.0[3].faster_min(rhs.0[3]),
-                ])
-            }
-        }
-    }
-
-    pub fn cmp_eq(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_cmpeq_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_eq(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(cast(unsafe { vceqq_f32(self.0, rhs.0) }))
-            } else {
-                Self([
-                    if self.0[0] == rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[1] == rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[2] == rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[3] == rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 },
-                ])
-            }
-        }
-    }
-
-    pub fn cmp_ne(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_cmpneq_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_ne(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(cast(unsafe { vmvnq_u32(vceqq_f32(self.0, rhs.0)) }))
-            } else {
-                Self([
-                    if self.0[0] != rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[1] != rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[2] != rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[3] != rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 },
-                ])
-            }
-        }
-    }
-
-    pub fn cmp_ge(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_cmpge_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_ge(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(cast(unsafe { vcgeq_f32(self.0, rhs.0) }))
-            } else {
-                Self([
-                    if self.0[0] >= rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[1] >= rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[2] >= rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[3] >= rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 },
-                ])
-            }
-        }
-    }
-
-    pub fn cmp_gt(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_cmpgt_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_gt(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(cast(unsafe { vcgtq_f32(self.0, rhs.0) }))
-            } else {
-                Self([
-                    if self.0[0] > rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[1] > rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[2] > rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[3] > rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 },
-                ])
-            }
-        }
-    }
-
-    pub fn cmp_le(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_cmple_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_le(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(cast(unsafe { vcleq_f32(self.0, rhs.0) }))
-            } else {
-                Self([
-                    if self.0[0] <= rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[1] <= rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[2] <= rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[3] <= rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 },
-                ])
-            }
-        }
-    }
-
-    pub fn cmp_lt(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_cmplt_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_lt(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(cast(unsafe { vcltq_f32(self.0, rhs.0) }))
-            } else {
-                Self([
-                    if self.0[0] < rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[1] < rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[2] < rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 },
-                    if self.0[3] < rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 },
-                ])
-            }
-        }
-    }
-
-    #[inline]
-    pub fn blend(self, t: Self, f: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse4.1"))] {
-                Self(unsafe { _mm_blendv_ps(f.0, t.0, self.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(v128_bitselect(t.0, f.0, self.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { cast(vbslq_u32( cast(self.0), cast(t.0), cast(f.0))) })
-            } else {
-                super::generic_bit_blend(self, t, f)
-            }
-        }
-    }
-
-    pub fn round(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse4.1"))] {
-                Self(
-                    unsafe { _mm_round_ps(self.0, _MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT) },
-                )
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_nearest(self.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vrndnq_f32(self.0) })
-            } else {
-                use super::u32x4;
-
-                let to_int = f32x4::splat(1.0 / f32::EPSILON);
-                let u: u32x4 = cast(self);
-                let e: i32x4 = cast(u.shr::<23>() & u32x4::splat(0xff));
-                let mut y: f32x4;
-
-                let no_op_magic = i32x4::splat(0x7f + 23);
-                let no_op_mask: f32x4 = cast(e.cmp_gt(no_op_magic) | e.cmp_eq(no_op_magic));
-                let no_op_val: f32x4 = self;
-
-                let zero_magic = i32x4::splat(0x7f - 1);
-                let zero_mask: f32x4 = cast(e.cmp_lt(zero_magic));
-                let zero_val: f32x4 = self * f32x4::splat(0.0);
-
-                let neg_bit: f32x4 = cast(cast::<u32x4, i32x4>(u).cmp_lt(i32x4::default()));
-                let x: f32x4 = neg_bit.blend(-self, self);
-                y = x + to_int - to_int - x;
-                y = y.cmp_gt(f32x4::splat(0.5)).blend(
-                    y + x - f32x4::splat(-1.0),
-                    y.cmp_lt(f32x4::splat(-0.5)).blend(y + x + f32x4::splat(1.0), y + x),
-                );
-                y = neg_bit.blend(-y, y);
-
-                no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y))
-            }
-        }
-    }
-
-    pub fn round_int(self) -> i32x4 {
-        // These technically don't have the same semantics for NaN and out of
-        // range values, but it doesn't seem to matter as Skia does it the same
-        // way.
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                i32x4(unsafe { _mm_cvtps_epi32(self.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                i32x4(i32x4_trunc_sat_f32x4(self.round().0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                i32x4(unsafe { vcvtnq_s32_f32(self.0) } )
-            } else {
-                let rounded: [f32; 4] = cast(self.round());
-                cast([
-                    rounded[0] as i32,
-                    rounded[1] as i32,
-                    rounded[2] as i32,
-                    rounded[3] as i32,
-                ])
-            }
-        }
-    }
-
-    pub fn trunc_int(self) -> i32x4 {
-        // These technically don't have the same semantics for NaN and out of
-        // range values, but it doesn't seem to matter as Skia does it the same
-        // way.
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                i32x4(unsafe { _mm_cvttps_epi32(self.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                i32x4(i32x4_trunc_sat_f32x4(self.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                i32x4(unsafe { vcvtq_s32_f32(self.0) })
-            } else {
-                cast([
-                    self.0[0] as i32,
-                    self.0[1] as i32,
-                    self.0[2] as i32,
-                    self.0[3] as i32,
-                ])
-            }
-        }
-    }
-
-    pub fn recip_fast(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_rcp_ps(self.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_div(f32x4_splat(1.0), self.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                unsafe {
-                    let a = vrecpeq_f32(self.0);
-                    let a = vmulq_f32(vrecpsq_f32(self.0, a), a);
-                    Self(a)
-                }
-            } else {
-                Self::from([
-                    1.0 / self.0[0],
-                    1.0 / self.0[1],
-                    1.0 / self.0[2],
-                    1.0 / self.0[3],
-                ])
-            }
-        }
-    }
-
-    pub fn recip_sqrt(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_rsqrt_ps(self.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_div(f32x4_splat(1.0), f32x4_sqrt(self.0)))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                unsafe {
-                    let a = vrsqrteq_f32(self.0);
-                    let a = vmulq_f32(vrsqrtsq_f32(self.0, vmulq_f32(a, a)), a);
-                    Self(a)
-                }
-            } else {
-                Self::from([
-                    1.0 / self.0[0].sqrt(),
-                    1.0 / self.0[1].sqrt(),
-                    1.0 / self.0[2].sqrt(),
-                    1.0 / self.0[3].sqrt(),
-                ])
-            }
-        }
-    }
-
-    pub fn sqrt(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_sqrt_ps(self.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_sqrt(self.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vsqrtq_f32(self.0) })
-            } else {
-                Self::from([
-                    self.0[0].sqrt(),
-                    self.0[1].sqrt(),
-                    self.0[2].sqrt(),
-                    self.0[3].sqrt(),
-                ])
-            }
-        }
-    }
-}
-
-impl From<[f32; 4]> for f32x4 {
-    fn from(v: [f32; 4]) -> Self {
-        cast(v)
-    }
-}
-
-impl From<f32x4> for [f32; 4] {
-    fn from(v: f32x4) -> Self {
-        cast(v)
-    }
-}
-
-impl core::ops::Add for f32x4 {
-    type Output = Self;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_add_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_add(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vaddq_f32(self.0, rhs.0) })
-            } else {
-                Self([
-                    self.0[0] + rhs.0[0],
-                    self.0[1] + rhs.0[1],
-                    self.0[2] + rhs.0[2],
-                    self.0[3] + rhs.0[3],
-                ])
-            }
-        }
-    }
-}
-
-impl core::ops::AddAssign for f32x4 {
-    fn add_assign(&mut self, rhs: f32x4) {
-        *self = *self + rhs;
-    }
-}
-
-impl core::ops::Sub for f32x4 {
-    type Output = Self;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_sub_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_sub(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vsubq_f32(self.0, rhs.0) })
-            } else {
-                Self([
-                    self.0[0] - rhs.0[0],
-                    self.0[1] - rhs.0[1],
-                    self.0[2] - rhs.0[2],
-                    self.0[3] - rhs.0[3],
-                ])
-            }
-        }
-    }
-}
-
-impl core::ops::Mul for f32x4 {
-    type Output = Self;
-
-    fn mul(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_mul_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_mul(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vmulq_f32(self.0, rhs.0) })
-            } else {
-                Self([
-                    self.0[0] * rhs.0[0],
-                    self.0[1] * rhs.0[1],
-                    self.0[2] * rhs.0[2],
-                    self.0[3] * rhs.0[3],
-                ])
-            }
-        }
-    }
-}
-
-impl core::ops::MulAssign for f32x4 {
-    fn mul_assign(&mut self, rhs: f32x4) {
-        *self = *self * rhs;
-    }
-}
-
-impl core::ops::Div for f32x4 {
-    type Output = Self;
-
-    fn div(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_div_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_div(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vdivq_f32(self.0, rhs.0) })
-            } else {
-                Self([
-                    self.0[0] / rhs.0[0],
-                    self.0[1] / rhs.0[1],
-                    self.0[2] / rhs.0[2],
-                    self.0[3] / rhs.0[3],
-                ])
-            }
-        }
-    }
-}
-
-impl core::ops::BitAnd for f32x4 {
-    type Output = Self;
-
-    #[inline(always)]
-    fn bitand(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_and_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(v128_and(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(cast(unsafe { vandq_u32(cast(self.0), cast(rhs.0)) }))
-            } else {
-                Self([
-                    f32::from_bits(self.0[0].to_bits() & rhs.0[0].to_bits()),
-                    f32::from_bits(self.0[1].to_bits() & rhs.0[1].to_bits()),
-                    f32::from_bits(self.0[2].to_bits() & rhs.0[2].to_bits()),
-                    f32::from_bits(self.0[3].to_bits() & rhs.0[3].to_bits()),
-                ])
-            }
-        }
-    }
-}
-
-impl core::ops::BitOr for f32x4 {
-    type Output = Self;
-
-    #[inline(always)]
-    fn bitor(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_or_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(v128_or(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(cast(unsafe { vorrq_u32(cast(self.0), cast(rhs.0)) }))
-            } else {
-                Self([
-                    f32::from_bits(self.0[0].to_bits() | rhs.0[0].to_bits()),
-                    f32::from_bits(self.0[1].to_bits() | rhs.0[1].to_bits()),
-                    f32::from_bits(self.0[2].to_bits() | rhs.0[2].to_bits()),
-                    f32::from_bits(self.0[3].to_bits() | rhs.0[3].to_bits()),
-                ])
-            }
-        }
-    }
-}
-
-impl core::ops::BitXor for f32x4 {
-    type Output = Self;
-
-    #[inline(always)]
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_xor_ps(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(v128_xor(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(cast(unsafe { veorq_u32(cast(self.0), cast(rhs.0)) }))
-            } else {
-                Self([
-                    f32::from_bits(self.0[0].to_bits() ^ rhs.0[0].to_bits()),
-                    f32::from_bits(self.0[1].to_bits() ^ rhs.0[1].to_bits()),
-                    f32::from_bits(self.0[2].to_bits() ^ rhs.0[2].to_bits()),
-                    f32::from_bits(self.0[3].to_bits() ^ rhs.0[3].to_bits()),
-                ])
-            }
-        }
-    }
-}
-
-impl core::ops::Neg for f32x4 {
-    type Output = Self;
-
-    fn neg(self) -> Self {
-        Self::default() - self
-    }
-}
-
-impl core::ops::Not for f32x4 {
-    type Output = Self;
-
-    fn not(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                unsafe {
-                    let all_bits = _mm_set1_ps(f32::from_bits(u32::MAX));
-                    Self(_mm_xor_ps(self.0, all_bits))
-                }
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(v128_not(self.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(cast(unsafe { vmvnq_u32(cast(self.0)) }))
-            } else {
-                self ^ Self::splat(cast(u32::MAX))
-            }
-        }
-    }
-}
-
-impl core::cmp::PartialEq for f32x4 {
-    fn eq(&self, rhs: &Self) -> bool {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                unsafe { _mm_movemask_ps(_mm_cmpeq_ps(self.0, rhs.0)) == 0b1111 }
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                unsafe { vminvq_u32(vceqq_f32(self.0, rhs.0)) != 0 }
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                u32x4_all_true(f32x4_eq(self.0, rhs.0))
-            } else {
-                self.0 == rhs.0
-            }
-        }
-    }
-}
diff --git a/src/wide/f32x8_t.rs b/src/wide/f32x8_t.rs
deleted file mode 100644
index 6039334..0000000
--- a/src/wide/f32x8_t.rs
+++ /dev/null
@@ -1,403 +0,0 @@
-// Copyright 2020 Yevhenii Reizner
-//
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Based on https://github.com/Lokathor/wide (Zlib)
-
-use bytemuck::cast;
-
-use super::{i32x8, u32x8};
-
-cfg_if::cfg_if! {
-    if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-        #[cfg(target_arch = "x86")]
-        use core::arch::x86::*;
-        #[cfg(target_arch = "x86_64")]
-        use core::arch::x86_64::*;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(32))]
-        pub struct f32x8(__m256);
-    } else {
-        use super::f32x4;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(32))]
-        pub struct f32x8(pub f32x4, pub f32x4);
-    }
-}
-
-unsafe impl bytemuck::Zeroable for f32x8 {}
-unsafe impl bytemuck::Pod for f32x8 {}
-
-impl Default for f32x8 {
-    fn default() -> Self {
-        Self::splat(0.0)
-    }
-}
-
-impl f32x8 {
-    pub fn splat(n: f32) -> Self {
-        cast([n, n, n, n, n, n, n, n])
-    }
-
-    pub fn floor(self) -> Self {
-        let roundtrip: f32x8 = cast(self.trunc_int().to_f32x8());
-        roundtrip
-            - roundtrip
-                .cmp_gt(self)
-                .blend(f32x8::splat(1.0), f32x8::default())
-    }
-
-    pub fn fract(self) -> Self {
-        self - self.floor()
-    }
-
-    pub fn normalize(self) -> Self {
-        self.max(f32x8::default()).min(f32x8::splat(1.0))
-    }
-
-    pub fn to_i32x8_bitcast(self) -> i32x8 {
-        bytemuck::cast(self)
-    }
-
-    pub fn to_u32x8_bitcast(self) -> u32x8 {
-        bytemuck::cast(self)
-    }
-
-    pub fn cmp_eq(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_EQ_OQ) })
-            } else {
-                Self(self.0.cmp_eq(rhs.0), self.1.cmp_eq(rhs.1))
-            }
-        }
-    }
-
-    pub fn cmp_ne(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_NEQ_OQ) })
-            } else {
-                Self(self.0.cmp_ne(rhs.0), self.1.cmp_ne(rhs.1))
-            }
-        }
-    }
-
-    pub fn cmp_ge(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_GE_OQ) })
-            } else {
-                Self(self.0.cmp_ge(rhs.0), self.1.cmp_ge(rhs.1))
-            }
-        }
-    }
-
-    pub fn cmp_gt(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_GT_OQ) })
-            } else {
-                Self(self.0.cmp_gt(rhs.0), self.1.cmp_gt(rhs.1))
-            }
-        }
-    }
-
-    pub fn cmp_le(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_LE_OQ) })
-            } else {
-                Self(self.0.cmp_le(rhs.0), self.1.cmp_le(rhs.1))
-            }
-        }
-    }
-
-    pub fn cmp_lt(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_LT_OQ) })
-            } else {
-                Self(self.0.cmp_lt(rhs.0), self.1.cmp_lt(rhs.1))
-            }
-        }
-    }
-
-    #[inline]
-    pub fn blend(self, t: Self, f: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_blendv_ps(f.0, t.0, self.0) })
-            } else {
-                Self(self.0.blend(t.0, f.0), self.1.blend(t.1, f.1))
-            }
-        }
-    }
-
-    pub fn abs(self) -> Self {
-        let non_sign_bits = f32x8::splat(f32::from_bits(i32::MAX as u32));
-        self & non_sign_bits
-    }
-
-    pub fn max(self, rhs: Self) -> Self {
-        // These technically don't have the same semantics for NaN and 0, but it
-        // doesn't seem to matter as Skia does it the same way.
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_max_ps(self.0, rhs.0) })
-            } else {
-                Self(self.0.max(rhs.0), self.1.max(rhs.1))
-            }
-        }
-    }
-
-    pub fn min(self, rhs: Self) -> Self {
-        // These technically don't have the same semantics for NaN and 0, but it
-        // doesn't seem to matter as Skia does it the same way.
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_min_ps(self.0, rhs.0) })
-            } else {
-                Self(self.0.min(rhs.0), self.1.min(rhs.1))
-            }
-        }
-    }
-
-    pub fn is_finite(self) -> Self {
-        let shifted_exp_mask = u32x8::splat(0xFF000000);
-        let u: u32x8 = cast(self);
-        let shift_u = u.shl::<1>();
-        let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask);
-        cast(out)
-    }
-
-    pub fn round(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_round_ps(self.0, _MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT) })
-            } else {
-                Self(self.0.round(), self.1.round())
-            }
-        }
-    }
-
-    pub fn round_int(self) -> i32x8 {
-        // These technically don't have the same semantics for NaN and out of
-        // range values, but it doesn't seem to matter as Skia does it the same
-        // way.
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                cast(unsafe { _mm256_cvtps_epi32(self.0) })
-            } else {
-                i32x8(self.0.round_int(), self.1.round_int())
-            }
-        }
-    }
-
-    pub fn trunc_int(self) -> i32x8 {
-        // These technically don't have the same semantics for NaN and out of
-        // range values, but it doesn't seem to matter as Skia does it the same
-        // way.
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                cast(unsafe { _mm256_cvttps_epi32(self.0) })
-            } else {
-                i32x8(self.0.trunc_int(), self.1.trunc_int())
-            }
-        }
-    }
-
-    pub fn recip_fast(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_rcp_ps(self.0) })
-            } else {
-                Self(self.0.recip_fast(), self.1.recip_fast())
-            }
-        }
-    }
-
-    pub fn recip_sqrt(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_rsqrt_ps(self.0) })
-            } else {
-                Self(self.0.recip_sqrt(), self.1.recip_sqrt())
-            }
-        }
-    }
-
-    pub fn sqrt(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_sqrt_ps(self.0) })
-            } else {
-                Self(self.0.sqrt(), self.1.sqrt())
-            }
-        }
-    }
-}
-
-impl From<[f32; 8]> for f32x8 {
-    fn from(v: [f32; 8]) -> Self {
-        cast(v)
-    }
-}
-
-impl From<f32x8> for [f32; 8] {
-    fn from(v: f32x8) -> Self {
-        cast(v)
-    }
-}
-
-impl core::ops::Add for f32x8 {
-    type Output = Self;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_add_ps(self.0, rhs.0) })
-            } else {
-                Self(self.0 + rhs.0, self.1 + rhs.1)
-            }
-        }
-    }
-}
-
-impl core::ops::AddAssign for f32x8 {
-    fn add_assign(&mut self, rhs: f32x8) {
-        *self = *self + rhs;
-    }
-}
-
-impl core::ops::Sub for f32x8 {
-    type Output = Self;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_sub_ps(self.0, rhs.0) })
-            } else {
-                Self(self.0 - rhs.0, self.1 - rhs.1)
-            }
-        }
-    }
-}
-
-impl core::ops::Mul for f32x8 {
-    type Output = Self;
-
-    fn mul(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_mul_ps(self.0, rhs.0) })
-            } else {
-                Self(self.0 * rhs.0, self.1 * rhs.1)
-            }
-        }
-    }
-}
-
-impl core::ops::MulAssign for f32x8 {
-    fn mul_assign(&mut self, rhs: f32x8) {
-        *self = *self * rhs;
-    }
-}
-
-impl core::ops::Div for f32x8 {
-    type Output = Self;
-
-    fn div(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_div_ps(self.0, rhs.0) })
-            } else {
-                Self(self.0 / rhs.0, self.1 / rhs.1)
-            }
-        }
-    }
-}
-
-impl core::ops::BitAnd for f32x8 {
-    type Output = Self;
-
-    #[inline(always)]
-    fn bitand(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_and_ps(self.0, rhs.0) })
-            } else {
-                Self(self.0 & rhs.0, self.1 & rhs.1)
-            }
-        }
-    }
-}
-
-impl core::ops::BitOr for f32x8 {
-    type Output = Self;
-
-    #[inline(always)]
-    fn bitor(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_or_ps(self.0, rhs.0) })
-            } else {
-                Self(self.0 | rhs.0, self.1 | rhs.1)
-            }
-        }
-    }
-}
-
-impl core::ops::BitXor for f32x8 {
-    type Output = Self;
-
-    #[inline(always)]
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                Self(unsafe { _mm256_xor_ps(self.0, rhs.0) })
-            } else {
-                Self(self.0 ^ rhs.0, self.1 ^ rhs.1)
-            }
-        }
-    }
-}
-
-impl core::ops::Neg for f32x8 {
-    type Output = Self;
-
-    fn neg(self) -> Self {
-        Self::default() - self
-    }
-}
-
-impl core::ops::Not for f32x8 {
-    type Output = Self;
-
-    fn not(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                let all_bits = unsafe { _mm256_set1_ps(f32::from_bits(u32::MAX)) };
-                Self(unsafe { _mm256_xor_ps(self.0, all_bits) })
-            } else {
-                Self(!self.0, !self.1)
-            }
-        }
-    }
-}
-
-impl core::cmp::PartialEq for f32x8 {
-    fn eq(&self, rhs: &Self) -> bool {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                let mask = unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_EQ_OQ) };
-                unsafe { _mm256_movemask_ps(mask) == 0b1111_1111 }
-            } else {
-                self.0 == rhs.0 && self.1 == rhs.1
-            }
-        }
-    }
-}
diff --git a/src/wide/i32x4_t.rs b/src/wide/i32x4_t.rs
deleted file mode 100644
index fb77a0f..0000000
--- a/src/wide/i32x4_t.rs
+++ /dev/null
@@ -1,281 +0,0 @@
-// Copyright 2020 Yevhenii Reizner
-//
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Based on https://github.com/Lokathor/wide (Zlib)
-
-use bytemuck::cast;
-
-use super::f32x4;
-
-cfg_if::cfg_if! {
-    if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-        #[cfg(target_arch = "x86")]
-        use core::arch::x86::*;
-        #[cfg(target_arch = "x86_64")]
-        use core::arch::x86_64::*;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(16))]
-        pub struct i32x4(pub __m128i);
-    } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-        use core::arch::wasm32::*;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(16))]
-        pub struct i32x4(pub v128);
-    } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-        use core::arch::aarch64::*;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(16))]
-        pub struct i32x4(pub int32x4_t);
-    } else {
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(16))]
-        pub struct i32x4([i32; 4]);
-    }
-}
-
-unsafe impl bytemuck::Zeroable for i32x4 {}
-unsafe impl bytemuck::Pod for i32x4 {}
-
-impl Default for i32x4 {
-    fn default() -> Self {
-        Self::splat(0)
-    }
-}
-
-impl i32x4 {
-    pub fn splat(n: i32) -> Self {
-        cast([n, n, n, n])
-    }
-
-    pub fn blend(self, t: Self, f: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse4.1"))] {
-                Self(unsafe { _mm_blendv_epi8(f.0, t.0, self.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(v128_bitselect(t.0, f.0, self.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vbslq_s32(cast(self.0), t.0, f.0) })
-            } else {
-                super::generic_bit_blend(self, t, f)
-            }
-        }
-    }
-
-    pub fn cmp_eq(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                cast(Self(cast(unsafe { _mm_cmpeq_epi32(self.0, rhs.0) })))
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(i32x4_eq(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { cast(vceqq_s32(self.0, rhs.0)) })
-            } else {
-                Self([
-                    if self.0[0] == rhs.0[0] { -1 } else { 0 },
-                    if self.0[1] == rhs.0[1] { -1 } else { 0 },
-                    if self.0[2] == rhs.0[2] { -1 } else { 0 },
-                    if self.0[3] == rhs.0[3] { -1 } else { 0 },
-                ])
-            }
-        }
-    }
-
-    pub fn cmp_gt(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                cast(Self(cast(unsafe { _mm_cmpgt_epi32(self.0, rhs.0) })))
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(i32x4_gt(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { cast(vcgtq_s32(self.0, rhs.0)) })
-            } else {
-                Self([
-                    if self.0[0] > rhs.0[0] { -1 } else { 0 },
-                    if self.0[1] > rhs.0[1] { -1 } else { 0 },
-                    if self.0[2] > rhs.0[2] { -1 } else { 0 },
-                    if self.0[3] > rhs.0[3] { -1 } else { 0 },
-                ])
-            }
-        }
-    }
-
-    pub fn cmp_lt(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                cast(Self(cast(unsafe { _mm_cmplt_epi32(self.0, rhs.0) })))
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(i32x4_lt(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { cast(vcltq_s32(self.0, rhs.0)) })
-            } else {
-                Self([
-                    if self.0[0] < rhs.0[0] { -1 } else { 0 },
-                    if self.0[1] < rhs.0[1] { -1 } else { 0 },
-                    if self.0[2] < rhs.0[2] { -1 } else { 0 },
-                    if self.0[3] < rhs.0[3] { -1 } else { 0 },
-                ])
-            }
-        }
-    }
-
-    pub fn to_f32x4(self) -> f32x4 {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                cast(Self(cast(unsafe { _mm_cvtepi32_ps(self.0) })))
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                cast(Self(f32x4_convert_i32x4(self.0)))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                cast(Self(unsafe { cast(vcvtq_f32_s32(self.0)) }))
-            } else {
-                let arr: [i32; 4] = cast(self);
-                cast([
-                    arr[0] as f32,
-                    arr[1] as f32,
-                    arr[2] as f32,
-                    arr[3] as f32,
-                ])
-            }
-        }
-    }
-
-    pub fn to_f32x4_bitcast(self) -> f32x4 {
-        bytemuck::cast(self)
-    }
-}
-
-impl From<[i32; 4]> for i32x4 {
-    fn from(v: [i32; 4]) -> Self {
-        cast(v)
-    }
-}
-
-impl From<i32x4> for [i32; 4] {
-    fn from(v: i32x4) -> Self {
-        cast(v)
-    }
-}
-
-impl core::ops::Add for i32x4 {
-    type Output = Self;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_add_epi32(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(i32x4_add(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vaddq_s32(self.0, rhs.0) })
-            } else {
-                Self([
-                    self.0[0].wrapping_add(rhs.0[0]),
-                    self.0[1].wrapping_add(rhs.0[1]),
-                    self.0[2].wrapping_add(rhs.0[2]),
-                    self.0[3].wrapping_add(rhs.0[3]),
-                ])
-            }
-        }
-    }
-}
-
-impl core::ops::BitAnd for i32x4 {
-    type Output = Self;
-
-    fn bitand(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_and_si128(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(v128_and(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vandq_s32(self.0, rhs.0) })
-            } else {
-                Self([
-                    self.0[0] & rhs.0[0],
-                    self.0[1] & rhs.0[1],
-                    self.0[2] & rhs.0[2],
-                    self.0[3] & rhs.0[3],
-                ])
-            }
-        }
-    }
-}
-
-impl core::ops::Mul for i32x4 {
-    type Output = Self;
-
-    fn mul(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse4.1"))] {
-                Self(unsafe { _mm_mullo_epi32(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(i32x4_mul(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vmulq_s32(self.0, rhs.0) })
-            } else {
-                // Cast is required, since we have to use scalar multiplication on SSE2.
-                let a: [i32; 4] = cast(self);
-                let b: [i32; 4] = cast(rhs);
-                Self(cast([
-                    a[0].wrapping_mul(b[0]),
-                    a[1].wrapping_mul(b[1]),
-                    a[2].wrapping_mul(b[2]),
-                    a[3].wrapping_mul(b[3]),
-                ]))
-            }
-        }
-    }
-}
-
-impl core::ops::BitOr for i32x4 {
-    type Output = Self;
-
-    #[inline]
-    fn bitor(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_or_si128(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(v128_or(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vorrq_s32(self.0, rhs.0) })
-            } else {
-                Self([
-                    self.0[0] | rhs.0[0],
-                    self.0[1] | rhs.0[1],
-                    self.0[2] | rhs.0[2],
-                    self.0[3] | rhs.0[3],
-                ])
-            }
-        }
-    }
-}
-
-impl core::ops::BitXor for i32x4 {
-    type Output = Self;
-
-    #[inline]
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_xor_si128(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(v128_xor(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { veorq_s32(self.0, rhs.0) })
-            } else {
-                Self([
-                    self.0[0] ^ rhs.0[0],
-                    self.0[1] ^ rhs.0[1],
-                    self.0[2] ^ rhs.0[2],
-                    self.0[3] ^ rhs.0[3],
-                ])
-            }
-        }
-    }
-}
diff --git a/src/wide/i32x8_t.rs b/src/wide/i32x8_t.rs
deleted file mode 100644
index 52f9729..0000000
--- a/src/wide/i32x8_t.rs
+++ /dev/null
@@ -1,192 +0,0 @@
-// Copyright 2020 Yevhenii Reizner
-//
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Based on https://github.com/Lokathor/wide (Zlib)
-
-use bytemuck::cast;
-
-use super::{f32x8, u32x8};
-
-cfg_if::cfg_if! {
-    if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-        #[cfg(target_arch = "x86")]
-        use core::arch::x86::*;
-        #[cfg(target_arch = "x86_64")]
-        use core::arch::x86_64::*;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(32))]
-        pub struct i32x8(__m256i);
-    } else {
-        use super::i32x4;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(32))]
-        pub struct i32x8(pub i32x4, pub i32x4);
-    }
-}
-
-unsafe impl bytemuck::Zeroable for i32x8 {}
-unsafe impl bytemuck::Pod for i32x8 {}
-
-impl Default for i32x8 {
-    fn default() -> Self {
-        Self::splat(0)
-    }
-}
-
-impl i32x8 {
-    pub fn splat(n: i32) -> Self {
-        cast([n, n, n, n, n, n, n, n])
-    }
-
-    pub fn blend(self, t: Self, f: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                Self(unsafe { _mm256_blendv_epi8(f.0, t.0, self.0) })
-            } else {
-                Self(self.0.blend(t.0, f.0), self.1.blend(t.1, f.1))
-            }
-        }
-    }
-
-    pub fn cmp_eq(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                Self(unsafe { _mm256_cmpeq_epi32(self.0, rhs.0) })
-            } else {
-                Self(self.0.cmp_eq(rhs.0), self.1.cmp_eq(rhs.1))
-            }
-        }
-    }
-
-    pub fn cmp_gt(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                Self(unsafe { _mm256_cmpgt_epi32(self.0, rhs.0) })
-            } else {
-                Self(self.0.cmp_gt(rhs.0), self.1.cmp_gt(rhs.1))
-            }
-        }
-    }
-
-    pub fn cmp_lt(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                // There is no `_mm256_cmpLT_epi32`, therefore we have to use
-                // `_mm256_cmpGT_epi32` and then invert the result.
-                let v = unsafe { _mm256_cmpgt_epi32(self.0, rhs.0) };
-                let all_bits = unsafe { _mm256_set1_epi16(-1) };
-                Self(unsafe { _mm256_xor_si256(v, all_bits) })
-            } else {
-                Self(self.0.cmp_lt(rhs.0), self.1.cmp_lt(rhs.1))
-            }
-        }
-    }
-
-    pub fn to_f32x8(self) -> f32x8 {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                cast(unsafe { _mm256_cvtepi32_ps(self.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "avx"))] {
-                cast([self.0.to_f32x4(), self.1.to_f32x4()])
-            } else {
-                f32x8(self.0.to_f32x4(), self.1.to_f32x4())
-            }
-        }
-    }
-
-    pub fn to_u32x8_bitcast(self) -> u32x8 {
-        bytemuck::cast(self)
-    }
-
-    pub fn to_f32x8_bitcast(self) -> f32x8 {
-        bytemuck::cast(self)
-    }
-}
-
-impl From<[i32; 8]> for i32x8 {
-    fn from(v: [i32; 8]) -> Self {
-        cast(v)
-    }
-}
-
-impl From<i32x8> for [i32; 8] {
-    fn from(v: i32x8) -> Self {
-        cast(v)
-    }
-}
-
-impl core::ops::Add for i32x8 {
-    type Output = Self;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                Self(unsafe { _mm256_add_epi32(self.0, rhs.0) })
-            } else {
-                Self(self.0 + rhs.0, self.1 + rhs.1)
-            }
-        }
-    }
-}
-
-impl core::ops::BitAnd for i32x8 {
-    type Output = Self;
-
-    fn bitand(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                Self(unsafe { _mm256_and_si256(self.0, rhs.0) })
-            } else {
-                Self(self.0 & rhs.0, self.1 & rhs.1)
-            }
-        }
-    }
-}
-
-impl core::ops::Mul for i32x8 {
-    type Output = Self;
-
-    fn mul(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                Self(unsafe { _mm256_mullo_epi32(self.0, rhs.0) })
-            } else {
-                Self(self.0 * rhs.0, self.1 * rhs.1)
-            }
-        }
-    }
-}
-
-impl core::ops::BitOr for i32x8 {
-    type Output = Self;
-
-    #[inline]
-    fn bitor(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                Self(unsafe { _mm256_or_si256(self.0, rhs.0) })
-            } else {
-                Self(self.0 | rhs.0, self.1 | rhs.1)
-            }
-        }
-    }
-}
-
-impl core::ops::BitXor for i32x8 {
-    type Output = Self;
-
-    #[inline]
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                Self(unsafe { _mm256_xor_si256(self.0, rhs.0) })
-            } else {
-                Self(self.0 ^ rhs.0, self.1 ^ rhs.1)
-            }
-        }
-    }
-}
diff --git a/src/wide/mod.rs b/src/wide/mod.rs
deleted file mode 100644
index c58c239..0000000
--- a/src/wide/mod.rs
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright 2020 Yevhenii Reizner
-//
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// This module was written from scratch, therefore there is no Google copyright.
-
-// f32x16, i32x16 and u32x16 are implemented as [Tx8; 2] and not as [T; 16].
-// This way we still can use some SIMD.
-//
-// We doesn't use #[inline] that much in this module.
-// The compiler will inline most of the methods automatically.
-// The only exception is U16x16, were we have to force inlining,
-// otherwise the performance will be horrible.
-
-#![allow(non_camel_case_types)]
-
-mod f32x16_t;
-mod f32x4_t;
-mod f32x8_t;
-mod i32x4_t;
-mod i32x8_t;
-mod u16x16_t;
-mod u32x4_t;
-mod u32x8_t;
-
-pub use f32x16_t::f32x16;
-pub use f32x4_t::f32x4;
-pub use f32x8_t::f32x8;
-pub use i32x4_t::i32x4;
-pub use i32x8_t::i32x8;
-pub use tiny_skia_path::f32x2;
-pub use u16x16_t::u16x16;
-pub use u32x4_t::u32x4;
-pub use u32x8_t::u32x8;
-
-#[allow(dead_code)]
-#[inline]
-pub fn generic_bit_blend<T>(mask: T, y: T, n: T) -> T
-where
-    T: Copy + core::ops::BitXor<Output = T> + core::ops::BitAnd<Output = T>,
-{
-    n ^ ((n ^ y) & mask)
-}
-
-/// A faster and more forgiving f32 min/max implementation.
-///
-/// Unlike std one, we do not care about NaN.
-#[allow(dead_code)]
-pub trait FasterMinMax {
-    fn faster_min(self, rhs: f32) -> f32;
-    fn faster_max(self, rhs: f32) -> f32;
-}
-
-#[allow(dead_code)]
-impl FasterMinMax for f32 {
-    fn faster_min(self, rhs: f32) -> f32 {
-        if rhs < self {
-            rhs
-        } else {
-            self
-        }
-    }
-
-    fn faster_max(self, rhs: f32) -> f32 {
-        if self < rhs {
-            rhs
-        } else {
-            self
-        }
-    }
-}
diff --git a/src/wide/u16x16_t.rs b/src/wide/u16x16_t.rs
deleted file mode 100644
index 5e1a464..0000000
--- a/src/wide/u16x16_t.rs
+++ /dev/null
@@ -1,250 +0,0 @@
-// Copyright 2020 Yevhenii Reizner
-//
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// No need to use explicit 256bit AVX2 SIMD.
-// `-C target-cpu=native` will autovectorize it better than us.
-// Not even sure why explicit instructions are so slow...
-//
-// On ARM AArch64 we can actually get up to 2x performance boost by using SIMD.
-//
-// We also have to inline all the methods. They are pretty large,
-// but without the inlining the performance is plummeting.
-
-#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
-use bytemuck::cast;
-#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
-use core::arch::aarch64::uint16x8_t;
-
-#[allow(non_camel_case_types)]
-#[derive(Copy, Clone, PartialEq, Default, Debug)]
-pub struct u16x16(pub [u16; 16]);
-
-macro_rules! impl_u16x16_op {
-    ($a:expr, $op:ident, $b:expr) => {
-        u16x16([
-            $a.0[0].$op($b.0[0]),
-            $a.0[1].$op($b.0[1]),
-            $a.0[2].$op($b.0[2]),
-            $a.0[3].$op($b.0[3]),
-            $a.0[4].$op($b.0[4]),
-            $a.0[5].$op($b.0[5]),
-            $a.0[6].$op($b.0[6]),
-            $a.0[7].$op($b.0[7]),
-            $a.0[8].$op($b.0[8]),
-            $a.0[9].$op($b.0[9]),
-            $a.0[10].$op($b.0[10]),
-            $a.0[11].$op($b.0[11]),
-            $a.0[12].$op($b.0[12]),
-            $a.0[13].$op($b.0[13]),
-            $a.0[14].$op($b.0[14]),
-            $a.0[15].$op($b.0[15]),
-        ])
-    };
-}
-
-#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
-macro_rules! impl_aarch64_call {
-    ($f:ident, $a:expr, $b:expr) => {
-        let a = $a.split();
-        let b = $b.split();
-        Self(bytemuck::cast([
-            unsafe { core::arch::aarch64::$f(a.0, b.0) },
-            unsafe { core::arch::aarch64::$f(a.1, b.1) },
-        ]))
-    };
-}
-
-impl u16x16 {
-    #[inline]
-    pub fn splat(n: u16) -> Self {
-        Self([n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n])
-    }
-
-    #[inline]
-    pub fn as_slice(&self) -> &[u16; 16] {
-        &self.0
-    }
-
-    #[inline]
-    pub fn min(&self, rhs: &Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                impl_aarch64_call!(vminq_u16, self, rhs)
-            } else {
-                impl_u16x16_op!(self, min, rhs)
-            }
-        }
-    }
-
-    #[inline]
-    pub fn max(&self, rhs: &Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                impl_aarch64_call!(vmaxq_u16, self, rhs)
-            } else {
-                impl_u16x16_op!(self, max, rhs)
-            }
-        }
-    }
-
-    #[inline]
-    pub fn cmp_le(&self, rhs: &Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                impl_aarch64_call!(vcleq_u16, self, rhs)
-            } else {
-                Self([
-                    if self.0[ 0] <= rhs.0[ 0] { !0 } else { 0 },
-                    if self.0[ 1] <= rhs.0[ 1] { !0 } else { 0 },
-                    if self.0[ 2] <= rhs.0[ 2] { !0 } else { 0 },
-                    if self.0[ 3] <= rhs.0[ 3] { !0 } else { 0 },
-                    if self.0[ 4] <= rhs.0[ 4] { !0 } else { 0 },
-                    if self.0[ 5] <= rhs.0[ 5] { !0 } else { 0 },
-                    if self.0[ 6] <= rhs.0[ 6] { !0 } else { 0 },
-                    if self.0[ 7] <= rhs.0[ 7] { !0 } else { 0 },
-                    if self.0[ 8] <= rhs.0[ 8] { !0 } else { 0 },
-                    if self.0[ 9] <= rhs.0[ 9] { !0 } else { 0 },
-                    if self.0[10] <= rhs.0[10] { !0 } else { 0 },
-                    if self.0[11] <= rhs.0[11] { !0 } else { 0 },
-                    if self.0[12] <= rhs.0[12] { !0 } else { 0 },
-                    if self.0[13] <= rhs.0[13] { !0 } else { 0 },
-                    if self.0[14] <= rhs.0[14] { !0 } else { 0 },
-                    if self.0[15] <= rhs.0[15] { !0 } else { 0 },
-                ])
-            }
-        }
-    }
-
-    #[inline]
-    pub fn blend(self, t: Self, e: Self) -> Self {
-        (t & self) | (e & !self)
-    }
-
-    #[inline]
-    #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))]
-    pub fn split(self) -> (uint16x8_t, uint16x8_t) {
-        let pair: [uint16x8_t; 2] = cast(self.0);
-        (pair[0], pair[1])
-    }
-}
-
-impl core::ops::Add<u16x16> for u16x16 {
-    type Output = Self;
-
-    #[inline]
-    fn add(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                impl_aarch64_call!(vaddq_u16, self, rhs)
-            } else {
-                impl_u16x16_op!(self, add, rhs)
-            }
-        }
-    }
-}
-
-impl core::ops::Sub<u16x16> for u16x16 {
-    type Output = Self;
-
-    #[inline]
-    fn sub(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                impl_aarch64_call!(vsubq_u16, self, rhs)
-            } else {
-                impl_u16x16_op!(self, sub, rhs)
-            }
-        }
-    }
-}
-
-impl core::ops::Mul<u16x16> for u16x16 {
-    type Output = Self;
-
-    #[inline]
-    fn mul(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                impl_aarch64_call!(vmulq_u16, self, rhs)
-            } else {
-                impl_u16x16_op!(self, mul, rhs)
-            }
-        }
-    }
-}
-
-impl core::ops::Div<u16x16> for u16x16 {
-    type Output = Self;
-
-    #[inline]
-    fn div(self, rhs: Self) -> Self::Output {
-        impl_u16x16_op!(self, div, rhs)
-    }
-}
-
-impl core::ops::BitAnd<u16x16> for u16x16 {
-    type Output = Self;
-
-    #[inline]
-    fn bitand(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                impl_aarch64_call!(vandq_u16, self, rhs)
-            } else {
-                impl_u16x16_op!(self, bitand, rhs)
-            }
-        }
-    }
-}
-
-impl core::ops::BitOr<u16x16> for u16x16 {
-    type Output = Self;
-
-    #[inline]
-    fn bitor(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                impl_aarch64_call!(vorrq_u16, self, rhs)
-            } else {
-                impl_u16x16_op!(self, bitor, rhs)
-            }
-        }
-    }
-}
-
-impl core::ops::Not for u16x16 {
-    type Output = Self;
-
-    #[inline]
-    fn not(self) -> Self::Output {
-        u16x16([
-            !self.0[0],
-            !self.0[1],
-            !self.0[2],
-            !self.0[3],
-            !self.0[4],
-            !self.0[5],
-            !self.0[6],
-            !self.0[7],
-            !self.0[8],
-            !self.0[9],
-            !self.0[10],
-            !self.0[11],
-            !self.0[12],
-            !self.0[13],
-            !self.0[14],
-            !self.0[15],
-        ])
-    }
-}
-
-impl core::ops::Shr for u16x16 {
-    type Output = Self;
-
-    #[inline]
-    fn shr(self, rhs: Self) -> Self::Output {
-        impl_u16x16_op!(self, shr, rhs)
-    }
-}
diff --git a/src/wide/u32x4_t.rs b/src/wide/u32x4_t.rs
deleted file mode 100644
index 27d78cb..0000000
--- a/src/wide/u32x4_t.rs
+++ /dev/null
@@ -1,191 +0,0 @@
-// Copyright 2020 Yevhenii Reizner
-//
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Based on https://github.com/Lokathor/wide (Zlib)
-
-cfg_if::cfg_if! {
-    if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-        #[cfg(target_arch = "x86")]
-        use core::arch::x86::*;
-        #[cfg(target_arch = "x86_64")]
-        use core::arch::x86_64::*;
-
-        // unused when AVX is available
-        #[cfg(not(target_feature = "avx2"))]
-        use bytemuck::cast;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(16))]
-        pub struct u32x4(__m128i);
-    } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-        use core::arch::wasm32::*;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(16))]
-        pub struct u32x4(v128);
-    } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-        use core::arch::aarch64::*;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(16))]
-        pub struct u32x4(uint32x4_t);
-    } else {
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(16))]
-        pub struct u32x4([u32; 4]);
-    }
-}
-
-unsafe impl bytemuck::Zeroable for u32x4 {}
-unsafe impl bytemuck::Pod for u32x4 {}
-
-impl Default for u32x4 {
-    fn default() -> Self {
-        Self::splat(0)
-    }
-}
-
-impl u32x4 {
-    pub fn splat(n: u32) -> Self {
-        bytemuck::cast([n, n, n, n])
-    }
-
-    // unused when AVX is available
-    #[cfg(not(target_feature = "avx2"))]
-    pub fn cmp_eq(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_cmpeq_epi32(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(u32x4_eq(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vceqq_u32(self.0, rhs.0) })
-            } else {
-                Self([
-                    if self.0[0] == rhs.0[0] { u32::MAX } else { 0 },
-                    if self.0[1] == rhs.0[1] { u32::MAX } else { 0 },
-                    if self.0[2] == rhs.0[2] { u32::MAX } else { 0 },
-                    if self.0[3] == rhs.0[3] { u32::MAX } else { 0 },
-                ])
-            }
-        }
-    }
-
-    // unused when AVX is available
-    #[cfg(not(target_feature = "avx2"))]
-    pub fn shl<const RHS: i32>(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                let shift = cast([RHS as u64, 0]);
-                Self(unsafe { _mm_sll_epi32(self.0, shift) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(u32x4_shl(self.0, RHS as _))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vshlq_n_u32::<RHS>(self.0) })
-            } else {
-                let u = RHS as u64;
-                Self([
-                    self.0[0] << u,
-                    self.0[1] << u,
-                    self.0[2] << u,
-                    self.0[3] << u,
-                ])
-            }
-        }
-    }
-
-    // unused when AVX is available
-    #[cfg(not(target_feature = "avx2"))]
-    pub fn shr<const RHS: i32>(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                let shift: __m128i = cast([RHS as u64, 0]);
-                Self(unsafe { _mm_srl_epi32(self.0, shift) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(u32x4_shr(self.0, RHS as _))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vshrq_n_u32::<RHS>(self.0) })
-            } else {
-                let u = RHS as u64;
-                Self([
-                    self.0[0] >> u,
-                    self.0[1] >> u,
-                    self.0[2] >> u,
-                    self.0[3] >> u,
-                ])
-            }
-        }
-    }
-}
-
-impl core::ops::Not for u32x4 {
-    type Output = Self;
-
-    fn not(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                let all_bits = unsafe { _mm_set1_epi32(-1) };
-                Self(unsafe { _mm_xor_si128(self.0, all_bits) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(v128_not(self.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vmvnq_u32(self.0) })
-            } else {
-                Self([
-                    !self.0[0],
-                    !self.0[1],
-                    !self.0[2],
-                    !self.0[3],
-                ])
-            }
-        }
-    }
-}
-
-impl core::ops::Add for u32x4 {
-    type Output = Self;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_add_epi32(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(u32x4_add(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vaddq_u32(self.0, rhs.0) })
-            } else {
-                Self([
-                    self.0[0].wrapping_add(rhs.0[0]),
-                    self.0[1].wrapping_add(rhs.0[1]),
-                    self.0[2].wrapping_add(rhs.0[2]),
-                    self.0[3].wrapping_add(rhs.0[3]),
-                ])
-            }
-        }
-    }
-}
-
-impl core::ops::BitAnd for u32x4 {
-    type Output = Self;
-
-    fn bitand(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                Self(unsafe { _mm_and_si128(self.0, rhs.0) })
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(v128_and(self.0, rhs.0))
-            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
-                Self(unsafe { vandq_u32(self.0, rhs.0) })
-            } else {
-                Self([
-                    self.0[0] & rhs.0[0],
-                    self.0[1] & rhs.0[1],
-                    self.0[2] & rhs.0[2],
-                    self.0[3] & rhs.0[3],
-                ])
-            }
-        }
-    }
-}
diff --git a/src/wide/u32x8_t.rs b/src/wide/u32x8_t.rs
deleted file mode 100644
index b3791b5..0000000
--- a/src/wide/u32x8_t.rs
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright 2020 Yevhenii Reizner
-//
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Based on https://github.com/Lokathor/wide (Zlib)
-
-use super::{f32x8, i32x8};
-
-cfg_if::cfg_if! {
-    if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-        #[cfg(target_arch = "x86")]
-        use core::arch::x86::*;
-        #[cfg(target_arch = "x86_64")]
-        use core::arch::x86_64::*;
-
-        use bytemuck::cast;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(32))]
-        pub struct u32x8(__m256i);
-    } else {
-        use super::u32x4;
-
-        #[derive(Clone, Copy, Debug)]
-        #[repr(C, align(32))]
-        pub struct u32x8(u32x4, u32x4);
-    }
-}
-
-unsafe impl bytemuck::Zeroable for u32x8 {}
-unsafe impl bytemuck::Pod for u32x8 {}
-
-impl Default for u32x8 {
-    fn default() -> Self {
-        Self::splat(0)
-    }
-}
-
-impl u32x8 {
-    pub fn splat(n: u32) -> Self {
-        bytemuck::cast([n, n, n, n, n, n, n, n])
-    }
-
-    pub fn to_i32x8_bitcast(self) -> i32x8 {
-        bytemuck::cast(self)
-    }
-
-    pub fn to_f32x8_bitcast(self) -> f32x8 {
-        bytemuck::cast(self)
-    }
-
-    pub fn cmp_eq(self, rhs: Self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                Self(unsafe { _mm256_cmpeq_epi32(self.0, rhs.0) })
-            } else {
-                Self(self.0.cmp_eq(rhs.0), self.1.cmp_eq(rhs.1))
-            }
-        }
-    }
-
-    pub fn shl<const RHS: i32>(self) -> Self {
-        cfg_if::cfg_if! {
-           if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                let shift: __m128i = cast([RHS as u64, 0]);
-                Self(unsafe { _mm256_sll_epi32(self.0, shift) })
-            } else {
-                Self(self.0.shl::<RHS>(), self.1.shl::<RHS>())
-            }
-        }
-    }
-
-    pub fn shr<const RHS: i32>(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                let shift: __m128i = cast([RHS as u64, 0]);
-                Self(unsafe { _mm256_srl_epi32(self.0, shift) })
-            } else {
-                Self(self.0.shr::<RHS>(), self.1.shr::<RHS>())
-            }
-        }
-    }
-}
-
-impl core::ops::Not for u32x8 {
-    type Output = Self;
-
-    fn not(self) -> Self {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                let all_bits = unsafe { _mm256_set1_epi16(-1) };
-                Self(unsafe { _mm256_xor_si256(self.0, all_bits) })
-            } else {
-                Self(!self.0, !self.1)
-            }
-        }
-    }
-}
-
-impl core::ops::Add for u32x8 {
-    type Output = Self;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                Self(unsafe { _mm256_add_epi32(self.0, rhs.0) })
-            } else {
-                Self(self.0 + rhs.0, self.1 + rhs.1)
-            }
-        }
-    }
-}
-
-impl core::ops::BitAnd for u32x8 {
-    type Output = Self;
-
-    fn bitand(self, rhs: Self) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                Self(unsafe { _mm256_and_si256(self.0, rhs.0) })
-            } else {
-                Self(self.0 & rhs.0, self.1 & rhs.1)
-            }
-        }
-    }
-}
diff --git a/tests/images/canvas/draw-pixmap-opacity.png b/tests/images/canvas/draw-pixmap-opacity.png
index 8e01b5b..81572a3 100644
Binary files a/tests/images/canvas/draw-pixmap-opacity.png and b/tests/images/canvas/draw-pixmap-opacity.png differ
diff --git a/tests/images/gradients/three-stops-evenly-spaced-lq.png b/tests/images/gradients/three-stops-evenly-spaced-lq.png
index 6528379..2a2f546 100644
Binary files a/tests/images/gradients/three-stops-evenly-spaced-lq.png and b/tests/images/gradients/three-stops-evenly-spaced-lq.png differ
diff --git a/tests/images/gradients/two-stops-linear-pad-lq.png b/tests/images/gradients/two-stops-linear-pad-lq.png
index 41e0025..04e0659 100644
Binary files a/tests/images/gradients/two-stops-linear-pad-lq.png and b/tests/images/gradients/two-stops-linear-pad-lq.png differ
diff --git a/tests/images/gradients/two-stops-linear-reflect-lq.png b/tests/images/gradients/two-stops-linear-reflect-lq.png
index 347ebc1..c18e728 100644
Binary files a/tests/images/gradients/two-stops-linear-reflect-lq.png and b/tests/images/gradients/two-stops-linear-reflect-lq.png differ
diff --git a/tests/images/gradients/two-stops-linear-repeat-lq.png b/tests/images/gradients/two-stops-linear-repeat-lq.png
index 0f67e30..033fc59 100644
Binary files a/tests/images/gradients/two-stops-linear-repeat-lq.png and b/tests/images/gradients/two-stops-linear-repeat-lq.png differ
diff --git a/tests/images/gradients/two-stops-unevenly-spaced-lq.png b/tests/images/gradients/two-stops-unevenly-spaced-lq.png
index d1521ed..fc655f8 100644
Binary files a/tests/images/gradients/two-stops-unevenly-spaced-lq.png and b/tests/images/gradients/two-stops-unevenly-spaced-lq.png differ