diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b84bc29..3aac993 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,8 +11,7 @@ jobs: strategy: matrix: rust: - - 1.57.0 - - stable + - nightly steps: - name: Checkout uses: actions/checkout@v2 @@ -23,16 +22,13 @@ jobs: toolchain: ${{ matrix.rust }} override: true - - name: Build with minimal features (no_std) - run: cargo build --verbose --no-default-features --features no-std-float + # - name: Build with minimal features (no_std) + # run: cargo build --verbose --no-default-features --features no-std-float - name: Run tests for tiny-skia-path working-directory: path run: cargo test --verbose - - name: Run tests without SIMD - run: cargo test --verbose --no-default-features --features png-format - - name: Run tests with SSE2 env: RUSTFLAGS: -Ctarget-feature=+sse2 @@ -62,7 +58,7 @@ jobs: - name: Install toolchain uses: actions-rs/toolchain@v1 with: - toolchain: stable + toolchain: nightly override: true target: wasm32-wasi @@ -71,11 +67,8 @@ jobs: curl https://wasmtime.dev/install.sh -sSf | bash echo "$HOME/.wasmtime/bin" >> $GITHUB_PATH - - name: Build with minimal features (no_std) - run: cargo build --target wasm32-wasi --verbose --no-default-features --features no-std-float - - - name: Run tests without SIMD - run: cargo test --target wasm32-wasi --verbose --no-default-features --features png-format + # - name: Build with minimal features (no_std) + # run: cargo build --target wasm32-wasi --verbose --no-default-features --features no-std-float - name: Run tests with SIMD128 env: @@ -91,18 +84,15 @@ jobs: - name: Install toolchain uses: actions-rs/toolchain@v1 with: - toolchain: stable + toolchain: nightly override: true target: aarch64-unknown-linux-gnu - name: Install cross run: cargo install cross - - name: Build with minimal features (no_std) - run: cross build --target aarch64-unknown-linux-gnu --verbose --no-default-features --features no-std-float - - - name: Run tests without SIMD - run: cross test --target aarch64-unknown-linux-gnu --verbose --no-default-features --features png-format + # - name: Build with minimal features (no_std) + # run: cross build --target aarch64-unknown-linux-gnu --verbose --no-default-features --features no-std-float - name: Run tests with Neon run: cross test --target aarch64-unknown-linux-gnu diff --git a/Cargo.toml b/Cargo.toml index e80a97c..52de6e0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,17 +24,12 @@ png = { version = "0.17", optional = true } tiny-skia-path = { version = "0.10.0", path = "path", default-features = false } [features] -default = ["std", "simd", "png-format"] +default = ["std", "png-format"] # Enables the use of the standard library. Deactivate this and activate the no-std-float # feature to compile for targets that don't have std. std = ["tiny-skia-path/std"] no-std-float = ["tiny-skia-path/no-std-float"] -# Enables SIMD instructions on x86 (from SSE up to AVX2), WebAssembly (SIMD128) -# and AArch64 (Neon). -# Has no effect on other targets. Present mainly for testing. -simd = [] - # Allows loading and saving `Pixmap` as PNG. png-format = ["std", "png"] diff --git a/path/src/f32x2_t.rs b/path/src/f32x2_t.rs deleted file mode 100644 index e471f42..0000000 --- a/path/src/f32x2_t.rs +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2020 Yevhenii Reizner -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#[cfg(all(not(feature = "std"), feature = "no-std-float"))] -use crate::NoStdFloat; - -// Right now, there are no visible benefits of using SIMD for f32x2. So we don't. -/// A pair of f32 numbers. -/// -/// Mainly for internal use. Do not rely on it! -#[allow(non_camel_case_types)] -#[derive(Copy, Clone, Default, PartialEq, Debug)] -pub struct f32x2(pub [f32; 2]); - -impl f32x2 { - /// Creates a new pair. - pub fn new(a: f32, b: f32) -> f32x2 { - f32x2([a, b]) - } - - /// Creates a new pair from a single value. - pub fn splat(x: f32) -> f32x2 { - f32x2([x, x]) - } - - /// Returns an absolute value. - pub fn abs(self) -> f32x2 { - f32x2([self.x().abs(), self.y().abs()]) - } - - /// Returns a minimum value. - pub fn min(self, other: f32x2) -> f32x2 { - f32x2([pmin(self.x(), other.x()), pmin(self.y(), other.y())]) - } - - /// Returns a maximum value. - pub fn max(self, other: f32x2) -> f32x2 { - f32x2([pmax(self.x(), other.x()), pmax(self.y(), other.y())]) - } - - /// Returns a maximum of both values. - pub fn max_component(self) -> f32 { - pmax(self.x(), self.y()) - } - - /// Returns the first value. - pub fn x(&self) -> f32 { - self.0[0] - } - - /// Returns the second value. - pub fn y(&self) -> f32 { - self.0[1] - } -} - -impl core::ops::Add for f32x2 { - type Output = f32x2; - - fn add(self, other: f32x2) -> f32x2 { - f32x2([self.x() + other.x(), self.y() + other.y()]) - } -} - -impl core::ops::Sub for f32x2 { - type Output = f32x2; - - fn sub(self, other: f32x2) -> f32x2 { - f32x2([self.x() - other.x(), self.y() - other.y()]) - } -} - -impl core::ops::Mul for f32x2 { - type Output = f32x2; - - fn mul(self, other: f32x2) -> f32x2 { - f32x2([self.x() * other.x(), self.y() * other.y()]) - } -} - -impl core::ops::Div for f32x2 { - type Output = f32x2; - - fn div(self, other: f32x2) -> f32x2 { - f32x2([self.x() / other.x(), self.y() / other.y()]) - } -} - -// A faster and more forgiving f32 min/max implementation. -// -// Unlike std one, we do not care about NaN. - -fn pmax(a: f32, b: f32) -> f32 { - if a < b { - b - } else { - a - } -} - -fn pmin(a: f32, b: f32) -> f32 { - if b < a { - b - } else { - a - } -} diff --git a/path/src/f32x4_t.rs b/path/src/f32x4_t.rs deleted file mode 100644 index e591b55..0000000 --- a/path/src/f32x4_t.rs +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2020 Yevhenii Reizner -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Right now, there are no visible benefits of using SIMD for f32x4. So we don't. -#[derive(Default, Clone, Copy, PartialEq, Debug)] -#[repr(C, align(16))] -pub struct f32x4(pub [f32; 4]); - -impl f32x4 { - pub fn max(self, rhs: Self) -> Self { - Self([ - self.0[0].max(rhs.0[0]), - self.0[1].max(rhs.0[1]), - self.0[2].max(rhs.0[2]), - self.0[3].max(rhs.0[3]), - ]) - } - - pub fn min(self, rhs: Self) -> Self { - Self([ - self.0[0].min(rhs.0[0]), - self.0[1].min(rhs.0[1]), - self.0[2].min(rhs.0[2]), - self.0[3].min(rhs.0[3]), - ]) - } -} - -impl core::ops::Add for f32x4 { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - Self([ - self.0[0] + rhs.0[0], - self.0[1] + rhs.0[1], - self.0[2] + rhs.0[2], - self.0[3] + rhs.0[3], - ]) - } -} - -impl core::ops::AddAssign for f32x4 { - fn add_assign(&mut self, rhs: f32x4) { - *self = *self + rhs; - } -} - -impl core::ops::Sub for f32x4 { - type Output = Self; - - fn sub(self, rhs: Self) -> Self::Output { - Self([ - self.0[0] - rhs.0[0], - self.0[1] - rhs.0[1], - self.0[2] - rhs.0[2], - self.0[3] - rhs.0[3], - ]) - } -} - -impl core::ops::Mul for f32x4 { - type Output = Self; - - fn mul(self, rhs: Self) -> Self::Output { - Self([ - self.0[0] * rhs.0[0], - self.0[1] * rhs.0[1], - self.0[2] * rhs.0[2], - self.0[3] * rhs.0[3], - ]) - } -} - -impl core::ops::MulAssign for f32x4 { - fn mul_assign(&mut self, rhs: f32x4) { - *self = *self * rhs; - } -} diff --git a/path/src/lib.rs b/path/src/lib.rs index e80a70c..23eb068 100644 --- a/path/src/lib.rs +++ b/path/src/lib.rs @@ -12,6 +12,7 @@ //! //! Note that all types use single precision floats (`f32`), just like [Skia](https://skia.org/). +#![feature(portable_simd)] #![no_std] #![warn(missing_docs)] #![warn(missing_copy_implementations)] @@ -36,8 +37,6 @@ extern crate std; extern crate alloc; mod dash; -mod f32x2_t; -mod f32x4_t; mod floating_point; mod path; mod path_builder; @@ -49,7 +48,6 @@ mod stroker; mod transform; pub use dash::StrokeDash; -pub use f32x2_t::f32x2; pub use floating_point::*; pub use path::*; pub use path_builder::*; @@ -86,16 +84,6 @@ impl Point { Point { x, y } } - /// Creates a new `Point` from `f32x2`. - pub fn from_f32x2(r: f32x2) -> Self { - Point::from_xy(r.x(), r.y()) - } - - /// Converts a `Point` into a `f32x2`. - pub fn to_f32x2(&self) -> f32x2 { - f32x2::new(self.x, self.y) - } - /// Creates a point at 0x0 position. pub fn zero() -> Self { Point { x: 0.0, y: 0.0 } diff --git a/path/src/path_geometry.rs b/path/src/path_geometry.rs index d4c3746..ebd101c 100644 --- a/path/src/path_geometry.rs +++ b/path/src/path_geometry.rs @@ -10,9 +10,10 @@ #![allow(missing_docs)] +use core::simd::f32x2; + use crate::{Point, Transform}; -use crate::f32x2_t::f32x2; use crate::floating_point::FLOAT_PI; use crate::scalar::{Scalar, SCALAR_NEARLY_ZERO, SCALAR_ROOT_2_OVER_2}; @@ -22,6 +23,21 @@ use crate::path_builder::PathDirection; #[cfg(all(not(feature = "std"), feature = "no-std-float"))] use crate::NoStdFloat; +trait PointExt { + fn from_f32x2(r: f32x2) -> Self; + fn to_f32x2(&self) -> f32x2; +} + +impl PointExt for Point { + fn from_f32x2(r: f32x2) -> Self { + Point::from_xy(r.as_array()[0], r.as_array()[1]) + } + + fn to_f32x2(&self) -> f32x2 { + f32x2::from_array([self.x, self.y]) + } +} + // use for : eval(t) == A * t^2 + B * t + C #[derive(Clone, Copy, Default, Debug)] pub struct QuadCoeff { diff --git a/path/src/rect.rs b/path/src/rect.rs index d199f9d..06b7f1c 100644 --- a/path/src/rect.rs +++ b/path/src/rect.rs @@ -345,7 +345,7 @@ impl Rect { /// /// Returns None if count is zero or if Point array contains an infinity or NaN. pub fn from_points(points: &[Point]) -> Option { - use crate::f32x4_t::f32x4; + use core::simd::{f32x4, SimdFloat}; if points.is_empty() { return None; @@ -356,13 +356,13 @@ impl Rect { let mut max; if points.len() & 1 != 0 { let pt = points[0]; - min = f32x4([pt.x, pt.y, pt.x, pt.y]); + min = f32x4::from_array([pt.x, pt.y, pt.x, pt.y]); max = min; offset += 1; } else { let pt0 = points[0]; let pt1 = points[1]; - min = f32x4([pt0.x, pt0.y, pt1.x, pt1.y]); + min = f32x4::from_array([pt0.x, pt0.y, pt1.x, pt1.y]); max = min; offset += 2; } @@ -371,17 +371,17 @@ impl Rect { while offset != points.len() { let pt0 = points[offset + 0]; let pt1 = points[offset + 1]; - let xy = f32x4([pt0.x, pt0.y, pt1.x, pt1.y]); + let xy = f32x4::from_array([pt0.x, pt0.y, pt1.x, pt1.y]); accum *= xy; - min = min.min(xy); - max = max.max(xy); + min = min.simd_min(xy); + max = max.simd_max(xy); offset += 2; } let all_finite = accum * f32x4::default() == f32x4::default(); - let min: [f32; 4] = min.0; - let max: [f32; 4] = max.0; + let min: &[f32; 4] = min.as_array(); + let max: &[f32; 4] = max.as_array(); if all_finite { Rect::from_ltrb( min[0].min(min[2]), diff --git a/src/lib.rs b/src/lib.rs index 38de1cb..f11f8a4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ and a user should manage the world transform, clipping mask and style manually. See the `examples/` directory for usage examples. */ +#![feature(portable_simd)] #![no_std] #![warn(missing_docs)] #![warn(missing_copy_implementations)] @@ -52,7 +53,6 @@ mod pipeline; mod pixmap; mod scan; mod shaders; -mod wide; mod painter; // Keep it under `pixmap` for a better order in the docs. diff --git a/src/pipeline/highp.rs b/src/pipeline/highp.rs index bcb113c..66a3ecb 100644 --- a/src/pipeline/highp.rs +++ b/src/pipeline/highp.rs @@ -15,11 +15,12 @@ For some reason, we are almost 2x slower. Maybe because Skia uses clang's vector and we're using a manual implementation. */ +use std::simd::{f32x8, i32x8, u32x8, StdFloat, SimdFloat, SimdPartialOrd, SimdPartialEq}; + use crate::{PremultipliedColorU8, SpreadMode, PixmapRef}; use crate::geom::ScreenIntRect; use crate::pixmap::SubPixmapMut; -use crate::wide::{f32x8, i32x8, u32x8}; pub const STAGE_WIDTH: usize = 8; @@ -125,6 +126,16 @@ pub fn fn_ptr(f: StageFn) -> *const () { f as *const () } +trait F32x8Ext { + fn normalize(self) -> Self; +} + +impl F32x8Ext for f32x8 { + fn normalize(self) -> Self { + self.simd_max(f32x8::default()).simd_min(f32x8::splat(1.0)) + } +} + #[inline(never)] pub fn start( functions: &[StageFn], @@ -209,19 +220,19 @@ fn move_destination_to_source(p: &mut Pipeline) { } fn clamp_0(p: &mut Pipeline) { - p.r = p.r.max(f32x8::default()); - p.g = p.g.max(f32x8::default()); - p.b = p.b.max(f32x8::default()); - p.a = p.a.max(f32x8::default()); + p.r = p.r.simd_max(f32x8::default()); + p.g = p.g.simd_max(f32x8::default()); + p.b = p.b.simd_max(f32x8::default()); + p.a = p.a.simd_max(f32x8::default()); p.next_stage(); } fn clamp_a(p: &mut Pipeline) { - p.r = p.r.min(f32x8::splat(1.0)); - p.g = p.g.min(f32x8::splat(1.0)); - p.b = p.b.min(f32x8::splat(1.0)); - p.a = p.a.min(f32x8::splat(1.0)); + p.r = p.r.simd_min(f32x8::splat(1.0)); + p.g = p.g.simd_min(f32x8::splat(1.0)); + p.b = p.b.simd_min(f32x8::splat(1.0)); + p.a = p.a.simd_min(f32x8::splat(1.0)); p.next_stage(); } @@ -301,10 +312,10 @@ fn gather_ix(pixmap: PixmapRef, mut x: f32x8, mut y: f32x8) -> u32x8 { // Exclusive -> inclusive. let w = ulp_sub(pixmap.width() as f32); let h = ulp_sub(pixmap.height() as f32); - x = x.max(f32x8::default()).min(f32x8::splat(w)); - y = y.max(f32x8::default()).min(f32x8::splat(h)); + x = x.simd_max(f32x8::default()).simd_min(f32x8::splat(w)); + y = y.simd_max(f32x8::default()).simd_min(f32x8::splat(h)); - (y.trunc_int() * i32x8::splat(pixmap.width() as i32) + x.trunc_int()).to_u32x8_bitcast() + (y.trunc().cast::() * i32x8::splat(pixmap.width() as i32) + x.trunc().cast::()).cast() } #[inline(always)] @@ -405,15 +416,15 @@ blend_fn!(source_in, |s, _, _, da| s * da); blend_fn!(destination_in, |_, d, sa, _| d * sa); blend_fn!(source_out, |s, _, _, da| s * inv(da)); blend_fn!(destination_out, |_, d, sa, _| d * inv(sa)); -blend_fn!(source_over, |s, d, sa, _| mad(d, inv(sa), s)); -blend_fn!(destination_over, |s, d, _, da| mad(s, inv(da), d)); +blend_fn!(source_over, |s, d: f32x8, sa, _| d.mul_add(inv(sa), s)); +blend_fn!(destination_over, |s: f32x8, d, _, da| s.mul_add(inv(da), d)); blend_fn!(modulate, |s, d, _, _| s * d); blend_fn!(multiply, |s, d, sa, da| s * inv(da) + d * inv(sa) + s * d); blend_fn!(screen, |s, d, _, _| s + d - s * d); blend_fn!(xor, |s, d, sa, da| s * inv(da) + d * inv(sa)); // Wants a type for some reason. -blend_fn!(plus, |s: f32x8, d: f32x8, _, _| (s + d).min(f32x8::splat(1.0))); +blend_fn!(plus, |s: f32x8, d: f32x8, _, _| (s + d).simd_min(f32x8::splat(1.0))); macro_rules! blend_fn2 { ($name:ident, $f:expr) => { @@ -422,54 +433,54 @@ macro_rules! blend_fn2 { p.r = $f(p.r, p.dr, p.a, p.da); p.g = $f(p.g, p.dg, p.a, p.da); p.b = $f(p.b, p.db, p.a, p.da); - p.a = mad(p.da, inv(p.a), p.a); + p.a = p.da.mul_add(inv(p.a), p.a); p.next_stage(); } }; } -blend_fn2!(darken, |s: f32x8, d, sa, da: f32x8| s + d - (s * da).max(d * sa)); -blend_fn2!(lighten, |s: f32x8, d, sa, da: f32x8| s + d - (s * da).min(d * sa)); -blend_fn2!(difference, |s: f32x8, d, sa, da: f32x8| s + d - two((s * da).min(d * sa))); +blend_fn2!(darken, |s: f32x8, d, sa, da: f32x8| s + d - (s * da).simd_max(d * sa)); +blend_fn2!(lighten, |s: f32x8, d, sa, da: f32x8| s + d - (s * da).simd_min(d * sa)); +blend_fn2!(difference, |s: f32x8, d, sa, da: f32x8| s + d - two((s * da).simd_min(d * sa))); blend_fn2!(exclusion, |s: f32x8, d, _, _| s + d - two(s * d)); blend_fn2!(color_burn, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| - d.cmp_eq(da).blend( + d.simd_eq(da).select( d + s * inv(da), - s.cmp_eq(f32x8::default()).blend( + s.simd_eq(f32x8::default()).select( d * inv(sa), - sa * (da - da.min((da - d) * sa * s.recip_fast())) + s * inv(da) + d * inv(sa) + sa * (da - da.simd_min((da - d) * sa * s.recip())) + s * inv(da) + d * inv(sa) ) ) ); blend_fn2!(color_dodge, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| - d.cmp_eq(f32x8::default()).blend( + d.simd_eq(f32x8::default()).select( s * inv(da), - s.cmp_eq(sa).blend( + s.simd_eq(sa).select( s + d * inv(sa), - sa * da.min((d * sa) * (sa - s).recip_fast()) + s * inv(da) + d * inv(sa) + sa * da.simd_min((d * sa) * (sa - s).recip()) + s * inv(da) + d * inv(sa) ) ) ); blend_fn2!(hard_light, |s: f32x8, d: f32x8, sa, da| - s * inv(da) + d * inv(sa) + two(s).cmp_le(sa).blend( + s * inv(da) + d * inv(sa) + two(s).simd_le(sa).select( two(s * d), sa * da - two((da - d) * (sa - s)) ) ); blend_fn2!(overlay, |s: f32x8, d: f32x8, sa, da| - s * inv(da) + d * inv(sa) + two(d).cmp_le(da).blend( + s * inv(da) + d * inv(sa) + two(d).simd_le(da).select( two(s * d), sa * da - two((da - d) * (sa - s)) ) ); blend_fn2!(soft_light, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| { - let m = da.cmp_gt(f32x8::default()).blend(d / da, f32x8::default()); + let m = da.simd_gt(f32x8::default()).select(d / da, f32x8::default()); let s2 = two(s); let m4 = two(two(m)); @@ -481,9 +492,9 @@ blend_fn2!(soft_light, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| { let dark_dst = (m4 * m4 + m4) * (m - f32x8::splat(1.0)) + f32x8::splat(7.0) * m; let lite_dst = m.sqrt() - m; let lite_src = d * sa + da * (s2 - sa) - * two(two(d)).cmp_le(da).blend(dark_dst, lite_dst); // 2 or 3? + * two(two(d)).simd_le(da).select(dark_dst, lite_dst); // 2 or 3? - s * inv(da) + d * inv(sa) + s2.cmp_le(sa).blend(dark_src, lite_src) // 1 or (2 or 3)? + s * inv(da) + d * inv(sa) + s2.simd_le(sa).select(dark_src, lite_src) // 1 or (2 or 3)? }); // We're basing our implementation of non-separable blend modes on @@ -600,7 +611,7 @@ fn luminosity_k( #[inline(always)] fn sat(r: f32x8, g: f32x8, b: f32x8) -> f32x8 { - r.max(g.max(b)) - r.min(g.min(b)) + r.simd_max(g.simd_max(b)) - r.simd_min(g.simd_min(b)) } #[inline(always)] @@ -610,13 +621,13 @@ fn lum(r: f32x8, g: f32x8, b: f32x8) -> f32x8 { #[inline(always)] fn set_sat(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, s: f32x8) { - let mn = r.min(g.min(*b)); - let mx = r.max(g.max(*b)); + let mn = r.simd_min(g.simd_min(*b)); + let mx = r.simd_max(g.simd_max(*b)); let sat = mx - mn; // Map min channel to 0, max channel to s, and scale the middle proportionally. - let scale = |c| sat.cmp_eq(f32x8::default()) - .blend(f32x8::default(), (c - mn) * s / sat); + let scale = |c| sat.simd_eq(f32x8::default()) + .select(f32x8::default(), (c - mn) * s / sat); *r = scale(*r); *g = scale(*g); @@ -633,14 +644,14 @@ fn set_lum(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, l: f32x8) { #[inline(always)] fn clip_color(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: f32x8) { - let mn = r.min(g.min(*b)); - let mx = r.max(g.max(*b)); + let mn = r.simd_min(g.simd_min(*b)); + let mx = r.simd_max(g.simd_max(*b)); let l = lum(*r, *g, *b); let clip = |mut c| { - c = mx.cmp_ge(f32x8::default()).blend(c, l + (c - l) * l / (l - mn)); - c = mx.cmp_gt(a).blend(l + (c - l) * (a - l) / (mx - l), c); - c = c.max(f32x8::default()); // Sometimes without this we may dip just a little negative. + c = mx.simd_ge(f32x8::default()).select(c, l + (c - l) * l / (l - mn)); + c = mx.simd_gt(a).select(l + (c - l) * (a - l) / (mx - l), c); + c = c.simd_max(f32x8::default()); // Sometimes without this we may dip just a little negative. c }; @@ -652,10 +663,10 @@ fn clip_color(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: f32x8) { pub fn source_over_rgba(p: &mut Pipeline) { let pixels = p.pixmap_dst.slice4_at_xy(p.dx, p.dy); load_8888(pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da); - p.r = mad(p.dr, inv(p.a), p.r); - p.g = mad(p.dg, inv(p.a), p.g); - p.b = mad(p.db, inv(p.a), p.b); - p.a = mad(p.da, inv(p.a), p.a); + p.r = p.dr.mul_add(inv(p.a), p.r); + p.g = p.dg.mul_add(inv(p.a), p.g); + p.b = p.db.mul_add(inv(p.a), p.b); + p.a = p.da.mul_add(inv(p.a), p.a); store_8888(&p.r, &p.g, &p.b, &p.a, pixels); p.next_stage(); @@ -664,10 +675,10 @@ pub fn source_over_rgba(p: &mut Pipeline) { pub fn source_over_rgba_tail(p: &mut Pipeline) { let pixels = p.pixmap_dst.slice_at_xy(p.dx, p.dy); load_8888_tail(p.tail, pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da); - p.r = mad(p.dr, inv(p.a), p.r); - p.g = mad(p.dg, inv(p.a), p.g); - p.b = mad(p.db, inv(p.a), p.b); - p.a = mad(p.da, inv(p.a), p.a); + p.r = p.dr.mul_add(inv(p.a), p.r); + p.g = p.dg.mul_add(inv(p.a), p.g); + p.b = p.db.mul_add(inv(p.a), p.b); + p.a = p.da.mul_add(inv(p.a), p.a); store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, pixels); p.next_stage(); @@ -676,8 +687,8 @@ pub fn source_over_rgba_tail(p: &mut Pipeline) { fn transform(p: &mut Pipeline) { let ts = &p.ctx.transform; - let tr = mad(p.r, f32x8::splat(ts.sx), mad(p.g, f32x8::splat(ts.kx), f32x8::splat(ts.tx))); - let tg = mad(p.r, f32x8::splat(ts.ky), mad(p.g, f32x8::splat(ts.sy), f32x8::splat(ts.ty))); + let tr = p.r.mul_add(f32x8::splat(ts.sx), p.g.mul_add(f32x8::splat(ts.kx), f32x8::splat(ts.tx))); + let tg = p.r.mul_add(f32x8::splat(ts.ky), p.g.mul_add(f32x8::splat(ts.sy), f32x8::splat(ts.ty))); p.r = tr; p.g = tg; @@ -757,11 +768,9 @@ fn bicubic(p: &mut Pipeline) { #[inline(always)] fn bicubic_near(t: f32x8) -> f32x8 { // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18 - mad( - t, - mad(t, - mad( - f32x8::splat(-21.0/18.0), + t.mul_add( + t.mul_add( + f32x8::splat(-21.0/18.0).mul_add( t, f32x8::splat(27.0/18.0), ), @@ -774,7 +783,7 @@ fn bicubic_near(t: f32x8) -> f32x8 { #[inline(always)] fn bicubic_far(t: f32x8) -> f32x8 { // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18) - (t * t) * mad(f32x8::splat(7.0/18.0), t, f32x8::splat(-6.0/18.0)) + (t * t) * f32x8::splat(7.0/18.0).mul_add(t, f32x8::splat(-6.0/18.0)) } #[inline(always)] @@ -803,10 +812,10 @@ fn sampler_2x2( sample(pixmap, ctx, x,y, &mut rr, &mut gg, &mut bb, &mut aa); let w = wx[i] * wy[j]; - *r = mad(w, rr, *r); - *g = mad(w, gg, *g); - *b = mad(w, bb, *b); - *a = mad(w, aa, *a); + *r = w.mul_add(rr, *r); + *g = w.mul_add(gg, *g); + *b = w.mul_add(bb, *b); + *a = w.mul_add(aa, *a); x += one; } @@ -841,10 +850,10 @@ fn sampler_4x4( sample(pixmap, ctx, x,y, &mut rr, &mut gg, &mut bb, &mut aa); let w = wx[i] * wy[j]; - *r = mad(w, rr, *r); - *g = mad(w, gg, *g); - *b = mad(w, bb, *b); - *a = mad(w, aa, *a); + *r = w.mul_add(rr, *r); + *g = w.mul_add(gg, *g); + *b = w.mul_add(bb, *b); + *a = w.mul_add(aa, *a); x += one; } @@ -904,7 +913,7 @@ fn gradient(p: &mut Pipeline) { let mut idx = u32x8::default(); for i in 1..ctx.len { let tt = ctx.t_values[i].get(); - let n: u32x8 = bytemuck::cast([ + let n = u32x8::from_array([ (t[0] >= tt) as u32, (t[1] >= tt) as u32, (t[2] >= tt) as u32, @@ -925,7 +934,7 @@ fn gradient_lookup( ctx: &super::GradientCtx, idx: &u32x8, t: f32x8, r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8, ) { - let idx: [u32; 8] = bytemuck::cast(*idx); + let idx: &[u32; 8] = idx.as_array(); macro_rules! gather { ($d:expr, $c:ident) => { @@ -954,20 +963,20 @@ fn gradient_lookup( let bb = gather!(&ctx.biases, b); let ba = gather!(&ctx.biases, a); - *r = mad(t, fr, br); - *g = mad(t, fg, bg); - *b = mad(t, fb, bb); - *a = mad(t, fa, ba); + *r = t.mul_add(fr, br); + *g = t.mul_add(fg, bg); + *b = t.mul_add(fb, bb); + *a = t.mul_add(fa, ba); } fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) { let ctx = &p.ctx.evenly_spaced_2_stop_gradient; let t = p.r; - p.r = mad(t, f32x8::splat(ctx.factor.r), f32x8::splat(ctx.bias.r)); - p.g = mad(t, f32x8::splat(ctx.factor.g), f32x8::splat(ctx.bias.g)); - p.b = mad(t, f32x8::splat(ctx.factor.b), f32x8::splat(ctx.bias.b)); - p.a = mad(t, f32x8::splat(ctx.factor.a), f32x8::splat(ctx.bias.a)); + p.r = t.mul_add(f32x8::splat(ctx.factor.r), f32x8::splat(ctx.bias.r)); + p.g = t.mul_add(f32x8::splat(ctx.factor.g), f32x8::splat(ctx.bias.g)); + p.b = t.mul_add(f32x8::splat(ctx.factor.b), f32x8::splat(ctx.bias.b)); + p.a = t.mul_add(f32x8::splat(ctx.factor.a), f32x8::splat(ctx.bias.a)); p.next_stage(); } @@ -1012,12 +1021,12 @@ fn mask_2pt_conical_degenerates(p: &mut Pipeline) { let ctx = &mut p.ctx.two_point_conical_gradient; let t = p.r; - let is_degenerate = t.cmp_le(f32x8::default()) | t.cmp_ne(t); - p.r = is_degenerate.blend(f32x8::default(), t); + let is_degenerate = t.simd_le(f32x8::default()) | t.simd_ne(t); + p.r = is_degenerate.select(f32x8::default(), t); - let is_not_degenerate = !is_degenerate.to_u32x8_bitcast(); - let is_not_degenerate: [u32; 8] = bytemuck::cast(is_not_degenerate); - ctx.mask = bytemuck::cast([ + let is_not_degenerate = !is_degenerate.to_int().cast::(); + let is_not_degenerate = is_not_degenerate.as_array(); + ctx.mask = u32x8::from_array([ if is_not_degenerate[0] != 0 { !0 } else { 0 }, if is_not_degenerate[1] != 0 { !0 } else { 0 }, if is_not_degenerate[2] != 0 { !0 } else { 0 }, @@ -1034,10 +1043,10 @@ fn mask_2pt_conical_degenerates(p: &mut Pipeline) { fn apply_vector_mask(p: &mut Pipeline) { let ctx = &p.ctx.two_point_conical_gradient; - p.r = (p.r.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast(); - p.g = (p.g.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast(); - p.b = (p.b.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast(); - p.a = (p.a.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast(); + p.r = (p.r.cast::() & ctx.mask).cast::(); + p.g = (p.g.cast::() & ctx.mask).cast::(); + p.b = (p.b.cast::() & ctx.mask).cast::(); + p.a = (p.a.cast::() & ctx.mask).cast::(); p.next_stage(); } @@ -1145,7 +1154,7 @@ fn store_8888_tail( #[inline(always)] fn unnorm(v: &f32x8) -> i32x8 { - (v.max(f32x8::default()).min(f32x8::splat(1.0)) * f32x8::splat(255.0)).round_int() + (v.simd_max(f32x8::default()).simd_min(f32x8::splat(1.0)) * f32x8::splat(255.0)).round().cast() } #[inline(always)] @@ -1158,12 +1167,7 @@ fn two(v: f32x8) -> f32x8 { v + v } -#[inline(always)] -fn mad(f: f32x8, m: f32x8, a: f32x8) -> f32x8 { - f * m + a -} - #[inline(always)] fn lerp(from: f32x8, to: f32x8, t: f32x8) -> f32x8 { - mad(to - from, t, from) + (to - from).mul_add(t, from) } diff --git a/src/pipeline/lowp.rs b/src/pipeline/lowp.rs index df0a1d3..a7f4de0 100644 --- a/src/pipeline/lowp.rs +++ b/src/pipeline/lowp.rs @@ -28,10 +28,11 @@ we are still 40-60% behind Skia built for Haswell. On ARM AArch64 the story is different and explicit SIMD make our code up to 2-3x faster. */ +use std::simd::{u16x16, f32x16, StdFloat, SimdFloat, SimdPartialOrd}; + use crate::PremultipliedColorU8; use crate::pixmap::SubPixmapMut; -use crate::wide::{f32x8, u16x16, f32x16}; use crate::geom::ScreenIntRect; pub const STAGE_WIDTH: usize = 16; @@ -142,6 +143,16 @@ pub fn fn_ptr_eq(f1: StageFn, f2: StageFn) -> bool { core::ptr::eq(f1 as *const (), f2 as *const ()) } +trait F32x16Ext { + fn normalize(self) -> Self; +} + +impl F32x16Ext for f32x16 { + fn normalize(self) -> Self { + self.simd_max(f32x16::default()).simd_min(f32x16::splat(1.0)) + } +} + #[inline(never)] pub fn start( functions: &[StageFn], @@ -234,10 +245,10 @@ fn uniform_color(p: &mut Pipeline) { } fn seed_shader(p: &mut Pipeline) { - let iota = f32x16( - f32x8::from([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), - f32x8::from([8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5]), - ); + let iota = f32x16::from_array([ + 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, + 8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5, + ]); let x = f32x16::splat(p.dx as f32) + iota; let y = f32x16::splat(p.dy as f32 + 0.5); @@ -285,7 +296,7 @@ pub fn load_dst_u8_tail(p: &mut Pipeline) { pub fn store_u8(p: &mut Pipeline) { let data = p.pixmap.slice16_mask_at_xy(p.dx, p.dy); - let a = p.a.as_slice(); + let a = p.a.as_array(); data[ 0] = a[ 0] as u8; data[ 1] = a[ 1] as u8; @@ -309,7 +320,7 @@ pub fn store_u8(p: &mut Pipeline) { pub fn store_u8_tail(p: &mut Pipeline) { let data = p.pixmap.slice_mask_at_xy(p.dx, p.dy); - let a = p.a.as_slice(); + let a = p.a.as_array(); // This is better than `for i in 0..tail`, because this way the compiler // knows that we have only 16 steps and slices access is guarantee to be valid. @@ -331,7 +342,7 @@ fn load_mask_u8(p: &mut Pipeline) { let mut c = u16x16::default(); for i in 0..p.tail { - c.0[i] = u16::from(p.mask_ctx.data[offset + i]); + c.as_mut_array()[i] = u16::from(p.mask_ctx.data[offset + i]); } p.r = u16x16::splat(0); @@ -347,7 +358,7 @@ fn mask_u8(p: &mut Pipeline) { let mut c = u16x16::default(); for i in 0..p.tail { - c.0[i] = u16::from(p.mask_ctx.data[offset + i]); + c.as_mut_array()[i] = u16::from(p.mask_ctx.data[offset + i]); } if c == u16x16::default() { @@ -365,7 +376,7 @@ fn mask_u8(p: &mut Pipeline) { fn scale_u8(p: &mut Pipeline) { // Load u8xTail and cast it to u16x16. let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail); - let c = u16x16([ + let c = u16x16::from_array([ u16::from(data[0]), u16::from(data[1]), 0, @@ -395,7 +406,7 @@ fn scale_u8(p: &mut Pipeline) { fn lerp_u8(p: &mut Pipeline) { // Load u8xTail and cast it to u16x16. let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail); - let c = u16x16([ + let c = u16x16::from_array([ u16::from(data[0]), u16::from(data[1]), 0, @@ -470,7 +481,7 @@ blend_fn!(screen, |s, d, _, _| s + d - div255(s * d)); blend_fn!(xor, |s, d, sa, da| div255(s * inv(da) + d * inv(sa))); // Wants a type for some reason. -blend_fn!(plus, |s: u16x16, d, _, _| (s + d).min(&u16x16::splat(255))); +blend_fn!(plus, |s: u16x16, d: u16x16, _, _| (s + d).min(u16x16::splat(255))); macro_rules! blend_fn2 { @@ -487,25 +498,30 @@ macro_rules! blend_fn2 { }; } -blend_fn2!(darken, |s: u16x16, d, sa, da| s + d - div255((s * da).max(&(d * sa)))); -blend_fn2!(lighten, |s: u16x16, d, sa, da| s + d - div255((s * da).min(&(d * sa)))); +blend_fn2!(darken, |s: u16x16, d: u16x16, sa: u16x16, da: u16x16| s + d - div255((s * da).max(d * sa))); +blend_fn2!(lighten, |s: u16x16, d: u16x16, sa: u16x16, da: u16x16| s + d - div255((s * da).min(d * sa))); blend_fn2!(exclusion, |s: u16x16, d, _, _| s + d - u16x16::splat(2) * div255(s * d)); -blend_fn2!(difference, |s: u16x16, d, sa, da| - s + d - u16x16::splat(2) * div255((s * da).min(&(d * sa)))); +blend_fn2!(difference, |s: u16x16, d, sa, da: u16x16| + s + d - u16x16::splat(2) * div255((s * da).min(d * sa))); blend_fn2!(hard_light, |s: u16x16, d: u16x16, sa, da| { div255(s * inv(da) + d * inv(sa) - + (s+s).cmp_le(&sa).blend( + + blend((s+s).simd_le(sa).to_int().cast(), u16x16::splat(2) * s * d, sa * da - u16x16::splat(2) * (sa-s)*(da-d) ) ) }); +#[inline] +fn blend(a: u16x16, t: u16x16, e: u16x16) -> u16x16 { + (t & a) | (e & !a) +} + blend_fn2!(overlay, |s: u16x16, d: u16x16, sa, da| { div255(s * inv(da) + d * inv(sa) - + (d+d).cmp_le(&da).blend( + + blend((d+d).simd_le(da).to_int().cast(), u16x16::splat(2) * s * d, sa * da - u16x16::splat(2) * (sa-s)*(da-d) ) @@ -542,8 +558,8 @@ fn transform(p: &mut Pipeline) { let x = join(&p.r, &p.g); let y = join(&p.b, &p.a); - let nx = mad(x, f32x16::splat(ts.sx), mad(y, f32x16::splat(ts.kx), f32x16::splat(ts.tx))); - let ny = mad(x, f32x16::splat(ts.ky), mad(y, f32x16::splat(ts.sy), f32x16::splat(ts.ty))); + let nx = x.mul_add(f32x16::splat(ts.sx), y.mul_add(f32x16::splat(ts.kx), f32x16::splat(ts.tx))); + let ny = x.mul_add(f32x16::splat(ts.ky), y.mul_add(f32x16::splat(ts.sy), f32x16::splat(ts.ty))); split(&nx, &mut p.r, &mut p.g); split(&ny, &mut p.b, &mut p.a); @@ -588,24 +604,24 @@ fn gradient(p: &mut Pipeline) { let mut idx = u16x16::splat(0); for i in 1..ctx.len { let tt = ctx.t_values[i].get(); - let t0: [f32; 8] = t.0.into(); - let t1: [f32; 8] = t.1.into(); - idx.0[ 0] += (t0[0] >= tt) as u16; - idx.0[ 1] += (t0[1] >= tt) as u16; - idx.0[ 2] += (t0[2] >= tt) as u16; - idx.0[ 3] += (t0[3] >= tt) as u16; - idx.0[ 4] += (t0[4] >= tt) as u16; - idx.0[ 5] += (t0[5] >= tt) as u16; - idx.0[ 6] += (t0[6] >= tt) as u16; - idx.0[ 7] += (t0[7] >= tt) as u16; - idx.0[ 8] += (t1[0] >= tt) as u16; - idx.0[ 9] += (t1[1] >= tt) as u16; - idx.0[10] += (t1[2] >= tt) as u16; - idx.0[11] += (t1[3] >= tt) as u16; - idx.0[12] += (t1[4] >= tt) as u16; - idx.0[13] += (t1[5] >= tt) as u16; - idx.0[14] += (t1[6] >= tt) as u16; - idx.0[15] += (t1[7] >= tt) as u16; + let t = t.as_array(); + let idx = idx.as_mut_array(); + idx[ 0] += (t[ 0] >= tt) as u16; + idx[ 1] += (t[ 1] >= tt) as u16; + idx[ 2] += (t[ 2] >= tt) as u16; + idx[ 3] += (t[ 3] >= tt) as u16; + idx[ 4] += (t[ 4] >= tt) as u16; + idx[ 5] += (t[ 5] >= tt) as u16; + idx[ 6] += (t[ 6] >= tt) as u16; + idx[ 7] += (t[ 7] >= tt) as u16; + idx[ 8] += (t[ 8] >= tt) as u16; + idx[ 9] += (t[ 9] >= tt) as u16; + idx[10] += (t[10] >= tt) as u16; + idx[11] += (t[11] >= tt) as u16; + idx[12] += (t[12] >= tt) as u16; + idx[13] += (t[13] >= tt) as u16; + idx[14] += (t[14] >= tt) as u16; + idx[15] += (t[15] >= tt) as u16; } gradient_lookup(ctx, &idx, t, &mut p.r, &mut p.g, &mut p.b, &mut p.a); @@ -617,10 +633,10 @@ fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) { let t = join(&p.r, &p.g); round_f32_to_u16( - mad(t, f32x16::splat(ctx.factor.r), f32x16::splat(ctx.bias.r)), - mad(t, f32x16::splat(ctx.factor.g), f32x16::splat(ctx.bias.g)), - mad(t, f32x16::splat(ctx.factor.b), f32x16::splat(ctx.bias.b)), - mad(t, f32x16::splat(ctx.factor.a), f32x16::splat(ctx.bias.a)), + t.mul_add(f32x16::splat(ctx.factor.r), f32x16::splat(ctx.bias.r)), + t.mul_add(f32x16::splat(ctx.factor.g), f32x16::splat(ctx.bias.g)), + t.mul_add(f32x16::splat(ctx.factor.b), f32x16::splat(ctx.bias.b)), + t.mul_add(f32x16::splat(ctx.factor.a), f32x16::splat(ctx.bias.a)), &mut p.r, &mut p.g, &mut p.b, &mut p.a, ); @@ -643,32 +659,29 @@ fn gradient_lookup( ctx: &super::GradientCtx, idx: &u16x16, t: f32x16, r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16, ) { + let idx = idx.as_array(); macro_rules! gather { ($d:expr, $c:ident) => { // Surprisingly, but bound checking doesn't affect the performance. // And since `idx` can contain any number, we should leave it in place. - f32x16( - f32x8::from([ - $d[idx.0[ 0] as usize].$c, - $d[idx.0[ 1] as usize].$c, - $d[idx.0[ 2] as usize].$c, - $d[idx.0[ 3] as usize].$c, - $d[idx.0[ 4] as usize].$c, - $d[idx.0[ 5] as usize].$c, - $d[idx.0[ 6] as usize].$c, - $d[idx.0[ 7] as usize].$c, - ]), - f32x8::from([ - $d[idx.0[ 8] as usize].$c, - $d[idx.0[ 9] as usize].$c, - $d[idx.0[10] as usize].$c, - $d[idx.0[11] as usize].$c, - $d[idx.0[12] as usize].$c, - $d[idx.0[13] as usize].$c, - $d[idx.0[14] as usize].$c, - $d[idx.0[15] as usize].$c, - ]), - ) + f32x16::from_array([ + $d[idx[ 0] as usize].$c, + $d[idx[ 1] as usize].$c, + $d[idx[ 2] as usize].$c, + $d[idx[ 3] as usize].$c, + $d[idx[ 4] as usize].$c, + $d[idx[ 5] as usize].$c, + $d[idx[ 6] as usize].$c, + $d[idx[ 7] as usize].$c, + $d[idx[ 8] as usize].$c, + $d[idx[ 9] as usize].$c, + $d[idx[10] as usize].$c, + $d[idx[11] as usize].$c, + $d[idx[12] as usize].$c, + $d[idx[13] as usize].$c, + $d[idx[14] as usize].$c, + $d[idx[15] as usize].$c, + ]) }; } @@ -683,10 +696,10 @@ fn gradient_lookup( let ba = gather!(&ctx.biases, a); round_f32_to_u16( - mad(t, fr, br), - mad(t, fg, bg), - mad(t, fb, bb), - mad(t, fa, ba), + t.mul_add(fr, br), + t.mul_add(fg, bg), + t.mul_add(fb, bb), + t.mul_add(fa, ba), r, g, b, a, ); } @@ -704,10 +717,42 @@ fn round_f32_to_u16( let bf = bf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5); let af = af * f32x16::splat(255.0) + f32x16::splat(0.5); - rf.save_to_u16x16(r); - gf.save_to_u16x16(g); - bf.save_to_u16x16(b); - af.save_to_u16x16(a); + save_to_u16x16(rf, r); + save_to_u16x16(gf, g); + save_to_u16x16(bf, b); + save_to_u16x16(af, a); +} + +// TODO: optimize +// This method is too heavy and shouldn't be inlined. +fn save_to_u16x16(src: f32x16, dst: &mut u16x16) { + // Do not use to_i32x8, because it involves rounding, + // and Skia cast's without it. + + // let n0: [f32; 8] = self.0.into(); + // let n1: [f32; 8] = self.1.into(); + let n = src.as_array(); + let dst = dst.as_mut_array(); + + dst[0] = n[0] as u16; + dst[1] = n[1] as u16; + dst[2] = n[2] as u16; + dst[3] = n[3] as u16; + + dst[4] = n[4] as u16; + dst[5] = n[5] as u16; + dst[6] = n[6] as u16; + dst[7] = n[7] as u16; + + dst[8] = n[8] as u16; + dst[9] = n[9] as u16; + dst[10] = n[10] as u16; + dst[11] = n[11] as u16; + + dst[12] = n[12] as u16; + dst[13] = n[13] as u16; + dst[14] = n[14] as u16; + dst[15] = n[15] as u16; } pub fn just_return(_: &mut Pipeline) { @@ -723,28 +768,28 @@ fn load_8888( data: &[PremultipliedColorU8; STAGE_WIDTH], r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16, ) { - *r = u16x16([ + *r = u16x16::from_array([ data[ 0].red() as u16, data[ 1].red() as u16, data[ 2].red() as u16, data[ 3].red() as u16, data[ 4].red() as u16, data[ 5].red() as u16, data[ 6].red() as u16, data[ 7].red() as u16, data[ 8].red() as u16, data[ 9].red() as u16, data[10].red() as u16, data[11].red() as u16, data[12].red() as u16, data[13].red() as u16, data[14].red() as u16, data[15].red() as u16, ]); - *g = u16x16([ + *g = u16x16::from_array([ data[ 0].green() as u16, data[ 1].green() as u16, data[ 2].green() as u16, data[ 3].green() as u16, data[ 4].green() as u16, data[ 5].green() as u16, data[ 6].green() as u16, data[ 7].green() as u16, data[ 8].green() as u16, data[ 9].green() as u16, data[10].green() as u16, data[11].green() as u16, data[12].green() as u16, data[13].green() as u16, data[14].green() as u16, data[15].green() as u16, ]); - *b = u16x16([ + *b = u16x16::from_array([ data[ 0].blue() as u16, data[ 1].blue() as u16, data[ 2].blue() as u16, data[ 3].blue() as u16, data[ 4].blue() as u16, data[ 5].blue() as u16, data[ 6].blue() as u16, data[ 7].blue() as u16, data[ 8].blue() as u16, data[ 9].blue() as u16, data[10].blue() as u16, data[11].blue() as u16, data[12].blue() as u16, data[13].blue() as u16, data[14].blue() as u16, data[15].blue() as u16, ]); - *a = u16x16([ + *a = u16x16::from_array([ data[ 0].alpha() as u16, data[ 1].alpha() as u16, data[ 2].alpha() as u16, data[ 3].alpha() as u16, data[ 4].alpha() as u16, data[ 5].alpha() as u16, data[ 6].alpha() as u16, data[ 7].alpha() as u16, data[ 8].alpha() as u16, data[ 9].alpha() as u16, data[10].alpha() as u16, data[11].alpha() as u16, @@ -769,10 +814,10 @@ fn store_8888( r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16, data: &mut [PremultipliedColorU8; STAGE_WIDTH], ) { - let r = r.as_slice(); - let g = g.as_slice(); - let b = b.as_slice(); - let a = a.as_slice(); + let r = r.as_array(); + let g = g.as_array(); + let b = b.as_array(); + let a = a.as_array(); data[ 0] = PremultipliedColorU8::from_rgba_unchecked(r[ 0] as u8, g[ 0] as u8, b[ 0] as u8, a[ 0] as u8); data[ 1] = PremultipliedColorU8::from_rgba_unchecked(r[ 1] as u8, g[ 1] as u8, b[ 1] as u8, a[ 1] as u8); @@ -797,10 +842,10 @@ fn store_8888_tail( r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16, tail: usize, data: &mut [PremultipliedColorU8], ) { - let r = r.as_slice(); - let g = g.as_slice(); - let b = b.as_slice(); - let a = a.as_slice(); + let r = r.as_array(); + let g = g.as_array(); + let b = b.as_array(); + let a = a.as_array(); // This is better than `for i in 0..tail`, because this way the compiler // knows that we have only 16 steps and slices access is guarantee to be valid. @@ -818,7 +863,7 @@ fn store_8888_tail( #[inline(always)] fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) { - *a = u16x16([ + *a = u16x16::from_array([ data[ 0] as u16, data[ 1] as u16, data[ 2] as u16, data[ 3] as u16, data[ 4] as u16, data[ 5] as u16, data[ 6] as u16, data[ 7] as u16, data[ 8] as u16, data[ 9] as u16, data[10] as u16, data[11] as u16, @@ -830,7 +875,7 @@ fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) { fn div255(v: u16x16) -> u16x16 { // Skia uses `vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8)` here when NEON is available, // but it doesn't affect performance much and breaks reproducible result. Ignore it. - // NOTE: the compiler does not replace the devision with a shift. + // NOTE: the compiler does not replace the division with a shift. (v + u16x16::splat(255)) >> u16x16::splat(8) // / u16x16::splat(256) } @@ -852,9 +897,9 @@ fn lerp(from: u16x16, to: u16x16, t: u16x16) -> u16x16 { #[inline(always)] fn split(v: &f32x16, lo: &mut u16x16, hi: &mut u16x16) { // We're splitting f32x16 (512bit) into two u16x16 (256 bit). - let data: [u8; 64] = bytemuck::cast(*v); - let d0: &mut [u8; 32] = bytemuck::cast_mut(&mut lo.0); - let d1: &mut [u8; 32] = bytemuck::cast_mut(&mut hi.0); + let data: [u8; 64] = bytemuck::cast(*v.as_array()); + let d0: &mut [u8; 32] = bytemuck::cast_mut(lo.as_mut_array()); + let d1: &mut [u8; 32] = bytemuck::cast_mut(hi.as_mut_array()); d0.copy_from_slice(&data[0..32]); d1.copy_from_slice(&data[32..64]); @@ -864,20 +909,14 @@ fn split(v: &f32x16, lo: &mut u16x16, hi: &mut u16x16) { fn join(lo: &u16x16, hi: &u16x16) -> f32x16 { // We're joining two u16x16 (256 bit) into f32x16 (512bit). - let d0: [u8; 32] = bytemuck::cast(lo.0); - let d1: [u8; 32] = bytemuck::cast(hi.0); + let d0: [u8; 32] = bytemuck::cast(*lo.as_array()); + let d1: [u8; 32] = bytemuck::cast(*hi.as_array()); let mut v = f32x16::default(); - let data: &mut [u8; 64] = bytemuck::cast_mut(&mut v); + let data: &mut [u8; 64] = bytemuck::cast_mut(v.as_mut_array()); data[0..32].copy_from_slice(&d0); data[32..64].copy_from_slice(&d1); v } - -#[inline(always)] -fn mad(f: f32x16, m: f32x16, a: f32x16) -> f32x16 { - // NEON vmlaq_f32 doesn't seem to affect performance in any way. Ignore it. - f * m + a -} diff --git a/src/pipeline/mod.rs b/src/pipeline/mod.rs index ee2b252..d7c9318 100644 --- a/src/pipeline/mod.rs +++ b/src/pipeline/mod.rs @@ -45,6 +45,7 @@ and should be optimized out in the future. */ use alloc::vec::Vec; +use core::simd::u32x8; use arrayvec::ArrayVec; @@ -57,7 +58,6 @@ pub use blitter::RasterPipelineBlitter; use crate::geom::ScreenIntRect; use crate::pixmap::SubPixmapMut; -use crate::wide::u32x8; mod blitter; #[rustfmt::skip] mod highp; @@ -137,7 +137,7 @@ pub const STAGES_COUNT: usize = Stage::ApplyVectorMask as usize + 1; impl<'a> PixmapRef<'a> { #[inline(always)] pub(crate) fn gather(&self, index: u32x8) -> [PremultipliedColorU8; highp::STAGE_WIDTH] { - let index: [u32; 8] = bytemuck::cast(index); + let index: &[u32; 8] = index.as_array(); let pixels = self.pixels(); [ pixels[index[0] as usize], diff --git a/src/scan/hairline.rs b/src/scan/hairline.rs index 150078b..6abd3d2 100644 --- a/src/scan/hairline.rs +++ b/src/scan/hairline.rs @@ -5,8 +5,9 @@ // found in the LICENSE file. use core::convert::TryInto; +use core::simd::{f32x2, SimdFloat}; -use tiny_skia_path::{f32x2, PathVerb, SaturateCast, Scalar}; +use tiny_skia_path::{PathVerb, SaturateCast, Scalar}; use crate::{IntRect, LineCap, Path, PathSegment, Point, Rect}; @@ -27,6 +28,48 @@ pub type LineProc = fn(&[Point], Option<&ScreenIntRect>, &mut dyn Blitter); const MAX_CUBIC_SUBDIVIDE_LEVEL: u8 = 9; const MAX_QUAD_SUBDIVIDE_LEVEL: u8 = 5; +trait F32x2Ext { + fn x(self) -> f32; + fn y(self) -> f32; + fn max_component(self) -> f32; +} + +impl F32x2Ext for f32x2 { + fn x(self) -> f32 { + self.as_array()[0] + } + + fn y(self) -> f32 { + self.as_array()[1] + } + + fn max_component(self) -> f32 { + let a = self.x(); + let b = self.y(); + // This is faster than `f32::max`. Unlike std one, we do not care about NaN. + if a < b { + b + } else { + a + } + } +} + +trait PointExt { + fn from_f32x2(r: f32x2) -> Self; + fn to_f32x2(&self) -> f32x2; +} + +impl PointExt for Point { + fn from_f32x2(r: f32x2) -> Self { + Point::from_xy(r.as_array()[0], r.as_array()[1]) + } + + fn to_f32x2(&self) -> f32x2 { + f32x2::from_array([self.x, self.y]) + } +} + pub fn stroke_path( path: &Path, line_cap: LineCap, @@ -429,8 +472,8 @@ fn compute_nocheck_quad_bounds(points: &[Point; 3]) -> Option { let mut max = min; for i in 1..3 { let pair = points[i].to_f32x2(); - min = min.min(pair); - max = max.max(pair); + min = min.simd_min(pair); + max = max.simd_max(pair); } Rect::from_ltrb(min.x(), min.y(), max.x(), max.y()) @@ -564,8 +607,8 @@ fn compute_nocheck_cubic_bounds(points: &[Point; 4]) -> Option { let mut max = min; for i in 1..4 { let pair = points[i].to_f32x2(); - min = min.min(pair); - max = max.max(pair); + min = min.simd_min(pair); + max = max.simd_max(pair); } Rect::from_ltrb(min.x(), min.y(), max.x(), max.y()) @@ -631,7 +674,7 @@ fn compute_cubic_segments(points: &[Point; 4]) -> usize { let p13 = one_third * p3 + two_third * p0; let p23 = one_third * p0 + two_third * p3; - let diff = (p1 - p13).abs().max((p2 - p23).abs()).max_component(); + let diff = (p1 - p13).abs().simd_max((p2 - p23).abs()).max_component(); let mut tol = 1.0 / 8.0; for i in 0..MAX_CUBIC_SUBDIVIDE_LEVEL { diff --git a/src/shaders/linear_gradient.rs b/src/shaders/linear_gradient.rs index 8cf1b41..81b47c3 100644 --- a/src/shaders/linear_gradient.rs +++ b/src/shaders/linear_gradient.rs @@ -110,10 +110,10 @@ fn points_to_unit_ts(start: Point, end: Point) -> Option { } fn average_gradient_color(points: &[GradientStop]) -> Color { - use crate::wide::f32x4; + use core::simd::f32x4; fn load_color(c: Color) -> f32x4 { - f32x4::from([c.red(), c.green(), c.blue(), c.alpha()]) + f32x4::from_array([c.red(), c.green(), c.blue(), c.alpha()]) } fn store_color(c: f32x4) -> Color { diff --git a/src/shaders/radial_gradient.rs b/src/shaders/radial_gradient.rs index 3c7f441..ce1104d 100644 --- a/src/shaders/radial_gradient.rs +++ b/src/shaders/radial_gradient.rs @@ -13,7 +13,6 @@ use crate::{GradientStop, Point, Shader, SpreadMode, Transform}; use super::gradient::{Gradient, DEGENERATE_THRESHOLD}; use crate::pipeline; use crate::pipeline::RasterPipelineBuilder; -use crate::wide::u32x8; #[cfg(all(not(feature = "std"), feature = "no-std-float"))] use tiny_skia_path::NoStdFloat; @@ -142,7 +141,7 @@ impl RadialGradient { }; p.ctx.two_point_conical_gradient = pipeline::TwoPointConicalGradientCtx { - mask: u32x8::default(), + mask: core::simd::u32x8::default(), p0, }; diff --git a/src/wide/f32x16_t.rs b/src/wide/f32x16_t.rs deleted file mode 100644 index 3cd76a1..0000000 --- a/src/wide/f32x16_t.rs +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2020 Yevhenii Reizner -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -use super::{f32x8, u16x16}; - -#[derive(Copy, Clone, Debug)] -#[repr(C, align(32))] -pub struct f32x16(pub f32x8, pub f32x8); - -unsafe impl bytemuck::Zeroable for f32x16 {} -unsafe impl bytemuck::Pod for f32x16 {} - -impl Default for f32x16 { - fn default() -> Self { - Self::splat(0.0) - } -} - -impl f32x16 { - pub fn splat(n: f32) -> Self { - Self(f32x8::splat(n), f32x8::splat(n)) - } - - #[inline] - pub fn abs(&self) -> Self { - // Yes, Skia does it in the same way. - let abs = |x| bytemuck::cast::(bytemuck::cast::(x) & 0x7fffffff); - - let n0: [f32; 8] = self.0.into(); - let n1: [f32; 8] = self.1.into(); - Self( - f32x8::from([ - abs(n0[0]), - abs(n0[1]), - abs(n0[2]), - abs(n0[3]), - abs(n0[4]), - abs(n0[5]), - abs(n0[6]), - abs(n0[7]), - ]), - f32x8::from([ - abs(n1[0]), - abs(n1[1]), - abs(n1[2]), - abs(n1[3]), - abs(n1[4]), - abs(n1[5]), - abs(n1[6]), - abs(n1[7]), - ]), - ) - } - - pub fn cmp_gt(self, rhs: &Self) -> Self { - Self(self.0.cmp_gt(rhs.0), self.1.cmp_gt(rhs.1)) - } - - pub fn blend(self, t: Self, f: Self) -> Self { - Self(self.0.blend(t.0, f.0), self.1.blend(t.1, f.1)) - } - - pub fn normalize(&self) -> Self { - Self(self.0.normalize(), self.1.normalize()) - } - - pub fn floor(&self) -> Self { - // Yes, Skia does it in the same way. - let roundtrip = self.round(); - roundtrip - - roundtrip - .cmp_gt(self) - .blend(f32x16::splat(1.0), f32x16::splat(0.0)) - } - - pub fn sqrt(&self) -> Self { - Self(self.0.sqrt(), self.1.sqrt()) - } - - pub fn round(&self) -> Self { - Self(self.0.round(), self.1.round()) - } - - // This method is too heavy and shouldn't be inlined. - pub fn save_to_u16x16(&self, dst: &mut u16x16) { - // Do not use to_i32x8, because it involves rounding, - // and Skia cast's without it. - - let n0: [f32; 8] = self.0.into(); - let n1: [f32; 8] = self.1.into(); - - dst.0[0] = n0[0] as u16; - dst.0[1] = n0[1] as u16; - dst.0[2] = n0[2] as u16; - dst.0[3] = n0[3] as u16; - - dst.0[4] = n0[4] as u16; - dst.0[5] = n0[5] as u16; - dst.0[6] = n0[6] as u16; - dst.0[7] = n0[7] as u16; - - dst.0[8] = n1[0] as u16; - dst.0[9] = n1[1] as u16; - dst.0[10] = n1[2] as u16; - dst.0[11] = n1[3] as u16; - - dst.0[12] = n1[4] as u16; - dst.0[13] = n1[5] as u16; - dst.0[14] = n1[6] as u16; - dst.0[15] = n1[7] as u16; - } -} - -impl core::ops::Add for f32x16 { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - Self(self.0 + rhs.0, self.1 + rhs.1) - } -} - -impl core::ops::Sub for f32x16 { - type Output = Self; - - fn sub(self, rhs: Self) -> Self::Output { - Self(self.0 - rhs.0, self.1 - rhs.1) - } -} - -impl core::ops::Mul for f32x16 { - type Output = Self; - - fn mul(self, rhs: Self) -> Self::Output { - Self(self.0 * rhs.0, self.1 * rhs.1) - } -} diff --git a/src/wide/f32x4_t.rs b/src/wide/f32x4_t.rs deleted file mode 100644 index 21d5140..0000000 --- a/src/wide/f32x4_t.rs +++ /dev/null @@ -1,640 +0,0 @@ -// Copyright 2020 Yevhenii Reizner -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Based on https://github.com/Lokathor/wide (Zlib) - -use bytemuck::cast; - -#[cfg(all(not(feature = "std"), feature = "no-std-float"))] -use tiny_skia_path::NoStdFloat; - -use super::i32x4; - -cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(16))] - pub struct f32x4(__m128); - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - use core::arch::wasm32::*; - - // repr(transparent) allows for directly passing the v128 on the WASM stack. - #[derive(Clone, Copy, Debug)] - #[repr(transparent)] - pub struct f32x4(v128); - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - use core::arch::aarch64::*; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(16))] - pub struct f32x4(float32x4_t); - } else { - use super::FasterMinMax; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(16))] - pub struct f32x4([f32; 4]); - } -} - -unsafe impl bytemuck::Zeroable for f32x4 {} -unsafe impl bytemuck::Pod for f32x4 {} - -impl Default for f32x4 { - fn default() -> Self { - Self::splat(0.0) - } -} - -impl f32x4 { - pub fn splat(n: f32) -> Self { - Self::from([n, n, n, n]) - } - - pub fn floor(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_floor(self.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vrndmq_f32(self.0) }) - } else { - let roundtrip: f32x4 = cast(self.trunc_int().to_f32x4()); - roundtrip - roundtrip.cmp_gt(self).blend(f32x4::splat(1.0), f32x4::default()) - } - } - } - - pub fn abs(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_abs(self.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vabsq_f32(self.0) }) - } else { - let non_sign_bits = f32x4::splat(f32::from_bits(i32::MAX as u32)); - self & non_sign_bits - } - } - } - - pub fn max(self, rhs: Self) -> Self { - // These technically don't have the same semantics for NaN and 0, but it - // doesn't seem to matter as Skia does it the same way. - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_max_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_pmax(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vmaxq_f32(self.0, rhs.0) }) - } else { - Self([ - self.0[0].faster_max(rhs.0[0]), - self.0[1].faster_max(rhs.0[1]), - self.0[2].faster_max(rhs.0[2]), - self.0[3].faster_max(rhs.0[3]), - ]) - } - } - } - - pub fn min(self, rhs: Self) -> Self { - // These technically don't have the same semantics for NaN and 0, but it - // doesn't seem to matter as Skia does it the same way. - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_min_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_pmin(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vminq_f32(self.0, rhs.0) }) - } else { - Self([ - self.0[0].faster_min(rhs.0[0]), - self.0[1].faster_min(rhs.0[1]), - self.0[2].faster_min(rhs.0[2]), - self.0[3].faster_min(rhs.0[3]), - ]) - } - } - } - - pub fn cmp_eq(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_cmpeq_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_eq(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(cast(unsafe { vceqq_f32(self.0, rhs.0) })) - } else { - Self([ - if self.0[0] == rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[1] == rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[2] == rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[3] == rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 }, - ]) - } - } - } - - pub fn cmp_ne(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_cmpneq_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_ne(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(cast(unsafe { vmvnq_u32(vceqq_f32(self.0, rhs.0)) })) - } else { - Self([ - if self.0[0] != rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[1] != rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[2] != rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[3] != rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 }, - ]) - } - } - } - - pub fn cmp_ge(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_cmpge_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_ge(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(cast(unsafe { vcgeq_f32(self.0, rhs.0) })) - } else { - Self([ - if self.0[0] >= rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[1] >= rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[2] >= rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[3] >= rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 }, - ]) - } - } - } - - pub fn cmp_gt(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_cmpgt_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_gt(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(cast(unsafe { vcgtq_f32(self.0, rhs.0) })) - } else { - Self([ - if self.0[0] > rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[1] > rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[2] > rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[3] > rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 }, - ]) - } - } - } - - pub fn cmp_le(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_cmple_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_le(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(cast(unsafe { vcleq_f32(self.0, rhs.0) })) - } else { - Self([ - if self.0[0] <= rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[1] <= rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[2] <= rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[3] <= rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 }, - ]) - } - } - } - - pub fn cmp_lt(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_cmplt_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_lt(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(cast(unsafe { vcltq_f32(self.0, rhs.0) })) - } else { - Self([ - if self.0[0] < rhs.0[0] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[1] < rhs.0[1] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[2] < rhs.0[2] { f32::from_bits(u32::MAX) } else { 0.0 }, - if self.0[3] < rhs.0[3] { f32::from_bits(u32::MAX) } else { 0.0 }, - ]) - } - } - } - - #[inline] - pub fn blend(self, t: Self, f: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse4.1"))] { - Self(unsafe { _mm_blendv_ps(f.0, t.0, self.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(v128_bitselect(t.0, f.0, self.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { cast(vbslq_u32( cast(self.0), cast(t.0), cast(f.0))) }) - } else { - super::generic_bit_blend(self, t, f) - } - } - } - - pub fn round(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse4.1"))] { - Self( - unsafe { _mm_round_ps(self.0, _MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT) }, - ) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_nearest(self.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vrndnq_f32(self.0) }) - } else { - use super::u32x4; - - let to_int = f32x4::splat(1.0 / f32::EPSILON); - let u: u32x4 = cast(self); - let e: i32x4 = cast(u.shr::<23>() & u32x4::splat(0xff)); - let mut y: f32x4; - - let no_op_magic = i32x4::splat(0x7f + 23); - let no_op_mask: f32x4 = cast(e.cmp_gt(no_op_magic) | e.cmp_eq(no_op_magic)); - let no_op_val: f32x4 = self; - - let zero_magic = i32x4::splat(0x7f - 1); - let zero_mask: f32x4 = cast(e.cmp_lt(zero_magic)); - let zero_val: f32x4 = self * f32x4::splat(0.0); - - let neg_bit: f32x4 = cast(cast::(u).cmp_lt(i32x4::default())); - let x: f32x4 = neg_bit.blend(-self, self); - y = x + to_int - to_int - x; - y = y.cmp_gt(f32x4::splat(0.5)).blend( - y + x - f32x4::splat(-1.0), - y.cmp_lt(f32x4::splat(-0.5)).blend(y + x + f32x4::splat(1.0), y + x), - ); - y = neg_bit.blend(-y, y); - - no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y)) - } - } - } - - pub fn round_int(self) -> i32x4 { - // These technically don't have the same semantics for NaN and out of - // range values, but it doesn't seem to matter as Skia does it the same - // way. - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - i32x4(unsafe { _mm_cvtps_epi32(self.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - i32x4(i32x4_trunc_sat_f32x4(self.round().0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - i32x4(unsafe { vcvtnq_s32_f32(self.0) } ) - } else { - let rounded: [f32; 4] = cast(self.round()); - cast([ - rounded[0] as i32, - rounded[1] as i32, - rounded[2] as i32, - rounded[3] as i32, - ]) - } - } - } - - pub fn trunc_int(self) -> i32x4 { - // These technically don't have the same semantics for NaN and out of - // range values, but it doesn't seem to matter as Skia does it the same - // way. - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - i32x4(unsafe { _mm_cvttps_epi32(self.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - i32x4(i32x4_trunc_sat_f32x4(self.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - i32x4(unsafe { vcvtq_s32_f32(self.0) }) - } else { - cast([ - self.0[0] as i32, - self.0[1] as i32, - self.0[2] as i32, - self.0[3] as i32, - ]) - } - } - } - - pub fn recip_fast(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_rcp_ps(self.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_div(f32x4_splat(1.0), self.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - unsafe { - let a = vrecpeq_f32(self.0); - let a = vmulq_f32(vrecpsq_f32(self.0, a), a); - Self(a) - } - } else { - Self::from([ - 1.0 / self.0[0], - 1.0 / self.0[1], - 1.0 / self.0[2], - 1.0 / self.0[3], - ]) - } - } - } - - pub fn recip_sqrt(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_rsqrt_ps(self.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_div(f32x4_splat(1.0), f32x4_sqrt(self.0))) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - unsafe { - let a = vrsqrteq_f32(self.0); - let a = vmulq_f32(vrsqrtsq_f32(self.0, vmulq_f32(a, a)), a); - Self(a) - } - } else { - Self::from([ - 1.0 / self.0[0].sqrt(), - 1.0 / self.0[1].sqrt(), - 1.0 / self.0[2].sqrt(), - 1.0 / self.0[3].sqrt(), - ]) - } - } - } - - pub fn sqrt(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_sqrt_ps(self.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_sqrt(self.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vsqrtq_f32(self.0) }) - } else { - Self::from([ - self.0[0].sqrt(), - self.0[1].sqrt(), - self.0[2].sqrt(), - self.0[3].sqrt(), - ]) - } - } - } -} - -impl From<[f32; 4]> for f32x4 { - fn from(v: [f32; 4]) -> Self { - cast(v) - } -} - -impl From for [f32; 4] { - fn from(v: f32x4) -> Self { - cast(v) - } -} - -impl core::ops::Add for f32x4 { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_add_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_add(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vaddq_f32(self.0, rhs.0) }) - } else { - Self([ - self.0[0] + rhs.0[0], - self.0[1] + rhs.0[1], - self.0[2] + rhs.0[2], - self.0[3] + rhs.0[3], - ]) - } - } - } -} - -impl core::ops::AddAssign for f32x4 { - fn add_assign(&mut self, rhs: f32x4) { - *self = *self + rhs; - } -} - -impl core::ops::Sub for f32x4 { - type Output = Self; - - fn sub(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_sub_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_sub(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vsubq_f32(self.0, rhs.0) }) - } else { - Self([ - self.0[0] - rhs.0[0], - self.0[1] - rhs.0[1], - self.0[2] - rhs.0[2], - self.0[3] - rhs.0[3], - ]) - } - } - } -} - -impl core::ops::Mul for f32x4 { - type Output = Self; - - fn mul(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_mul_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_mul(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vmulq_f32(self.0, rhs.0) }) - } else { - Self([ - self.0[0] * rhs.0[0], - self.0[1] * rhs.0[1], - self.0[2] * rhs.0[2], - self.0[3] * rhs.0[3], - ]) - } - } - } -} - -impl core::ops::MulAssign for f32x4 { - fn mul_assign(&mut self, rhs: f32x4) { - *self = *self * rhs; - } -} - -impl core::ops::Div for f32x4 { - type Output = Self; - - fn div(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_div_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_div(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vdivq_f32(self.0, rhs.0) }) - } else { - Self([ - self.0[0] / rhs.0[0], - self.0[1] / rhs.0[1], - self.0[2] / rhs.0[2], - self.0[3] / rhs.0[3], - ]) - } - } - } -} - -impl core::ops::BitAnd for f32x4 { - type Output = Self; - - #[inline(always)] - fn bitand(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_and_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(v128_and(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(cast(unsafe { vandq_u32(cast(self.0), cast(rhs.0)) })) - } else { - Self([ - f32::from_bits(self.0[0].to_bits() & rhs.0[0].to_bits()), - f32::from_bits(self.0[1].to_bits() & rhs.0[1].to_bits()), - f32::from_bits(self.0[2].to_bits() & rhs.0[2].to_bits()), - f32::from_bits(self.0[3].to_bits() & rhs.0[3].to_bits()), - ]) - } - } - } -} - -impl core::ops::BitOr for f32x4 { - type Output = Self; - - #[inline(always)] - fn bitor(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_or_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(v128_or(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(cast(unsafe { vorrq_u32(cast(self.0), cast(rhs.0)) })) - } else { - Self([ - f32::from_bits(self.0[0].to_bits() | rhs.0[0].to_bits()), - f32::from_bits(self.0[1].to_bits() | rhs.0[1].to_bits()), - f32::from_bits(self.0[2].to_bits() | rhs.0[2].to_bits()), - f32::from_bits(self.0[3].to_bits() | rhs.0[3].to_bits()), - ]) - } - } - } -} - -impl core::ops::BitXor for f32x4 { - type Output = Self; - - #[inline(always)] - fn bitxor(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_xor_ps(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(v128_xor(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(cast(unsafe { veorq_u32(cast(self.0), cast(rhs.0)) })) - } else { - Self([ - f32::from_bits(self.0[0].to_bits() ^ rhs.0[0].to_bits()), - f32::from_bits(self.0[1].to_bits() ^ rhs.0[1].to_bits()), - f32::from_bits(self.0[2].to_bits() ^ rhs.0[2].to_bits()), - f32::from_bits(self.0[3].to_bits() ^ rhs.0[3].to_bits()), - ]) - } - } - } -} - -impl core::ops::Neg for f32x4 { - type Output = Self; - - fn neg(self) -> Self { - Self::default() - self - } -} - -impl core::ops::Not for f32x4 { - type Output = Self; - - fn not(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - unsafe { - let all_bits = _mm_set1_ps(f32::from_bits(u32::MAX)); - Self(_mm_xor_ps(self.0, all_bits)) - } - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(v128_not(self.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(cast(unsafe { vmvnq_u32(cast(self.0)) })) - } else { - self ^ Self::splat(cast(u32::MAX)) - } - } - } -} - -impl core::cmp::PartialEq for f32x4 { - fn eq(&self, rhs: &Self) -> bool { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - unsafe { _mm_movemask_ps(_mm_cmpeq_ps(self.0, rhs.0)) == 0b1111 } - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - unsafe { vminvq_u32(vceqq_f32(self.0, rhs.0)) != 0 } - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - u32x4_all_true(f32x4_eq(self.0, rhs.0)) - } else { - self.0 == rhs.0 - } - } - } -} diff --git a/src/wide/f32x8_t.rs b/src/wide/f32x8_t.rs deleted file mode 100644 index 6039334..0000000 --- a/src/wide/f32x8_t.rs +++ /dev/null @@ -1,403 +0,0 @@ -// Copyright 2020 Yevhenii Reizner -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Based on https://github.com/Lokathor/wide (Zlib) - -use bytemuck::cast; - -use super::{i32x8, u32x8}; - -cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(32))] - pub struct f32x8(__m256); - } else { - use super::f32x4; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(32))] - pub struct f32x8(pub f32x4, pub f32x4); - } -} - -unsafe impl bytemuck::Zeroable for f32x8 {} -unsafe impl bytemuck::Pod for f32x8 {} - -impl Default for f32x8 { - fn default() -> Self { - Self::splat(0.0) - } -} - -impl f32x8 { - pub fn splat(n: f32) -> Self { - cast([n, n, n, n, n, n, n, n]) - } - - pub fn floor(self) -> Self { - let roundtrip: f32x8 = cast(self.trunc_int().to_f32x8()); - roundtrip - - roundtrip - .cmp_gt(self) - .blend(f32x8::splat(1.0), f32x8::default()) - } - - pub fn fract(self) -> Self { - self - self.floor() - } - - pub fn normalize(self) -> Self { - self.max(f32x8::default()).min(f32x8::splat(1.0)) - } - - pub fn to_i32x8_bitcast(self) -> i32x8 { - bytemuck::cast(self) - } - - pub fn to_u32x8_bitcast(self) -> u32x8 { - bytemuck::cast(self) - } - - pub fn cmp_eq(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_EQ_OQ) }) - } else { - Self(self.0.cmp_eq(rhs.0), self.1.cmp_eq(rhs.1)) - } - } - } - - pub fn cmp_ne(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_NEQ_OQ) }) - } else { - Self(self.0.cmp_ne(rhs.0), self.1.cmp_ne(rhs.1)) - } - } - } - - pub fn cmp_ge(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_GE_OQ) }) - } else { - Self(self.0.cmp_ge(rhs.0), self.1.cmp_ge(rhs.1)) - } - } - } - - pub fn cmp_gt(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_GT_OQ) }) - } else { - Self(self.0.cmp_gt(rhs.0), self.1.cmp_gt(rhs.1)) - } - } - } - - pub fn cmp_le(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_LE_OQ) }) - } else { - Self(self.0.cmp_le(rhs.0), self.1.cmp_le(rhs.1)) - } - } - } - - pub fn cmp_lt(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_LT_OQ) }) - } else { - Self(self.0.cmp_lt(rhs.0), self.1.cmp_lt(rhs.1)) - } - } - } - - #[inline] - pub fn blend(self, t: Self, f: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_blendv_ps(f.0, t.0, self.0) }) - } else { - Self(self.0.blend(t.0, f.0), self.1.blend(t.1, f.1)) - } - } - } - - pub fn abs(self) -> Self { - let non_sign_bits = f32x8::splat(f32::from_bits(i32::MAX as u32)); - self & non_sign_bits - } - - pub fn max(self, rhs: Self) -> Self { - // These technically don't have the same semantics for NaN and 0, but it - // doesn't seem to matter as Skia does it the same way. - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_max_ps(self.0, rhs.0) }) - } else { - Self(self.0.max(rhs.0), self.1.max(rhs.1)) - } - } - } - - pub fn min(self, rhs: Self) -> Self { - // These technically don't have the same semantics for NaN and 0, but it - // doesn't seem to matter as Skia does it the same way. - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_min_ps(self.0, rhs.0) }) - } else { - Self(self.0.min(rhs.0), self.1.min(rhs.1)) - } - } - } - - pub fn is_finite(self) -> Self { - let shifted_exp_mask = u32x8::splat(0xFF000000); - let u: u32x8 = cast(self); - let shift_u = u.shl::<1>(); - let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask); - cast(out) - } - - pub fn round(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_round_ps(self.0, _MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT) }) - } else { - Self(self.0.round(), self.1.round()) - } - } - } - - pub fn round_int(self) -> i32x8 { - // These technically don't have the same semantics for NaN and out of - // range values, but it doesn't seem to matter as Skia does it the same - // way. - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - cast(unsafe { _mm256_cvtps_epi32(self.0) }) - } else { - i32x8(self.0.round_int(), self.1.round_int()) - } - } - } - - pub fn trunc_int(self) -> i32x8 { - // These technically don't have the same semantics for NaN and out of - // range values, but it doesn't seem to matter as Skia does it the same - // way. - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - cast(unsafe { _mm256_cvttps_epi32(self.0) }) - } else { - i32x8(self.0.trunc_int(), self.1.trunc_int()) - } - } - } - - pub fn recip_fast(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_rcp_ps(self.0) }) - } else { - Self(self.0.recip_fast(), self.1.recip_fast()) - } - } - } - - pub fn recip_sqrt(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_rsqrt_ps(self.0) }) - } else { - Self(self.0.recip_sqrt(), self.1.recip_sqrt()) - } - } - } - - pub fn sqrt(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_sqrt_ps(self.0) }) - } else { - Self(self.0.sqrt(), self.1.sqrt()) - } - } - } -} - -impl From<[f32; 8]> for f32x8 { - fn from(v: [f32; 8]) -> Self { - cast(v) - } -} - -impl From for [f32; 8] { - fn from(v: f32x8) -> Self { - cast(v) - } -} - -impl core::ops::Add for f32x8 { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_add_ps(self.0, rhs.0) }) - } else { - Self(self.0 + rhs.0, self.1 + rhs.1) - } - } - } -} - -impl core::ops::AddAssign for f32x8 { - fn add_assign(&mut self, rhs: f32x8) { - *self = *self + rhs; - } -} - -impl core::ops::Sub for f32x8 { - type Output = Self; - - fn sub(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_sub_ps(self.0, rhs.0) }) - } else { - Self(self.0 - rhs.0, self.1 - rhs.1) - } - } - } -} - -impl core::ops::Mul for f32x8 { - type Output = Self; - - fn mul(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_mul_ps(self.0, rhs.0) }) - } else { - Self(self.0 * rhs.0, self.1 * rhs.1) - } - } - } -} - -impl core::ops::MulAssign for f32x8 { - fn mul_assign(&mut self, rhs: f32x8) { - *self = *self * rhs; - } -} - -impl core::ops::Div for f32x8 { - type Output = Self; - - fn div(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_div_ps(self.0, rhs.0) }) - } else { - Self(self.0 / rhs.0, self.1 / rhs.1) - } - } - } -} - -impl core::ops::BitAnd for f32x8 { - type Output = Self; - - #[inline(always)] - fn bitand(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_and_ps(self.0, rhs.0) }) - } else { - Self(self.0 & rhs.0, self.1 & rhs.1) - } - } - } -} - -impl core::ops::BitOr for f32x8 { - type Output = Self; - - #[inline(always)] - fn bitor(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_or_ps(self.0, rhs.0) }) - } else { - Self(self.0 | rhs.0, self.1 | rhs.1) - } - } - } -} - -impl core::ops::BitXor for f32x8 { - type Output = Self; - - #[inline(always)] - fn bitxor(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - Self(unsafe { _mm256_xor_ps(self.0, rhs.0) }) - } else { - Self(self.0 ^ rhs.0, self.1 ^ rhs.1) - } - } - } -} - -impl core::ops::Neg for f32x8 { - type Output = Self; - - fn neg(self) -> Self { - Self::default() - self - } -} - -impl core::ops::Not for f32x8 { - type Output = Self; - - fn not(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - let all_bits = unsafe { _mm256_set1_ps(f32::from_bits(u32::MAX)) }; - Self(unsafe { _mm256_xor_ps(self.0, all_bits) }) - } else { - Self(!self.0, !self.1) - } - } - } -} - -impl core::cmp::PartialEq for f32x8 { - fn eq(&self, rhs: &Self) -> bool { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx"))] { - let mask = unsafe { _mm256_cmp_ps(self.0, rhs.0, _CMP_EQ_OQ) }; - unsafe { _mm256_movemask_ps(mask) == 0b1111_1111 } - } else { - self.0 == rhs.0 && self.1 == rhs.1 - } - } - } -} diff --git a/src/wide/i32x4_t.rs b/src/wide/i32x4_t.rs deleted file mode 100644 index fb77a0f..0000000 --- a/src/wide/i32x4_t.rs +++ /dev/null @@ -1,281 +0,0 @@ -// Copyright 2020 Yevhenii Reizner -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Based on https://github.com/Lokathor/wide (Zlib) - -use bytemuck::cast; - -use super::f32x4; - -cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(16))] - pub struct i32x4(pub __m128i); - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - use core::arch::wasm32::*; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(16))] - pub struct i32x4(pub v128); - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - use core::arch::aarch64::*; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(16))] - pub struct i32x4(pub int32x4_t); - } else { - #[derive(Clone, Copy, Debug)] - #[repr(C, align(16))] - pub struct i32x4([i32; 4]); - } -} - -unsafe impl bytemuck::Zeroable for i32x4 {} -unsafe impl bytemuck::Pod for i32x4 {} - -impl Default for i32x4 { - fn default() -> Self { - Self::splat(0) - } -} - -impl i32x4 { - pub fn splat(n: i32) -> Self { - cast([n, n, n, n]) - } - - pub fn blend(self, t: Self, f: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse4.1"))] { - Self(unsafe { _mm_blendv_epi8(f.0, t.0, self.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(v128_bitselect(t.0, f.0, self.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vbslq_s32(cast(self.0), t.0, f.0) }) - } else { - super::generic_bit_blend(self, t, f) - } - } - } - - pub fn cmp_eq(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - cast(Self(cast(unsafe { _mm_cmpeq_epi32(self.0, rhs.0) }))) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(i32x4_eq(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { cast(vceqq_s32(self.0, rhs.0)) }) - } else { - Self([ - if self.0[0] == rhs.0[0] { -1 } else { 0 }, - if self.0[1] == rhs.0[1] { -1 } else { 0 }, - if self.0[2] == rhs.0[2] { -1 } else { 0 }, - if self.0[3] == rhs.0[3] { -1 } else { 0 }, - ]) - } - } - } - - pub fn cmp_gt(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - cast(Self(cast(unsafe { _mm_cmpgt_epi32(self.0, rhs.0) }))) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(i32x4_gt(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { cast(vcgtq_s32(self.0, rhs.0)) }) - } else { - Self([ - if self.0[0] > rhs.0[0] { -1 } else { 0 }, - if self.0[1] > rhs.0[1] { -1 } else { 0 }, - if self.0[2] > rhs.0[2] { -1 } else { 0 }, - if self.0[3] > rhs.0[3] { -1 } else { 0 }, - ]) - } - } - } - - pub fn cmp_lt(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - cast(Self(cast(unsafe { _mm_cmplt_epi32(self.0, rhs.0) }))) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(i32x4_lt(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { cast(vcltq_s32(self.0, rhs.0)) }) - } else { - Self([ - if self.0[0] < rhs.0[0] { -1 } else { 0 }, - if self.0[1] < rhs.0[1] { -1 } else { 0 }, - if self.0[2] < rhs.0[2] { -1 } else { 0 }, - if self.0[3] < rhs.0[3] { -1 } else { 0 }, - ]) - } - } - } - - pub fn to_f32x4(self) -> f32x4 { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - cast(Self(cast(unsafe { _mm_cvtepi32_ps(self.0) }))) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - cast(Self(f32x4_convert_i32x4(self.0))) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - cast(Self(unsafe { cast(vcvtq_f32_s32(self.0)) })) - } else { - let arr: [i32; 4] = cast(self); - cast([ - arr[0] as f32, - arr[1] as f32, - arr[2] as f32, - arr[3] as f32, - ]) - } - } - } - - pub fn to_f32x4_bitcast(self) -> f32x4 { - bytemuck::cast(self) - } -} - -impl From<[i32; 4]> for i32x4 { - fn from(v: [i32; 4]) -> Self { - cast(v) - } -} - -impl From for [i32; 4] { - fn from(v: i32x4) -> Self { - cast(v) - } -} - -impl core::ops::Add for i32x4 { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_add_epi32(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(i32x4_add(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vaddq_s32(self.0, rhs.0) }) - } else { - Self([ - self.0[0].wrapping_add(rhs.0[0]), - self.0[1].wrapping_add(rhs.0[1]), - self.0[2].wrapping_add(rhs.0[2]), - self.0[3].wrapping_add(rhs.0[3]), - ]) - } - } - } -} - -impl core::ops::BitAnd for i32x4 { - type Output = Self; - - fn bitand(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_and_si128(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(v128_and(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vandq_s32(self.0, rhs.0) }) - } else { - Self([ - self.0[0] & rhs.0[0], - self.0[1] & rhs.0[1], - self.0[2] & rhs.0[2], - self.0[3] & rhs.0[3], - ]) - } - } - } -} - -impl core::ops::Mul for i32x4 { - type Output = Self; - - fn mul(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse4.1"))] { - Self(unsafe { _mm_mullo_epi32(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(i32x4_mul(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vmulq_s32(self.0, rhs.0) }) - } else { - // Cast is required, since we have to use scalar multiplication on SSE2. - let a: [i32; 4] = cast(self); - let b: [i32; 4] = cast(rhs); - Self(cast([ - a[0].wrapping_mul(b[0]), - a[1].wrapping_mul(b[1]), - a[2].wrapping_mul(b[2]), - a[3].wrapping_mul(b[3]), - ])) - } - } - } -} - -impl core::ops::BitOr for i32x4 { - type Output = Self; - - #[inline] - fn bitor(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_or_si128(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(v128_or(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vorrq_s32(self.0, rhs.0) }) - } else { - Self([ - self.0[0] | rhs.0[0], - self.0[1] | rhs.0[1], - self.0[2] | rhs.0[2], - self.0[3] | rhs.0[3], - ]) - } - } - } -} - -impl core::ops::BitXor for i32x4 { - type Output = Self; - - #[inline] - fn bitxor(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_xor_si128(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(v128_xor(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { veorq_s32(self.0, rhs.0) }) - } else { - Self([ - self.0[0] ^ rhs.0[0], - self.0[1] ^ rhs.0[1], - self.0[2] ^ rhs.0[2], - self.0[3] ^ rhs.0[3], - ]) - } - } - } -} diff --git a/src/wide/i32x8_t.rs b/src/wide/i32x8_t.rs deleted file mode 100644 index 52f9729..0000000 --- a/src/wide/i32x8_t.rs +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright 2020 Yevhenii Reizner -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Based on https://github.com/Lokathor/wide (Zlib) - -use bytemuck::cast; - -use super::{f32x8, u32x8}; - -cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(32))] - pub struct i32x8(__m256i); - } else { - use super::i32x4; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(32))] - pub struct i32x8(pub i32x4, pub i32x4); - } -} - -unsafe impl bytemuck::Zeroable for i32x8 {} -unsafe impl bytemuck::Pod for i32x8 {} - -impl Default for i32x8 { - fn default() -> Self { - Self::splat(0) - } -} - -impl i32x8 { - pub fn splat(n: i32) -> Self { - cast([n, n, n, n, n, n, n, n]) - } - - pub fn blend(self, t: Self, f: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - Self(unsafe { _mm256_blendv_epi8(f.0, t.0, self.0) }) - } else { - Self(self.0.blend(t.0, f.0), self.1.blend(t.1, f.1)) - } - } - } - - pub fn cmp_eq(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - Self(unsafe { _mm256_cmpeq_epi32(self.0, rhs.0) }) - } else { - Self(self.0.cmp_eq(rhs.0), self.1.cmp_eq(rhs.1)) - } - } - } - - pub fn cmp_gt(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - Self(unsafe { _mm256_cmpgt_epi32(self.0, rhs.0) }) - } else { - Self(self.0.cmp_gt(rhs.0), self.1.cmp_gt(rhs.1)) - } - } - } - - pub fn cmp_lt(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - // There is no `_mm256_cmpLT_epi32`, therefore we have to use - // `_mm256_cmpGT_epi32` and then invert the result. - let v = unsafe { _mm256_cmpgt_epi32(self.0, rhs.0) }; - let all_bits = unsafe { _mm256_set1_epi16(-1) }; - Self(unsafe { _mm256_xor_si256(v, all_bits) }) - } else { - Self(self.0.cmp_lt(rhs.0), self.1.cmp_lt(rhs.1)) - } - } - } - - pub fn to_f32x8(self) -> f32x8 { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - cast(unsafe { _mm256_cvtepi32_ps(self.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "avx"))] { - cast([self.0.to_f32x4(), self.1.to_f32x4()]) - } else { - f32x8(self.0.to_f32x4(), self.1.to_f32x4()) - } - } - } - - pub fn to_u32x8_bitcast(self) -> u32x8 { - bytemuck::cast(self) - } - - pub fn to_f32x8_bitcast(self) -> f32x8 { - bytemuck::cast(self) - } -} - -impl From<[i32; 8]> for i32x8 { - fn from(v: [i32; 8]) -> Self { - cast(v) - } -} - -impl From for [i32; 8] { - fn from(v: i32x8) -> Self { - cast(v) - } -} - -impl core::ops::Add for i32x8 { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - Self(unsafe { _mm256_add_epi32(self.0, rhs.0) }) - } else { - Self(self.0 + rhs.0, self.1 + rhs.1) - } - } - } -} - -impl core::ops::BitAnd for i32x8 { - type Output = Self; - - fn bitand(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - Self(unsafe { _mm256_and_si256(self.0, rhs.0) }) - } else { - Self(self.0 & rhs.0, self.1 & rhs.1) - } - } - } -} - -impl core::ops::Mul for i32x8 { - type Output = Self; - - fn mul(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - Self(unsafe { _mm256_mullo_epi32(self.0, rhs.0) }) - } else { - Self(self.0 * rhs.0, self.1 * rhs.1) - } - } - } -} - -impl core::ops::BitOr for i32x8 { - type Output = Self; - - #[inline] - fn bitor(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - Self(unsafe { _mm256_or_si256(self.0, rhs.0) }) - } else { - Self(self.0 | rhs.0, self.1 | rhs.1) - } - } - } -} - -impl core::ops::BitXor for i32x8 { - type Output = Self; - - #[inline] - fn bitxor(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - Self(unsafe { _mm256_xor_si256(self.0, rhs.0) }) - } else { - Self(self.0 ^ rhs.0, self.1 ^ rhs.1) - } - } - } -} diff --git a/src/wide/mod.rs b/src/wide/mod.rs deleted file mode 100644 index c58c239..0000000 --- a/src/wide/mod.rs +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2020 Yevhenii Reizner -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// This module was written from scratch, therefore there is no Google copyright. - -// f32x16, i32x16 and u32x16 are implemented as [Tx8; 2] and not as [T; 16]. -// This way we still can use some SIMD. -// -// We doesn't use #[inline] that much in this module. -// The compiler will inline most of the methods automatically. -// The only exception is U16x16, were we have to force inlining, -// otherwise the performance will be horrible. - -#![allow(non_camel_case_types)] - -mod f32x16_t; -mod f32x4_t; -mod f32x8_t; -mod i32x4_t; -mod i32x8_t; -mod u16x16_t; -mod u32x4_t; -mod u32x8_t; - -pub use f32x16_t::f32x16; -pub use f32x4_t::f32x4; -pub use f32x8_t::f32x8; -pub use i32x4_t::i32x4; -pub use i32x8_t::i32x8; -pub use tiny_skia_path::f32x2; -pub use u16x16_t::u16x16; -pub use u32x4_t::u32x4; -pub use u32x8_t::u32x8; - -#[allow(dead_code)] -#[inline] -pub fn generic_bit_blend(mask: T, y: T, n: T) -> T -where - T: Copy + core::ops::BitXor + core::ops::BitAnd, -{ - n ^ ((n ^ y) & mask) -} - -/// A faster and more forgiving f32 min/max implementation. -/// -/// Unlike std one, we do not care about NaN. -#[allow(dead_code)] -pub trait FasterMinMax { - fn faster_min(self, rhs: f32) -> f32; - fn faster_max(self, rhs: f32) -> f32; -} - -#[allow(dead_code)] -impl FasterMinMax for f32 { - fn faster_min(self, rhs: f32) -> f32 { - if rhs < self { - rhs - } else { - self - } - } - - fn faster_max(self, rhs: f32) -> f32 { - if self < rhs { - rhs - } else { - self - } - } -} diff --git a/src/wide/u16x16_t.rs b/src/wide/u16x16_t.rs deleted file mode 100644 index 5e1a464..0000000 --- a/src/wide/u16x16_t.rs +++ /dev/null @@ -1,250 +0,0 @@ -// Copyright 2020 Yevhenii Reizner -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// No need to use explicit 256bit AVX2 SIMD. -// `-C target-cpu=native` will autovectorize it better than us. -// Not even sure why explicit instructions are so slow... -// -// On ARM AArch64 we can actually get up to 2x performance boost by using SIMD. -// -// We also have to inline all the methods. They are pretty large, -// but without the inlining the performance is plummeting. - -#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] -use bytemuck::cast; -#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] -use core::arch::aarch64::uint16x8_t; - -#[allow(non_camel_case_types)] -#[derive(Copy, Clone, PartialEq, Default, Debug)] -pub struct u16x16(pub [u16; 16]); - -macro_rules! impl_u16x16_op { - ($a:expr, $op:ident, $b:expr) => { - u16x16([ - $a.0[0].$op($b.0[0]), - $a.0[1].$op($b.0[1]), - $a.0[2].$op($b.0[2]), - $a.0[3].$op($b.0[3]), - $a.0[4].$op($b.0[4]), - $a.0[5].$op($b.0[5]), - $a.0[6].$op($b.0[6]), - $a.0[7].$op($b.0[7]), - $a.0[8].$op($b.0[8]), - $a.0[9].$op($b.0[9]), - $a.0[10].$op($b.0[10]), - $a.0[11].$op($b.0[11]), - $a.0[12].$op($b.0[12]), - $a.0[13].$op($b.0[13]), - $a.0[14].$op($b.0[14]), - $a.0[15].$op($b.0[15]), - ]) - }; -} - -#[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] -macro_rules! impl_aarch64_call { - ($f:ident, $a:expr, $b:expr) => { - let a = $a.split(); - let b = $b.split(); - Self(bytemuck::cast([ - unsafe { core::arch::aarch64::$f(a.0, b.0) }, - unsafe { core::arch::aarch64::$f(a.1, b.1) }, - ])) - }; -} - -impl u16x16 { - #[inline] - pub fn splat(n: u16) -> Self { - Self([n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n]) - } - - #[inline] - pub fn as_slice(&self) -> &[u16; 16] { - &self.0 - } - - #[inline] - pub fn min(&self, rhs: &Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - impl_aarch64_call!(vminq_u16, self, rhs) - } else { - impl_u16x16_op!(self, min, rhs) - } - } - } - - #[inline] - pub fn max(&self, rhs: &Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - impl_aarch64_call!(vmaxq_u16, self, rhs) - } else { - impl_u16x16_op!(self, max, rhs) - } - } - } - - #[inline] - pub fn cmp_le(&self, rhs: &Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - impl_aarch64_call!(vcleq_u16, self, rhs) - } else { - Self([ - if self.0[ 0] <= rhs.0[ 0] { !0 } else { 0 }, - if self.0[ 1] <= rhs.0[ 1] { !0 } else { 0 }, - if self.0[ 2] <= rhs.0[ 2] { !0 } else { 0 }, - if self.0[ 3] <= rhs.0[ 3] { !0 } else { 0 }, - if self.0[ 4] <= rhs.0[ 4] { !0 } else { 0 }, - if self.0[ 5] <= rhs.0[ 5] { !0 } else { 0 }, - if self.0[ 6] <= rhs.0[ 6] { !0 } else { 0 }, - if self.0[ 7] <= rhs.0[ 7] { !0 } else { 0 }, - if self.0[ 8] <= rhs.0[ 8] { !0 } else { 0 }, - if self.0[ 9] <= rhs.0[ 9] { !0 } else { 0 }, - if self.0[10] <= rhs.0[10] { !0 } else { 0 }, - if self.0[11] <= rhs.0[11] { !0 } else { 0 }, - if self.0[12] <= rhs.0[12] { !0 } else { 0 }, - if self.0[13] <= rhs.0[13] { !0 } else { 0 }, - if self.0[14] <= rhs.0[14] { !0 } else { 0 }, - if self.0[15] <= rhs.0[15] { !0 } else { 0 }, - ]) - } - } - } - - #[inline] - pub fn blend(self, t: Self, e: Self) -> Self { - (t & self) | (e & !self) - } - - #[inline] - #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] - pub fn split(self) -> (uint16x8_t, uint16x8_t) { - let pair: [uint16x8_t; 2] = cast(self.0); - (pair[0], pair[1]) - } -} - -impl core::ops::Add for u16x16 { - type Output = Self; - - #[inline] - fn add(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - impl_aarch64_call!(vaddq_u16, self, rhs) - } else { - impl_u16x16_op!(self, add, rhs) - } - } - } -} - -impl core::ops::Sub for u16x16 { - type Output = Self; - - #[inline] - fn sub(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - impl_aarch64_call!(vsubq_u16, self, rhs) - } else { - impl_u16x16_op!(self, sub, rhs) - } - } - } -} - -impl core::ops::Mul for u16x16 { - type Output = Self; - - #[inline] - fn mul(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - impl_aarch64_call!(vmulq_u16, self, rhs) - } else { - impl_u16x16_op!(self, mul, rhs) - } - } - } -} - -impl core::ops::Div for u16x16 { - type Output = Self; - - #[inline] - fn div(self, rhs: Self) -> Self::Output { - impl_u16x16_op!(self, div, rhs) - } -} - -impl core::ops::BitAnd for u16x16 { - type Output = Self; - - #[inline] - fn bitand(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - impl_aarch64_call!(vandq_u16, self, rhs) - } else { - impl_u16x16_op!(self, bitand, rhs) - } - } - } -} - -impl core::ops::BitOr for u16x16 { - type Output = Self; - - #[inline] - fn bitor(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - impl_aarch64_call!(vorrq_u16, self, rhs) - } else { - impl_u16x16_op!(self, bitor, rhs) - } - } - } -} - -impl core::ops::Not for u16x16 { - type Output = Self; - - #[inline] - fn not(self) -> Self::Output { - u16x16([ - !self.0[0], - !self.0[1], - !self.0[2], - !self.0[3], - !self.0[4], - !self.0[5], - !self.0[6], - !self.0[7], - !self.0[8], - !self.0[9], - !self.0[10], - !self.0[11], - !self.0[12], - !self.0[13], - !self.0[14], - !self.0[15], - ]) - } -} - -impl core::ops::Shr for u16x16 { - type Output = Self; - - #[inline] - fn shr(self, rhs: Self) -> Self::Output { - impl_u16x16_op!(self, shr, rhs) - } -} diff --git a/src/wide/u32x4_t.rs b/src/wide/u32x4_t.rs deleted file mode 100644 index 27d78cb..0000000 --- a/src/wide/u32x4_t.rs +++ /dev/null @@ -1,191 +0,0 @@ -// Copyright 2020 Yevhenii Reizner -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Based on https://github.com/Lokathor/wide (Zlib) - -cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - // unused when AVX is available - #[cfg(not(target_feature = "avx2"))] - use bytemuck::cast; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(16))] - pub struct u32x4(__m128i); - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - use core::arch::wasm32::*; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(16))] - pub struct u32x4(v128); - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - use core::arch::aarch64::*; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(16))] - pub struct u32x4(uint32x4_t); - } else { - #[derive(Clone, Copy, Debug)] - #[repr(C, align(16))] - pub struct u32x4([u32; 4]); - } -} - -unsafe impl bytemuck::Zeroable for u32x4 {} -unsafe impl bytemuck::Pod for u32x4 {} - -impl Default for u32x4 { - fn default() -> Self { - Self::splat(0) - } -} - -impl u32x4 { - pub fn splat(n: u32) -> Self { - bytemuck::cast([n, n, n, n]) - } - - // unused when AVX is available - #[cfg(not(target_feature = "avx2"))] - pub fn cmp_eq(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_cmpeq_epi32(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(u32x4_eq(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vceqq_u32(self.0, rhs.0) }) - } else { - Self([ - if self.0[0] == rhs.0[0] { u32::MAX } else { 0 }, - if self.0[1] == rhs.0[1] { u32::MAX } else { 0 }, - if self.0[2] == rhs.0[2] { u32::MAX } else { 0 }, - if self.0[3] == rhs.0[3] { u32::MAX } else { 0 }, - ]) - } - } - } - - // unused when AVX is available - #[cfg(not(target_feature = "avx2"))] - pub fn shl(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - let shift = cast([RHS as u64, 0]); - Self(unsafe { _mm_sll_epi32(self.0, shift) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(u32x4_shl(self.0, RHS as _)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vshlq_n_u32::(self.0) }) - } else { - let u = RHS as u64; - Self([ - self.0[0] << u, - self.0[1] << u, - self.0[2] << u, - self.0[3] << u, - ]) - } - } - } - - // unused when AVX is available - #[cfg(not(target_feature = "avx2"))] - pub fn shr(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - let shift: __m128i = cast([RHS as u64, 0]); - Self(unsafe { _mm_srl_epi32(self.0, shift) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(u32x4_shr(self.0, RHS as _)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vshrq_n_u32::(self.0) }) - } else { - let u = RHS as u64; - Self([ - self.0[0] >> u, - self.0[1] >> u, - self.0[2] >> u, - self.0[3] >> u, - ]) - } - } - } -} - -impl core::ops::Not for u32x4 { - type Output = Self; - - fn not(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - let all_bits = unsafe { _mm_set1_epi32(-1) }; - Self(unsafe { _mm_xor_si128(self.0, all_bits) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(v128_not(self.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vmvnq_u32(self.0) }) - } else { - Self([ - !self.0[0], - !self.0[1], - !self.0[2], - !self.0[3], - ]) - } - } - } -} - -impl core::ops::Add for u32x4 { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_add_epi32(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(u32x4_add(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vaddq_u32(self.0, rhs.0) }) - } else { - Self([ - self.0[0].wrapping_add(rhs.0[0]), - self.0[1].wrapping_add(rhs.0[1]), - self.0[2].wrapping_add(rhs.0[2]), - self.0[3].wrapping_add(rhs.0[3]), - ]) - } - } - } -} - -impl core::ops::BitAnd for u32x4 { - type Output = Self; - - fn bitand(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - Self(unsafe { _mm_and_si128(self.0, rhs.0) }) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(v128_and(self.0, rhs.0)) - } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { - Self(unsafe { vandq_u32(self.0, rhs.0) }) - } else { - Self([ - self.0[0] & rhs.0[0], - self.0[1] & rhs.0[1], - self.0[2] & rhs.0[2], - self.0[3] & rhs.0[3], - ]) - } - } - } -} diff --git a/src/wide/u32x8_t.rs b/src/wide/u32x8_t.rs deleted file mode 100644 index b3791b5..0000000 --- a/src/wide/u32x8_t.rs +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2020 Yevhenii Reizner -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Based on https://github.com/Lokathor/wide (Zlib) - -use super::{f32x8, i32x8}; - -cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - use bytemuck::cast; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(32))] - pub struct u32x8(__m256i); - } else { - use super::u32x4; - - #[derive(Clone, Copy, Debug)] - #[repr(C, align(32))] - pub struct u32x8(u32x4, u32x4); - } -} - -unsafe impl bytemuck::Zeroable for u32x8 {} -unsafe impl bytemuck::Pod for u32x8 {} - -impl Default for u32x8 { - fn default() -> Self { - Self::splat(0) - } -} - -impl u32x8 { - pub fn splat(n: u32) -> Self { - bytemuck::cast([n, n, n, n, n, n, n, n]) - } - - pub fn to_i32x8_bitcast(self) -> i32x8 { - bytemuck::cast(self) - } - - pub fn to_f32x8_bitcast(self) -> f32x8 { - bytemuck::cast(self) - } - - pub fn cmp_eq(self, rhs: Self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - Self(unsafe { _mm256_cmpeq_epi32(self.0, rhs.0) }) - } else { - Self(self.0.cmp_eq(rhs.0), self.1.cmp_eq(rhs.1)) - } - } - } - - pub fn shl(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - let shift: __m128i = cast([RHS as u64, 0]); - Self(unsafe { _mm256_sll_epi32(self.0, shift) }) - } else { - Self(self.0.shl::(), self.1.shl::()) - } - } - } - - pub fn shr(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - let shift: __m128i = cast([RHS as u64, 0]); - Self(unsafe { _mm256_srl_epi32(self.0, shift) }) - } else { - Self(self.0.shr::(), self.1.shr::()) - } - } - } -} - -impl core::ops::Not for u32x8 { - type Output = Self; - - fn not(self) -> Self { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - let all_bits = unsafe { _mm256_set1_epi16(-1) }; - Self(unsafe { _mm256_xor_si256(self.0, all_bits) }) - } else { - Self(!self.0, !self.1) - } - } - } -} - -impl core::ops::Add for u32x8 { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - Self(unsafe { _mm256_add_epi32(self.0, rhs.0) }) - } else { - Self(self.0 + rhs.0, self.1 + rhs.1) - } - } - } -} - -impl core::ops::BitAnd for u32x8 { - type Output = Self; - - fn bitand(self, rhs: Self) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - Self(unsafe { _mm256_and_si256(self.0, rhs.0) }) - } else { - Self(self.0 & rhs.0, self.1 & rhs.1) - } - } - } -} diff --git a/tests/images/canvas/draw-pixmap-opacity.png b/tests/images/canvas/draw-pixmap-opacity.png index 8e01b5b..81572a3 100644 Binary files a/tests/images/canvas/draw-pixmap-opacity.png and b/tests/images/canvas/draw-pixmap-opacity.png differ diff --git a/tests/images/gradients/three-stops-evenly-spaced-lq.png b/tests/images/gradients/three-stops-evenly-spaced-lq.png index 6528379..2a2f546 100644 Binary files a/tests/images/gradients/three-stops-evenly-spaced-lq.png and b/tests/images/gradients/three-stops-evenly-spaced-lq.png differ diff --git a/tests/images/gradients/two-stops-linear-pad-lq.png b/tests/images/gradients/two-stops-linear-pad-lq.png index 41e0025..04e0659 100644 Binary files a/tests/images/gradients/two-stops-linear-pad-lq.png and b/tests/images/gradients/two-stops-linear-pad-lq.png differ diff --git a/tests/images/gradients/two-stops-linear-reflect-lq.png b/tests/images/gradients/two-stops-linear-reflect-lq.png index 347ebc1..c18e728 100644 Binary files a/tests/images/gradients/two-stops-linear-reflect-lq.png and b/tests/images/gradients/two-stops-linear-reflect-lq.png differ diff --git a/tests/images/gradients/two-stops-linear-repeat-lq.png b/tests/images/gradients/two-stops-linear-repeat-lq.png index 0f67e30..033fc59 100644 Binary files a/tests/images/gradients/two-stops-linear-repeat-lq.png and b/tests/images/gradients/two-stops-linear-repeat-lq.png differ diff --git a/tests/images/gradients/two-stops-unevenly-spaced-lq.png b/tests/images/gradients/two-stops-unevenly-spaced-lq.png index d1521ed..fc655f8 100644 Binary files a/tests/images/gradients/two-stops-unevenly-spaced-lq.png and b/tests/images/gradients/two-stops-unevenly-spaced-lq.png differ