Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 148 additions & 0 deletions src/Math/Interpolate.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/* Copyright Jukka Jylänki

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

/** @file Interpolate.h
@author Jukka Jylänki
@brief */
#pragma once

#include "assume.h"
#include "MathFunc.h"
#include "MathConstants.h"
#include "MathNamespace.h"

MATH_BEGIN_NAMESPACE

// Interpolates [0,1]->[0,1] in a way that starts smoothly, but stops sharply.
// Sometimes also referred to as EaseIn or EaseInQuad.
inline float SmoothStart(float t)
{
assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
return t*t;
}

// Like SmoothStart, but even smoother start (and sharper stop)
inline float SmoothStart3(float t)
{
assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
return t*t*t;
}

inline float SmoothStart4(float t)
{
assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
float tt = t*t;
return tt*tt;
}

inline float SmoothStart5(float t)
{
assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
float tt = t*t;
return tt*tt*t;
}

// Starts sharply at (0,0), but stops smoothly to (1,1). I.e. reverse of SmoothStart2.
inline float SmoothStop(float t)
{
assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
float oneT = 1.f - t;
return 1.f - oneT*oneT;
}

inline float SmoothStop3(float t)
{
assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
float oneT = 1.f - t;
return 1.f - oneT*oneT*oneT;
}

inline float SmoothStop4(float t)
{
assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
float oneT = 1.f - t;
oneT *= oneT;
return 1.f - oneT*oneT;
}

inline float SmoothStop5(float t)
{
assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
float oneT = 1.f - t;
float oneT2 = oneT * oneT;
return 1.f - oneT2*oneT2*oneT;
}

// Starts out as SmoothStop, and linearly blends to SmoothStart
inline float SharpStep(float t)
{
assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
// t * t^2 + (1-t)*(1 - (1-t)^2)
// = 2t^3 - 3t^2 + 2t
// = t(2t^2 + 2 - 3t)
float tt = t*t;
return t*(2.f * tt + 2.f - 3.f * t);
}

// Starts out as SmoothStart, and linearly blends to SmoothStop.
// Also called "cubic Hermite interpolation"
inline float SmoothStep(float t)
{
assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
// (1-t) * SmoothStart(t) + t * SmoothStop(t)
// = (1-t) * t^2 + t*(1 - (1-t)^2)
// = 3t^2 - 2t^3

float tt = t*t;
return 3.f*tt - 2.f*tt*t;
}

// N.b. it is possible to define higher order linear
// blends, like

// (1-t) * SmoothStart4(t) + t * SmoothStop4(t)
// = -2t^5+5t^4-6t^3+4t^2

// (1-t) * SmoothStart5(t) + t * SmoothStop5(t)
// = -4t^5+10t^4-10t^3+5t^2

// Nth order blend: (1-t) * t^n + t * (1-(1-t)^n)

// and so on. As the exponent grows, the function becomes
// "sharper" in the beginning and the end.

inline float SmoothStep5(float t)
{
assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
// 6t^5 -15t^4 + 10t^3
float tt = t*t;
return tt*t*(6.f*tt - 15.f*t + 10.f);
}

inline float SmoothStep7(float t)
{
assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
// -20t^7 + 70t^6 - 84t^5 + 35t^4
float tt = t*t;
float tttt = tt*tt;
return tttt*(-20.f*tt*t + 70.f*tt - 84.f*t + 35.f);
}

inline float CosineStep01(float t)
{
assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
return 0.5f - Cos(t*pi) * 0.5f;
}

MATH_END_NAMESPACE
2 changes: 1 addition & 1 deletion src/Math/MathFunc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,7 @@ float Step(float y, float x)
return (x >= y) ? 1.f : 0.f;
}

float SmoothStep(float min, float max, float x)
float Ramp(float min, float max, float x)
{
return x <= min ? 0.f : (x >= max ? 1.f : (x - min) / (max - min));
}
Expand Down
2 changes: 1 addition & 1 deletion src/Math/MathFunc.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ float InvLerp(float a, float b, float x);
float Step(float y, float x);
/// See http://msdn.microsoft.com/en-us/library/bb509658(v=vs.85).aspx
/** @see Lerp(), LerpMod(), InvLerp(), Step(), PingPongMod(), Mod(), ModPos(), Frac(). */
float SmoothStep(float min, float max, float x);
float Ramp(float min, float max, float x);
/// Limits x to the range [0, mod], but instead of wrapping around from mod to 0, the result will move back
/// from mod to 0 as x goes from mod to 2*mod.
/** @see Lerp(), LerpMod(), InvLerp(), Step(), SmoothStep(), Mod(), ModPos(), Frac(). */
Expand Down
2 changes: 1 addition & 1 deletion src/Math/Quat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ Quat MUST_USE_RESULT Quat::Slerp(const Quat &q2, float t) const
#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE)
simd4f angle = dot4_ps(q, q2.q); // <q, q2.q>
simd4f neg = cmplt_ps(angle, zero_ps()); // angle < 0?
neg = and_ps(neg, set1_ps_hex(0x80000000)); // Convert 0/0xFFFFFFFF mask to a 0x/0x80000000 mask.
neg = and_ps(neg, set1_ps(-0.0f)); // Convert 0/0xFFFFFFFF mask to a 0x/0x80000000 mask.
// neg = s4i_to_s4f(_mm_slli_epi32(s4f_to_s4i(neg), 31)); // A SSE2-esque way to achieve the above would be this, but this seems to clock slower (12.04 clocks vs 11.97 clocks)
angle = xor_ps(angle, neg); // if angle was negative, make it positive.
simd4f one = set1_ps(1.f);
Expand Down
12 changes: 6 additions & 6 deletions src/Math/quat_simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ MATH_BEGIN_NAMESPACE
inline void quat_to_mat3x4(simd4f q, simd4f t, simd4f *m)
{
simd4f one = set_ps(0, 0, 0, 1);
const simd4f sseX1 = set_ps_hex((int)0x80000000UL, (int)0x80000000UL, 0, (int)0x80000000UL); // [-, -, + -]
const simd4f sseX1 = set_ps(-0.0f, -0.0f, 0, -0.0f); // [-, -, + -]
simd4f q2 = add_ps(q, q); // [2w 2z 2y 2x]
simd4f t2 = _mm_add_ss(xor_ps(mul_ps(zwww_ps(q), zzyx_ps(q2)), sseX1), one); // [-2xw -2yw 2zw 1-2zz]
const simd4f sseX0 = yzwx_ps(sseX1); // [-, -, -, +]
Expand Down Expand Up @@ -119,7 +119,7 @@ FORCE_INLINE simd4f quat_mul_quat(simd4f q1, simd4f q2)
x*r.y - y*r.x + z*r.w + w*r.z,
-x*r.x - y*r.y - z*r.z + w*r.w); */
#if defined(MATH_SSE)
const simd4f signy = set_ps_hex(0x80000000u, 0x80000000u, 0, 0); // [- - + +]
const simd4f signy = set_ps(-0.0f, -0.0f, 0, 0); // [- - + +]
const simd4f signz = wxxw_ps(signy); // [- + + -]

simd4f X = xxxx_ps(q1);
Expand Down Expand Up @@ -148,9 +148,9 @@ FORCE_INLINE simd4f quat_mul_quat(simd4f q1, simd4f q2)
quat_mul_quat_asm(&q1, &q2, &ret);
return ret;
#elif defined(MATH_NEON)
static const float32x4_t signx = set_ps_hex_const(0x80000000u, 0, 0x80000000u, 0);
static const float32x4_t signy = set_ps_hex_const(0x80000000u, 0x80000000u, 0, 0);
static const float32x4_t signz = set_ps_hex_const(0x80000000u, 0, 0, 0x80000000u);
static const float32x4_t signx = set_ps(-0.0f, 0, -0.0f, 0);
static const float32x4_t signy = set_ps(-0.0f, -0.0f, 0, 0);
static const float32x4_t signz = set_ps(-0.0f, 0, 0, -0.0f);

const float32_t *q1f = (const float32_t *)&q1;
float32x4_t X = xor_ps(signx, vdupq_n_f32(q1f[0]));
Expand Down Expand Up @@ -178,7 +178,7 @@ FORCE_INLINE simd4f quat_div_quat(simd4f q1, simd4f q2)
-x*r.y + y*r.x + z*r.w - w*r.z,
x*r.x + y*r.y + z*r.z + w*r.w); */

const simd4f signx = set_ps_hex(0x80000000u, 0, 0x80000000u, 0); // [- + - +]
const simd4f signx = set_ps(-0.0f, 0, -0.0f, 0); // [- + - +]
const simd4f signy = xxww_ps(signx); // [- - + +]
const simd4f signz = wxxw_ps(signx); // [- + + -]

Expand Down
50 changes: 31 additions & 19 deletions tests/NEONTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "TestRunner.h"
#include "TestData.h"
#include "../src/Math/SSEMath.h"
#include "../src/Math/simd.h"
#include "../src/Math/float4x4_sse.h"
#include "../src/Math/float4_neon.h"

Expand All @@ -18,28 +19,18 @@ BENCHMARK(float4_op_add, "float4 + float4")
}
BENCHMARK_END;

#ifdef MATH_NEON

BENCHMARK(rsqrtq, "neon")
{
float32x4_t r = v[i];
float32x4_t rcp = vrsqrteq_f32(r);
float32x4_t ret = vmulq_f32(vrsqrtsq_f32(vmulq_f32(rcp, rcp), r), rcp);
v3[i] = ret;
}
BENCHMARK_END

BENCHMARK(rsqrt, "neon")
UNIQUE_TEST(set_ps_neg_zero)
{
float32x4_t r = v[i];
float32x2_t rcp = vrsqrte_f32(vget_low_f32(r));
float32x2_t hi = vget_high_f32(r);
float32x2_t ret = vmul_f32(vrsqrts_f32(vmul_f32(rcp, rcp), vget_low_f32(r)), rcp);
v3[i] = vcombine_f32(ret, hi);
simd4f constant = set1_ps(-0.f);
u32 arr[4];
memcpy(arr, &constant, sizeof(arr));
asserteq(arr[0], 0x80000000u);
asserteq(arr[1], 0x80000000u);
asserteq(arr[2], 0x80000000u);
asserteq(arr[3], 0x80000000u);
}
BENCHMARK_END

UNIQUE_TEST(set_ps_const)
UNIQUE_TEST(set_ps_const_vec)
{
simd4f constant = set_ps_const(4.f, 3.f, 2.f, 1.f);
float arr[4];
Expand All @@ -61,6 +52,27 @@ UNIQUE_TEST(set_ps_const_hex)
asserteq(arr[3], -0.0f);
}

#ifdef MATH_NEON

BENCHMARK(rsqrtq, "neon")
{
float32x4_t r = v[i];
float32x4_t rcp = vrsqrteq_f32(r);
float32x4_t ret = vmulq_f32(vrsqrtsq_f32(vmulq_f32(rcp, rcp), r), rcp);
v3[i] = ret;
}
BENCHMARK_END

BENCHMARK(rsqrt, "neon")
{
float32x4_t r = v[i];
float32x2_t rcp = vrsqrte_f32(vget_low_f32(r));
float32x2_t hi = vget_high_f32(r);
float32x2_t ret = vmul_f32(vrsqrts_f32(vmul_f32(rcp, rcp), vget_low_f32(r)), rcp);
v3[i] = vcombine_f32(ret, hi);
}
BENCHMARK_END

#ifdef ANDROID

FORCE_INLINE void inline_asm_add(void *v1, void *v2, void *out)
Expand Down