juj · juj · May 15, 2020 · May 16, 2020 · May 16, 2020
diff --git a/src/Math/Interpolate.h b/src/Math/Interpolate.h
@@ -0,0 +1,148 @@
+/* Copyright Jukka Jylänki
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+/** @file Interpolate.h
+	@author Jukka Jylänki
+	@brief */
+#pragma once
+
+#include "assume.h"
+#include "MathFunc.h"
+#include "MathConstants.h"
+#include "MathNamespace.h"
+
+MATH_BEGIN_NAMESPACE
+
+// Interpolates [0,1]->[0,1] in a way that starts smoothly, but stops sharply.
+// Sometimes also referred to as EaseIn or EaseInQuad.
+inline float SmoothStart(float t)
+{
+  assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
+  return t*t;
+}
+
+// Like SmoothStart, but even smoother start (and sharper stop)
+inline float SmoothStart3(float t)
+{
+  assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
+  return t*t*t;
+}
+
+inline float SmoothStart4(float t)
+{
+  assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
+  float tt = t*t;
+  return tt*tt;
+}
+
+inline float SmoothStart5(float t)
+{
+  assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
+  float tt = t*t;
+  return tt*tt*t;
+}
+
+// Starts sharply at (0,0), but stops smoothly to (1,1). I.e. reverse of SmoothStart2.
+inline float SmoothStop(float t)
+{
+  assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
+  float oneT = 1.f - t;
+  return 1.f - oneT*oneT;
+}
+
+inline float SmoothStop3(float t)
+{
+  assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
+  float oneT = 1.f - t;
+  return 1.f - oneT*oneT*oneT;
+}
+
+inline float SmoothStop4(float t)
+{
+  assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
+  float oneT = 1.f - t;
+  oneT *= oneT;
+  return 1.f - oneT*oneT;
+}
+
+inline float SmoothStop5(float t)
+{
+  assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
+  float oneT = 1.f - t;
+  float oneT2 = oneT * oneT;
+  return 1.f - oneT2*oneT2*oneT;
+}
+
+// Starts out as SmoothStop, and linearly blends to SmoothStart
+inline float SharpStep(float t)
+{
+  assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
+  //   t * t^2 + (1-t)*(1 - (1-t)^2)
+  // = 2t^3 - 3t^2 + 2t
+  // = t(2t^2 + 2 - 3t)
+  float tt = t*t;
+  return t*(2.f * tt + 2.f - 3.f * t);
+}
+
+// Starts out as SmoothStart, and linearly blends to SmoothStop.
+// Also called "cubic Hermite interpolation"
+inline float SmoothStep(float t)
+{
+  assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
+  //   (1-t) * SmoothStart(t) + t * SmoothStop(t)
+  // = (1-t) * t^2 + t*(1 - (1-t)^2)
+  // = 3t^2 - 2t^3
+
+  float tt = t*t;
+  return 3.f*tt - 2.f*tt*t;
+}
+
+// N.b. it is possible to define higher order linear
+// blends, like 
+
+//   (1-t) * SmoothStart4(t) + t * SmoothStop4(t)
+//  = -2t^5+5t^4-6t^3+4t^2
+
+//   (1-t) * SmoothStart5(t) + t * SmoothStop5(t)
+//  = -4t^5+10t^4-10t^3+5t^2
+
+//  Nth order blend: (1-t) * t^n + t * (1-(1-t)^n)
+
+// and so on. As the exponent grows, the function becomes
+// "sharper" in the beginning and the end.
+
+inline float SmoothStep5(float t)
+{
+  assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
+  // 6t^5 -15t^4 + 10t^3
+  float tt = t*t;
+  return tt*t*(6.f*tt - 15.f*t + 10.f);
+}
+
+inline float SmoothStep7(float t)
+{
+  assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
+  // -20t^7 + 70t^6 - 84t^5 + 35t^4
+  float tt = t*t;
+  float tttt = tt*tt;
+  return tttt*(-20.f*tt*t + 70.f*tt - 84.f*t + 35.f);
+}
+
+inline float CosineStep01(float t)
+{
+  assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h)
+  return 0.5f - Cos(t*pi) * 0.5f;
+}
+
+MATH_END_NAMESPACE
diff --git a/src/Math/MathFunc.cpp b/src/Math/MathFunc.cpp
@@ -499,7 +499,7 @@ float Step(float y, float x)
 	return (x >= y) ? 1.f : 0.f;
 }
 
-float SmoothStep(float min, float max, float x)
+float Ramp(float min, float max, float x)
 {
 	return x <= min ? 0.f : (x >= max ? 1.f : (x - min) / (max - min));
 }

diff --git a/src/Math/MathFunc.h b/src/Math/MathFunc.h
@@ -189,7 +189,7 @@ float InvLerp(float a, float b, float x);
 float Step(float y, float x);
 /// See http://msdn.microsoft.com/en-us/library/bb509658(v=vs.85).aspx
 /** @see Lerp(), LerpMod(), InvLerp(), Step(), PingPongMod(), Mod(), ModPos(), Frac(). */
-float SmoothStep(float min, float max, float x);
+float Ramp(float min, float max, float x);
 /// Limits x to the range [0, mod], but instead of wrapping around from mod to 0, the result will move back
 /// from mod to 0 as x goes from mod to 2*mod.
 /** @see Lerp(), LerpMod(), InvLerp(), Step(), SmoothStep(), Mod(), ModPos(), Frac(). */

diff --git a/src/Math/Quat.cpp b/src/Math/Quat.cpp
@@ -303,7 +303,7 @@ Quat MUST_USE_RESULT Quat::Slerp(const Quat &q2, float t) const
 #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE)
 	simd4f angle = dot4_ps(q, q2.q); // <q, q2.q>
 	simd4f neg = cmplt_ps(angle, zero_ps()); // angle < 0?
-	neg = and_ps(neg, set1_ps_hex(0x80000000)); // Convert 0/0xFFFFFFFF mask to a 0x/0x80000000 mask.
+	neg = and_ps(neg, set1_ps(-0.0f)); // Convert 0/0xFFFFFFFF mask to a 0x/0x80000000 mask.
 //	neg = s4i_to_s4f(_mm_slli_epi32(s4f_to_s4i(neg), 31)); // A SSE2-esque way to achieve the above would be this, but this seems to clock slower (12.04 clocks vs 11.97 clocks)
 	angle = xor_ps(angle, neg); // if angle was negative, make it positive.
 	simd4f one = set1_ps(1.f);

diff --git a/src/Math/quat_simd.h b/src/Math/quat_simd.h
@@ -15,7 +15,7 @@ MATH_BEGIN_NAMESPACE
 inline void quat_to_mat3x4(simd4f q, simd4f t, simd4f *m)
 {
 	simd4f one = set_ps(0, 0, 0, 1);
-	const simd4f sseX1 = set_ps_hex((int)0x80000000UL, (int)0x80000000UL, 0, (int)0x80000000UL); // [-, -, + -]
+	const simd4f sseX1 = set_ps(-0.0f, -0.0f, 0, -0.0f); // [-, -, + -]
 	simd4f q2 = add_ps(q, q);                                                     // [2w 2z 2y 2x]
 	simd4f t2 = _mm_add_ss(xor_ps(mul_ps(zwww_ps(q), zzyx_ps(q2)), sseX1), one);  // [-2xw -2yw  2zw 1-2zz]
 	const simd4f sseX0 = yzwx_ps(sseX1);                                          // [-, -, -, +]
@@ -119,7 +119,7 @@ FORCE_INLINE simd4f quat_mul_quat(simd4f q1, simd4f q2)
 	            x*r.y - y*r.x + z*r.w + w*r.z,
 	           -x*r.x - y*r.y - z*r.z + w*r.w); */
 #if defined(MATH_SSE)
-	const simd4f signy = set_ps_hex(0x80000000u, 0x80000000u, 0, 0); // [- - + +]
+	const simd4f signy = set_ps(-0.0f, -0.0f, 0, 0); // [- - + +]
 	const simd4f signz = wxxw_ps(signy);   // [- + + -]
 
 	simd4f X = xxxx_ps(q1);
@@ -148,9 +148,9 @@ FORCE_INLINE simd4f quat_mul_quat(simd4f q1, simd4f q2)
 	quat_mul_quat_asm(&q1, &q2, &ret);
 	return ret;
 #elif defined(MATH_NEON)
-	static const float32x4_t signx = set_ps_hex_const(0x80000000u, 0, 0x80000000u, 0);
-	static const float32x4_t signy = set_ps_hex_const(0x80000000u, 0x80000000u, 0, 0);
-	static const float32x4_t signz = set_ps_hex_const(0x80000000u, 0, 0, 0x80000000u);
+	static const float32x4_t signx = set_ps(-0.0f, 0, -0.0f, 0);
+	static const float32x4_t signy = set_ps(-0.0f, -0.0f, 0, 0);
+	static const float32x4_t signz = set_ps(-0.0f, 0, 0, -0.0f);
 
 	const float32_t *q1f = (const float32_t *)&q1;
 	float32x4_t X = xor_ps(signx, vdupq_n_f32(q1f[0]));
@@ -178,7 +178,7 @@ FORCE_INLINE simd4f quat_div_quat(simd4f q1, simd4f q2)
 	           -x*r.y + y*r.x + z*r.w - w*r.z,
 	            x*r.x + y*r.y + z*r.z + w*r.w); */
 
-	const simd4f signx = set_ps_hex(0x80000000u, 0, 0x80000000u, 0); // [- + - +]
+	const simd4f signx = set_ps(-0.0f, 0, -0.0f, 0); // [- + - +]
 	const simd4f signy = xxww_ps(signx);   // [- - + +]
 	const simd4f signz = wxxw_ps(signx);   // [- + + -]
 

diff --git a/tests/NEONTests.cpp b/tests/NEONTests.cpp
@@ -7,6 +7,7 @@
 #include "TestRunner.h"
 #include "TestData.h"
 #include "../src/Math/SSEMath.h"
+#include "../src/Math/simd.h"
 #include "../src/Math/float4x4_sse.h"
 #include "../src/Math/float4_neon.h"
 
@@ -18,28 +19,18 @@ BENCHMARK(float4_op_add, "float4 + float4")
 }
 BENCHMARK_END;
 
-#ifdef MATH_NEON
-
-BENCHMARK(rsqrtq, "neon")
-{
-	float32x4_t r = v[i];
-	float32x4_t rcp = vrsqrteq_f32(r);
-	float32x4_t ret = vmulq_f32(vrsqrtsq_f32(vmulq_f32(rcp, rcp), r), rcp);
-	v3[i] = ret;
-}
-BENCHMARK_END
-
-BENCHMARK(rsqrt, "neon")
+UNIQUE_TEST(set_ps_neg_zero)
 {
-	float32x4_t r = v[i];
-	float32x2_t rcp = vrsqrte_f32(vget_low_f32(r));
-	float32x2_t hi = vget_high_f32(r);
-	float32x2_t ret = vmul_f32(vrsqrts_f32(vmul_f32(rcp, rcp), vget_low_f32(r)), rcp);
-	v3[i] = vcombine_f32(ret, hi);
+	simd4f constant = set1_ps(-0.f);
+	u32 arr[4];
+	memcpy(arr, &constant, sizeof(arr));
+	asserteq(arr[0], 0x80000000u);
+	asserteq(arr[1], 0x80000000u);
+	asserteq(arr[2], 0x80000000u);
+	asserteq(arr[3], 0x80000000u);
 }
-BENCHMARK_END
 
-UNIQUE_TEST(set_ps_const)
+UNIQUE_TEST(set_ps_const_vec)
 {
 	simd4f constant = set_ps_const(4.f, 3.f, 2.f, 1.f);
 	float arr[4];
@@ -61,6 +52,27 @@ UNIQUE_TEST(set_ps_const_hex)
 	asserteq(arr[3], -0.0f);
 }
 
+#ifdef MATH_NEON
+
+BENCHMARK(rsqrtq, "neon")
+{
+	float32x4_t r = v[i];
+	float32x4_t rcp = vrsqrteq_f32(r);
+	float32x4_t ret = vmulq_f32(vrsqrtsq_f32(vmulq_f32(rcp, rcp), r), rcp);
+	v3[i] = ret;
+}
+BENCHMARK_END
+
+BENCHMARK(rsqrt, "neon")
+{
+	float32x4_t r = v[i];
+	float32x2_t rcp = vrsqrte_f32(vget_low_f32(r));
+	float32x2_t hi = vget_high_f32(r);
+	float32x2_t ret = vmul_f32(vrsqrts_f32(vmul_f32(rcp, rcp), vget_low_f32(r)), rcp);
+	v3[i] = vcombine_f32(ret, hi);
+}
+BENCHMARK_END
+
 #ifdef ANDROID
 
 FORCE_INLINE void inline_asm_add(void *v1, void *v2, void *out)