flesh out spherical isometries

MikaelSlevinsky · Aug 20, 2020 · 278dcd8 · 278dcd8
1 parent 6373913
commit 278dcd8
Show file tree

Hide file tree

Showing 18 changed files with 962 additions and 55 deletions.
diff --git a/Make.inc b/Make.inc
@@ -84,7 +84,7 @@ ASM = src/recurrence/recurrence_default.s \
       src/rotations/rotations_AVX512F.s \
       src/rotations/rotations_NEON.s
 
-SRC = src/recurrence.c src/transforms.c src/rotations.c src/permute.c src/tdc.c src/drivers.c src/fftw.c
+SRC = src/recurrence.c src/transforms.c src/rotations.c src/permute.c src/tdc.c src/drivers.c src/fftw.c src/isometries.c
 
 machine := $(shell $(CC) -dumpmachine | cut -d'-' -f1)
 

diff --git a/Makefile b/Makefile
@@ -44,6 +44,7 @@ tests:
 	$(CC) src/ftutilities.c test/test_tdc.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o test_tdc
 	$(CC) src/ftutilities.c test/test_drivers.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o test_drivers
 	$(CC) src/ftutilities.c test/test_fftw.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o test_fftw
+	$(CC) src/ftutilities.c test/test_isometries.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o test_isometries
 
 examples:
 	$(CC) src/ftutilities.c examples/additiontheorem.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o additiontheorem
@@ -64,6 +65,7 @@ runtests:
 	./test_tdc
 	OMP_NUM_THREADS=$(FT_NUM_THREADS) ./test_drivers 1 3 0
 	OMP_NUM_THREADS=$(FT_NUM_THREADS) ./test_fftw 1 3 0
+	OMP_NUM_THREADS=$(FT_NUM_THREADS) ./test_isometries 1 3
 
 runexamples:
 	./additiontheorem

diff --git a/src/fasttransforms.h b/src/fasttransforms.h
@@ -34,11 +34,13 @@ void ft_clenshawf(const int n, const float * c, const int incc, const int m, flo
 void ft_orthogonal_polynomial_clenshaw(const int n, const double * c, const int incc, const double * A, const double * B, const double * C, const int m, double * x, double * phi0, double * f);
 void ft_orthogonal_polynomial_clenshawf(const int n, const float * c, const int incc, const float * A, const float * B, const float * C, const int m, float * x, float * phi0, float * f);
 
-void ft_orthogonal_polynomial_clenshawl(const int n, const long double * c, const int incc, const long double * A, const long double * B, const long double * C, const int m, long double * x, long double * phi0, long double * f);
+void ft_eigen_eval(const int n, const double * c, const int incc, const double * A, const double * B, const double * C, const int m, double * x, const int sign, double * f);
+void ft_eigen_evalf(const int n, const float * c, const int incc, const float * A, const float * B, const float * C, const int m, float * x, const int sign, float * f);
+void ft_eigen_evall(const int n, const long double * c, const int incc, const long double * A, const long double * B, const long double * C, const int m, long double * x, const int sign, long double * f);
 #if defined(FT_QUADMATH)
     #include <quadmath.h>
     typedef __float128 quadruple;
-    void ft_orthogonal_polynomial_clenshawq(const int n, const quadruple * c, const int incc, const quadruple * A, const quadruple * B, const quadruple * C, const int m, quadruple * x, quadruple * phi0, quadruple * f);
+    void ft_eigen_evalq(const int n, const quadruple * c, const int incc, const quadruple * A, const quadruple * B, const quadruple * C, const int m, quadruple * x, const int sign, quadruple * f);
 #endif
 
 #include "tdc.h"
@@ -498,4 +500,85 @@ void ft_execute_spinsph_synthesis(const ft_spinsphere_fftw_plan * P, ft_complex
 /// Execute FFTW analysis on the sphere with spin.
 void ft_execute_spinsph_analysis(const ft_spinsphere_fftw_plan * P, ft_complex * X, const int N, const int M);
 
+/*!
+  \brief A static struct to store an orthogonal matrix \f$Q \in \mathbb{R}^{3\times3}\f$, such that \f$Q^\top Q = I\f$.
+  \f$Q\f$ has column-major storage.
+*/
+typedef struct {
+    double Q[9];
+} ft_orthogonal_transformation;
+
+/*!
+  \brief Every orthogonal matrix \f$Q \in \mathbb{R}^{3\times3}\f$ can be decomposed as a product of \f$ZYZ\f$ Euler angles and, if necessary, a reflection \f$R\f$ about the \f$xy\f$-plane.
+  \f[
+  Q = ZYZR,
+  \f]
+  where the \f$z\f$-axis rotations are:
+  \f[
+  Z = \begin{pmatrix} c & -s & 0\\ s & c & 0\\ 0 & 0 & 1\end{pmatrix},
+  \f]
+  the \f$y\f$-axis rotation is:
+  \f[
+  Y = \begin{pmatrix} c & 0 & -s\\ 0 & 1 & 0\\ s & 0 & c\end{pmatrix},
+  \f]
+  and the potential reflection is:
+  \f[
+  R = \begin{pmatrix} 1 & 0 & 0\\ 0 & 1 & 0\\ 0 & 0 & \pm 1\end{pmatrix}.
+  \f]
+  The reflection is stored as an integer `sign` corresponding to the bottom right entry.
+*/
+typedef struct {
+    double s[3];
+    double c[3];
+    int sign;
+} ft_ZYZR;
+
+/*!
+  \brief A static struct to store a reflection about the plane \f$w\cdot x = 0\f$ in \f$\mathbb{R}^3\f$.
+*/
+typedef struct {
+    double w[3];
+} ft_reflection;
+
+ft_ZYZR ft_create_ZYZR(ft_orthogonal_transformation Q);
+
+void ft_apply_ZYZR(ft_ZYZR Q, ft_orthogonal_transformation * U);
+
+void ft_apply_reflection(ft_reflection Q, ft_orthogonal_transformation * U);
+
+void ft_execute_sph_polar_rotation(double * A, const int N, const int M, double s, double c);
+
+void ft_execute_sph_polar_reflection(double * A, const int N, const int M);
+
+typedef struct {
+    ft_symmetric_tridiagonal_symmetric_eigen * F11;
+    ft_symmetric_tridiagonal_symmetric_eigen * F21;
+    ft_symmetric_tridiagonal_symmetric_eigen * F12;
+    ft_symmetric_tridiagonal_symmetric_eigen * F22;
+    int l;
+} ft_partial_ZY_axis_exchange_factorization;
+
+typedef struct {
+    ft_partial_ZY_axis_exchange_factorization ** F;
+    int n;
+} ft_ZY_axis_exchange_factorization;
+
+void ft_destroy_partial_ZY_axis_exchange_factorization(ft_partial_ZY_axis_exchange_factorization * F);
+
+void ft_destroy_ZY_axis_exchange_factorization(ft_ZY_axis_exchange_factorization * F);
+
+ft_partial_ZY_axis_exchange_factorization * ft_create_partial_ZY_axis_exchange_factorization(const int l);
+
+ft_ZY_axis_exchange_factorization * ft_create_ZY_axis_exchange_factorization(const int n);
+
+void ft_execute_sph_ZY_axis_exchange(ft_ZY_axis_exchange_factorization * J, double * A, const int N, const int M);
+
+void ft_execute_sph_isometry(ft_ZY_axis_exchange_factorization * J, ft_ZYZR Q, double * A, const int N, const int M);
+
+void ft_execute_sph_rotation(ft_ZY_axis_exchange_factorization * J, const double alpha, const double beta, const double gamma, double * A, const int N, const int M);
+
+void ft_execute_sph_reflection(ft_ZY_axis_exchange_factorization * J, ft_reflection W, double * A, const int N, const int M);
+
+void ft_execute_sph_orthogonal_transformation(ft_ZY_axis_exchange_factorization * J, ft_orthogonal_transformation Q, double * A, const int N, const int M);
+
 #endif // FASTTRANSFORMS_H
diff --git a/src/ftinternal.h b/src/ftinternal.h
@@ -179,6 +179,8 @@ typedef struct {
     #define vloadu8(v) ((double8) _mm512_loadu_pd(v))
     #define vstore8(u, v) (_mm512_store_pd(u, v))
     #define vstoreu8(u, v) (_mm512_storeu_pd(u, v))
+    #define vsqrt8(x) ((double8) _mm512_sqrt_pd(x))
+    #define vmovemask8(x) (((int) _mm256_movemask_pd((double4) _mm512_extractf64x4_pd(x, 0)))*((int) _mm256_movemask_pd((double4) _mm512_extractf64x4_pd(x, 1))))
     #define vfma8(a, b, c) ((double8) _mm512_fmadd_pd(a, b, c))
     #define vfms8(a, b, c) ((double8) _mm512_fmsub_pd(a, b, c))
     #define vfmas8(a, b, c) ((double8) _mm512_fmaddsub_pd(a, b, c))
@@ -188,6 +190,8 @@ typedef struct {
     #define vloadu16f(v) ((float16) _mm512_loadu_ps(v))
     #define vstore16f(u, v) (_mm512_store_ps(u, v))
     #define vstoreu16f(u, v) (_mm512_storeu_ps(u, v))
+    #define vsqrt16f(x) ((float16) _mm512_sqrt_ps(x))
+    #define vmovemask16f(x) (((int) _mm256_movemask_ps((float8) _mm512_extractf32x8_ps(x, 0)))*((int) _mm256_movemask_ps((float8) _mm512_extractf32x8_ps(x, 1))))
     #define vfma16f(a, b, c) ((float16) _mm512_fmadd_ps(a, b, c))
     #define vfms16f(a, b, c) ((float16) _mm512_fmsub_ps(a, b, c))
     #define vfmas16f(a, b, c) ((float16) _mm512_fmaddsub_ps(a, b, c))
@@ -203,13 +207,17 @@ typedef struct {
     #define vloadu4(v) ((double4) _mm256_loadu_pd(v))
     #define vstore4(u, v) (_mm256_store_pd(u, v))
     #define vstoreu4(u, v) (_mm256_storeu_pd(u, v))
+    #define vas4(a, b) ((double4) _mm256_addsub_pd(a, b))
+    #define vsqrt4(x) ((double4) _mm256_sqrt_pd(x))
+    #define vmovemask4(x) ((int) _mm256_movemask_pd(x))
     typedef float float8 __attribute__ ((vector_size (VECTOR_SIZE_4*8)));
     #define vall8f(x) ((float8) _mm256_set1_ps(x))
     #define vload8f(v) ((float8) _mm256_load_ps(v))
     #define vloadu8f(v) ((float8) _mm256_loadu_ps(v))
     #define vstore8f(u, v) (_mm256_store_ps(u, v))
     #define vstoreu8f(u, v) (_mm256_storeu_ps(u, v))
-    #define vas4(a, b) ((double4) _mm256_addsub_pd(a, b))
+    #define vsqrt8f(x) ((float8) _mm256_sqrt_ps(x))
+    #define vmovemask8f(x) ((int) _mm256_movemask_ps(x))
     #define vas8f(a, b) ((float8) _mm256_addsub_ps(a, b))
     #ifdef __FMA__
         #define vfma4(a, b, c) ((double4) _mm256_fmadd_pd(a, b, c))
@@ -231,6 +239,8 @@ typedef struct {
     #define vloadu2(v) ((double2) _mm_loadu_pd(v))
     #define vstore2(u, v) (_mm_store_pd(u, v))
     #define vstoreu2(u, v) (_mm_storeu_pd(u, v))
+    #define vsqrt2(x) ((double2) _mm_sqrt_pd(x))
+    #define vmovemask2(x) ((int) _mm_movemask_pd(x))
     #ifdef __FMA__
         #define vfma2(a, b, c) ((double2) _mm_fmadd_pd(a, b, c))
         #define vfms2(a, b, c) ((double2) _mm_fmsub_pd(a, b, c))
@@ -248,6 +258,8 @@ typedef struct {
     #define vloadu4f(v) ((float4) _mm_loadu_ps(v))
     #define vstore4f(u, v) (_mm_store_ps(u, v))
     #define vstoreu4f(u, v) (_mm_storeu_ps(u, v))
+    #define vsqrt4f(x) ((float4) _mm_sqrt_ps(x))
+    #define vmovemask4f(x) ((int) _mm_movemask_ps(x))
     #ifdef __FMA__
         #define vfma4f(a, b, c) ((float4) _mm_fmadd_ps(a, b, c))
         #define vfms4f(a, b, c) ((float4) _mm_fmsub_ps(a, b, c))
@@ -308,9 +320,21 @@ void orthogonal_polynomial_clenshaw_AVX_FMAf(const int n, const float * c, const
 void orthogonal_polynomial_clenshaw_AVX512Ff(const int n, const float * c, const int incc, const float * A, const float * B, const float * C, const int m, float * x, float * phi0, float * f);
 void orthogonal_polynomial_clenshaw_NEONf(const int n, const float * c, const int incc, const float * A, const float * B, const float * C, const int m, float * x, float * phi0, float * f);
 
-void orthogonal_polynomial_clenshaw_defaultl(const int n, const long double * c, const int incc, const long double * A, const long double * B, const long double * C, const int m, long double * x, long double * phi0, long double * f);
+void eigen_eval_default(const int n, const double * c, const int incc, const double * A, const double * B, const double * C, const int m, double * x, const int sign, double * f);
+void eigen_eval_SSE2(const int n, const double * c, const int incc, const double * A, const double * B, const double * C, const int m, double * x, const int sign, double * f);
+void eigen_eval_AVX(const int n, const double * c, const int incc, const double * A, const double * B, const double * C, const int m, double * x, const int sign, double * f);
+void eigen_eval_AVX_FMA(const int n, const double * c, const int incc, const double * A, const double * B, const double * C, const int m, double * x, const int sign, double * f);
+void eigen_eval_AVX512F(const int n, const double * c, const int incc, const double * A, const double * B, const double * C, const int m, double * x, const int sign, double * f);
+
+void eigen_eval_defaultf(const int n, const float * c, const int incc, const float * A, const float * B, const float * C, const int m, float * x, const int sign, float * f);
+void eigen_eval_SSEf(const int n, const float * c, const int incc, const float * A, const float * B, const float * C, const int m, float * x, const int sign, float * f);
+void eigen_eval_AVXf(const int n, const float * c, const int incc, const float * A, const float * B, const float * C, const int m, float * x, const int sign, float * f);
+void eigen_eval_AVX_FMAf(const int n, const float * c, const int incc, const float * A, const float * B, const float * C, const int m, float * x, const int sign, float * f);
+void eigen_eval_AVX512Ff(const int n, const float * c, const int incc, const float * A, const float * B, const float * C, const int m, float * x, const int sign, float * f);
+
+void eigen_eval_defaultl(const int n, const long double * c, const int incc, const long double * A, const long double * B, const long double * C, const int m, long double * x, const int sign, long double * f);
 #if defined(FT_QUADMATH)
-    void orthogonal_polynomial_clenshaw_defaultq(const int n, const quadruple * c, const int incc, const quadruple * A, const quadruple * B, const quadruple * C, const int m, quadruple * x, quadruple * phi0, quadruple * f);
+    void eigen_eval_defaultq(const int n, const quadruple * c, const int incc, const quadruple * A, const quadruple * B, const quadruple * C, const int m, quadruple * x, const int sign, quadruple * f);
 #endif
 
 double * plan_legendre_to_chebyshev(const int normleg, const int normcheb, const int n);