unify dot-product, style fixes

slmdev · slmdev · commit 0089bef19fc6 · 2025-07-15T09:34:59.000+02:00
diff --git a/src/cmdline.cpp b/src/cmdline.cpp
@@ -1,8 +1,10 @@
+#include <format>
+#include <cstring>
 #include "cmdline.h"
 #include "common/utils.h"
 #include "common/timer.h"
 #include "file/sac.h"
-#include <cstring>
+
 
 CmdLine::CmdLine()
 :mode(ENCODE)
diff --git a/src/common/histbuf.h b/src/common/histbuf.h
@@ -76,7 +76,7 @@ class RollBuffer2 {
       return buf[pos + index];
     }
 
-    const std::span<T> get_span() const {
+    const std::span<const T> get_span() const {
       return std::span<const T>{buf.data() + pos,n};
     }
     const T* data() const {
diff --git a/src/common/math.h b/src/common/math.h
@@ -4,6 +4,7 @@
 #include "../global.h"
 #include <cassert>
 #include <cmath>
+#include <immintrin.h>
 
 namespace slmath
 {
@@ -58,21 +59,60 @@ namespace slmath
       vec2D G;
   };
 
-  inline double dot_scalar(const vec1D &v1,const vec1D &v2)
+
+  inline double dot(span_cf64 x,span_cf64 y)
   {
-    assert(v1.size()==v2.size());
-    double sum=0.0;
-    for (std::size_t i=0;i<v1.size();++i)
-      sum+=v1[i]*v2[i];
-    return sum;
+    assert(x.size()==y.size());
+    const std::size_t n=x.size();
+    double total=0.0;
+    std::size_t i=0;
+
+    if constexpr(SACGlobalCfg::USE_AVX2) {
+      if constexpr(SACGlobalCfg::UNROLL_AVX2) {
+        if (n>=8)
+        {
+          __m256d sum1 = _mm256_setzero_pd();
+          __m256d sum2 = _mm256_setzero_pd();
+          for (;i + 8 <= n;i += 8)
+          {
+            __m256d vx1 = _mm256_loadu_pd(&x[i]);
+            __m256d vy1 = _mm256_loadu_pd(&y[i]);
+            sum1 = _mm256_fmadd_pd(vx1, vy1, sum1);
+            __m256d vx2 = _mm256_loadu_pd(&x[i + 4]);
+            __m256d vy2 = _mm256_loadu_pd(&y[i + 4]);
+            sum2 = _mm256_fmadd_pd(vx2, vy2, sum2);
+          }
+          sum1 = _mm256_add_pd(sum1, sum2);
+          alignas(32) double buffer[4];
+          _mm256_store_pd(buffer, sum1);
+          total = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+        }
+      } else if (n>=4)
+      {
+        __m256d sum = _mm256_setzero_pd();
+        for (;i + 4 <= n;i += 4)
+        {
+          __m256d vx = _mm256_loadu_pd(&x[i]);
+          __m256d vy = _mm256_loadu_pd(&y[i]);
+          sum = _mm256_fmadd_pd(vx, vy, sum);
+        }
+        alignas(32) double buffer[4];
+        _mm256_store_pd(buffer, sum);
+        total = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+      }
+    }
+
+    for (;i<n;i++)
+      total+=x[i]*y[i];
+    return total;
   }
 
   // vector = matrix * vector
   inline vec1D mul(const vec2D &m,const vec1D &v)
   {
     vec1D v_out(m.size());
     for (std::size_t i=0;i<m.size();i++)
-      v_out[i]=slmath::dot_scalar(m[i],v);
+      v_out[i]=slmath::dot(m[i],v);
     return v_out;
   }
 
diff --git a/src/common/utils.h b/src/common/utils.h
@@ -150,7 +150,8 @@ namespace StrUtils {
 
 namespace MathUtils {
 
-#if defined(USE_AVX512)
+
+/*#if defined(USE_AVX512)
 inline double dot(const double* x,const double* y, std::size_t n)
 {
   __m512d sum = _mm512_setzero_pd();
@@ -222,7 +223,7 @@ inline double dot(const double* x,const double* y, std::size_t n)
     sum+=x[i]*y[i];
   return sum;
 }
-#endif
+#endif*/
 
   inline double calc_loglik_L1(double abs_e,double b)
   {
diff --git a/src/file/wav.cpp b/src/file/wav.cpp
@@ -1,6 +1,7 @@
-#include "wav.h"
-#include "../common/utils.h"
 #include <iostream>
+#include <format>
+#include "../common/utils.h"
+#include "wav.h"
 
 int word_align(int numbytes)
 {
diff --git a/src/global.h b/src/global.h
@@ -7,7 +7,6 @@
 #include <fstream>
 #include <sstream>
 #include <iomanip>
-#include <chrono>
 #include <vector>
 #include <span>
 
@@ -23,8 +22,14 @@ using span_i32=std::span<int32_t>;
 using span_ci32=std::span<const int32_t>;
 using span_cf64=std::span<const double>;
 
-#define USE_AVX256
-//#define UNROLL_AVX256
-//#define USE_AVX512
+struct SACGlobalCfg {
+  static constexpr bool USE_AVX2=true;
+  static constexpr bool UNROLL_AVX2=true;
+  static constexpr double NLMS_POW_EPS=1.0;
+  static constexpr double LMS_ADA_EPS=1E-5;
+  static constexpr bool LMS_MIX_INIT=true;// increase stability
+  static constexpr bool LMS_MIX_CLAMPW=true;
+  static constexpr bool RLS_ALC=true; //adaptive lambda control
+};
 
 #endif
diff --git a/src/opt/cma.cpp b/src/opt/cma.cpp
@@ -1,6 +1,7 @@
+#include <format>
+#include "../common/math.h"
 #include "cma.h"
 #include "ssc.h"
-#include "../common/math.h"
 
 OptCMA::OptCMA(const CMACfg &cfg,const box_const &parambox,bool verbose)
 :Opt(parambox),cfg(cfg),p(ndim),
diff --git a/src/opt/dds.cpp b/src/opt/dds.cpp
@@ -1,6 +1,8 @@
+#include <format>
 #include "dds.h"
 #include "ssc.h"
 
+
 OptDDS::OptDDS(const DDSCfg &cfg,const box_const &parambox,bool verbose)
 :Opt(parambox),cfg(cfg),
 verbose(verbose)
diff --git a/src/opt/de.cpp b/src/opt/de.cpp
@@ -1,4 +1,5 @@
 #include <cassert>
+#include <format>
 #include "de.h"
 #include "../common/utils.h"
 
diff --git a/src/pred/lms.h b/src/pred/lms.h
@@ -15,7 +15,7 @@ class LS_Stream {
     }
     double Predict()
     {
-      pred=MathUtils::dot(x.data(),w.data(),n);
+      pred=slmath::dot(x.get_span(),w);
       return pred;
     }
     virtual void Update(double val)=0;
@@ -58,7 +58,6 @@ void update_w_avx(double* w, const double* mutab, const double* x, double wgrad,
 
 class NLMS_Stream : public LS_Stream
 {
-  const double eps_pow=1.0;
   public:
     NLMS_Stream(int n,double mu,double mu_decay=1.0,double pow_decay=0.8)
     :LS_Stream(n),mutab(n),powtab(n),mu(mu)
@@ -71,44 +70,37 @@ class NLMS_Stream : public LS_Stream
       }
     }
 
-  #if defined(USE_AVX256)
     double calc_spow(const double *x,const double *powtab,std::size_t n)
     {
       double spow=0.0;
+
       std::size_t i=0;
-      if (n>=4) {
-        __m256d sum_vec = _mm256_setzero_pd();
-        for (; i + 4 <= n; i += 4) {
-          __m256d x_vec = _mm256_loadu_pd(&x[i]);
-          __m256d pow_vec = _mm256_load_pd(&powtab[i]);
-          __m256d x_squared = _mm256_mul_pd(x_vec, x_vec);
-          sum_vec = _mm256_fmadd_pd(pow_vec, x_squared, sum_vec);
-        }
 
-        alignas(32) double buffer[4];
-        _mm256_store_pd(buffer, sum_vec);
-        spow = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+      if constexpr(SACGlobalCfg::USE_AVX2) {
+        if (n>=4) {
+          __m256d sum_vec = _mm256_setzero_pd();
+          for (; i + 4 <= n; i += 4) {
+            __m256d x_vec = _mm256_loadu_pd(&x[i]);
+            __m256d pow_vec = _mm256_load_pd(&powtab[i]);
+            __m256d x_squared = _mm256_mul_pd(x_vec, x_vec);
+            sum_vec = _mm256_fmadd_pd(pow_vec, x_squared, sum_vec);
+          }
+
+          alignas(32) double buffer[4];
+          _mm256_store_pd(buffer, sum_vec);
+          spow = buffer[0] + buffer[1] + buffer[2] + buffer[3];
+        }
       }
 
       for (;i<n;i++)
         spow += powtab[i] * (x[i] * x[i]);
       return spow;
     }
-  #else
-    double calc_spow(const double *x,const double *powtab,std::size_t n)
-    {
-      double spow=0.0;
-      for (std::size_t i=0;i<n;i++) {
-        spow+=powtab[i]*(x[i]*x[i]);
-      }
-      return spow;
-    }
-  #endif
 
     void Update(double val) override
     {
       const double spow=calc_spow(x.data(),powtab.data(),n);
-      const double wgrad=mu*(val-pred)*sum_powtab/(eps_pow+spow);
+      const double wgrad=mu*(val-pred)*sum_powtab/(spow+SACGlobalCfg::NLMS_POW_EPS);
       for (int i=0;i<n;i++) {
         w[i]+=mutab[i]*(wgrad*x[i]);
       }
@@ -135,7 +127,7 @@ class LADADA_Stream : public LS_Stream
       for (int i=0;i<n;i++) {
         double const grad=serr*x[i];
         eg[i]=beta*eg[i]+(1.0-beta)*grad*grad; //accumulate gradients
-        double g=grad*1.0/(sqrt(eg[i])+1E-5);// update weights
+        double g=grad*1.0/(sqrt(eg[i])+SACGlobalCfg::LMS_ADA_EPS);// update weights
         w[i]+=mu*g;
       }
       x.push(val);
@@ -159,7 +151,7 @@ class LMSADA_Stream : public LS_Stream
       for (int i=0;i<n;i++) {
         double const grad=err*x[i]-nu*MathUtils::sgn(w[i]);
         eg[i]=beta*eg[i]+(1.0-beta)*grad*grad; //accumulate gradients
-        double g=grad*1.0/(sqrt(eg[i])+1E-5);// update weights
+        double g=grad*1.0/(sqrt(eg[i])+SACGlobalCfg::LMS_ADA_EPS);// update weights
         w[i]+=mu*g;
       }
       x.push(val);
@@ -180,7 +172,7 @@ class LMS {
     double Predict(const vec1D &inp)
     {
       x=inp;
-      pred=slmath::dot_scalar(x,w);
+      pred=slmath::dot(x,w);
       return pred;
     }
     virtual void Update(double)=0;
@@ -204,7 +196,7 @@ class LMS_ADA : public LMS
         double const grad=err*x[i] - nu*MathUtils::sgn(w[i]); // gradient + l1-regularization
 
         eg[i]=beta*eg[i]+(1.0-beta)*grad*grad; //accumulate gradients
-        double g=grad*1.0/(sqrt(eg[i])+1E-5);// update weights
+        double g=grad*1.0/(sqrt(eg[i])+SACGlobalCfg::LMS_ADA_EPS);// update weights
         w[i]+=mu*g;
       }
     }
@@ -226,7 +218,7 @@ class LAD_ADA : public LMS
       for (int i=0;i<n;i++) {
         double const grad=serr*x[i];
         eg[i]=beta*eg[i]+(1.0-beta)*grad*grad; //accumulate gradients
-        double scaled_grad=grad*1.0/(sqrt(eg[i])+1E-5);// update weights
+        double scaled_grad=grad*1.0/(sqrt(eg[i])+SACGlobalCfg::LMS_ADA_EPS);// update weights
         w[i]+=mu*scaled_grad;
       }
     }
@@ -264,7 +256,7 @@ class HBR_ADA : public LMS
       for (int i=0;i<n;i++) {
         double const grad=grad_loss*x[i];
         eg[i]=beta*eg[i]+(1.0-beta)*grad*grad; //accumulate gradients
-        const double g=grad*1.0/(sqrt(eg[i])+1E-5);// update weights
+        const double g=grad*1.0/(sqrt(eg[i])+SACGlobalCfg::LMS_ADA_EPS);// update weights
         w[i]+=mu*g;
       }
 
@@ -304,7 +296,7 @@ class LMS_ADAM : public LMS
         double n_hat=beta2*S[i]/(1.0-power_beta2);*/
         double m_hat=M[i]/(1.0-power_beta1);
         double n_hat=S[i]/(1.0-power_beta2);
-        w[i]+=mu*m_hat/(sqrt(n_hat)+1E-5);
+        w[i]+=mu*m_hat/(sqrt(n_hat)+SACGlobalCfg::LMS_ADA_EPS);
       }
     }
   private:
diff --git a/src/pred/lms_cascade.h b/src/pred/lms_cascade.h
@@ -21,9 +21,7 @@
   cv2.Update(nbits0,nbits1);
 */
 
-// increase stability
-constexpr int LMS_MIX_INIT=1;
-constexpr int LMS_MIX_CLAMPW=1;
+
 
 // Blend 2xLMS-ADA using L1 + L2 loss
 // using absolute error as scoring function
@@ -35,7 +33,7 @@ class Blend2LMS_L1 {
      mix1(n,lms_mu,lms_beta),
      cw2(blend_beta)
     {
-      if constexpr(LMS_MIX_INIT)
+      if constexpr(SACGlobalCfg::LMS_MIX_INIT)
         for (int i=0;i<n-1;i++)
           mix0.w[i] = mix1.w[i] = 1.0/(i+1);
     }
@@ -53,7 +51,7 @@ class Blend2LMS_L1 {
     {
       mix0.Update(target);
       mix1.Update(target);
-      if constexpr(LMS_MIX_CLAMPW)
+      if constexpr(SACGlobalCfg::LMS_MIX_CLAMPW)
         for (int i=0;i<n;i++) {
           mix0.w[i]=std::max(mix0.w[i],0.0);
           mix1.w[i]=std::max(mix1.w[i],0.0);
diff --git a/src/pred/lpc.h b/src/pred/lpc.h
@@ -23,7 +23,7 @@ class OLS {
     }
     double Predict()
     {
-      pred=MathUtils::dot(x.data(),w.data(),n);
+      pred=slmath::dot(x,w);
       return pred;
     }
 
diff --git a/src/pred/rls.cpp b/src/pred/rls.cpp
@@ -2,8 +2,6 @@
 #include "../common/math.h"
 #include "../common/utils.h"
 
-constexpr int RLS_ALC=1;
-
 RLS::RLS(int n,double gamma,double nu)
 :n(n),
 px(0.),gamma(gamma),
@@ -17,7 +15,7 @@ alc(gamma)
 
 double RLS::Predict()
 {
-  px=slmath::dot_scalar(hist,w);
+  px=slmath::dot(hist,w);
   return px;
 }
 
@@ -33,10 +31,10 @@ void RLS::Update(double val)
 
   vec1D ph=slmath::mul(P,hist); //phi=hist P hist
   // a priori variance of prediction
-  double phi=slmath::dot_scalar(hist,ph);
+  double phi=slmath::dot(hist,ph);
 
   double alpha=gamma;
-  if constexpr(RLS_ALC) {
+  if constexpr(SACGlobalCfg::RLS_ALC) {
     // Normalized Innovation Squared
     // quantifies how "unexpected" the observation is
     // relative to the models uncertainty phi

Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ class RollBuffer2 {`
`76`	`76`	`return buf[pos + index];`
`77`	`77`	`}`
`78`	`78`
`79`		`- const std::span<T> get_span() const {`
	`79`	`+ const std::span<const T> get_span() const {`
`80`	`80`	`return std::span<const T>{buf.data() + pos,n};`
`81`	`81`	`}`
`82`	`82`	`const T* data() const {`
Original file line number	Diff line number	Diff line change
`@@ -150,7 +150,8 @@ namespace StrUtils {`
`150`	`150`
`151`	`151`	`namespace MathUtils {`
`152`	`152`
`153`		`-#if defined(USE_AVX512)`
	`153`	`+`
	`154`	`+/*#if defined(USE_AVX512)`
`154`	`155`	`inline double dot(const double* x,const double* y, std::size_t n)`
`155`	`156`	`{`
`156`	`157`	`__m512d sum = _mm512_setzero_pd();`
`@@ -222,7 +223,7 @@ inline double dot(const double* x,const double* y, std::size_t n)`
`222`	`223`	`sum+=x[i]*y[i];`
`223`	`224`	`return sum;`
`224`	`225`	`}`
`225`		`-#endif`
	`226`	`+#endif*/`
`226`	`227`
`227`	`228`	`inline double calc_loglik_L1(double abs_e,double b)`
`228`	`229`	`{`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`		`-#include "wav.h"`
`2`		`-#include "../common/utils.h"`
`3`	`1`	`#include <iostream>`
	`2`	`+#include <format>`
	`3`	`+#include "../common/utils.h"`
	`4`	`+#include "wav.h"`
`4`	`5`
`5`	`6`	`int word_align(int numbytes)`
`6`	`7`	`{`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`#include <cassert>`
	`2`	`+#include <format>`
`2`	`3`	`#include "de.h"`
`3`	`4`	`#include "../common/utils.h"`
`4`	`5`