From 711fe778134c49c2f19e3862b628fc8fe6d3ef1d Mon Sep 17 00:00:00 2001
From: Samuel Felton <samuel.felton@irisa.fr>
Date: Wed, 26 Nov 2025 18:48:48 +0100
Subject: [PATCH 01/24] SIMD homogeneous matrix multiplicaiton

---
 .../include/visp3/core/vpHomogeneousMatrix.h  |   2 +
 .../transformation/vpHomogeneousMatrix.cpp    | 153 +++++++++++++++++-
 .../core/test/math/catchHomogeneousMatrix.cpp |  63 ++++++++
 3 files changed, 215 insertions(+), 3 deletions(-)
diff --git a/modules/core/include/visp3/core/vpHomogeneousMatrix.h b/modules/core/include/visp3/core/vpHomogeneousMatrix.h
index 76950d9a82..d5a7b696ee 100644
--- a/modules/core/include/visp3/core/vpHomogeneousMatrix.h
+++ b/modules/core/include/visp3/core/vpHomogeneousMatrix.h
@@ -381,6 +381,8 @@ class VISP_EXPORT vpHomogeneousMatrix : public vpArray2D<double>
   vpHomogeneousMatrix &operator<<(double val);
   vpHomogeneousMatrix &operator,(double val);
 
+  void project(const vpMatrix &inputPoints, vpMatrix &outputPoints) const;
+
   void orthogonalizeRotation();
 
   void print() const;
diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
index 8e17bea6a5..98222aeb0d 100644
--- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
+++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
@@ -43,8 +43,130 @@
 #include <visp3/core/vpPoint.h>
 #include <visp3/core/vpQuaternionVector.h>
 
+
+
+
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#include <emmintrin.h>
+#include <immintrin.h>
+
+#define VISP_HAVE_SSE2 1
+#endif
+
+// https://stackoverflow.com/a/40765925
+#if !defined(__FMA__) && defined(__AVX2__)
+#define __FMA__ 1
+#endif
+
+#if defined _WIN32 && defined(_M_ARM64)
+#define _ARM64_DISTINCT_NEON_TYPES
+#include <Intrin.h>
+#include <arm_neon.h>
+#define VISP_HAVE_NEON 1
+#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__)
+#include <arm_neon.h>
+#define VISP_HAVE_NEON 1
+#else
+#define VISP_HAVE_NEON 0
+#endif
+
+#define USE_SIMD_CODE 1
+
+#if VISP_HAVE_SSE2 && USE_SIMD_CODE
+#define USE_SSE 1
+#else
+#define USE_SSE 0
+#endif
+
+#if VISP_HAVE_NEON && USE_SIMD_CODE
+#define USE_NEON 1
+#else
+#define USE_NEON 0
+#endif
+
+
+
+
 BEGIN_VISP_NAMESPACE
 const unsigned int vpHomogeneousMatrix::constr_value_4 = 4;
+
+#if defined(__AVX__)
+inline
+double hsum_double_avx(__m256d v)
+{
+  __m128d vlow = _mm256_castpd256_pd128(v);
+  __m128d vhigh = _mm256_extractf128_pd(v, 1); // high 128
+  vlow = _mm_add_pd(vlow, vhigh);     // reduce down to 128
+  __m128d high64 = _mm_unpackhi_pd(vlow, vlow);
+  return  _mm_cvtsd_f64(_mm_add_sd(vlow, high64));  // reduce to scalar
+}
+#endif
+
+void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output) const
+{
+  output.resize(input.getRows(), input.getCols(), false);
+  if (input.getCols() != 3) {
+    throw vpException(vpException::dimensionError, "Input matrix should have 3 columns");
+  }
+  double *inputData = input.data;
+  double *outputData = output.data;
+#if (USE_SSE)
+  __m128d rows[] = {
+    _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2),
+    _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2),
+    _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2)
+  };
+  for (unsigned int i = 0; i < input.getRows(); ++i) {
+
+    __m128d r1 = _mm_set_pd(inputData[1], inputData[0]);
+    __m128d r2 = _mm_set_pd(1.0, inputData[2]);
+
+    for (unsigned int j = 0; j < 3; ++j) {
+      __m128d m1 = _mm_mul_pd(r1, rows[j * 2]);
+#if !defined(__FMA__)
+      __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]);
+      __m128d add = _mm_add_pd(m1, m2);
+#else
+      __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1);
+#endif
+      __m128d sum = _mm_hadd_pd(add, add);
+      outputData[j] = _mm_cvtsd_f64(sum);
+    }
+    inputData += 3;
+    outputData += 3;
+  }
+
+#elif defined(__AVX__)
+  __m256d rows[] = {
+    _mm256_loadu_pd(rowPtrs[0]),
+    _mm256_loadu_pd(rowPtrs[1]),
+    _mm256_loadu_pd(rowPtrs[2])
+  };
+
+  for (unsigned int i = 0; i < input.getRows(); ++i) {
+    __m256d r = _mm256_set_pd(1.0, inputData[2], inputData[1], inputData[0]);
+    for (unsigned int j = 0; j < 3; ++j) {
+      __m256d mulres = _mm256_mul_pd(rows[j], r);
+      output[i][j] = hsum_double_avx(mulres);
+    }
+    inputData += 3;
+  }
+
+#else
+  double *r0 = rowPtrs[0];
+  double *r1 = rowPtrs[1];
+  double *r2 = rowPtrs[2];
+
+  for (unsigned int i = 0; i < input.getRows(); ++i) {
+    output[i][0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2] + r0[3];
+    output[i][1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2] + r1[3];
+    output[i][2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2] + r2[3];
+    inputData += 3;
+  }
+#endif
+}
+
+
 /*!
   Construct an homogeneous matrix from a translation vector and quaternion
   rotation vector.
@@ -651,16 +773,41 @@ vpColVector vpHomogeneousMatrix::operator*(const vpColVector &v) const
                       "(%dx1) column vector",
                       v.getRows()));
   }
-  vpColVector p(rowNum);
+  vpColVector p(rowNum, 0.0);
+
+#if USE_SSE
+  __m128d rows[] = {
+    _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2),
+    _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2),
+    _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2),
+    _mm_loadu_pd(rowPtrs[3]), _mm_loadu_pd(rowPtrs[3] + 2)
+
+  };
+
+  __m128d r1 = _mm_loadu_pd(v.data);
+  __m128d r2 = _mm_loadu_pd(v.data + 2);
+
+  for (unsigned int j = 0; j < 4; ++j) {
+    __m128d m1 = _mm_mul_pd(r1, rows[j * 2]);
+#if !defined(__FMA__)
+    __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]);
+    __m128d add = _mm_add_pd(m1, m2);
+#else
+    __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1);
+#endif
+    __m128d sum = _mm_hadd_pd(add, add);
+    p[j] = _mm_cvtsd_f64(sum);
+  }
+
 
-  p = 0.0;
+#else
 
   for (unsigned int j = 0; j < val_4; ++j) {
     for (unsigned int i = 0; i < val_4; ++i) {
       p[i] += rowPtrs[i][j] * v[j];
     }
   }
-
+#endif
   return p;
 }
 
diff --git a/modules/core/test/math/catchHomogeneousMatrix.cpp b/modules/core/test/math/catchHomogeneousMatrix.cpp
index af819ab093..9a41b7c0f1 100644
--- a/modules/core/test/math/catchHomogeneousMatrix.cpp
+++ b/modules/core/test/math/catchHomogeneousMatrix.cpp
@@ -38,6 +38,8 @@
  */
 #include <visp3/core/vpConfig.h>
 
+#include <visp3/core/vpUniRand.h>
+
 #if defined(VISP_HAVE_CATCH2)
 #include <visp3/core/vpHomogeneousMatrix.h>
 
@@ -224,6 +226,67 @@ TEST_CASE("vpHomogenousMatrix * vpRotationMatrix", "[operator*]")
   CHECK(success);
 }
 
+TEST_CASE("Point projection", "project")
+{
+  std::vector<unsigned int> NS = { 1, 10, 100, 1000, 10000 };
+  for (unsigned int N : NS) {
+    std::cout << "Running for N = " << N << std::endl;
+    std::vector<double> timeProject, timeMult, timeNaive;
+    vpUniRand r(42);
+    for (unsigned int trial = 0; trial < 100000; ++trial) {
+      vpHomogeneousMatrix M;
+      vpMatrix input(N, 3);
+      for (unsigned int i = 0; i< N; ++i) {
+        input[i][0] = r.uniform(0.0, 1.0);
+        input[i][1] = r.uniform(0.0, 1.0);
+        input[i][2] = r.uniform(0.0, 1.0);
+      }
+      vpMatrix output(N, 3);
+
+      double t1 = vpTime::measureTimeMs();
+      M.project(input, output);
+      double t2 = vpTime::measureTimeMs();
+      timeProject.push_back(t2 - t1);
+
+
+      vpColVector x(4, 1);
+
+      vpMatrix outputR(N, 3);
+      double t1r = vpTime::measureTimeMs();
+
+      for (unsigned int i = 0; i < input.getRows(); ++i) {
+        x[0] = input[i][0];
+        x[1] = input[i][1];
+        x[2] = input[i][2];
+
+        vpColVector res = M * x;
+        outputR[i][0] = res[0];
+        outputR[i][1] = res[1];
+        outputR[i][2] = res[2];
+      }
+      double t2r = vpTime::measureTimeMs();
+      timeNaive.push_back(t2r - t1r);
+
+
+      vpMatrix input4(4, N);
+      vpMatrix output4(4, N);
+      double t14 = vpTime::measureTimeMs();
+
+      vpMatrix::mult2Matrices((vpMatrix)M, input4, output4);
+      double t24 = vpTime::measureTimeMs();
+      timeMult.push_back(t24 - t14);
+
+      CHECK((outputR == output));
+    }
+    std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +-" << vpMath::getStdev(timeProject) <<  "ms" << std::endl;
+    std::cout << "Mult  version took: " << vpMath::getMean(timeMult) << " +-" << vpMath::getStdev(timeMult) << "ms" << std::endl;
+    std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +-" << vpMath::getStdev(timeNaive) <<  "ms" << std::endl;
+    std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::getMean(timeProject) << std::endl;
+
+  }
+
+}
+
 int main(int argc, char *argv[])
 {
   Catch::Session session;

From 196660c49a471fe05c9a198b6fa10db2055a6d9b Mon Sep 17 00:00:00 2001
From: Samuel Felton <samuel.felton@irisa.fr>
Date: Thu, 27 Nov 2025 17:07:36 +0100
Subject: [PATCH 02/24] implement first version of simd for rotaiton matrix

---
 .../include/visp3/core/vpRotationMatrix.h     |   2 +
 .../transformation/vpHomogeneousMatrix.cpp    |   2 +-
 .../math/transformation/vpRotationMatrix.cpp  | 109 ++++++++++++++++++
 .../core/test/math/catchHomogeneousMatrix.cpp |   1 +
 modules/core/test/math/catchRotation.cpp      |  68 +++++++++++
 5 files changed, 181 insertions(+), 1 deletion(-)

diff --git a/modules/core/include/visp3/core/vpRotationMatrix.h b/modules/core/include/visp3/core/vpRotationMatrix.h
index fbbf0b6f0e..96dc62969d 100644
--- a/modules/core/include/visp3/core/vpRotationMatrix.h
+++ b/modules/core/include/visp3/core/vpRotationMatrix.h
@@ -190,6 +190,8 @@ class VISP_EXPORT vpRotationMatrix : public vpArray2D<double>
 
   void printVector();
 
+  void rotateVectors(const vpMatrix &inputs, vpMatrix &outputs) const;
+
   /*!
     This function is not applicable to a rotation matrix that is always a
     3-by-3 matrix.
diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
index 98222aeb0d..354010c19c 100644
--- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
+++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
@@ -104,7 +104,7 @@ double hsum_double_avx(__m256d v)
 
 void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output) const
 {
-  output.resize(input.getRows(), input.getCols(), false);
+  output.resize(input.getRows(), input.getCols(), false, false);
   if (input.getCols() != 3) {
     throw vpException(vpException::dimensionError, "Input matrix should have 3 columns");
   }
diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp
index 067db852cf..dd66958167 100644
--- a/modules/core/src/math/transformation/vpRotationMatrix.cpp
+++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp
@@ -49,7 +49,116 @@
 // Debug trace
 #include <math.h>
 
+
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+
+#define VISP_HAVE_SSE2 1
+#endif
+
+// https://stackoverflow.com/a/40765925
+#if !defined(__FMA__) && defined(__AVX2__)
+#define __FMA__ 1
+#endif
+
+#if defined _WIN32 && defined(_M_ARM64)
+#define _ARM64_DISTINCT_NEON_TYPES
+#include <Intrin.h>
+#include <arm_neon.h>
+#define VISP_HAVE_NEON 1
+#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__)
+#include <arm_neon.h>
+#define VISP_HAVE_NEON 1
+#else
+#define VISP_HAVE_NEON 0
+#endif
+
+#define USE_SIMD_CODE 1
+
+#if VISP_HAVE_SSE2 && USE_SIMD_CODE
+#define USE_SSE 1
+#else
+#define USE_SSE 0
+#endif
+
+#if VISP_HAVE_NEON && USE_SIMD_CODE
+#define USE_NEON 1
+#else
+#define USE_NEON 0
+#endif
+
+
 BEGIN_VISP_NAMESPACE
+
+
+void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output) const
+{
+
+
+
+  output.resize(input.getRows(), input.getCols(), false, false);
+  if (input.getCols() != 3) {
+    throw vpException(vpException::dimensionError, "Input matrix should have 3 columns");
+  }
+  double *inputData = input.data;
+  double *outputData = output.data;
+
+#if USE_SSE
+  __m128d rowStarts[] = {
+    _mm_loadu_pd(rowPtrs[0]),
+    _mm_loadu_pd(rowPtrs[1]),
+    _mm_loadu_pd(rowPtrs[2])
+  };
+
+  __m128d lastColStart = _mm_set_pd(rowPtrs[1][2], rowPtrs[0][2]);
+  __m128d lastColEnd = _mm_set_pd(rowPtrs[2][2], rowPtrs[2][2]);
+
+  const int dotProdFullMask = 0x33; // 0x3: use both input floats, 0x2: store into lane 1
+  const int dotProdFirstElemMask = 0x11;
+
+  for (unsigned int i = 0; i < input.getRows(); ++i) {
+    __m128d xy = _mm_loadu_pd(inputData);
+    __m128d zz = _mm_set_pd(inputData[2], inputData[2]);
+    // Perform dot products with the two first elements of each row
+    __m128d r1 = _mm_dp_pd(xy, rowStarts[0], dotProdFullMask);
+    __m128d r2 = _mm_dp_pd(xy, rowStarts[1], dotProdFullMask);
+    __m128d r3 = _mm_dp_pd(xy, rowStarts[2], dotProdFullMask);
+
+    __m128d r12xy = _mm_shuffle_pd(r1, r2, 0x2);
+
+    // Dot product between last columns of the rows and Z
+    __m128d lastColxy = _mm_dp_pd(zz, lastColStart, dotProdFullMask); // First two cols
+    __m128d lastColz = _mm_dp_pd(zz, lastColEnd, dotProdFirstElemMask); // Last one
+
+    __m128d r3z = _mm_add_pd(r3, lastColz);
+
+    _mm_storeu_pd(outputData, _mm_add_pd(r12xy, lastColxy));
+
+    outputData[2] = _mm_cvtsd_f64(r3z);
+    inputData += 3;
+    outputData += 3;
+  }
+
+
+#else
+  double *r0 = rowPtrs[0];
+  double *r1 = rowPtrs[1];
+  double *r2 = rowPtrs[2];
+
+  for (unsigned int i = 0; i < input.getRows(); ++i) {
+    outputData[0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2];
+    outputData[1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2];
+    outputData[2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2];
+    inputData += 3;
+    outputData += 3;
+  }
+#endif
+}
+
+
 const unsigned int vpRotationMatrix::constr_val_3 = 3;
 /*!
   Initialize the rotation matrix as identity.
diff --git a/modules/core/test/math/catchHomogeneousMatrix.cpp b/modules/core/test/math/catchHomogeneousMatrix.cpp
index 9a41b7c0f1..e3b6c00f20 100644
--- a/modules/core/test/math/catchHomogeneousMatrix.cpp
+++ b/modules/core/test/math/catchHomogeneousMatrix.cpp
@@ -250,6 +250,7 @@ TEST_CASE("Point projection", "project")
 
 
       vpColVector x(4, 1);
+      vpColVector res(4);
 
       vpMatrix outputR(N, 3);
       double t1r = vpTime::measureTimeMs();
diff --git a/modules/core/test/math/catchRotation.cpp b/modules/core/test/math/catchRotation.cpp
index 5031a2efcc..51455e8059 100644
--- a/modules/core/test/math/catchRotation.cpp
+++ b/modules/core/test/math/catchRotation.cpp
@@ -559,6 +559,74 @@ TEST_CASE("Default constructor", "[rotation matrix]")
 #endif
 }
 
+
+
+TEST_CASE("Vector rotation", "project")
+{
+  std::vector<unsigned int> NS = { 1, 10, 100, 1000, 10000 };
+  for (unsigned int N : NS) {
+    std::cout << "Running for N = " << N << std::endl;
+    std::vector<double> timeProject, timeMult, timeNaive;
+    vpUniRand r(42);
+    for (unsigned int trial = 0; trial < 100000; ++trial) {
+      vpRotationMatrix M;
+      vpMatrix input(N, 3);
+      for (unsigned int i = 0; i< N; ++i) {
+        input[i][0] = r.uniform(0.0, 1.0);
+        input[i][1] = r.uniform(0.0, 1.0);
+        input[i][2] = r.uniform(0.0, 1.0);
+      }
+      vpMatrix output(N, 3);
+
+      double t1 = vpTime::measureTimeMs();
+      M.rotateVectors(input, output);
+      double t2 = vpTime::measureTimeMs();
+      timeProject.push_back(t2 - t1);
+
+
+      vpColVector x(3, 1);
+      vpColVector res(3);
+      vpMatrix outputR(N, 3);
+      double t1r = vpTime::measureTimeMs();
+
+      for (unsigned int i = 0; i < input.getRows(); ++i) {
+        x[0] = input[i][0];
+        x[1] = input[i][1];
+        x[2] = input[i][2];
+
+        res = M * x;
+        outputR[i][0] = res[0];
+        outputR[i][1] = res[1];
+        outputR[i][2] = res[2];
+      }
+      double t2r = vpTime::measureTimeMs();
+      timeNaive.push_back(t2r - t1r);
+
+
+      vpMatrix input4(3, N);
+      vpMatrix output4(3, N);
+      double t14 = vpTime::measureTimeMs();
+
+      vpMatrix::mult2Matrices((vpMatrix)M, input4, output4);
+      double t24 = vpTime::measureTimeMs();
+      timeMult.push_back(t24 - t14);
+
+      if (outputR != output) {
+        std::cerr << "Output diff = " << (output - outputR)<< std::endl;
+
+
+      }
+
+      CHECK((outputR == output));
+    }
+    std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +-" << vpMath::getStdev(timeProject) <<  "ms" << std::endl;
+    std::cout << "Mult  version took: " << vpMath::getMean(timeMult) << " +-" << vpMath::getStdev(timeMult) << "ms" << std::endl;
+    std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +-" << vpMath::getStdev(timeNaive) <<  "ms" << std::endl;
+    std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::getMean(timeProject) << std::endl;
+
+  }
+}
+
 int main(int argc, char *argv[])
 {
   Catch::Session session;

From 729110a80b90448bbb75a4a2f4339e65fabedbd0 Mon Sep 17 00:00:00 2001
From: Samuel Felton <samuel.felton@irisa.fr>
Date: Tue, 2 Dec 2025 13:39:57 +0100
Subject: [PATCH 03/24] AVX implem for rotation matmul

---
 .../math/transformation/vpRotationMatrix.cpp  | 41 ++++++++++++++++---
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp
index dd66958167..e38ded51f4 100644
--- a/modules/core/src/math/transformation/vpRotationMatrix.cpp
+++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp
@@ -55,7 +55,6 @@
 #include <immintrin.h>
 #include <smmintrin.h>
 
-
 #define VISP_HAVE_SSE2 1
 #endif
 
@@ -96,9 +95,6 @@ BEGIN_VISP_NAMESPACE
 
 void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output) const
 {
-
-
-
   output.resize(input.getRows(), input.getCols(), false, false);
   if (input.getCols() != 3) {
     throw vpException(vpException::dimensionError, "Input matrix should have 3 columns");
@@ -106,7 +102,42 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output) co
   double *inputData = input.data;
   double *outputData = output.data;
 
-#if USE_SSE
+#if defined(__AVX__)
+  __m256d rows[] = {
+    _mm256_set_pd(0.0, rowPtrs[2][0], rowPtrs[1][0], rowPtrs[0][0]),
+    _mm256_set_pd(0.0, rowPtrs[2][1], rowPtrs[1][1], rowPtrs[0][1]),
+    _mm256_set_pd(0.0, rowPtrs[2][2], rowPtrs[1][2], rowPtrs[0][2]),
+  };
+
+  double result[4];
+
+  for (unsigned int i = 0; i < input.getRows(); ++i) {
+    const __m256d xyz = _mm256_set_pd(0.0, inputData[2], inputData[1], inputData[0]);
+
+#if defined(__FMA__)
+    __m256d dp = _mm256_mul_pd(rows[0], xyz);
+    dp = _mm256_fmadd_pd(rows[1], xyz, dp);
+    dp = _mm256_fmadd_pd(rows[2], xyz, dp);
+#else
+    __m256d col1 = _mm256_mul_pd(rows[0], xyz);
+    __m256d col2 = _mm256_mul_pd(rows[1], xyz);
+    __m256d col3 = _mm256_mul_pd(rows[2], xyz);
+
+    __m256d dp = _mm256_add_pd(col1, col2);
+    dp = _mm256_add_pd(dp, col3);
+#endif
+
+    _mm256_storeu_pd(result, dp);
+
+    outputData[0] = result[0];
+    outputData[1] = result[1];
+    outputData[2] = result[2];
+
+    inputData += 3;
+    outputData += 3;
+  }
+
+#elif USE_SSE
   __m128d rowStarts[] = {
     _mm_loadu_pd(rowPtrs[0]),
     _mm_loadu_pd(rowPtrs[1]),

From e230a7218f714e84d2b84906756650f70622428b Mon Sep 17 00:00:00 2001
From: Samuel Felton <samuel.felton@irisa.fr>
Date: Tue, 2 Dec 2025 15:45:33 +0100
Subject: [PATCH 04/24] Fix intrinsics usage, performance improvement

---
 .../math/transformation/vpRotationMatrix.cpp  | 83 +++++++++++++------
 1 file changed, 59 insertions(+), 24 deletions(-)

diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp
index e38ded51f4..cd8ee56d12 100644
--- a/modules/core/src/math/transformation/vpRotationMatrix.cpp
+++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp
@@ -102,7 +102,7 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output) co
   double *inputData = input.data;
   double *outputData = output.data;
 
-#if defined(__AVX__)
+#if defined(__AVX__) && 0 == 1
   __m256d rows[] = {
     _mm256_set_pd(0.0, rowPtrs[2][0], rowPtrs[1][0], rowPtrs[0][0]),
     _mm256_set_pd(0.0, rowPtrs[2][1], rowPtrs[1][1], rowPtrs[0][1]),
@@ -138,40 +138,75 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output) co
   }
 
 #elif USE_SSE
-  __m128d rowStarts[] = {
-    _mm_loadu_pd(rowPtrs[0]),
-    _mm_loadu_pd(rowPtrs[1]),
-    _mm_loadu_pd(rowPtrs[2])
-  };
 
-  __m128d lastColStart = _mm_set_pd(rowPtrs[1][2], rowPtrs[0][2]);
-  __m128d lastColEnd = _mm_set_pd(rowPtrs[2][2], rowPtrs[2][2]);
 
-  const int dotProdFullMask = 0x33; // 0x3: use both input floats, 0x2: store into lane 1
-  const int dotProdFirstElemMask = 0x11;
+  __m128d row01_lo = _mm_loadu_pd(rowPtrs[0]);
+  __m128d row01_hi = _mm_loadu_pd(rowPtrs[1]);
+  __m128d row2_xy = _mm_loadu_pd(rowPtrs[2]);
 
-  for (unsigned int i = 0; i < input.getRows(); ++i) {
-    __m128d xy = _mm_loadu_pd(inputData);
-    __m128d zz = _mm_set_pd(inputData[2], inputData[2]);
-    // Perform dot products with the two first elements of each row
-    __m128d r1 = _mm_dp_pd(xy, rowStarts[0], dotProdFullMask);
-    __m128d r2 = _mm_dp_pd(xy, rowStarts[1], dotProdFullMask);
-    __m128d r3 = _mm_dp_pd(xy, rowStarts[2], dotProdFullMask);
+  // Preload third column as scalars
+  double c0 = rowPtrs[0][2];
+  double c1 = rowPtrs[1][2];
+  double c2 = rowPtrs[2][2];
+
+  for (unsigned i = 0; i < input.getRows(); ++i) {
 
-    __m128d r12xy = _mm_shuffle_pd(r1, r2, 0x2);
+    const __m128d xy = _mm_loadu_pd(inputData);
 
-    // Dot product between last columns of the rows and Z
-    __m128d lastColxy = _mm_dp_pd(zz, lastColStart, dotProdFullMask); // First two cols
-    __m128d lastColz = _mm_dp_pd(zz, lastColEnd, dotProdFirstElemMask); // Last one
+    const double z = inputData[2];
+    // const __m128d zz = _mm_set_pd(z, z);
 
-    __m128d r3z = _mm_add_pd(r3, lastColz);
+    __m128d mul0 = _mm_mul_pd(row01_lo, xy);
+    __m128d mul1 = _mm_mul_pd(row01_hi, xy);
+    __m128d r01 = _mm_hadd_pd(mul0, mul1);
+    __m128d mul2 = _mm_mul_pd(row2_xy, xy);
 
-    _mm_storeu_pd(outputData, _mm_add_pd(r12xy, lastColxy));
+    // Adding scalars at the end without using simd is faster
+    // __m128d mulzz = _mm_mul_pd(row01_z, zz);
+    // r01 = _mm_add_pd(r01, mulzz);
+
+    // store result
+    outputData[0] = _mm_cvtsd_f64(r01) + c0 * z;
+    outputData[1] = _mm_cvtsd_f64(_mm_unpackhi_pd(r01, r01)) + c1 * z;
+    outputData[2] = _mm_cvtsd_f64(_mm_hadd_pd(mul2, mul2)) + c2 * z;
 
-    outputData[2] = _mm_cvtsd_f64(r3z);
     inputData += 3;
     outputData += 3;
   }
+    // __m128d rowStarts[] = {
+    //   _mm_loadu_pd(rowPtrs[0]),
+    //   _mm_loadu_pd(rowPtrs[1]),
+    //   _mm_loadu_pd(rowPtrs[2])
+    // };
+
+    // __m128d lastColStart = _mm_set_pd(rowPtrs[1][2], rowPtrs[0][2]);
+    // __m128d lastColEnd = _mm_set_pd(rowPtrs[2][2], rowPtrs[2][2]);
+
+    // const int dotProdFullMask = 0x33; // 0x3: use both input floats, 0x2: store into lane 1
+    // const int dotProdFirstElemMask = 0x11;
+
+    // for (unsigned int i = 0; i < input.getRows(); ++i) {
+    //   __m128d xy = _mm_loadu_pd(inputData);
+    //   __m128d zz = _mm_set_pd(inputData[2], inputData[2]);
+    //   // Perform dot products with the two first elements of each row
+    //   __m128d r1 = _mm_dp_pd(xy, rowStarts[0], dotProdFullMask);
+    //   __m128d r2 = _mm_dp_pd(xy, rowStarts[1], dotProdFullMask);
+    //   __m128d r3 = _mm_dp_pd(xy, rowStarts[2], dotProdFullMask);
+
+    //   __m128d r12xy = _mm_shuffle_pd(r1, r2, 0x2);
+
+    //   // Dot product between last columns of the rows and Z
+    //   __m128d lastColxy = _mm_dp_pd(zz, lastColStart, dotProdFullMask); // First two cols
+    //   __m128d lastColz = _mm_dp_pd(zz, lastColEnd, dotProdFirstElemMask); // Last one
+
+    //   __m128d r3z = _mm_add_pd(r3, lastColz);
+
+    //   _mm_storeu_pd(outputData, _mm_add_pd(r12xy, lastColxy));
+
+    //   outputData[2] = _mm_cvtsd_f64(r3z);
+    //   inputData += 3;
+    //   outputData += 3;
+    // }
 
 
 #else

From 1ff0b576ebca50bc7f984f598aa6d22a81b653b1 Mon Sep 17 00:00:00 2001
From: Samuel Felton <samuel.felton@irisa.fr>
Date: Wed, 3 Dec 2025 19:14:12 +0100
Subject: [PATCH 05/24] Simd version of the rbt dense depth, AVX matmul version
 for 3xN input matrices

---
 .../include/visp3/core/vpHomogeneousMatrix.h  |   2 +-
 .../include/visp3/core/vpRotationMatrix.h     |   2 +-
 .../transformation/vpHomogeneousMatrix.cpp    | 350 ++++++++++++------
 .../math/transformation/vpRotationMatrix.cpp  | 293 ++++++++-------
 .../include/visp3/rbt/vpRBDenseDepthTracker.h |  93 +----
 .../src/features/vpRBDenseDepthTracker.cpp    | 268 ++++++++++++--
 6 files changed, 635 insertions(+), 373 deletions(-)

diff --git a/modules/core/include/visp3/core/vpHomogeneousMatrix.h b/modules/core/include/visp3/core/vpHomogeneousMatrix.h
index d5a7b696ee..68f33ed3b4 100644
--- a/modules/core/include/visp3/core/vpHomogeneousMatrix.h
+++ b/modules/core/include/visp3/core/vpHomogeneousMatrix.h
@@ -381,7 +381,7 @@ class VISP_EXPORT vpHomogeneousMatrix : public vpArray2D<double>
   vpHomogeneousMatrix &operator<<(double val);
   vpHomogeneousMatrix &operator,(double val);
 
-  void project(const vpMatrix &inputPoints, vpMatrix &outputPoints) const;
+  void project(const vpMatrix &inputPoints, vpMatrix &outputPoints, bool isTransposed) const;
 
   void orthogonalizeRotation();
 
diff --git a/modules/core/include/visp3/core/vpRotationMatrix.h b/modules/core/include/visp3/core/vpRotationMatrix.h
index 96dc62969d..38ec6f2a05 100644
--- a/modules/core/include/visp3/core/vpRotationMatrix.h
+++ b/modules/core/include/visp3/core/vpRotationMatrix.h
@@ -190,7 +190,7 @@ class VISP_EXPORT vpRotationMatrix : public vpArray2D<double>
 
   void printVector();
 
-  void rotateVectors(const vpMatrix &inputs, vpMatrix &outputs) const;
+  void rotateVectors(const vpMatrix &inputs, vpMatrix &outputs, bool isTransposed) const;
 
   /*!
     This function is not applicable to a rotation matrix that is always a
diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
index 354010c19c..4867583025 100644
--- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
+++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
@@ -42,128 +42,242 @@
 #include <visp3/core/vpMatrix.h>
 #include <visp3/core/vpPoint.h>
 #include <visp3/core/vpQuaternionVector.h>
+#include <visp3/core/vpSIMDUtils.h>
 
 
+BEGIN_VISP_NAMESPACE
+const unsigned int vpHomogeneousMatrix::constr_value_4 = 4;
 
+void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool transposed) const
+{
+  output.resize(input.getRows(), input.getCols(), false, false);
+  if (transposed) {
 
-#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
-#include <emmintrin.h>
-#include <immintrin.h>
+    if (input.getCols() != 3) {
+      throw vpException(vpException::dimensionError, "Input matrix should have 3 columns");
+    }
+    double *inputData = input.data;
+    double *outputData = output.data;
+#if defined(VISP_HAVE_AVX)
+    __m256d rows[] = {
+      _mm256_loadu_pd(rowPtrs[0]),
+      _mm256_loadu_pd(rowPtrs[1]),
+      _mm256_loadu_pd(rowPtrs[2]),
+      _mm256_loadu_pd(rowPtrs[3]),
 
-#define VISP_HAVE_SSE2 1
-#endif
+    };
 
-// https://stackoverflow.com/a/40765925
-#if !defined(__FMA__) && defined(__AVX2__)
-#define __FMA__ 1
-#endif
 
-#if defined _WIN32 && defined(_M_ARM64)
-#define _ARM64_DISTINCT_NEON_TYPES
-#include <Intrin.h>
-#include <arm_neon.h>
-#define VISP_HAVE_NEON 1
-#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__)
-#include <arm_neon.h>
-#define VISP_HAVE_NEON 1
-#else
-#define VISP_HAVE_NEON 0
-#endif
+    double result[4];
+
+    for (unsigned int i = 0; i < input.getRows(); ++i) {
+      const __m256d xyzw = _mm256_setr_pd(inputData[0], inputData[1], inputData[2], 1.0);
+
+      const __m256d rs[] = {
+        _mm256_mul_pd(rows[0], xyzw),
+        _mm256_mul_pd(rows[1], xyzw),
+        _mm256_mul_pd(rows[2], xyzw),
+      };
+
+      __m256d rs_half_sum1 = _mm256_hadd_pd(rs[0], rs[1]);
+
+
+      _mm256_storeu_pd(result, rs_half_sum1);
+      __m128d hi = _mm256_extractf128_pd(rs[2], 1);   // [A3, A2]
+      __m128d lo = _mm256_castpd256_pd128(rs[2]);     // [A1, A0]
 
-#define USE_SIMD_CODE 1
+      __m128d sum2 = _mm_add_pd(lo, hi);          // [A3+A1, A2+A0]
+      __m128d sum1 = _mm_hadd_pd(sum2, sum2);     // [total, total]
+      double total = _mm_cvtsd_f64(sum1);
 
-#if VISP_HAVE_SSE2 && USE_SIMD_CODE
-#define USE_SSE 1
+
+      outputData[0] = result[0] + result[2];
+      outputData[1] = result[1] + result[3];
+      outputData[2] = total;
+
+      inputData += 3;
+      outputData += 3;
+    }
+
+#elif (VISP_HAVE_SSE2)
+    __m128d rows[] = {
+      _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2),
+      _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2),
+      _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2)
+    };
+    for (unsigned int i = 0; i < input.getRows(); ++i) {
+
+      __m128d r1 = _mm_set_pd(inputData[1], inputData[0]);
+      __m128d r2 = _mm_set_pd(1.0, inputData[2]);
+
+      for (unsigned int j = 0; j < 3; ++j) {
+        __m128d m1 = _mm_mul_pd(r1, rows[j * 2]);
+#if !defined(VISP_HAVE_FMA)
+        __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]);
+        __m128d add = _mm_add_pd(m1, m2);
 #else
-#define USE_SSE 0
+        __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1);
 #endif
-
-#if VISP_HAVE_NEON && USE_SIMD_CODE
-#define USE_NEON 1
+        __m128d sum = _mm_hadd_pd(add, add);
+        outputData[j] = _mm_cvtsd_f64(sum);
+      }
+      inputData += 3;
+      outputData += 3;
+    }
 #else
-#define USE_NEON 0
+    double *r0 = rowPtrs[0];
+    double *r1 = rowPtrs[1];
+    double *r2 = rowPtrs[2];
+
+    for (unsigned int i = 0; i < input.getRows(); ++i) {
+      output[i][0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2] + r0[3];
+      output[i][1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2] + r1[3];
+      output[i][2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2] + r2[3];
+      inputData += 3;
+    }
+#endif
+  }
+  else {
+    if (input.getRows() != 3) {
+      throw vpException(vpException::dimensionError, "Expected input to have 3 rows");
+    }
+    double *inputX = input[0];
+    double *inputY = input[1];
+    double *inputZ = input[2];
+    double *outputX = output[0];
+    double *outputY = output[1];
+    double *outputZ = output[2];
+
+#if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2)
+
+#if defined(VISP_HAVE_AVX2)
+
+    using Register = __m512d;
+    constexpr int numLanes = 8;
+    constexpr auto &set1 = _mm512_set1_pd;
+    constexpr auto &loadu = _mm512_loadu_pd;
+    constexpr auto &mul = _mm512_mul_pd;
+    constexpr auto &add = _mm512_add_pd;
+    constexpr auto &storeu = _mm512_storeu_pd;
+#if defined(VISP_HAVE_FMA)
+    constexpr auto &fma = _mm512_fmadd_pd;
+#endif
+#elif defined(VISP_HAVE_AVX)
+    using Register = __m256d;
+    constexpr int numLanes = 4;
+
+    constexpr auto &set1 = _mm256_set1_pd;
+    constexpr auto &loadu = _mm256_loadu_pd;
+    constexpr auto &mul = _mm256_mul_pd;
+    constexpr auto &add = _mm256_add_pd;
+    constexpr auto &storeu = _mm256_storeu_pd;
+#if defined(VISP_HAVE_FMA)
+    constexpr auto &fma = _mm256_fmadd_pd;
+#endif
+#elif VISP_HAVE_SSE2
+    using Register = __m128d;
+    constexpr int numLanes = 2;
+
+    constexpr auto &set1 = _mm_set1_pd;
+    constexpr auto &loadu = _mm_loadu_pd;
+    constexpr auto &mul = _mm_mul_pd;
+    constexpr auto &add = _mm_add_pd;
+    constexpr auto &storeu = _mm_storeu_pd;
+#if defined(VISP_HAVE_FMA)
+    constexpr auto &fma = _mm_fmadd_pd;
+#endif
 #endif
 
+    Register elems[12];
+    for (unsigned int i = 0; i < 12; ++i) {
+      elems[i] = set1(data[i]);
+    }
 
+    for (int i = 0; i <= static_cast<int>(input.getCols()) - numLanes; i += numLanes) {
+      const Register x4 = loadu(inputX);
+      const Register y4 = loadu(inputY);
+      const Register z4 = loadu(inputZ);
 
+#if defined(VISP_HAVE_FMA)
+      Register dp1 = mul(x4, elems[0]);
+      dp1 = fma(y4, elems[1], dp1);
+      dp1 = fma(z4, elems[2], dp1);
+      dp1 = add(elems[3], dp1);
 
-BEGIN_VISP_NAMESPACE
-const unsigned int vpHomogeneousMatrix::constr_value_4 = 4;
+      Register dp2 = mul(x4, elems[4]);
+      dp2 = fma(y4, elems[5], dp2);
+      dp2 = fma(z4, elems[6], dp2);
+      dp2 = add(elems[7], dp2);
+
+      Register dp3 = mul(x4, elems[8]);
+      dp3 = fma(y4, elems[9], dp3);
+      dp3 = fma(z4, elems[10], dp3);
+      dp3 = add(elems[11], dp3);
 
-#if defined(__AVX__)
-inline
-double hsum_double_avx(__m256d v)
-{
-  __m128d vlow = _mm256_castpd256_pd128(v);
-  __m128d vhigh = _mm256_extractf128_pd(v, 1); // high 128
-  vlow = _mm_add_pd(vlow, vhigh);     // reduce down to 128
-  __m128d high64 = _mm_unpackhi_pd(vlow, vlow);
-  return  _mm_cvtsd_f64(_mm_add_sd(vlow, high64));  // reduce to scalar
-}
-#endif
 
-void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output) const
-{
-  output.resize(input.getRows(), input.getCols(), false, false);
-  if (input.getCols() != 3) {
-    throw vpException(vpException::dimensionError, "Input matrix should have 3 columns");
-  }
-  double *inputData = input.data;
-  double *outputData = output.data;
-#if (USE_SSE)
-  __m128d rows[] = {
-    _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2),
-    _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2),
-    _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2)
-  };
-  for (unsigned int i = 0; i < input.getRows(); ++i) {
-
-    __m128d r1 = _mm_set_pd(inputData[1], inputData[0]);
-    __m128d r2 = _mm_set_pd(1.0, inputData[2]);
-
-    for (unsigned int j = 0; j < 3; ++j) {
-      __m128d m1 = _mm_mul_pd(r1, rows[j * 2]);
-#if !defined(__FMA__)
-      __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]);
-      __m128d add = _mm_add_pd(m1, m2);
 #else
-      __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1);
+      const Register muls[] = {
+        mul(x4, elems[0]),
+        mul(y4, elems[1]),
+        mul(z4, elems[2]),
+
+        mul(x4, elems[4]),
+        mul(y4, elems[5]),
+        mul(z4, elems[6]),
+
+        mul(x4, elems[8]),
+        mul(y4, elems[9]),
+        mul(z4, elems[10]),
+      };
+
+      Register dp1 = add(add(muls[0], muls[1]), muls[2]);
+      Register dp2 = add(add(muls[3], muls[4]), muls[5]);
+      Register dp3 = add(add(muls[6], muls[7]), muls[8]);
+
+      dp1 = add(dp1, elems[3]);
+      dp2 = add(dp2, elems[7]);
+      dp3 = add(dp3, elems[11]);
 #endif
-      __m128d sum = _mm_hadd_pd(add, add);
-      outputData[j] = _mm_cvtsd_f64(sum);
-    }
-    inputData += 3;
-    outputData += 3;
-  }
 
-#elif defined(__AVX__)
-  __m256d rows[] = {
-    _mm256_loadu_pd(rowPtrs[0]),
-    _mm256_loadu_pd(rowPtrs[1]),
-    _mm256_loadu_pd(rowPtrs[2])
-  };
-
-  for (unsigned int i = 0; i < input.getRows(); ++i) {
-    __m256d r = _mm256_set_pd(1.0, inputData[2], inputData[1], inputData[0]);
-    for (unsigned int j = 0; j < 3; ++j) {
-      __m256d mulres = _mm256_mul_pd(rows[j], r);
-      output[i][j] = hsum_double_avx(mulres);
+      storeu(outputX, dp1);
+      storeu(outputY, dp2);
+      storeu(outputZ, dp3);
+
+
+      inputX += numLanes; inputY += numLanes; inputZ += numLanes;
+      outputX += numLanes; outputY += numLanes; outputZ += numLanes;
+
+    }
+    double *r0 = rowPtrs[0];
+    double *r1 = rowPtrs[1];
+    double *r2 = rowPtrs[2];
+
+    for (unsigned int i = (input.getCols() / numLanes) * numLanes; i < input.getCols(); ++i) {
+      // std::cout << "i = " << i << std::endl;
+      double X = *inputX, Y = *inputY, Z = *inputZ;
+      *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z + r0[3];
+      *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z + r1[3];
+      *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z + r2[3];
+      ++inputX; ++inputY; ++inputZ;
+      ++outputX; ++outputY; ++outputZ;
     }
-    inputData += 3;
-  }
 
 #else
-  double *r0 = rowPtrs[0];
-  double *r1 = rowPtrs[1];
-  double *r2 = rowPtrs[2];
-
-  for (unsigned int i = 0; i < input.getRows(); ++i) {
-    output[i][0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2] + r0[3];
-    output[i][1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2] + r1[3];
-    output[i][2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2] + r2[3];
-    inputData += 3;
-  }
+    double *r0 = rowPtrs[0];
+    double *r1 = rowPtrs[1];
+    double *r2 = rowPtrs[2];
+
+    for (unsigned int i = 0; i < input.getRows(); ++i) {
+      double X = *inputX, Y = *inputY, Z = *inputZ;
+      *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z + r0[3];
+      *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z + r1[3];
+      *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z + r2[3];
+      ++inputX; ++inputY; ++inputZ;
+      ++outputX; ++outputY; ++outputZ;
+    }
 #endif
+
+  }
 }
 
 
@@ -775,39 +889,39 @@ vpColVector vpHomogeneousMatrix::operator*(const vpColVector &v) const
   }
   vpColVector p(rowNum, 0.0);
 
-#if USE_SSE
-  __m128d rows[] = {
-    _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2),
-    _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2),
-    _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2),
-    _mm_loadu_pd(rowPtrs[3]), _mm_loadu_pd(rowPtrs[3] + 2)
+// #if USE_SSE
+//   __m128d rows[] = {
+//     _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2),
+//     _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2),
+//     _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2),
+//     _mm_loadu_pd(rowPtrs[3]), _mm_loadu_pd(rowPtrs[3] + 2)
 
-  };
+//   };
 
-  __m128d r1 = _mm_loadu_pd(v.data);
-  __m128d r2 = _mm_loadu_pd(v.data + 2);
+//   __m128d r1 = _mm_loadu_pd(v.data);
+//   __m128d r2 = _mm_loadu_pd(v.data + 2);
 
-  for (unsigned int j = 0; j < 4; ++j) {
-    __m128d m1 = _mm_mul_pd(r1, rows[j * 2]);
-#if !defined(__FMA__)
-    __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]);
-    __m128d add = _mm_add_pd(m1, m2);
-#else
-    __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1);
-#endif
-    __m128d sum = _mm_hadd_pd(add, add);
-    p[j] = _mm_cvtsd_f64(sum);
-  }
+//   for (unsigned int j = 0; j < 4; ++j) {
+//     __m128d m1 = _mm_mul_pd(r1, rows[j * 2]);
+// #if !defined(__FMA__)
+//     __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]);
+//     __m128d add = _mm_add_pd(m1, m2);
+// #else
+//     __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1);
+// #endif
+//     __m128d sum = _mm_hadd_pd(add, add);
+//     p[j] = _mm_cvtsd_f64(sum);
+//   }
 
 
-#else
+// #else
 
   for (unsigned int j = 0; j < val_4; ++j) {
     for (unsigned int i = 0; i < val_4; ++i) {
       p[i] += rowPtrs[i][j] * v[j];
     }
   }
-#endif
+// #endif
   return p;
 }
 
diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp
index cd8ee56d12..fa5e78e9c3 100644
--- a/modules/core/src/math/transformation/vpRotationMatrix.cpp
+++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp
@@ -40,6 +40,8 @@
 #include <visp3/core/vpMath.h>
 #include <visp3/core/vpMatrix.h>
 
+#include <visp3/core/vpSIMDUtils.h>
+
 // Rotation classes
 #include <visp3/core/vpRotationMatrix.h>
 
@@ -49,179 +51,208 @@
 // Debug trace
 #include <math.h>
 
+BEGIN_VISP_NAMESPACE
 
-#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <smmintrin.h>
 
-#define VISP_HAVE_SSE2 1
-#endif
+void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bool isTransposed) const
+{
+  output.resize(input.getRows(), input.getCols(), false, false);
+  if (isTransposed) {
+    if (input.getCols() != 3) {
+      throw vpException(vpException::dimensionError, "Input matrix should have 3 columns");
+    }
+    double *inputData = input.data;
+    double *outputData = output.data;
+#if defined(VISP_HAVE_AVX)
+    __m256d rows[] = {
+      _mm256_setr_pd(rowPtrs[0][0], rowPtrs[0][1], rowPtrs[0][2], 0.0),
+      _mm256_setr_pd(rowPtrs[1][0], rowPtrs[1][1], rowPtrs[1][2], 0.0),
+      _mm256_setr_pd(rowPtrs[2][0], rowPtrs[2][1], rowPtrs[2][2], 0.0),
+    };
 
-// https://stackoverflow.com/a/40765925
-#if !defined(__FMA__) && defined(__AVX2__)
-#define __FMA__ 1
-#endif
+    double result[4];
 
-#if defined _WIN32 && defined(_M_ARM64)
-#define _ARM64_DISTINCT_NEON_TYPES
-#include <Intrin.h>
-#include <arm_neon.h>
-#define VISP_HAVE_NEON 1
-#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__)
-#include <arm_neon.h>
-#define VISP_HAVE_NEON 1
-#else
-#define VISP_HAVE_NEON 0
-#endif
+    for (unsigned int i = 0; i < input.getRows(); ++i) {
+      const __m256d xyzw = _mm256_setr_pd(inputData[0], inputData[1], inputData[2], 0.0);
 
-#define USE_SIMD_CODE 1
+      const __m256d rs[] = {
+        _mm256_mul_pd(rows[0], xyzw),
+        _mm256_mul_pd(rows[1], xyzw),
+        _mm256_mul_pd(rows[2], xyzw),
+      };
 
-#if VISP_HAVE_SSE2 && USE_SIMD_CODE
-#define USE_SSE 1
-#else
-#define USE_SSE 0
-#endif
+      __m256d rs_half_sum1 = _mm256_hadd_pd(rs[0], rs[1]);
 
-#if VISP_HAVE_NEON && USE_SIMD_CODE
-#define USE_NEON 1
-#else
-#define USE_NEON 0
-#endif
 
+      _mm256_storeu_pd(result, rs_half_sum1);
+      __m128d hi = _mm256_extractf128_pd(rs[2], 1);   // [A3, A2]
+      __m128d lo = _mm256_castpd256_pd128(rs[2]);     // [A1, A0]
 
-BEGIN_VISP_NAMESPACE
+      __m128d sum2 = _mm_add_pd(lo, hi);          // [A3+A1, A2+A0]
+      __m128d sum1 = _mm_hadd_pd(sum2, sum2);     // [total, total]
+      double total = _mm_cvtsd_f64(sum1);
 
 
-void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output) const
-{
-  output.resize(input.getRows(), input.getCols(), false, false);
-  if (input.getCols() != 3) {
-    throw vpException(vpException::dimensionError, "Input matrix should have 3 columns");
-  }
-  double *inputData = input.data;
-  double *outputData = output.data;
+      outputData[0] = result[0] + result[2];
+      outputData[1] = result[1] + result[3];
+      outputData[2] = total;
 
-#if defined(__AVX__) && 0 == 1
-  __m256d rows[] = {
-    _mm256_set_pd(0.0, rowPtrs[2][0], rowPtrs[1][0], rowPtrs[0][0]),
-    _mm256_set_pd(0.0, rowPtrs[2][1], rowPtrs[1][1], rowPtrs[0][1]),
-    _mm256_set_pd(0.0, rowPtrs[2][2], rowPtrs[1][2], rowPtrs[0][2]),
-  };
+      inputData += 3;
+      outputData += 3;
+    }
 
-  double result[4];
+#elif VISP_HAVE_SSE2
 
-  for (unsigned int i = 0; i < input.getRows(); ++i) {
-    const __m256d xyz = _mm256_set_pd(0.0, inputData[2], inputData[1], inputData[0]);
 
-#if defined(__FMA__)
-    __m256d dp = _mm256_mul_pd(rows[0], xyz);
-    dp = _mm256_fmadd_pd(rows[1], xyz, dp);
-    dp = _mm256_fmadd_pd(rows[2], xyz, dp);
-#else
-    __m256d col1 = _mm256_mul_pd(rows[0], xyz);
-    __m256d col2 = _mm256_mul_pd(rows[1], xyz);
-    __m256d col3 = _mm256_mul_pd(rows[2], xyz);
+    __m128d row01_lo = _mm_loadu_pd(rowPtrs[0]);
+    __m128d row01_hi = _mm_loadu_pd(rowPtrs[1]);
+    __m128d row2_xy = _mm_loadu_pd(rowPtrs[2]);
 
-    __m256d dp = _mm256_add_pd(col1, col2);
-    dp = _mm256_add_pd(dp, col3);
-#endif
+    // Preload third column as scalars
+    double c0 = rowPtrs[0][2];
+    double c1 = rowPtrs[1][2];
+    double c2 = rowPtrs[2][2];
 
-    _mm256_storeu_pd(result, dp);
 
-    outputData[0] = result[0];
-    outputData[1] = result[1];
-    outputData[2] = result[2];
+    for (unsigned i = 0; i < input.getRows(); ++i) {
 
-    inputData += 3;
-    outputData += 3;
-  }
+      const __m128d xy = _mm_loadu_pd(inputData);
 
-#elif USE_SSE
+      const double z = inputData[2];
+      // const __m128d zz = _mm_set_pd(z, z);
 
+      __m128d mul0 = _mm_mul_pd(row01_lo, xy);
+      __m128d mul1 = _mm_mul_pd(row01_hi, xy);
+      __m128d r01 = _mm_hadd_pd(mul0, mul1);
+      __m128d mul2 = _mm_mul_pd(row2_xy, xy);
 
-  __m128d row01_lo = _mm_loadu_pd(rowPtrs[0]);
-  __m128d row01_hi = _mm_loadu_pd(rowPtrs[1]);
-  __m128d row2_xy = _mm_loadu_pd(rowPtrs[2]);
+      // Adding scalars at the end without using simd is faster
+      // __m128d mulzz = _mm_mul_pd(row01_z, zz);
+      // r01 = _mm_add_pd(r01, mulzz);
 
-  // Preload third column as scalars
-  double c0 = rowPtrs[0][2];
-  double c1 = rowPtrs[1][2];
-  double c2 = rowPtrs[2][2];
+      // store result
+      outputData[0] = _mm_cvtsd_f64(r01) + c0 * z;
+      outputData[1] = _mm_cvtsd_f64(_mm_unpackhi_pd(r01, r01)) + c1 * z;
+      outputData[2] = _mm_cvtsd_f64(_mm_hadd_pd(mul2, mul2)) + c2 * z;
 
-  for (unsigned i = 0; i < input.getRows(); ++i) {
+      inputData += 3;
+      outputData += 3;
+    }
 
-    const __m128d xy = _mm_loadu_pd(inputData);
+#else
+    double *r0 = rowPtrs[0];
+    double *r1 = rowPtrs[1];
+    double *r2 = rowPtrs[2];
+
+    for (unsigned int i = 0; i < input.getRows(); ++i) {
+      outputData[0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2];
+      outputData[1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2];
+      outputData[2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2];
+      inputData += 3;
+      outputData += 3;
+    }
+#endif
+  }
 
-    const double z = inputData[2];
-    // const __m128d zz = _mm_set_pd(z, z);
 
-    __m128d mul0 = _mm_mul_pd(row01_lo, xy);
-    __m128d mul1 = _mm_mul_pd(row01_hi, xy);
-    __m128d r01 = _mm_hadd_pd(mul0, mul1);
-    __m128d mul2 = _mm_mul_pd(row2_xy, xy);
 
-    // Adding scalars at the end without using simd is faster
-    // __m128d mulzz = _mm_mul_pd(row01_z, zz);
-    // r01 = _mm_add_pd(r01, mulzz);
+  else { // 3xN matrix
+    if (input.getRows() != 3) {
+      throw vpException(vpException::dimensionError, "Expected input to have 3 rows");
+    }
+    double *inputX = input[0];
+    double *inputY = input[1];
+    double *inputZ = input[2];
+    double *outputX = output[0];
+    double *outputY = output[1];
+    double *outputZ = output[2];
 
-    // store result
-    outputData[0] = _mm_cvtsd_f64(r01) + c0 * z;
-    outputData[1] = _mm_cvtsd_f64(_mm_unpackhi_pd(r01, r01)) + c1 * z;
-    outputData[2] = _mm_cvtsd_f64(_mm_hadd_pd(mul2, mul2)) + c2 * z;
 
-    inputData += 3;
-    outputData += 3;
-  }
-    // __m128d rowStarts[] = {
-    //   _mm_loadu_pd(rowPtrs[0]),
-    //   _mm_loadu_pd(rowPtrs[1]),
-    //   _mm_loadu_pd(rowPtrs[2])
-    // };
+#if defined(VISP_HAVE_AVX)
 
-    // __m128d lastColStart = _mm_set_pd(rowPtrs[1][2], rowPtrs[0][2]);
-    // __m128d lastColEnd = _mm_set_pd(rowPtrs[2][2], rowPtrs[2][2]);
+    __m256d elems[9];
+    for (unsigned int i = 0; i < 9; ++i) {
+      elems[i] = _mm256_set1_pd(data[i]);
+    }
+    for (int i = 0; i <= static_cast<int>(input.getCols()) - 4; i += 4) {
+      const __m256d x4 = _mm256_loadu_pd(inputX);
+      const __m256d y4 = _mm256_loadu_pd(inputY);
+      const __m256d z4 = _mm256_loadu_pd(inputZ);
+
+#if defined(VISP_HAVE_FMA)
+      __m256d dp1 = _mm256_mul_pd(x4, elems[0]);
+      dp1 = _mm256_fmadd_pd(y4, elems[1], dp1);
+      dp1 = _mm256_fmadd_pd(z4, elems[2], dp1);
+      __m256d dp2 = _mm256_mul_pd(x4, elems[3]);
+      dp2 = _mm256_fmadd_pd(y4, elems[4], dp2);
+      dp2 = _mm256_fmadd_pd(z4, elems[5], dp2);
+      __m256d dp3 = _mm256_mul_pd(x4, elems[6]);
+      dp3 = _mm256_fmadd_pd(y4, elems[7], dp3);
+      dp3 = _mm256_fmadd_pd(z4, elems[8], dp3);
 
-    // const int dotProdFullMask = 0x33; // 0x3: use both input floats, 0x2: store into lane 1
-    // const int dotProdFirstElemMask = 0x11;
+#else
+      const __m256d muls = {
+        _mm256_mul_pd(x4, elems[0]),
+        _mm256_mul_pd(y4, elems[1]),
+        _mm256_mul_pd(z4, elems[2]),
+
+        _mm256_mul_pd(x4, elems[3]),
+        _mm256_mul_pd(y4, elems[4]),
+        _mm256_mul_pd(z4, elems[5]),
+
+        _mm256_mul_pd(x4, elems[6]),
+        _mm256_mul_pd(y4, elems[7]),
+        _mm256_mul_pd(z4, elems[8]),
+      }
 
-    // for (unsigned int i = 0; i < input.getRows(); ++i) {
-    //   __m128d xy = _mm_loadu_pd(inputData);
-    //   __m128d zz = _mm_set_pd(inputData[2], inputData[2]);
-    //   // Perform dot products with the two first elements of each row
-    //   __m128d r1 = _mm_dp_pd(xy, rowStarts[0], dotProdFullMask);
-    //   __m128d r2 = _mm_dp_pd(xy, rowStarts[1], dotProdFullMask);
-    //   __m128d r3 = _mm_dp_pd(xy, rowStarts[2], dotProdFullMask);
+      __m256d dp1 = _mm256_add_pd(_mm256_add_pd(muls[0], muls[1]), muls[2]);
+      __m256d dp2 = _mm256_add_pd(_mm256_add_pd(muls[3], muls[4]), muls[5]);
+      __m256d dp3 = _mm256_add_pd(_mm256_add_pd(muls[6], muls[7]), muls[8]);
 
-    //   __m128d r12xy = _mm_shuffle_pd(r1, r2, 0x2);
 
-    //   // Dot product between last columns of the rows and Z
-    //   __m128d lastColxy = _mm_dp_pd(zz, lastColStart, dotProdFullMask); // First two cols
-    //   __m128d lastColz = _mm_dp_pd(zz, lastColEnd, dotProdFirstElemMask); // Last one
+#endif
 
-    //   __m128d r3z = _mm_add_pd(r3, lastColz);
+      _mm256_storeu_pd(outputX, dp1);
+      _mm256_storeu_pd(outputY, dp2);
+      _mm256_storeu_pd(outputZ, dp3);
 
-    //   _mm_storeu_pd(outputData, _mm_add_pd(r12xy, lastColxy));
 
-    //   outputData[2] = _mm_cvtsd_f64(r3z);
-    //   inputData += 3;
-    //   outputData += 3;
-    // }
+      inputX += 4; inputY += 4; inputZ += 4;
+      outputX += 4; outputY += 4; outputZ += 4;
 
+    }
+    double *r0 = rowPtrs[0];
+    double *r1 = rowPtrs[1];
+    double *r2 = rowPtrs[2];
+
+    for (unsigned int i = (input.getCols() / 4) * 4; i < input.getCols(); ++i) {
+      // std::cout << "i = " << i << std::endl;
+      double X = *inputX, Y = *inputY, Z = *inputZ;
+      *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z;
+      *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z;
+      *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z;
+      ++inputX; ++inputY; ++inputZ;
+      ++outputX; ++outputY; ++outputZ;
+    }
 
 #else
-  double *r0 = rowPtrs[0];
-  double *r1 = rowPtrs[1];
-  double *r2 = rowPtrs[2];
-
-  for (unsigned int i = 0; i < input.getRows(); ++i) {
-    outputData[0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2];
-    outputData[1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2];
-    outputData[2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2];
-    inputData += 3;
-    outputData += 3;
-  }
+    double *r0 = rowPtrs[0];
+    double *r1 = rowPtrs[1];
+    double *r2 = rowPtrs[2];
+
+    for (unsigned int i = 0; i < input.getRows(); ++i) {
+      double X = *inputX, Y = *inputY, Z = *inputZ;
+      *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z;
+      *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z;
+      *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z;
+      ++inputX; ++inputY; ++inputZ;
+      ++outputX; ++outputY; ++outputZ;
+    }
 #endif
+  }
+
+
+
 }
 
 
diff --git a/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h b/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h
index cd8136b900..9539dc6aa1 100644
--- a/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h
+++ b/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h
@@ -179,10 +179,8 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker
       for (vpMatrix *m: matrices) {
         m->resize(3, numPoints, false, false);
       }
-      m_valid.resize(numPoints);
 
       for (unsigned int i = 0; i < numPoints; ++i) {
-        m_valid[i] = true;
         for (unsigned int j = 0; j < 3; ++j) {
           m_oXt[j][i] = points[i].oX[j];
           m_oNt[j][i] = points[i].objectNormal[j];
@@ -190,89 +188,9 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker
         }
       }
     }
-    inline void updateAndErrorAndInteractionMatrix(const vpCameraParameters &cam, const vpHomogeneousMatrix &cMo, const vpImage<float> &depth, vpColVector &e, vpMatrix &L)
-    {
-      const vpRotationMatrix cRo = cMo.getRotationMatrix();
-      const vpTranslationVector t = cMo.getTranslationVector();
-      vpMatrix::mult2Matrices(cRo, m_oXt, m_cXt);
-      vpMatrix::mult2Matrices(cRo, m_oNt, m_cNt);
-
-      const unsigned int numPoints = m_oXt.getCols();
-      e.resize(numPoints, false);
-      L.resize(numPoints, 6, false, false);
-
-#ifdef VISP_HAVE_OPENMP
-#pragma omp parallel for
-#endif
-      for (int i = 0; i < static_cast<int>(numPoints); ++i) {
-
-        //Step 1: update and filter out points that are no longer valid
-        {
-
-          for (unsigned int j = 0; j < 3; ++j) {
-            m_cXt[j][i] += t[j];
-          }
-          // Plane points away from the camera: this surface is no longer visible due to rotation
-          if (m_cNt[2][i] >= 0.0) {
-            m_valid[i] = false;
-            continue;
-          }
-          double x, y, u, v;
-          x = m_cXt[0][i] / m_cXt[2][i];
-          y = m_cXt[1][i] / m_cXt[2][i];
-
-          vpMeterPixelConversion::convertPointWithoutDistortion(cam, x, y, u, v);
-          // Point is no longer in image: depth value cannot be sampled
-          if (u < 0 || v < 0 || u >= depth.getWidth() || v >= depth.getHeight()) {
-            m_valid[i] = false;
-            continue;
-          }
-          const double Z = depth[static_cast<unsigned int>(v)][static_cast<unsigned int>(u)];
-          // Z value in the depth image from the camera is invalid
-          if (Z <= 0.0) {
-            m_valid[i] = false;
-            continue;
-          }
-
-          m_valid[i] = true;
-          m_observations[0][i] = x * Z;
-          m_observations[1][i] = y * Z;
-          m_observations[2][i] = Z;
-        }
-        // Step 2: update Jacobian and error for valid points
-        {
-          const double X = m_cXt[0][i], Y = m_cXt[1][i], Z = m_cXt[2][i];
-          const double nX = m_cNt[0][i], nY = m_cNt[1][i], nZ = m_cNt[2][i];
+    void updateAndErrorAndInteractionMatrix(const vpCameraParameters &cam, const vpHomogeneousMatrix &cMo,
+       const vpImage<float> &depth, vpColVector &e, vpMatrix &L);
 
-          const double D = -((nX * X) + (nY  * Y) + (nZ * Z));
-          double projNormal = nX * m_observations[0][i] + nY  * m_observations[1][i] + nZ * m_observations[2][i];
-
-          e[i] = D + projNormal;
-
-          L[i][0] = nX;
-          L[i][1] = nY;
-          L[i][2] = nZ;
-          L[i][3] = nZ * Y - nY * Z;
-          L[i][4] = nX * Z - nZ * X;
-          L[i][5] = nY * X - nX * Y;
-
-        }
-      }
-
-      // Disable invalid points
-      for (unsigned int i = 0; i < numPoints; ++i) {
-        if (!m_valid[i]) {
-          e[i] = 0.0;
-
-          L[i][0] = 0;
-          L[i][1] = 0;
-          L[i][2] = 0;
-          L[i][3] = 0;
-          L[i][4] = 0;
-          L[i][5] = 0;
-        }
-      }
-    }
 
     const vpMatrix &getPointsObject() const { return m_oXt; }
     const vpMatrix &getNormalsObject() const { return m_oNt; }
@@ -285,7 +203,7 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker
     vpMatrix m_oNt; // a 3xN matrix containing the 3D normals on the object
     vpMatrix m_cXt;
     vpMatrix m_cNt;
-    std::vector<bool> m_valid;
+    std::vector<unsigned int> m_invalid;
 
   };
 
@@ -304,7 +222,7 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker
       {vpRBDenseDepthTracker::vpDisplayType::DT_WEIGHT, "weight"},
       {vpRBDenseDepthTracker::vpDisplayType::DT_ERROR, "error"},
       {vpRBDenseDepthTracker::vpDisplayType::DT_WEIGHT_AND_ERROR, "weightAndError"}
-    });
+});
 
   virtual void loadJsonConfiguration(const nlohmann::json &j) VP_OVERRIDE
   {
@@ -330,6 +248,9 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker
   bool m_useMask;
   float m_minMaskConfidence;
   vpDisplayType m_displayType;
+
+  vpMatrix m_Lt;
+
 };
 
 END_VISP_NAMESPACE
diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
index 2a69b7bfa7..bd17100062 100644
--- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
+++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
@@ -30,6 +30,7 @@
 
 #include <visp3/rbt/vpRBDenseDepthTracker.h>
 #include <visp3/core/vpMeterPixelConversion.h>
+#include <visp3/core/vpSIMDUtils.h>
 #include <visp3/core/vpDisplay.h>
 
 #ifdef VISP_HAVE_OPENMP
@@ -185,7 +186,12 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame,
     return;
   }
 
-  m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_L);
+  double t1 = vpTime::measureTimeMs();
+  m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_Lt);
+  m_L = m_Lt.t();
+
+  double t2 = vpTime::measureTimeMs();
+  std::cout << "Update took: " << (t2 - t1) << std::endl;
 
   m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01);
   m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights);
@@ -199,53 +205,243 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame,
       m_L[i][dof] *= m_weights[i];
     }
   }
-
   m_LTL = m_L.AtA();
   computeJTR(m_L, m_weighted_error, m_LTR);
   m_vvsConverged = false;
 }
 
-void vpRBDenseDepthTracker::display(const vpCameraParameters &/*cam*/, const vpImage<unsigned char> &/*I*/,
-                                    const vpImage<vpRGBa> &/*IRGB*/, const vpImage<unsigned char> &depth) const
+void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, const vpMatrix &obsT, vpColVector &e, vpMatrix &Lt, unsigned int start, unsigned int end)
 {
-  switch (m_displayType) {
-  case DT_SIMPLE:
-  {
-    for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
-      vpColor c(0, 255, 0);
-      vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
-    }
-    break;
+#if defined(VISP_HAVE_OPENMP)
+#pragma omp parallel for
+#endif
+  for (unsigned int i = start; i < end; ++i) {
+    const double X = cXt[0][i], Y = cXt[1][i], Z = cXt[2][i];
+    const double nX = cNt[0][i], nY = cNt[1][i], nZ = cNt[2][i];
+
+    const double D = -((nX * X) + (nY  * Y) + (nZ * Z));
+    const double projNormal = nX * obsT[0][i] + nY  * obsT[1][i] + nZ * obsT[2][i];
+
+    e[i] = D + projNormal;
+
+    Lt[0][i] = nX;
+    Lt[1][i] = nY;
+    Lt[2][i] = nZ;
+    Lt[3][i] = nZ * Y - nY * Z;
+    Lt[4][i] = nX * Z - nZ * X;
+    Lt[5][i] = nY * X - nX * Y;
   }
-  case DT_WEIGHT:
-  {
-    for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
-      vpColor c(0, static_cast<unsigned char>(m_weights[i] * 255), 0);
-      vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
-    }
-    break;
+
+}
+
+
+#if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2)
+void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const  vpMatrix &cNt, const  vpMatrix &obsT, vpColVector &e, vpMatrix &Lt)
+{
+
+#if defined(VISP_HAVE_AVX2)
+
+  using Register = __m512d;
+  constexpr int numLanes = 8;
+  constexpr auto &loadu = _mm512_loadu_pd;
+  constexpr auto &mul = _mm512_mul_pd;
+  constexpr auto &sub = _mm512_sub_pd;
+  constexpr auto &storeu = _mm512_storeu_pd;
+#if defined(VISP_HAVE_FMA)
+  constexpr auto &fma = _mm512_fmadd_pd;
+#else
+  constexpr auto &add = _mm512_add_pd;
+#endif
+#elif defined(VISP_HAVE_AVX)
+  using Register = __m256d;
+  constexpr int numLanes = 4;
+
+  constexpr auto &loadu = _mm256_loadu_pd;
+  constexpr auto &mul = _mm256_mul_pd;
+  constexpr auto &sub = _mm256_sub_pd;
+
+  constexpr auto &storeu = _mm256_storeu_pd;
+#if defined(VISP_HAVE_FMA)
+  constexpr auto &fma = _mm256_fmadd_pd;
+#else
+  constexpr auto &add = _mm256_add_pd;
+#endif
+#elif VISP_HAVE_SSE2
+  using Register = __m128d;
+  constexpr int numLanes = 2;
+
+  constexpr auto &loadu = _mm_loadu_pd;
+  constexpr auto &mul = _mm_mul_pd;
+  constexpr auto &sub = _mm_sub_pd;
+
+  constexpr auto &storeu = _mm_storeu_pd;
+#if defined(VISP_HAVE_FMA)
+  constexpr auto &fma = _mm_fmadd_pd;
+#else
+  constexpr auto &add = _mm_add_pd;
+#endif
+#endif
+
+  for (int i = 0; i <= static_cast<int>(cXt.getCols()) - numLanes; i += numLanes) {
+    const Register X = loadu(cXt[0] + i), Y = loadu(cXt[1] + i), Z = loadu(cXt[2] + i);
+    const Register nX = loadu(cNt[0] + i), nY = loadu(cNt[1] + i), nZ = loadu(cNt[2] + i);
+    const Register obsX = loadu(obsT[0] + i), obsY = loadu(obsT[1] + i), obsZ = loadu(obsT[2] + i);
+
+    Register D = mul(nX, X);
+    Register projNormal = mul(obsX, X);
+#if defined(VISP_HAVE_FMA)
+    D = fma(nY, Y, D);
+    D = fma(nZ, Z, D);
+
+    projNormal = fma(nY, obsY, projNormal);
+    projNormal = fma(nZ, obsZ, projNormal);
+#else
+    D = add(mul(nY, Y), D);
+    D = add(mul(nZ, Z), D);
+
+    projNormal = add(mul(nY, obsY), projNormal);
+    projNormal = add(mul(nZ, obsZ), projNormal);
+#endif
+    storeu(e.data + i, sub(projNormal, D));
+
+    storeu(Lt[0] + i, nX);
+    storeu(Lt[1] + i, nY);
+    storeu(Lt[2] + i, nZ);
+
+    storeu(Lt[3] + i, sub(mul(nZ, Y), mul(nY, Z)));
+    storeu(Lt[4] + i, sub(mul(nX, Z), mul(nZ, X)));
+    storeu(Lt[5] + i, sub(mul(nY, X), mul(nX, Y)));
   }
-  case DT_ERROR:
+  errorAndInteractionMatrixBase(cXt, cNt, obsT, e, Lt, (cXt.getCols() / numLanes) * numLanes, cXt.getCols());
+}
+#endif
+
+
+
+void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix(
+  const vpCameraParameters &cam, const vpHomogeneousMatrix &cMo,
+  const vpImage<float> &depth, vpColVector &e, vpMatrix &Lt)
+{
+  const vpRotationMatrix cRo = cMo.getRotationMatrix();
+  cMo.project(m_oXt, m_cXt, false);
+  cRo.rotateVectors(m_oNt, m_cNt, false);
+
+  m_invalid.clear();
+
+  const unsigned int numPoints = m_oXt.getCols();
+  e.resize(numPoints, false);
+  Lt.resize(6, numPoints, false, false);
+
+#if defined(VISP_HAVE_OPENMP)
+#pragma omp parallel
   {
-    double maxError = m_error.getMaxValue();
-    for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
-      vpColor c(static_cast<unsigned int>((m_error[i] / maxError) * 255), 0, 0);
-      vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
+#endif
+    std::vector<unsigned int> localInvalid;
+#if defined(VISP_HAVE_OPENMP)
+#pragma omp for
+#endif
+
+    for (int i = 0; i < static_cast<int>(numPoints); ++i) {
+      //Step 1: update and filter out points that are no longer valid
+      {
+        // Plane points away from the camera: this surface is no longer visible due to rotation
+        if (m_cNt[2][i] >= 0.0) {
+          localInvalid.push_back(i);
+          continue;
+        }
+        double x, y, u, v;
+        x = m_cXt[0][i] / m_cXt[2][i];
+        y = m_cXt[1][i] / m_cXt[2][i];
+
+        vpMeterPixelConversion::convertPointWithoutDistortion(cam, x, y, u, v);
+        // Point is no longer in image: depth value cannot be sampled
+        if (u < 0 || v < 0 || u >= depth.getWidth() || v >= depth.getHeight()) {
+          localInvalid.push_back(i);
+          continue;
+        }
+        const double Z = depth[static_cast<unsigned int>(v)][static_cast<unsigned int>(u)];
+        // Z value in the depth image from the camera is invalid
+        if (Z <= 0.0) {
+          localInvalid.push_back(i);
+          continue;
+        }
+
+        m_observations[0][i] = x * Z;
+        m_observations[1][i] = y * Z;
+        m_observations[2][i] = Z;
+      }
+    }
+#if defined(VISP_HAVE_OPENMP)
+#pragma omp critical
+    {
+      m_invalid.insert(m_invalid.end(), localInvalid.begin(), localInvalid.end());
     }
-    break;
   }
-  case DT_WEIGHT_AND_ERROR:
+#else
+    m_invalid = std::move(localInvalid);
+#endif
+
+
+#if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2)
+    errorAndInteractionMatrixSIMD(m_cXt, m_cNt, m_observations, e, Lt);
+#else
+    errorAndInteractionMatrixBase(m_cXt, m_cNt, m_observations, e, Lt, 0, numPoints);
+#endif
+
+  // Disable invalid points
+    for (unsigned int i : m_invalid) {
+      e[i] = 0.0;
+
+      Lt[0][i] = 0;
+      Lt[1][i] = 0;
+      Lt[2][i] = 0;
+      Lt[3][i] = 0;
+      Lt[4][i] = 0;
+      Lt[5][i] = 0;
+    }
+}
+
+  void vpRBDenseDepthTracker::display(const vpCameraParameters &/*cam*/, const vpImage<unsigned char> &/*I*/,
+                                      const vpImage<vpRGBa> &/*IRGB*/, const vpImage<unsigned char> &depth) const
   {
-    double maxError = m_error.getMaxValue();
-    for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
-      vpColor c(static_cast<unsigned int>((m_error[i] / maxError) * 255.0), static_cast<unsigned char>(m_weights[i] * 255), 0);
-      vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
+    switch (m_displayType) {
+    case DT_SIMPLE:
+    {
+      for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
+        vpColor c(0, 255, 0);
+        vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
+      }
+      break;
+    }
+    case DT_WEIGHT:
+    {
+      for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
+        vpColor c(0, static_cast<unsigned char>(m_weights[i] * 255), 0);
+        vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
+      }
+      break;
+    }
+    case DT_ERROR:
+    {
+      double maxError = m_error.getMaxValue();
+      for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
+        vpColor c(static_cast<unsigned int>((m_error[i] / maxError) * 255), 0, 0);
+        vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
+      }
+      break;
+    }
+    case DT_WEIGHT_AND_ERROR:
+    {
+      double maxError = m_error.getMaxValue();
+      for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
+        vpColor c(static_cast<unsigned int>((m_error[i] / maxError) * 255.0), static_cast<unsigned char>(m_weights[i] * 255), 0);
+        vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
+      }
+      break;
+    }
+    default:
+      throw vpException(vpException::notImplementedError, "Depth tracker display type is invalid or not implemented");
     }
-    break;
-  }
-  default:
-    throw vpException(vpException::notImplementedError, "Depth tracker display type is invalid or not implemented");
   }
-}
 
-END_VISP_NAMESPACE
+  END_VISP_NAMESPACE

From 690c718bf77a06e905f232ff81325c61e528f030 Mon Sep 17 00:00:00 2001
From: Samuel Felton <samuel.felton@irisa.fr>
Date: Wed, 3 Dec 2025 19:14:57 +0100
Subject: [PATCH 06/24] Export SIMD intrinsics utils in a separate header

---
 modules/core/include/visp3/core/vpSIMDUtils.h | 89 +++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 modules/core/include/visp3/core/vpSIMDUtils.h

diff --git a/modules/core/include/visp3/core/vpSIMDUtils.h b/modules/core/include/visp3/core/vpSIMDUtils.h
new file mode 100644
index 0000000000..0342fbf06b
--- /dev/null
+++ b/modules/core/include/visp3/core/vpSIMDUtils.h
@@ -0,0 +1,89 @@
+/*
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2024 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See https://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Rotation matrix.
+ */
+
+/*!
+  \file vpSIMDUtils.h
+  \brief Header that defines and includes useful SIMD routines and macros
+*/
+#include <visp3/core/vpConfig.h>
+
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#define VISP_HAVE_SSE2 1
+#endif
+
+#if defined __AVX2__
+#define VISP_HAVE_AVX2 1
+#endif
+
+#if defined __AVX__
+#define VISP_HAVE_AVX 1
+#endif
+
+// https://stackoverflow.com/a/40765925
+#if !defined(__FMA__) && defined(__AVX2__)
+#define __FMA__ 1
+#endif
+
+
+#if defined(__FMA__)
+#define VISP_HAVE_FMA
+#endif
+
+#if defined _WIN32 && defined(_M_ARM64)
+#define _ARM64_DISTINCT_NEON_TYPES
+#include <Intrin.h>
+#include <arm_neon.h>
+#define VISP_HAVE_NEON 1
+#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__)
+#include <arm_neon.h>
+#define VISP_HAVE_NEON 1
+#else
+#define VISP_HAVE_NEON 0
+#endif
+
+
+#if VISP_HAVE_SSE2 && USE_SIMD_CODE
+#define USE_SSE 1
+#else
+#define USE_SSE 0
+#endif
+
+#if VISP_HAVE_NEON && USE_SIMD_CODE
+#define USE_NEON 1
+#else
+#define USE_NEON 0
+#endif

From 4c88d39b0cf4ab4c124cfacbb8fa68b862047ae3 Mon Sep 17 00:00:00 2001
From: Samuel Felton <samuel.felton@irisa.fr>
Date: Thu, 4 Dec 2025 17:56:04 +0100
Subject: [PATCH 07/24] Fix test, improve vpRBDenseDepth

---
 modules/core/include/visp3/core/vpSIMDUtils.h | 145 ++++++++++++++++++
 .../transformation/vpHomogeneousMatrix.cpp    |  76 +--------
 .../core/test/math/catchHomogeneousMatrix.cpp |  62 ++++++--
 modules/core/test/math/catchRotation.cpp      |  61 +++++---
 .../src/features/vpRBDenseDepthTracker.cpp    |  82 ++++------
 5 files changed, 262 insertions(+), 164 deletions(-)

diff --git a/modules/core/include/visp3/core/vpSIMDUtils.h b/modules/core/include/visp3/core/vpSIMDUtils.h
index 0342fbf06b..397def00a8 100644
--- a/modules/core/include/visp3/core/vpSIMDUtils.h
+++ b/modules/core/include/visp3/core/vpSIMDUtils.h
@@ -35,6 +35,9 @@
   \file vpSIMDUtils.h
   \brief Header that defines and includes useful SIMD routines and macros
 */
+
+#ifndef VP_SIMD_UTILS_H
+#define VP_SIMD_UTILS_H
 #include <visp3/core/vpConfig.h>
 
 #if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
@@ -87,3 +90,145 @@
 #else
 #define USE_NEON 0
 #endif
+
+namespace vpSIMD
+{
+#if defined(VISP_HAVE_AVX2)
+using Register = __m512d;
+
+inline const int numLanes = 8;
+inline Register add(const Register a, const Register b)
+{
+  return _mm512_add_pd(a, b);
+}
+
+inline Register sub(const Register a, const Register b)
+{
+  return _mm512_sub_pd(a, b);
+}
+
+inline Register mul(const Register a, const Register b)
+{
+  return _mm512_mul_pd(a, b);
+}
+
+
+inline Register fma(const Register a, const Register b, const Register c)
+{
+#if defined(VISP_HAVE_FMA)
+  return _mm512_fmadd_pd(a, b, c);
+#else
+  return add(mul(a, b), c);
+#endif
+}
+
+inline Register loadu(const double *const data)
+{
+  return _mm512_loadu_pd(data);
+}
+
+
+inline Register set1(double v)
+{
+  return _mm512_set1_pd(v);
+}
+
+inline void storeu(double *data, const Register a)
+{
+  _mm512_storeu_pd(data, a);
+}
+
+#elif defined(VISP_HAVE_AVX)
+using Register = __m256d;
+inline const int numLanes = 4;
+
+inline Register add(const Register a, const Register b)
+{
+  return _mm256_add_pd(a, b);
+}
+
+inline Register sub(const Register a, const Register b)
+{
+  return _mm256_sub_pd(a, b);
+}
+
+inline Register mul(const Register a, const Register b)
+{
+  return _mm256_mul_pd(a, b);
+}
+
+
+inline Register fma(const Register a, const Register b, const Register c)
+{
+#if defined(VISP_HAVE_FMA)
+  return _mm256_fmadd_pd(a, b, c);
+#else
+  return add(mul(a, b), c);
+#endif
+}
+
+inline Register loadu(const double *const data)
+{
+  return _mm256_loadu_pd(data);
+}
+
+
+inline Register set1(double v)
+{
+  return _mm256_set1_pd(v);
+}
+
+inline void storeu(double *data, const Register a)
+{
+  _mm256_storeu_pd(data, a);
+}
+
+#elif VISP_HAVE_SSE2
+using Register = __m128d;
+inline const int numLanes = 2;
+
+inline Register add(const Register a, const Register b)
+{
+  return _mm_add_pd(a, b);
+}
+
+inline Register sub(const Register a, const Register b)
+{
+  return _mm_sub_pd(a, b);
+}
+
+inline Register mul(const Register a, const Register b)
+{
+  return _mm_mul_pd(a, b);
+}
+
+
+inline Register fma(const Register a, const Register b, const Register c)
+{
+#if defined(VISP_HAVE_FMA)
+  return _mm_fmadd_pd(a, b, c);
+#else
+  return add(mul(a, b), c);
+#endif
+}
+
+inline Register loadu(const double *const data)
+{
+  return _mm_loadu_pd(data);
+}
+
+inline Register set1(double v)
+{
+  return _mm_set1_pd(v);
+}
+
+inline void storeu(double *data, const Register a)
+{
+  _mm_storeu_pd(data, a);
+}
+
+#endif
+
+}
+
+#endif // VP_SIMD_UTILS_H
diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
index 4867583025..9501e533b2 100644
--- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
+++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
@@ -150,43 +150,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
 
 #if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2)
 
-#if defined(VISP_HAVE_AVX2)
-
-    using Register = __m512d;
-    constexpr int numLanes = 8;
-    constexpr auto &set1 = _mm512_set1_pd;
-    constexpr auto &loadu = _mm512_loadu_pd;
-    constexpr auto &mul = _mm512_mul_pd;
-    constexpr auto &add = _mm512_add_pd;
-    constexpr auto &storeu = _mm512_storeu_pd;
-#if defined(VISP_HAVE_FMA)
-    constexpr auto &fma = _mm512_fmadd_pd;
-#endif
-#elif defined(VISP_HAVE_AVX)
-    using Register = __m256d;
-    constexpr int numLanes = 4;
-
-    constexpr auto &set1 = _mm256_set1_pd;
-    constexpr auto &loadu = _mm256_loadu_pd;
-    constexpr auto &mul = _mm256_mul_pd;
-    constexpr auto &add = _mm256_add_pd;
-    constexpr auto &storeu = _mm256_storeu_pd;
-#if defined(VISP_HAVE_FMA)
-    constexpr auto &fma = _mm256_fmadd_pd;
-#endif
-#elif VISP_HAVE_SSE2
-    using Register = __m128d;
-    constexpr int numLanes = 2;
-
-    constexpr auto &set1 = _mm_set1_pd;
-    constexpr auto &loadu = _mm_loadu_pd;
-    constexpr auto &mul = _mm_mul_pd;
-    constexpr auto &add = _mm_add_pd;
-    constexpr auto &storeu = _mm_storeu_pd;
-#if defined(VISP_HAVE_FMA)
-    constexpr auto &fma = _mm_fmadd_pd;
-#endif
-#endif
+    using namespace vpSIMD;
 
     Register elems[12];
     for (unsigned int i = 0; i < 12; ++i) {
@@ -198,52 +162,26 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
       const Register y4 = loadu(inputY);
       const Register z4 = loadu(inputZ);
 
-#if defined(VISP_HAVE_FMA)
       Register dp1 = mul(x4, elems[0]);
-      dp1 = fma(y4, elems[1], dp1);
-      dp1 = fma(z4, elems[2], dp1);
+      dp1 = vpSIMD::fma(y4, elems[1], dp1);
+      dp1 = vpSIMD::fma(z4, elems[2], dp1);
       dp1 = add(elems[3], dp1);
 
       Register dp2 = mul(x4, elems[4]);
-      dp2 = fma(y4, elems[5], dp2);
-      dp2 = fma(z4, elems[6], dp2);
+      dp2 = vpSIMD::fma(y4, elems[5], dp2);
+      dp2 = vpSIMD::fma(z4, elems[6], dp2);
       dp2 = add(elems[7], dp2);
 
       Register dp3 = mul(x4, elems[8]);
-      dp3 = fma(y4, elems[9], dp3);
-      dp3 = fma(z4, elems[10], dp3);
+      dp3 = vpSIMD::fma(y4, elems[9], dp3);
+      dp3 = vpSIMD::fma(z4, elems[10], dp3);
       dp3 = add(elems[11], dp3);
 
 
-#else
-      const Register muls[] = {
-        mul(x4, elems[0]),
-        mul(y4, elems[1]),
-        mul(z4, elems[2]),
-
-        mul(x4, elems[4]),
-        mul(y4, elems[5]),
-        mul(z4, elems[6]),
-
-        mul(x4, elems[8]),
-        mul(y4, elems[9]),
-        mul(z4, elems[10]),
-      };
-
-      Register dp1 = add(add(muls[0], muls[1]), muls[2]);
-      Register dp2 = add(add(muls[3], muls[4]), muls[5]);
-      Register dp3 = add(add(muls[6], muls[7]), muls[8]);
-
-      dp1 = add(dp1, elems[3]);
-      dp2 = add(dp2, elems[7]);
-      dp3 = add(dp3, elems[11]);
-#endif
-
       storeu(outputX, dp1);
       storeu(outputY, dp2);
       storeu(outputZ, dp3);
 
-
       inputX += numLanes; inputY += numLanes; inputZ += numLanes;
       outputX += numLanes; outputY += numLanes; outputZ += numLanes;
 
diff --git a/modules/core/test/math/catchHomogeneousMatrix.cpp b/modules/core/test/math/catchHomogeneousMatrix.cpp
index e3b6c00f20..47aef0bb9f 100644
--- a/modules/core/test/math/catchHomogeneousMatrix.cpp
+++ b/modules/core/test/math/catchHomogeneousMatrix.cpp
@@ -231,23 +231,32 @@ TEST_CASE("Point projection", "project")
   std::vector<unsigned int> NS = { 1, 10, 100, 1000, 10000 };
   for (unsigned int N : NS) {
     std::cout << "Running for N = " << N << std::endl;
-    std::vector<double> timeProject, timeMult, timeNaive;
+    std::vector<double> timeProjectTransposed, timeProject, timeMult, timeNaive;
     vpUniRand r(42);
     for (unsigned int trial = 0; trial < 100000; ++trial) {
-      vpHomogeneousMatrix M;
-      vpMatrix input(N, 3);
+      vpHomogeneousMatrix M(
+        r.uniform(0.0, 1.0), r.uniform(0.0, 1.0), r.uniform(0.0, 1.0),
+        r.uniform(0.0, M_PI), r.uniform(0.0, M_PI), r.uniform(0.0, M_PI));
+      vpMatrix inputT(N, 3);
       for (unsigned int i = 0; i< N; ++i) {
-        input[i][0] = r.uniform(0.0, 1.0);
-        input[i][1] = r.uniform(0.0, 1.0);
-        input[i][2] = r.uniform(0.0, 1.0);
+        inputT[i][0] = r.uniform(0.0, 1.0);
+        inputT[i][1] = r.uniform(0.0, 1.0);
+        inputT[i][2] = r.uniform(0.0, 1.0);
       }
-      vpMatrix output(N, 3);
+      vpMatrix input = inputT.t();
+      vpMatrix outputT(N, 3);
+      vpMatrix output(3, N);
+
 
       double t1 = vpTime::measureTimeMs();
-      M.project(input, output);
+      M.project(inputT, outputT, true);
       double t2 = vpTime::measureTimeMs();
-      timeProject.push_back(t2 - t1);
+      timeProjectTransposed.push_back(t2 - t1);
 
+      t1 = vpTime::measureTimeMs();
+      M.project(input, output, false);
+      t2 = vpTime::measureTimeMs();
+      timeProject.push_back(t2 - t1);
 
       vpColVector x(4, 1);
       vpColVector res(4);
@@ -255,10 +264,10 @@ TEST_CASE("Point projection", "project")
       vpMatrix outputR(N, 3);
       double t1r = vpTime::measureTimeMs();
 
-      for (unsigned int i = 0; i < input.getRows(); ++i) {
-        x[0] = input[i][0];
-        x[1] = input[i][1];
-        x[2] = input[i][2];
+      for (unsigned int i = 0; i < inputT.getRows(); ++i) {
+        x[0] = inputT[i][0];
+        x[1] = inputT[i][1];
+        x[2] = inputT[i][2];
 
         vpColVector res = M * x;
         outputR[i][0] = res[0];
@@ -270,6 +279,12 @@ TEST_CASE("Point projection", "project")
 
 
       vpMatrix input4(4, N);
+      for (unsigned int i = 0; i < inputT.getRows(); ++i) {
+        input4[0][i] = inputT[i][0];
+        input4[1][i] = inputT[i][1];
+        input4[2][i] = inputT[i][2];
+        input4[3][i] = 1;
+      }
       vpMatrix output4(4, N);
       double t14 = vpTime::measureTimeMs();
 
@@ -277,12 +292,29 @@ TEST_CASE("Point projection", "project")
       double t24 = vpTime::measureTimeMs();
       timeMult.push_back(t24 - t14);
 
-      CHECK((outputR == output));
+      double errorT = (outputR - outputT).frobeniusNorm() / N;
+      double error = (outputR - output.t()).frobeniusNorm() / N;
+
+      if (errorT > 1e-10 || error > 1e-10) {
+        std::cout << "M = " << M << std::endl;
+        std::cerr << "Naive outputT = "  << std::endl << outputR << std::endl;
+        std::cerr << "Mult outputT = "  << std::endl << output4.t() << std::endl;
+        std::cerr << "Project output transposed version = "  << std::endl << outputT << std::endl;
+        std::cerr << "Project output = "  << std::endl << outputT<< std::endl;
+
+
+        std::cerr << "diff transposed = "  << std::endl << outputR - outputT;
+        std::cerr << "diff = "  << std::endl << outputR - output.t();
+
+        FAIL();
+      }
     }
+    std::cout << "Optimized version (transposed) took: " << vpMath::getMean(timeProjectTransposed) << " +-" << vpMath::getStdev(timeProjectTransposed) <<  "ms" << std::endl;
     std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +-" << vpMath::getStdev(timeProject) <<  "ms" << std::endl;
+
     std::cout << "Mult  version took: " << vpMath::getMean(timeMult) << " +-" << vpMath::getStdev(timeMult) << "ms" << std::endl;
     std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +-" << vpMath::getStdev(timeNaive) <<  "ms" << std::endl;
-    std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::getMean(timeProject) << std::endl;
+    std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::minimum(vpMath::getMean(timeProjectTransposed), vpMath::getMean(timeProject)) << std::endl;
 
   }
 
diff --git a/modules/core/test/math/catchRotation.cpp b/modules/core/test/math/catchRotation.cpp
index 51455e8059..00e396ee48 100644
--- a/modules/core/test/math/catchRotation.cpp
+++ b/modules/core/test/math/catchRotation.cpp
@@ -566,21 +566,27 @@ TEST_CASE("Vector rotation", "project")
   std::vector<unsigned int> NS = { 1, 10, 100, 1000, 10000 };
   for (unsigned int N : NS) {
     std::cout << "Running for N = " << N << std::endl;
-    std::vector<double> timeProject, timeMult, timeNaive;
+    std::vector<double> timeProjectTransposed, timeProject, timeMult, timeNaive;
     vpUniRand r(42);
     for (unsigned int trial = 0; trial < 100000; ++trial) {
-      vpRotationMatrix M;
-      vpMatrix input(N, 3);
+      vpRotationMatrix M(r.uniform(0.0, M_PI), r.uniform(0.0, M_PI), r.uniform(0.0, M_PI));
+      vpMatrix inputT(N, 3);
+
       for (unsigned int i = 0; i< N; ++i) {
-        input[i][0] = r.uniform(0.0, 1.0);
-        input[i][1] = r.uniform(0.0, 1.0);
-        input[i][2] = r.uniform(0.0, 1.0);
+        inputT[i][0] = r.uniform(0.0, 1.0);
+        inputT[i][1] = r.uniform(0.0, 1.0);
+        inputT[i][2] = r.uniform(0.0, 1.0);
       }
-      vpMatrix output(N, 3);
-
+      vpMatrix input = inputT.t();
+      vpMatrix outputT(N, 3);
+      vpMatrix output(3, N);
       double t1 = vpTime::measureTimeMs();
-      M.rotateVectors(input, output);
+      M.rotateVectors(inputT, outputT, true);
       double t2 = vpTime::measureTimeMs();
+      timeProjectTransposed.push_back(t2 - t1);
+      t1 = vpTime::measureTimeMs();
+      M.rotateVectors(input, output, false);
+      t2 = vpTime::measureTimeMs();
       timeProject.push_back(t2 - t1);
 
 
@@ -589,10 +595,10 @@ TEST_CASE("Vector rotation", "project")
       vpMatrix outputR(N, 3);
       double t1r = vpTime::measureTimeMs();
 
-      for (unsigned int i = 0; i < input.getRows(); ++i) {
-        x[0] = input[i][0];
-        x[1] = input[i][1];
-        x[2] = input[i][2];
+      for (unsigned int i = 0; i < inputT.getRows(); ++i) {
+        x[0] = inputT[i][0];
+        x[1] = inputT[i][1];
+        x[2] = inputT[i][2];
 
         res = M * x;
         outputR[i][0] = res[0];
@@ -602,27 +608,34 @@ TEST_CASE("Vector rotation", "project")
       double t2r = vpTime::measureTimeMs();
       timeNaive.push_back(t2r - t1r);
 
-
-      vpMatrix input4(3, N);
-      vpMatrix output4(3, N);
+      vpMatrix outputMM(3, N);
       double t14 = vpTime::measureTimeMs();
 
-      vpMatrix::mult2Matrices((vpMatrix)M, input4, output4);
+      vpMatrix::mult2Matrices((vpMatrix)M, input, outputMM);
       double t24 = vpTime::measureTimeMs();
       timeMult.push_back(t24 - t14);
 
-      if (outputR != output) {
-        std::cerr << "Output diff = " << (output - outputR)<< std::endl;
+      double errorT = (outputR - outputT).frobeniusNorm() / N;
+      double error = (outputR - output.t()).frobeniusNorm() / N;
 
+      if (errorT > 1e-10) {
+        std::cerr << "Transposed version failed" << std::endl;
+        FAIL();
+      }
 
+      if (error > 1e-10) {
+        std::cerr << "Optimized 3xN version failed" << std::endl;
+        FAIL();
       }
 
-      CHECK((outputR == output));
+
+
     }
-    std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +-" << vpMath::getStdev(timeProject) <<  "ms" << std::endl;
-    std::cout << "Mult  version took: " << vpMath::getMean(timeMult) << " +-" << vpMath::getStdev(timeMult) << "ms" << std::endl;
-    std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +-" << vpMath::getStdev(timeNaive) <<  "ms" << std::endl;
-    std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::getMean(timeProject) << std::endl;
+    std::cout << "Optimized (transposed) version took: " << vpMath::getMean(timeProjectTransposed) << " +- " << vpMath::getStdev(timeProjectTransposed) <<  "ms" << std::endl;
+    std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +- " << vpMath::getStdev(timeProject) <<  "ms" << std::endl;
+    std::cout << "Mult  version took: " << vpMath::getMean(timeMult) << " +- " << vpMath::getStdev(timeMult) << "ms" << std::endl;
+    std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +- " << vpMath::getStdev(timeNaive) <<  "ms" << std::endl;
+    std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::minimum(vpMath::getMean(timeProjectTransposed), vpMath::getMean(timeProject)) << std::endl;
 
   }
 }
diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
index bd17100062..5fffe2e32d 100644
--- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
+++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
@@ -193,11 +193,16 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame,
   double t2 = vpTime::measureTimeMs();
   std::cout << "Update took: " << (t2 - t1) << std::endl;
 
+  t1 = vpTime::measureTimeMs();
   m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01);
-  m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights);
 
+  m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights);
+  t2 = vpTime::measureTimeMs();
+  std::cout << "M estimator took: " << (t2 - t1) << std::endl;
 
+  t1 = vpTime::measureTimeMs();
 
+#pragma omp parallel for
   for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
     m_weighted_error[i] = m_error[i] * m_weights[i];
     m_covWeightDiag[i] = m_weights[i] * m_weights[i];
@@ -205,15 +210,26 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame,
       m_L[i][dof] *= m_weights[i];
     }
   }
-  m_LTL = m_L.AtA();
+  t2 = vpTime::measureTimeMs();
+  std::cout << "weight mult took: " << (t2 - t1) << std::endl;
+  t1 = vpTime::measureTimeMs();
+  m_LTL.resize(6, 6);
+  m_L.AtA(m_LTL);
+  t2 = vpTime::measureTimeMs();
+  std::cout << "LTL took: " << (t2 - t1) << std::endl;
+  // m_LTR = m_Lt * m_weighted_error;
+  t1 = vpTime::measureTimeMs();
   computeJTR(m_L, m_weighted_error, m_LTR);
+  t2 = vpTime::measureTimeMs();
+  std::cout << "LTR took: " << (t2 - t1) << std::endl;
+  std::cout << std::endl;
   m_vvsConverged = false;
 }
 
 void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, const vpMatrix &obsT, vpColVector &e, vpMatrix &Lt, unsigned int start, unsigned int end)
 {
 #if defined(VISP_HAVE_OPENMP)
-#pragma omp parallel for
+#pragma omp parallel for schedule(static, 2048)
 #endif
   for (unsigned int i = start; i < end; ++i) {
     const double X = cXt[0][i], Y = cXt[1][i], Z = cXt[2][i];
@@ -231,7 +247,6 @@ void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, con
     Lt[4][i] = nX * Z - nZ * X;
     Lt[5][i] = nY * X - nX * Y;
   }
-
 }
 
 
@@ -239,49 +254,11 @@ void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, con
 void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const  vpMatrix &cNt, const  vpMatrix &obsT, vpColVector &e, vpMatrix &Lt)
 {
 
-#if defined(VISP_HAVE_AVX2)
-
-  using Register = __m512d;
-  constexpr int numLanes = 8;
-  constexpr auto &loadu = _mm512_loadu_pd;
-  constexpr auto &mul = _mm512_mul_pd;
-  constexpr auto &sub = _mm512_sub_pd;
-  constexpr auto &storeu = _mm512_storeu_pd;
-#if defined(VISP_HAVE_FMA)
-  constexpr auto &fma = _mm512_fmadd_pd;
-#else
-  constexpr auto &add = _mm512_add_pd;
-#endif
-#elif defined(VISP_HAVE_AVX)
-  using Register = __m256d;
-  constexpr int numLanes = 4;
-
-  constexpr auto &loadu = _mm256_loadu_pd;
-  constexpr auto &mul = _mm256_mul_pd;
-  constexpr auto &sub = _mm256_sub_pd;
-
-  constexpr auto &storeu = _mm256_storeu_pd;
-#if defined(VISP_HAVE_FMA)
-  constexpr auto &fma = _mm256_fmadd_pd;
-#else
-  constexpr auto &add = _mm256_add_pd;
-#endif
-#elif VISP_HAVE_SSE2
-  using Register = __m128d;
-  constexpr int numLanes = 2;
-
-  constexpr auto &loadu = _mm_loadu_pd;
-  constexpr auto &mul = _mm_mul_pd;
-  constexpr auto &sub = _mm_sub_pd;
+  using namespace vpSIMD;
 
-  constexpr auto &storeu = _mm_storeu_pd;
-#if defined(VISP_HAVE_FMA)
-  constexpr auto &fma = _mm_fmadd_pd;
-#else
-  constexpr auto &add = _mm_add_pd;
-#endif
+#if defined(VISP_HAVE_OPENMP)
+#pragma omp parallel for
 #endif
-
   for (int i = 0; i <= static_cast<int>(cXt.getCols()) - numLanes; i += numLanes) {
     const Register X = loadu(cXt[0] + i), Y = loadu(cXt[1] + i), Z = loadu(cXt[2] + i);
     const Register nX = loadu(cNt[0] + i), nY = loadu(cNt[1] + i), nZ = loadu(cNt[2] + i);
@@ -289,19 +266,12 @@ void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const  vpMatrix &cNt, co
 
     Register D = mul(nX, X);
     Register projNormal = mul(obsX, X);
-#if defined(VISP_HAVE_FMA)
-    D = fma(nY, Y, D);
-    D = fma(nZ, Z, D);
+    D = vpSIMD::fma(nY, Y, D);
+    D = vpSIMD::fma(nZ, Z, D);
 
-    projNormal = fma(nY, obsY, projNormal);
-    projNormal = fma(nZ, obsZ, projNormal);
-#else
-    D = add(mul(nY, Y), D);
-    D = add(mul(nZ, Z), D);
+    projNormal = vpSIMD::fma(nY, obsY, projNormal);
+    projNormal = vpSIMD::fma(nZ, obsZ, projNormal);
 
-    projNormal = add(mul(nY, obsY), projNormal);
-    projNormal = add(mul(nZ, obsZ), projNormal);
-#endif
     storeu(e.data + i, sub(projNormal, D));
 
     storeu(Lt[0] + i, nX);

From 77adf9d73dce7b5fccfd8dcdb09f1df0b3784cd9 Mon Sep 17 00:00:00 2001
From: Samuel Felton <samuel.felton@irisa.fr>
Date: Thu, 4 Dec 2025 18:00:38 +0100
Subject: [PATCH 08/24] Remove debug prints

---
 .../src/features/vpRBDenseDepthTracker.cpp    | 21 ++-----------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
index 5fffe2e32d..7c28b552d6 100644
--- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
+++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
@@ -190,19 +190,12 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame,
   m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_Lt);
   m_L = m_Lt.t();
 
-  double t2 = vpTime::measureTimeMs();
-  std::cout << "Update took: " << (t2 - t1) << std::endl;
-
   t1 = vpTime::measureTimeMs();
   m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01);
-
   m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights);
-  t2 = vpTime::measureTimeMs();
-  std::cout << "M estimator took: " << (t2 - t1) << std::endl;
-
-  t1 = vpTime::measureTimeMs();
-
+#if defined(VISP_HAVE_OPENMP)
 #pragma omp parallel for
+#endif
   for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
     m_weighted_error[i] = m_error[i] * m_weights[i];
     m_covWeightDiag[i] = m_weights[i] * m_weights[i];
@@ -210,19 +203,9 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame,
       m_L[i][dof] *= m_weights[i];
     }
   }
-  t2 = vpTime::measureTimeMs();
-  std::cout << "weight mult took: " << (t2 - t1) << std::endl;
-  t1 = vpTime::measureTimeMs();
   m_LTL.resize(6, 6);
   m_L.AtA(m_LTL);
-  t2 = vpTime::measureTimeMs();
-  std::cout << "LTL took: " << (t2 - t1) << std::endl;
-  // m_LTR = m_Lt * m_weighted_error;
-  t1 = vpTime::measureTimeMs();
   computeJTR(m_L, m_weighted_error, m_LTR);
-  t2 = vpTime::measureTimeMs();
-  std::cout << "LTR took: " << (t2 - t1) << std::endl;
-  std::cout << std::endl;
   m_vvsConverged = false;
 }
 

From 1f3c7c88881dc0dd7c4d11b324f7277fc4fff4f2 Mon Sep 17 00:00:00 2001
From: Samuel Felton <samuel.felton@irisa.fr>
Date: Fri, 5 Dec 2025 15:38:16 +0100
Subject: [PATCH 09/24] Move initVVS to cpp file, resize matrix there

---
 .../include/visp3/rbt/vpRBDenseDepthTracker.h |   2 +-
 .../src/features/vpRBDenseDepthTracker.cpp    | 139 ++++++++++--------
 2 files changed, 79 insertions(+), 62 deletions(-)

diff --git a/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h b/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h
index 9539dc6aa1..07282c2377 100644
--- a/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h
+++ b/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h
@@ -151,7 +151,7 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker
 
   void extractFeatures(const vpRBFeatureTrackerInput &frame, const vpRBFeatureTrackerInput &previousFrame, const vpHomogeneousMatrix &cMo) VP_OVERRIDE;
   void trackFeatures(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) VP_OVERRIDE { }
-  void initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) VP_OVERRIDE { }
+  void initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) VP_OVERRIDE;
   void computeVVSIter(const vpRBFeatureTrackerInput &frame, const vpHomogeneousMatrix &cMo, unsigned int iteration) VP_OVERRIDE;
 
   void display(const vpCameraParameters &cam, const vpImage<unsigned char> &I, const vpImage<vpRGBa> &IRGB, const vpImage<unsigned char> &depth) const VP_OVERRIDE;
diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
index 7c28b552d6..87fc23c564 100644
--- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
+++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
@@ -32,6 +32,7 @@
 #include <visp3/core/vpMeterPixelConversion.h>
 #include <visp3/core/vpSIMDUtils.h>
 #include <visp3/core/vpDisplay.h>
+#include <visp3/mbt/vpMbtTukeyEstimator.h>
 
 #ifdef VISP_HAVE_OPENMP
 #include <omp.h>
@@ -173,6 +174,17 @@ void vpRBDenseDepthTracker::extractFeatures(const vpRBFeatureTrackerInput &frame
   }
 }
 
+void vpRBDenseDepthTracker::initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/)
+{
+  m_LTL.resize(6, 6, false, false);
+  m_LTR.resize(6, false);
+  m_L.resize(m_numFeatures, 6, false, false);
+  m_Lt.resize(6, m_numFeatures, false, false);
+  m_error.resize(m_numFeatures, false);
+  m_weighted_error.resize(m_numFeatures, false);
+  m_weights.resize(m_numFeatures, false);
+}
+
 void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, const vpHomogeneousMatrix &cMo, unsigned int /*iteration*/)
 {
   if (m_numFeatures == 0) {
@@ -188,14 +200,21 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame,
 
   double t1 = vpTime::measureTimeMs();
   m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_Lt);
-  m_L = m_Lt.t();
+  m_Lt.transpose(m_L);
+  std::cout << "Update took: " << (vpTime::measureTimeMs() - t1) << std::endl;
 
   t1 = vpTime::measureTimeMs();
-  m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01);
-  m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights);
-#if defined(VISP_HAVE_OPENMP)
-#pragma omp parallel for
-#endif
+  // m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01);
+  // m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights);
+
+  vpMbtTukeyEstimator<double> robust;
+  // m_robust.setMinMedianAbsoluteDeviation();
+  robust.MEstimator(m_error, m_weights, (frame.renders.zFar - frame.renders.zNear) * 0.01);
+  std::cout << "MEstimator took: " << (vpTime::measureTimeMs() - t1) << std::endl;
+
+// #if defined(VISP_HAVE_OPENMP)
+// #pragma omp parallel for
+// #endif
   for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
     m_weighted_error[i] = m_error[i] * m_weights[i];
     m_covWeightDiag[i] = m_weights[i] * m_weights[i];
@@ -240,7 +259,7 @@ void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const  vpMatrix &cNt, co
   using namespace vpSIMD;
 
 #if defined(VISP_HAVE_OPENMP)
-#pragma omp parallel for
+#pragma omp parallel for schedule(static, 1024)
 #endif
   for (int i = 0; i <= static_cast<int>(cXt.getCols()) - numLanes; i += numLanes) {
     const Register X = loadu(cXt[0] + i), Y = loadu(cXt[1] + i), Z = loadu(cXt[2] + i);
@@ -284,11 +303,10 @@ void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix(
   const unsigned int numPoints = m_oXt.getCols();
   e.resize(numPoints, false);
   Lt.resize(6, numPoints, false, false);
-
 #if defined(VISP_HAVE_OPENMP)
 #pragma omp parallel
-  {
 #endif
+  {
     std::vector<unsigned int> localInvalid;
 #if defined(VISP_HAVE_OPENMP)
 #pragma omp for
@@ -329,72 +347,71 @@ void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix(
     {
       m_invalid.insert(m_invalid.end(), localInvalid.begin(), localInvalid.end());
     }
-  }
 #else
     m_invalid = std::move(localInvalid);
 #endif
-
+  }
 
 #if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2)
-    errorAndInteractionMatrixSIMD(m_cXt, m_cNt, m_observations, e, Lt);
+  errorAndInteractionMatrixSIMD(m_cXt, m_cNt, m_observations, e, Lt);
 #else
-    errorAndInteractionMatrixBase(m_cXt, m_cNt, m_observations, e, Lt, 0, numPoints);
+  errorAndInteractionMatrixBase(m_cXt, m_cNt, m_observations, e, Lt, 0, numPoints);
 #endif
-
   // Disable invalid points
-    for (unsigned int i : m_invalid) {
-      e[i] = 0.0;
-
-      Lt[0][i] = 0;
-      Lt[1][i] = 0;
-      Lt[2][i] = 0;
-      Lt[3][i] = 0;
-      Lt[4][i] = 0;
-      Lt[5][i] = 0;
-    }
+  for (unsigned int i : m_invalid) {
+    e[i] = 0.0;
+
+    Lt[0][i] = 0;
+    Lt[1][i] = 0;
+    Lt[2][i] = 0;
+    Lt[3][i] = 0;
+    Lt[4][i] = 0;
+    Lt[5][i] = 0;
+  }
+
 }
 
-  void vpRBDenseDepthTracker::display(const vpCameraParameters &/*cam*/, const vpImage<unsigned char> &/*I*/,
-                                      const vpImage<vpRGBa> &/*IRGB*/, const vpImage<unsigned char> &depth) const
+void vpRBDenseDepthTracker::display(const vpCameraParameters &/*cam*/, const vpImage<unsigned char> &/*I*/,
+                                    const vpImage<vpRGBa> &/*IRGB*/, const vpImage<unsigned char> &depth) const
+{
+  switch (m_displayType) {
+  case DT_SIMPLE:
   {
-    switch (m_displayType) {
-    case DT_SIMPLE:
-    {
-      for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
-        vpColor c(0, 255, 0);
-        vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
-      }
-      break;
-    }
-    case DT_WEIGHT:
-    {
-      for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
-        vpColor c(0, static_cast<unsigned char>(m_weights[i] * 255), 0);
-        vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
-      }
-      break;
+    for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
+      vpColor c(0, 255, 0);
+      vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
     }
-    case DT_ERROR:
-    {
-      double maxError = m_error.getMaxValue();
-      for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
-        vpColor c(static_cast<unsigned int>((m_error[i] / maxError) * 255), 0, 0);
-        vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
-      }
-      break;
+    break;
+  }
+  case DT_WEIGHT:
+  {
+    for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
+      vpColor c(0, static_cast<unsigned char>(m_weights[i] * 255), 0);
+      vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
     }
-    case DT_WEIGHT_AND_ERROR:
-    {
-      double maxError = m_error.getMaxValue();
-      for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
-        vpColor c(static_cast<unsigned int>((m_error[i] / maxError) * 255.0), static_cast<unsigned char>(m_weights[i] * 255), 0);
-        vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
-      }
-      break;
+    break;
+  }
+  case DT_ERROR:
+  {
+    double maxError = m_error.getMaxValue();
+    for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
+      vpColor c(static_cast<unsigned int>((m_error[i] / maxError) * 255), 0, 0);
+      vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
     }
-    default:
-      throw vpException(vpException::notImplementedError, "Depth tracker display type is invalid or not implemented");
+    break;
+  }
+  case DT_WEIGHT_AND_ERROR:
+  {
+    double maxError = m_error.getMaxValue();
+    for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
+      vpColor c(static_cast<unsigned int>((m_error[i] / maxError) * 255.0), static_cast<unsigned char>(m_weights[i] * 255), 0);
+      vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1);
     }
+    break;
   }
+  default:
+    throw vpException(vpException::notImplementedError, "Depth tracker display type is invalid or not implemented");
+  }
+}
 
-  END_VISP_NAMESPACE
+END_VISP_NAMESPACE

From cc729b6115b9421133b4719fe862ec20b68ddf59 Mon Sep 17 00:00:00 2001
From: Samuel Felton <samuel.felton@irisa.fr>
Date: Fri, 5 Dec 2025 18:20:16 +0100
Subject: [PATCH 10/24] Add ENABLE_NATIVE_ARCH option for gcc

---
 CMakeLists.txt                       |  3 +++
 cmake/AddExtraCompilationFlags.cmake | 37 ++++++++++++++++------------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e208d9a3a..a7f52d161a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -381,6 +381,9 @@ VP_OPTION(ENABLE_SSSE3 "" "" "Enable SSSE3 instructions" "" ON IF ((MSVC OR CMAK
 if(X86_64)
   VP_OPTION(ENABLE_AVX   "" "" "Enable AVX instructions"   "" OFF) # should be explicitly enabled, used in matrix transpose code
 endif()
+if(CMAKE_COMPILER_IS_GNUCXX)
+  VP_OPTION(ENABLE_NATIVE_ARCH  "" "" "Enable all available CPU instruction sets"   "" OFF)
+endif()
 
 #----------------------------------------------------------------------
 # BLAS / LAPACK
diff --git a/cmake/AddExtraCompilationFlags.cmake b/cmake/AddExtraCompilationFlags.cmake
index 35f0a269dc..4eba86f73d 100644
--- a/cmake/AddExtraCompilationFlags.cmake
+++ b/cmake/AddExtraCompilationFlags.cmake
@@ -171,25 +171,30 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     add_extra_compiler_options("-fvisibility-inlines-hidden")
   endif()
 
-  if(ENABLE_AVX AND X86_64)
-    add_extra_compiler_options("-mavx")
+  if(ENABLE_NATIVE_ARCH)
+    add_extra_compiler_options("-march=native")
   else()
-    if(ENABLE_SSE2)
-      add_extra_compiler_options("-msse2")
-    elseif(X86 OR X86_64)
-      add_extra_compiler_options("-mno-sse2")
-    endif()
 
-    if(ENABLE_SSE3)
-      add_extra_compiler_options("-msse3")
-    elseif(X86 OR X86_64)
-      #add_extra_compiler_options("-mno-sse3")
-    endif()
+    if(ENABLE_AVX AND X86_64)
+      add_extra_compiler_options("-mavx")
+    else()
+      if(ENABLE_SSE2)
+        add_extra_compiler_options("-msse2")
+      elseif(X86 OR X86_64)
+        add_extra_compiler_options("-mno-sse2")
+      endif()
+
+      if(ENABLE_SSE3)
+        add_extra_compiler_options("-msse3")
+      elseif(X86 OR X86_64)
+        #add_extra_compiler_options("-mno-sse3")
+      endif()
 
-    if(ENABLE_SSSE3)
-      add_extra_compiler_options("-mssse3")
-    elseif(X86 OR X86_64)
-      add_extra_compiler_options("-mno-ssse3")
+      if(ENABLE_SSSE3)
+        add_extra_compiler_options("-mssse3")
+      elseif(X86 OR X86_64)
+        add_extra_compiler_options("-mno-ssse3")
+      endif()
     endif()
   endif()
 

From a05c32905fcd1e90481cea631195e6c7283758f7 Mon Sep 17 00:00:00 2001
From: Samuel Felton <samuel.felton@irisa.fr>
Date: Fri, 5 Dec 2025 18:21:08 +0100
Subject: [PATCH 11/24] Remove reference to MBT tukey estimator, disable prints

---
 modules/core/include/visp3/core/vpSIMDUtils.h |  4 +-
 modules/core/src/math/robust/vpRobust.cpp     | 12 ++--
 .../math/transformation/vpRotationMatrix.cpp  | 66 +++++++------------
 .../src/features/vpRBDenseDepthTracker.cpp    | 21 ++----
 4 files changed, 38 insertions(+), 65 deletions(-)

diff --git a/modules/core/include/visp3/core/vpSIMDUtils.h b/modules/core/include/visp3/core/vpSIMDUtils.h
index 397def00a8..2d08d4c787 100644
--- a/modules/core/include/visp3/core/vpSIMDUtils.h
+++ b/modules/core/include/visp3/core/vpSIMDUtils.h
@@ -96,8 +96,8 @@ namespace vpSIMD
 #if defined(VISP_HAVE_AVX2)
 using Register = __m512d;
 
-inline const int numLanes = 8;
-inline Register add(const Register a, const Register b)
+inline constexpr int numLanes = 8;
+inline const Register add(const Register a, const Register b)
 {
   return _mm512_add_pd(a, b);
 }
diff --git a/modules/core/src/math/robust/vpRobust.cpp b/modules/core/src/math/robust/vpRobust.cpp
index 27fe9159c6..56be0a0c59 100644
--- a/modules/core/src/math/robust/vpRobust.cpp
+++ b/modules/core/src/math/robust/vpRobust.cpp
@@ -107,13 +107,14 @@ vpRobust &vpRobust::operator=(const vpRobust &&other)
 void vpRobust::resize(unsigned int n_data)
 {
   if (n_data != m_size) {
-    m_normres.resize(n_data);
-    m_sorted_normres.resize(n_data);
-    m_sorted_residues.resize(n_data);
+    m_normres.resize(n_data, false);
+    m_sorted_normres.resize(n_data, false);
+    m_sorted_residues.resize(n_data, false);
     m_size = n_data;
   }
 }
 
+
 // ===================================================================
 /*!
 
@@ -143,6 +144,8 @@ void vpRobust::MEstimator(const vpRobustEstimatorType method, const vpColVector
 
   // Calculate median
   med = select(m_sorted_residues, 0, n_data - 1, ind_med);
+
+  // med = select(m_sorted_residues, 0, n_data - 1, ind_med);
   // --comment: residualMedian = med
 
   // Normalize residues
@@ -153,6 +156,7 @@ void vpRobust::MEstimator(const vpRobustEstimatorType method, const vpColVector
 
   // Calculate MAD
   normmedian = select(m_sorted_normres, 0, n_data - 1, ind_med);
+
   // normalizedResidualMedian = normmedian ;
   // 1.48 keeps scale estimate consistent for a normal probability dist.
   m_mad = 1.4826 * normmedian; // median Absolute Deviation
@@ -534,7 +538,7 @@ double vpRobust::constrainedChi(vpRobustEstimatorType method, double x)
   default: {
     throw(vpException(vpException::fatalError, "Unsupported robust estimator type in vpRobust::constrainedChi()"));
   }
-  }
+}
 
   return -1;
 }
diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp
index fa5e78e9c3..8808039ed0 100644
--- a/modules/core/src/math/transformation/vpRotationMatrix.cpp
+++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp
@@ -157,6 +157,7 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
 
 
   else { // 3xN matrix
+    using namespace vpSIMD;
     if (input.getRows() != 3) {
       throw vpException(vpException::dimensionError, "Expected input to have 3 rows");
     }
@@ -168,64 +169,43 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
     double *outputZ = output[2];
 
 
-#if defined(VISP_HAVE_AVX)
+#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) || defined(VISP_HAVE_AVX2)
 
-    __m256d elems[9];
+    Register elems[9];
     for (unsigned int i = 0; i < 9; ++i) {
-      elems[i] = _mm256_set1_pd(data[i]);
+      elems[i] = set1(data[i]);
     }
-    for (int i = 0; i <= static_cast<int>(input.getCols()) - 4; i += 4) {
-      const __m256d x4 = _mm256_loadu_pd(inputX);
-      const __m256d y4 = _mm256_loadu_pd(inputY);
-      const __m256d z4 = _mm256_loadu_pd(inputZ);
-
-#if defined(VISP_HAVE_FMA)
-      __m256d dp1 = _mm256_mul_pd(x4, elems[0]);
-      dp1 = _mm256_fmadd_pd(y4, elems[1], dp1);
-      dp1 = _mm256_fmadd_pd(z4, elems[2], dp1);
-      __m256d dp2 = _mm256_mul_pd(x4, elems[3]);
-      dp2 = _mm256_fmadd_pd(y4, elems[4], dp2);
-      dp2 = _mm256_fmadd_pd(z4, elems[5], dp2);
-      __m256d dp3 = _mm256_mul_pd(x4, elems[6]);
-      dp3 = _mm256_fmadd_pd(y4, elems[7], dp3);
-      dp3 = _mm256_fmadd_pd(z4, elems[8], dp3);
+    for (int i = 0; i <= static_cast<int>(input.getCols()) - vpSIMD::numLanes; i += vpSIMD::numLanes) {
+      const Register x4 = loadu(inputX);
+      const Register y4 = loadu(inputY);
+      const Register z4 = loadu(inputZ);
 
-#else
-      const __m256d muls = {
-        _mm256_mul_pd(x4, elems[0]),
-        _mm256_mul_pd(y4, elems[1]),
-        _mm256_mul_pd(z4, elems[2]),
-
-        _mm256_mul_pd(x4, elems[3]),
-        _mm256_mul_pd(y4, elems[4]),
-        _mm256_mul_pd(z4, elems[5]),
-
-        _mm256_mul_pd(x4, elems[6]),
-        _mm256_mul_pd(y4, elems[7]),
-        _mm256_mul_pd(z4, elems[8]),
-      }
+      Register dp1 = mul(x4, elems[0]);
+      dp1 = vpSIMD::fma(y4, elems[1], dp1);
+      dp1 = vpSIMD::fma(z4, elems[2], dp1);
+      Register dp2 = mul(x4, elems[3]);
+      dp2 = vpSIMD::fma(y4, elems[4], dp2);
+      dp2 = vpSIMD::fma(z4, elems[5], dp2);
+      Register dp3 = mul(x4, elems[6]);
+      dp3 = vpSIMD::fma(y4, elems[7], dp3);
+      dp3 = vpSIMD::fma(z4, elems[8], dp3);
 
-      __m256d dp1 = _mm256_add_pd(_mm256_add_pd(muls[0], muls[1]), muls[2]);
-      __m256d dp2 = _mm256_add_pd(_mm256_add_pd(muls[3], muls[4]), muls[5]);
-      __m256d dp3 = _mm256_add_pd(_mm256_add_pd(muls[6], muls[7]), muls[8]);
 
 
-#endif
-
-      _mm256_storeu_pd(outputX, dp1);
-      _mm256_storeu_pd(outputY, dp2);
-      _mm256_storeu_pd(outputZ, dp3);
+      storeu(outputX, dp1);
+      storeu(outputY, dp2);
+      storeu(outputZ, dp3);
 
 
-      inputX += 4; inputY += 4; inputZ += 4;
-      outputX += 4; outputY += 4; outputZ += 4;
+      inputX += vpSIMD::numLanes; inputY += vpSIMD::numLanes; inputZ += vpSIMD::numLanes;
+      outputX += vpSIMD::numLanes; outputY += vpSIMD::numLanes; outputZ += vpSIMD::numLanes;
 
     }
     double *r0 = rowPtrs[0];
     double *r1 = rowPtrs[1];
     double *r2 = rowPtrs[2];
 
-    for (unsigned int i = (input.getCols() / 4) * 4; i < input.getCols(); ++i) {
+    for (unsigned int i = (input.getCols() / vpSIMD::numLanes) * vpSIMD::numLanes; i < input.getCols(); ++i) {
       // std::cout << "i = " << i << std::endl;
       double X = *inputX, Y = *inputY, Z = *inputZ;
       *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z;
diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
index 87fc23c564..81af464e3b 100644
--- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
+++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
@@ -32,7 +32,6 @@
 #include <visp3/core/vpMeterPixelConversion.h>
 #include <visp3/core/vpSIMDUtils.h>
 #include <visp3/core/vpDisplay.h>
-#include <visp3/mbt/vpMbtTukeyEstimator.h>
 
 #ifdef VISP_HAVE_OPENMP
 #include <omp.h>
@@ -154,7 +153,7 @@ void vpRBDenseDepthTracker::extractFeatures(const vpRBFeatureTrackerInput &frame
     }
 
     pointsPerThread[threadIdx] = std::move(localPoints);
-  }
+    }
   for (const std::vector<vpDepthPoint> &points: pointsPerThread) {
     m_depthPoints.insert(m_depthPoints.end(), std::make_move_iterator(points.begin()), std::make_move_iterator(points.end()));
   }
@@ -172,7 +171,7 @@ void vpRBDenseDepthTracker::extractFeatures(const vpRBFeatureTrackerInput &frame
   else {
     m_numFeatures = 0;
   }
-}
+  }
 
 void vpRBDenseDepthTracker::initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/)
 {
@@ -201,16 +200,10 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame,
   double t1 = vpTime::measureTimeMs();
   m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_Lt);
   m_Lt.transpose(m_L);
-  std::cout << "Update took: " << (vpTime::measureTimeMs() - t1) << std::endl;
 
   t1 = vpTime::measureTimeMs();
-  // m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01);
-  // m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights);
-
-  vpMbtTukeyEstimator<double> robust;
-  // m_robust.setMinMedianAbsoluteDeviation();
-  robust.MEstimator(m_error, m_weights, (frame.renders.zFar - frame.renders.zNear) * 0.01);
-  std::cout << "MEstimator took: " << (vpTime::measureTimeMs() - t1) << std::endl;
+  m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01);
+  m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights);
 
 // #if defined(VISP_HAVE_OPENMP)
 // #pragma omp parallel for
@@ -231,7 +224,7 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame,
 void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, const vpMatrix &obsT, vpColVector &e, vpMatrix &Lt, unsigned int start, unsigned int end)
 {
 #if defined(VISP_HAVE_OPENMP)
-#pragma omp parallel for schedule(static, 2048)
+#pragma omp parallel for
 #endif
   for (unsigned int i = start; i < end; ++i) {
     const double X = cXt[0][i], Y = cXt[1][i], Z = cXt[2][i];
@@ -297,7 +290,6 @@ void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix(
   const vpRotationMatrix cRo = cMo.getRotationMatrix();
   cMo.project(m_oXt, m_cXt, false);
   cRo.rotateVectors(m_oNt, m_cNt, false);
-
   m_invalid.clear();
 
   const unsigned int numPoints = m_oXt.getCols();
@@ -323,7 +315,6 @@ void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix(
         double x, y, u, v;
         x = m_cXt[0][i] / m_cXt[2][i];
         y = m_cXt[1][i] / m_cXt[2][i];
-
         vpMeterPixelConversion::convertPointWithoutDistortion(cam, x, y, u, v);
         // Point is no longer in image: depth value cannot be sampled
         if (u < 0 || v < 0 || u >= depth.getWidth() || v >= depth.getHeight()) {
@@ -336,7 +327,6 @@ void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix(
           localInvalid.push_back(i);
           continue;
         }
-
         m_observations[0][i] = x * Z;
         m_observations[1][i] = y * Z;
         m_observations[2][i] = Z;
@@ -368,7 +358,6 @@ void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix(
     Lt[4][i] = 0;
     Lt[5][i] = 0;
   }
-
 }
 
 void vpRBDenseDepthTracker::display(const vpCameraParameters &/*cam*/, const vpImage<unsigned char> &/*I*/,

From d847bfb6990d54a2f5771923265499730a1e6a4a Mon Sep 17 00:00:00 2001
From: Samuel Felton <samuel.felton@irisa.fr>
Date: Sat, 6 Dec 2025 21:29:32 +0100
Subject: [PATCH 12/24] Fix SSE3 flag check

---
 modules/core/src/math/transformation/vpHomogeneousMatrix.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
index 9501e533b2..0d218c5ea2 100644
--- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
+++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
@@ -99,7 +99,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
       outputData += 3;
     }
 
-#elif (VISP_HAVE_SSE2)
+#elif (VISP_HAVE_SSE3)
     __m128d rows[] = {
       _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2),
       _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2),

From c4bc02aa143749b7ccc07570aeba2c51c7fa4b2e Mon Sep 17 00:00:00 2001
From: Fabien Spindler <Fabien.Spindler@inria.fr>
Date: Sun, 7 Dec 2025 10:09:55 +0100
Subject: [PATCH 13/24] Update copyright headers

---
 CMakeLists.txt                                | 2 +-
 modules/core/include/visp3/core/vpSIMDUtils.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 986d7d4792..edb0af45da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 #############################################################################
 #
 # ViSP, open source Visual Servoing Platform software.
-# Copyright (C) 2005 - 2024 by Inria. All rights reserved.
+# Copyright (C) 2005 - 2025 by Inria. All rights reserved.
 #
 # This software is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
diff --git a/modules/core/include/visp3/core/vpSIMDUtils.h b/modules/core/include/visp3/core/vpSIMDUtils.h
index 2d08d4c787..b46c9c917e 100644
--- a/modules/core/include/visp3/core/vpSIMDUtils.h
+++ b/modules/core/include/visp3/core/vpSIMDUtils.h
@@ -1,6 +1,6 @@
 /*
  * ViSP, open source Visual Servoing Platform software.
- * Copyright (C) 2005 - 2024 by Inria. All rights reserved.
+ * Copyright (C) 2005 - 2025 by Inria. All rights reserved.
  *
  * This software is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -28,7 +28,7 @@
  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  *
  * Description:
- * Rotation matrix.
+ * SIMD utilities.
  */
 
 /*!

From 49e8c83bc6ef229294b3b15146afa296d9f3e227 Mon Sep 17 00:00:00 2001
From: Fabien Spindler <Fabien.Spindler@inria.fr>
Date: Sun, 7 Dec 2025 10:11:50 +0100
Subject: [PATCH 14/24] Fix warning unused variable
 modules/core/src/math/transformation/vpHomogeneousMatrix.cpp:60:13: warning:
 unused variable 'outputData' [-Wunused-variable]    60 |     double
 *outputData = output.data;       |             ^~~~~~~~~~

---
 .../core/src/math/transformation/vpHomogeneousMatrix.cpp   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
index 0d218c5ea2..6a8022732c 100644
--- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
+++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
@@ -57,8 +57,8 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
       throw vpException(vpException::dimensionError, "Input matrix should have 3 columns");
     }
     double *inputData = input.data;
-    double *outputData = output.data;
 #if defined(VISP_HAVE_AVX)
+    double *outputData = output.data;
     __m256d rows[] = {
       _mm256_loadu_pd(rowPtrs[0]),
       _mm256_loadu_pd(rowPtrs[1]),
@@ -100,6 +100,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
     }
 
 #elif (VISP_HAVE_SSE3)
+    double *outputData = output.data;
     __m128d rows[] = {
       _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2),
       _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2),
@@ -123,7 +124,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
       }
       inputData += 3;
       outputData += 3;
-    }
+  }
 #else
     double *r0 = rowPtrs[0];
     double *r1 = rowPtrs[1];
@@ -136,7 +137,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
       inputData += 3;
     }
 #endif
-  }
+}
   else {
     if (input.getRows() != 3) {
       throw vpException(vpException::dimensionError, "Expected input to have 3 rows");

From ae5966b6dd25697766b13f50f6feced7ec7bfcfb Mon Sep 17 00:00:00 2001
From: Fabien Spindler <Fabien.Spindler@inria.fr>
Date: Sun, 7 Dec 2025 10:22:53 +0100
Subject: [PATCH 15/24] Remove useless empty lines

---
 modules/core/include/visp3/core/vpSIMDUtils.h |  6 ------
 modules/core/src/math/robust/vpRobust.cpp     |  3 +--
 .../transformation/vpHomogeneousMatrix.cpp    | 21 +++++++------------
 .../math/transformation/vpRotationMatrix.cpp  | 21 +------------------
 .../core/test/math/catchHomogeneousMatrix.cpp |  5 +----
 modules/core/test/math/catchRotation.cpp      |  6 ------
 .../src/features/vpRBDenseDepthTracker.cpp    |  8 ++-----
 7 files changed, 13 insertions(+), 57 deletions(-)

diff --git a/modules/core/include/visp3/core/vpSIMDUtils.h b/modules/core/include/visp3/core/vpSIMDUtils.h
index b46c9c917e..a54f3c5550 100644
--- a/modules/core/include/visp3/core/vpSIMDUtils.h
+++ b/modules/core/include/visp3/core/vpSIMDUtils.h
@@ -78,7 +78,6 @@
 #define VISP_HAVE_NEON 0
 #endif
 
-
 #if VISP_HAVE_SSE2 && USE_SIMD_CODE
 #define USE_SSE 1
 #else
@@ -112,7 +111,6 @@ inline Register mul(const Register a, const Register b)
   return _mm512_mul_pd(a, b);
 }
 
-
 inline Register fma(const Register a, const Register b, const Register c)
 {
 #if defined(VISP_HAVE_FMA)
@@ -127,7 +125,6 @@ inline Register loadu(const double *const data)
   return _mm512_loadu_pd(data);
 }
 
-
 inline Register set1(double v)
 {
   return _mm512_set1_pd(v);
@@ -157,7 +154,6 @@ inline Register mul(const Register a, const Register b)
   return _mm256_mul_pd(a, b);
 }
 
-
 inline Register fma(const Register a, const Register b, const Register c)
 {
 #if defined(VISP_HAVE_FMA)
@@ -172,7 +168,6 @@ inline Register loadu(const double *const data)
   return _mm256_loadu_pd(data);
 }
 
-
 inline Register set1(double v)
 {
   return _mm256_set1_pd(v);
@@ -202,7 +197,6 @@ inline Register mul(const Register a, const Register b)
   return _mm_mul_pd(a, b);
 }
 
-
 inline Register fma(const Register a, const Register b, const Register c)
 {
 #if defined(VISP_HAVE_FMA)
diff --git a/modules/core/src/math/robust/vpRobust.cpp b/modules/core/src/math/robust/vpRobust.cpp
index 56be0a0c59..08b0c8da09 100644
--- a/modules/core/src/math/robust/vpRobust.cpp
+++ b/modules/core/src/math/robust/vpRobust.cpp
@@ -114,7 +114,6 @@ void vpRobust::resize(unsigned int n_data)
   }
 }
 
-
 // ===================================================================
 /*!
 
@@ -538,7 +537,7 @@ double vpRobust::constrainedChi(vpRobustEstimatorType method, double x)
   default: {
     throw(vpException(vpException::fatalError, "Unsupported robust estimator type in vpRobust::constrainedChi()"));
   }
-}
+  }
 
   return -1;
 }
diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
index 6a8022732c..0154c26e35 100644
--- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
+++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
@@ -67,7 +67,6 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
 
     };
 
-
     double result[4];
 
     for (unsigned int i = 0; i < input.getRows(); ++i) {
@@ -90,7 +89,6 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
       __m128d sum1 = _mm_hadd_pd(sum2, sum2);     // [total, total]
       double total = _mm_cvtsd_f64(sum1);
 
-
       outputData[0] = result[0] + result[2];
       outputData[1] = result[1] + result[3];
       outputData[2] = total;
@@ -124,7 +122,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
       }
       inputData += 3;
       outputData += 3;
-  }
+    }
 #else
     double *r0 = rowPtrs[0];
     double *r1 = rowPtrs[1];
@@ -137,7 +135,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
       inputData += 3;
     }
 #endif
-}
+    }
   else {
     if (input.getRows() != 3) {
       throw vpException(vpException::dimensionError, "Expected input to have 3 rows");
@@ -178,7 +176,6 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
       dp3 = vpSIMD::fma(z4, elems[10], dp3);
       dp3 = add(elems[11], dp3);
 
-
       storeu(outputX, dp1);
       storeu(outputY, dp2);
       storeu(outputZ, dp3);
@@ -200,7 +197,6 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
       ++inputX; ++inputY; ++inputZ;
       ++outputX; ++outputY; ++outputZ;
     }
-
 #else
     double *r0 = rowPtrs[0];
     double *r1 = rowPtrs[1];
@@ -215,15 +211,14 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
       ++outputX; ++outputY; ++outputZ;
     }
 #endif
-
   }
-}
+  }
 
 
-/*!
-  Construct an homogeneous matrix from a translation vector and quaternion
-  rotation vector.
- */
+  /*!
+    Construct an homogeneous matrix from a translation vector and quaternion
+    rotation vector.
+   */
 vpHomogeneousMatrix::vpHomogeneousMatrix(const vpTranslationVector &t, const vpQuaternionVector &q)
   : vpArray2D<double>(constr_value_4, constr_value_4)
 {
@@ -1578,8 +1573,8 @@ void vpHomogeneousMatrix::parse_json(const nlohmann::json &j)
     const bool converted = convertFromTypeAndBuildFrom<vpHomogeneousMatrix, vpPoseVector>(j, *this);
     if (!converted) {
       from_json(j, *asArray);
-    }
   }
+}
   else { // Generic 2D array conversion
     from_json(j, *asArray);
   }
diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp
index 8808039ed0..a2cbce8c26 100644
--- a/modules/core/src/math/transformation/vpRotationMatrix.cpp
+++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp
@@ -1,6 +1,6 @@
 /*
  * ViSP, open source Visual Servoing Platform software.
- * Copyright (C) 2005 - 2024 by Inria. All rights reserved.
+ * Copyright (C) 2005 - 2025 by Inria. All rights reserved.
  *
  * This software is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -53,7 +53,6 @@
 
 BEGIN_VISP_NAMESPACE
 
-
 void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bool isTransposed) const
 {
   output.resize(input.getRows(), input.getCols(), false, false);
@@ -92,7 +91,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
       __m128d sum1 = _mm_hadd_pd(sum2, sum2);     // [total, total]
       double total = _mm_cvtsd_f64(sum1);
 
-
       outputData[0] = result[0] + result[2];
       outputData[1] = result[1] + result[3];
       outputData[2] = total;
@@ -102,8 +100,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
     }
 
 #elif VISP_HAVE_SSE2
-
-
     __m128d row01_lo = _mm_loadu_pd(rowPtrs[0]);
     __m128d row01_hi = _mm_loadu_pd(rowPtrs[1]);
     __m128d row2_xy = _mm_loadu_pd(rowPtrs[2]);
@@ -113,7 +109,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
     double c1 = rowPtrs[1][2];
     double c2 = rowPtrs[2][2];
 
-
     for (unsigned i = 0; i < input.getRows(); ++i) {
 
       const __m128d xy = _mm_loadu_pd(inputData);
@@ -138,7 +133,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
       inputData += 3;
       outputData += 3;
     }
-
 #else
     double *r0 = rowPtrs[0];
     double *r1 = rowPtrs[1];
@@ -153,9 +147,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
     }
 #endif
   }
-
-
-
   else { // 3xN matrix
     using namespace vpSIMD;
     if (input.getRows() != 3) {
@@ -168,7 +159,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
     double *outputY = output[1];
     double *outputZ = output[2];
 
-
 #if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) || defined(VISP_HAVE_AVX2)
 
     Register elems[9];
@@ -190,16 +180,12 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
       dp3 = vpSIMD::fma(y4, elems[7], dp3);
       dp3 = vpSIMD::fma(z4, elems[8], dp3);
 
-
-
       storeu(outputX, dp1);
       storeu(outputY, dp2);
       storeu(outputZ, dp3);
 
-
       inputX += vpSIMD::numLanes; inputY += vpSIMD::numLanes; inputZ += vpSIMD::numLanes;
       outputX += vpSIMD::numLanes; outputY += vpSIMD::numLanes; outputZ += vpSIMD::numLanes;
-
     }
     double *r0 = rowPtrs[0];
     double *r1 = rowPtrs[1];
@@ -214,7 +200,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
       ++inputX; ++inputY; ++inputZ;
       ++outputX; ++outputY; ++outputZ;
     }
-
 #else
     double *r0 = rowPtrs[0];
     double *r1 = rowPtrs[1];
@@ -230,12 +215,8 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
     }
 #endif
   }
-
-
-
 }
 
-
 const unsigned int vpRotationMatrix::constr_val_3 = 3;
 /*!
   Initialize the rotation matrix as identity.
diff --git a/modules/core/test/math/catchHomogeneousMatrix.cpp b/modules/core/test/math/catchHomogeneousMatrix.cpp
index 47aef0bb9f..098d859845 100644
--- a/modules/core/test/math/catchHomogeneousMatrix.cpp
+++ b/modules/core/test/math/catchHomogeneousMatrix.cpp
@@ -1,6 +1,6 @@
 /*
  * ViSP, open source Visual Servoing Platform software.
- * Copyright (C) 2005 - 2024 by Inria. All rights reserved.
+ * Copyright (C) 2005 - 2025 by Inria. All rights reserved.
  *
  * This software is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -247,7 +247,6 @@ TEST_CASE("Point projection", "project")
       vpMatrix outputT(N, 3);
       vpMatrix output(3, N);
 
-
       double t1 = vpTime::measureTimeMs();
       M.project(inputT, outputT, true);
       double t2 = vpTime::measureTimeMs();
@@ -277,7 +276,6 @@ TEST_CASE("Point projection", "project")
       double t2r = vpTime::measureTimeMs();
       timeNaive.push_back(t2r - t1r);
 
-
       vpMatrix input4(4, N);
       for (unsigned int i = 0; i < inputT.getRows(); ++i) {
         input4[0][i] = inputT[i][0];
@@ -317,7 +315,6 @@ TEST_CASE("Point projection", "project")
     std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::minimum(vpMath::getMean(timeProjectTransposed), vpMath::getMean(timeProject)) << std::endl;
 
   }
-
 }
 
 int main(int argc, char *argv[])
diff --git a/modules/core/test/math/catchRotation.cpp b/modules/core/test/math/catchRotation.cpp
index 00e396ee48..9deaec4dc9 100644
--- a/modules/core/test/math/catchRotation.cpp
+++ b/modules/core/test/math/catchRotation.cpp
@@ -559,8 +559,6 @@ TEST_CASE("Default constructor", "[rotation matrix]")
 #endif
 }
 
-
-
 TEST_CASE("Vector rotation", "project")
 {
   std::vector<unsigned int> NS = { 1, 10, 100, 1000, 10000 };
@@ -627,16 +625,12 @@ TEST_CASE("Vector rotation", "project")
         std::cerr << "Optimized 3xN version failed" << std::endl;
         FAIL();
       }
-
-
-
     }
     std::cout << "Optimized (transposed) version took: " << vpMath::getMean(timeProjectTransposed) << " +- " << vpMath::getStdev(timeProjectTransposed) <<  "ms" << std::endl;
     std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +- " << vpMath::getStdev(timeProject) <<  "ms" << std::endl;
     std::cout << "Mult  version took: " << vpMath::getMean(timeMult) << " +- " << vpMath::getStdev(timeMult) << "ms" << std::endl;
     std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +- " << vpMath::getStdev(timeNaive) <<  "ms" << std::endl;
     std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::minimum(vpMath::getMean(timeProjectTransposed), vpMath::getMean(timeProject)) << std::endl;
-
   }
 }
 
diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
index 81af464e3b..ae9c0aa6e6 100644
--- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
+++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
@@ -153,7 +153,7 @@ void vpRBDenseDepthTracker::extractFeatures(const vpRBFeatureTrackerInput &frame
     }
 
     pointsPerThread[threadIdx] = std::move(localPoints);
-    }
+  }
   for (const std::vector<vpDepthPoint> &points: pointsPerThread) {
     m_depthPoints.insert(m_depthPoints.end(), std::make_move_iterator(points.begin()), std::make_move_iterator(points.end()));
   }
@@ -171,7 +171,7 @@ void vpRBDenseDepthTracker::extractFeatures(const vpRBFeatureTrackerInput &frame
   else {
     m_numFeatures = 0;
   }
-  }
+}
 
 void vpRBDenseDepthTracker::initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/)
 {
@@ -244,11 +244,9 @@ void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, con
   }
 }
 
-
 #if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2)
 void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const  vpMatrix &cNt, const  vpMatrix &obsT, vpColVector &e, vpMatrix &Lt)
 {
-
   using namespace vpSIMD;
 
 #if defined(VISP_HAVE_OPENMP)
@@ -281,8 +279,6 @@ void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const  vpMatrix &cNt, co
 }
 #endif
 
-
-
 void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix(
   const vpCameraParameters &cam, const vpHomogeneousMatrix &cMo,
   const vpImage<float> &depth, vpColVector &e, vpMatrix &Lt)

From 3f3bac2c6478ec50e3e714ed5fa5c310db79705c Mon Sep 17 00:00:00 2001
From: Fabien Spindler <Fabien.Spindler@inria.fr>
Date: Sun, 7 Dec 2025 10:25:01 +0100
Subject: [PATCH 16/24] Fix warning variable set but not used
 modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp:200:10: warning:
 variable 't1' set but not used [-Wunused-but-set-variable]   200 |   double
 t1 = vpTime::measureTimeMs();       |          ^

---
 modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
index ae9c0aa6e6..2772c9adc4 100644
--- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
+++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
@@ -197,11 +197,9 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame,
     return;
   }
 
-  double t1 = vpTime::measureTimeMs();
   m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_Lt);
   m_Lt.transpose(m_L);
 
-  t1 = vpTime::measureTimeMs();
   m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01);
   m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights);
 

From adc26ab8f97480192076f253ffbe29993a58e533 Mon Sep 17 00:00:00 2001
From: Fabien Spindler <Fabien.Spindler@inria.fr>
Date: Sun, 7 Dec 2025 10:39:44 +0100
Subject: [PATCH 17/24] Remove vpSIMD namespace from doxygen doc

---
 modules/core/include/visp3/core/vpSIMDUtils.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/core/include/visp3/core/vpSIMDUtils.h b/modules/core/include/visp3/core/vpSIMDUtils.h
index a54f3c5550..7d81d01a77 100644
--- a/modules/core/include/visp3/core/vpSIMDUtils.h
+++ b/modules/core/include/visp3/core/vpSIMDUtils.h
@@ -40,6 +40,8 @@
 #define VP_SIMD_UTILS_H
 #include <visp3/core/vpConfig.h>
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
 #if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
 #include <emmintrin.h>
 #include <immintrin.h>
@@ -225,4 +227,5 @@ inline void storeu(double *data, const Register a)
 
 }
 
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 #endif // VP_SIMD_UTILS_H

From c6ff19a448a741b03bef1f998fda4b86e252add0 Mon Sep 17 00:00:00 2001
From: Fabien Spindler <Fabien.Spindler@inria.fr>
Date: Sun, 7 Dec 2025 11:57:17 +0100
Subject: [PATCH 18/24] Fix bug when input vector is not transposed and AVX or
 SSE2 not available Document vpRotationMatrix::rotateVectors() method

---
 .../core/src/math/transformation/vpRotationMatrix.cpp    | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp
index a2cbce8c26..5f665d6ca5 100644
--- a/modules/core/src/math/transformation/vpRotationMatrix.cpp
+++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp
@@ -53,6 +53,13 @@
 
 BEGIN_VISP_NAMESPACE
 
+/*!
+ * Rotate a set of vectors by this rotation matrix.
+ * \param input : Input matrix containing vectors to rotate. By default, each column is a 3D vector with 3 rows.
+ * When `isTransposed` is true, each row is a 3D vector with 3 columns.
+ * \param output : Output matrix containing rotated vectors. Has the same layout and size as the input matrix.
+ * \param isTransposed : If true, input matrix is transposed (each row is a 3D vector).
+**/
 void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bool isTransposed) const
 {
   output.resize(input.getRows(), input.getCols(), false, false);
@@ -205,7 +212,7 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
     double *r1 = rowPtrs[1];
     double *r2 = rowPtrs[2];
 
-    for (unsigned int i = 0; i < input.getRows(); ++i) {
+    for (unsigned int i = 0; i < input.getCols(); ++i) {
       double X = *inputX, Y = *inputY, Z = *inputZ;
       *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z;
       *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z;

From 51e89a34781652bdf7c45b95df3f4c5a0cd457eb Mon Sep 17 00:00:00 2001
From: Fabien Spindler <Fabien.Spindler@inria.fr>
Date: Sun, 7 Dec 2025 12:07:16 +0100
Subject: [PATCH 19/24] Fix bug when input vector is not transposed and AVX or
 SSE3 not available Document vpHomogeneousMatrix::project() method

---
 .../transformation/vpHomogeneousMatrix.cpp    | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
index 0154c26e35..9e295b25b1 100644
--- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
+++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
@@ -48,6 +48,14 @@
 BEGIN_VISP_NAMESPACE
 const unsigned int vpHomogeneousMatrix::constr_value_4 = 4;
 
+/**
+ * Project a set of 3D points using the homogeneous matrix.
+ * The input points can be stored either by row or by column.
+ * \param input : Input matrix containing 3D points to project. By default, each column is a 3D point with 3 rows.
+ * When `transposed` is true, each row is a 3D point with 3 columns.
+ * \param output : Output matrix containing projected points. Has the same layout and size as the input matrix.
+ * \param transposed : If true, input matrix is transposed (each row is a 3D point).
+ */
 void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool transposed) const
 {
   output.resize(input.getRows(), input.getCols(), false, false);
@@ -135,7 +143,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
       inputData += 3;
     }
 #endif
-    }
+  }
   else {
     if (input.getRows() != 3) {
       throw vpException(vpException::dimensionError, "Expected input to have 3 rows");
@@ -202,7 +210,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
     double *r1 = rowPtrs[1];
     double *r2 = rowPtrs[2];
 
-    for (unsigned int i = 0; i < input.getRows(); ++i) {
+    for (unsigned int i = 0; i < input.getCols(); ++i) {
       double X = *inputX, Y = *inputY, Z = *inputZ;
       *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z + r0[3];
       *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z + r1[3];
@@ -212,13 +220,13 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
     }
 #endif
   }
-  }
+}
 
 
-  /*!
-    Construct an homogeneous matrix from a translation vector and quaternion
-    rotation vector.
-   */
+/*!
+  Construct an homogeneous matrix from a translation vector and quaternion
+  rotation vector.
+ */
 vpHomogeneousMatrix::vpHomogeneousMatrix(const vpTranslationVector &t, const vpQuaternionVector &q)
   : vpArray2D<double>(constr_value_4, constr_value_4)
 {
@@ -1573,8 +1581,8 @@ void vpHomogeneousMatrix::parse_json(const nlohmann::json &j)
     const bool converted = convertFromTypeAndBuildFrom<vpHomogeneousMatrix, vpPoseVector>(j, *this);
     if (!converted) {
       from_json(j, *asArray);
+    }
   }
-}
   else { // Generic 2D array conversion
     from_json(j, *asArray);
   }

From 09fc8cd7fdafff70359b5e3b248d64602846e7cb Mon Sep 17 00:00:00 2001
From: Fabien Spindler <Fabien.Spindler@inria.fr>
Date: Sun, 7 Dec 2025 12:08:07 +0100
Subject: [PATCH 20/24] Cleanup tests to help debugging

---
 .../core/test/math/catchHomogeneousMatrix.cpp | 25 +++++---
 modules/core/test/math/catchRotation.cpp      | 58 ++++++++++++-------
 2 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/modules/core/test/math/catchHomogeneousMatrix.cpp b/modules/core/test/math/catchHomogeneousMatrix.cpp
index 098d859845..de2f1d8182 100644
--- a/modules/core/test/math/catchHomogeneousMatrix.cpp
+++ b/modules/core/test/math/catchHomogeneousMatrix.cpp
@@ -63,6 +63,7 @@ bool test_matrix_equal(const vpHomogeneousMatrix &M1, const vpHomogeneousMatrix
 
 TEST_CASE("vpHomogeneousMatrix re-orthogonalize rotation matrix", "[vpHomogeneousMatrix]")
 {
+  std::cout << "\n== Test vpHomogeneousMatrix re-orthogonalize rotation matrix ==" << std::endl;
   CHECK_NOTHROW([]() {
     vpHomogeneousMatrix M { 0.9835,  -0.0581, 0.1716, 0.0072, -0.0489, -0.9972,
                           -0.0571, 0.0352,  0.1744, 0.0478, -0.9835, 0.9470 };
@@ -122,6 +123,7 @@ TEST_CASE("vpHomogeneousMatrix re-orthogonalize rotation matrix", "[vpHomogeneou
 
 TEST_CASE("vpRotationMatrix re-orthogonalize rotation matrix", "[vpRotationMatrix]")
 {
+  std::cout << "\n== Test vpRotationMatrix re-orthogonalize rotation matrix ==" << std::endl;
   CHECK_NOTHROW(
       []() { vpRotationMatrix R { 0.9835, -0.0581, 0.1716, -0.0489, -0.9972, -0.0571, 0.1744, 0.0478, -0.9835 }; }());
 
@@ -165,6 +167,7 @@ TEST_CASE("vpRotationMatrix re-orthogonalize rotation matrix", "[vpRotationMatri
 
 TEST_CASE("ENU to NED conversion", "[enu2ned]")
 {
+  std::cout << "\n== Test ENU to NED conversion ==" << std::endl;
   vpHomogeneousMatrix enu_M_flu { 0, -1, 0, 0.2, 1, 0, 0, 1., 0, 0, 1, 0.3 };
   std::cout << "enu_M_flu:\n" << enu_M_flu << std::endl;
 
@@ -206,6 +209,7 @@ TEST_CASE("ENU to NED conversion", "[enu2ned]")
 
 TEST_CASE("vpHomogenousMatrix * vpRotationMatrix", "[operator*]")
 {
+  std::cout << "\n== Test vpHomogenousMatrix * vpRotationMatrix ==" << std::endl;
   // Test rotation_matrix * homogeneous_matrix
   vpHomogeneousMatrix _1_M_2_ {
        0.9835, -0.0581,  0.1716, 0.0072,
@@ -228,7 +232,12 @@ TEST_CASE("vpHomogenousMatrix * vpRotationMatrix", "[operator*]")
 
 TEST_CASE("Point projection", "project")
 {
+  std::cout << "\n== Test point projection ==" << std::endl;
+#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE3) // Should be adapted to future evolution of vpHomogeneousMatrix::project()
   std::vector<unsigned int> NS = { 1, 10, 100, 1000, 10000 };
+#else
+  std::vector<unsigned int> NS = { 1, 10 };
+#endif
   for (unsigned int N : NS) {
     std::cout << "Running for N = " << N << std::endl;
     std::vector<double> timeProjectTransposed, timeProject, timeMult, timeNaive;
@@ -286,7 +295,7 @@ TEST_CASE("Point projection", "project")
       vpMatrix output4(4, N);
       double t14 = vpTime::measureTimeMs();
 
-      vpMatrix::mult2Matrices((vpMatrix)M, input4, output4);
+      vpMatrix::mult2Matrices(static_cast<vpMatrix>(M), input4, output4);
       double t24 = vpTime::measureTimeMs();
       timeMult.push_back(t24 - t14);
 
@@ -295,14 +304,12 @@ TEST_CASE("Point projection", "project")
 
       if (errorT > 1e-10 || error > 1e-10) {
         std::cout << "M = " << M << std::endl;
-        std::cerr << "Naive outputT = "  << std::endl << outputR << std::endl;
-        std::cerr << "Mult outputT = "  << std::endl << output4.t() << std::endl;
-        std::cerr << "Project output transposed version = "  << std::endl << outputT << std::endl;
-        std::cerr << "Project output = "  << std::endl << outputT<< std::endl;
-
-
-        std::cerr << "diff transposed = "  << std::endl << outputR - outputT;
-        std::cerr << "diff = "  << std::endl << outputR - output.t();
+        std::cerr << "Naive outputT = " << outputR << std::endl;
+        std::cerr << "Mult outputT = " << output4.t() << std::endl;
+        std::cerr << "Project output transposed version = " << outputT << std::endl;
+        std::cerr << "Project output = " << outputT << std::endl;
+        std::cerr << "Diff transposed = " << outputR - outputT << std::endl;
+        std::cerr << "Diff = " << outputR - output.t() << std::endl;
 
         FAIL();
       }
diff --git a/modules/core/test/math/catchRotation.cpp b/modules/core/test/math/catchRotation.cpp
index 9deaec4dc9..874e78af57 100644
--- a/modules/core/test/math/catchRotation.cpp
+++ b/modules/core/test/math/catchRotation.cpp
@@ -134,6 +134,7 @@ TEST_CASE("Common rotation operations", "[rotation]")
 {
   SECTION("Theta u initialization")
   {
+    std::cout << "\n== Test theta u initialization ==" << std::endl;
     vpThetaUVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10));
     std::vector<double> bench1(3, vpMath::rad(10));
     vpColVector bench3(3, vpMath::rad(10));
@@ -169,6 +170,7 @@ TEST_CASE("Common rotation operations", "[rotation]")
   }
   SECTION("Rxyz initialization")
   {
+    std::cout << "\n== Test Rxyz initialization ==" << std::endl;
     vpRxyzVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10));
     std::vector<double> bench1(3, vpMath::rad(10));
     vpColVector bench3(3, vpMath::rad(10));
@@ -203,6 +205,7 @@ TEST_CASE("Common rotation operations", "[rotation]")
   }
   SECTION("rzyx initialization")
   {
+    std::cout << "\n== Test rzyx initialization ==" << std::endl;
     vpRzyxVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10));
     std::vector<double> bench1(3, vpMath::rad(10));
     vpColVector bench3(3, vpMath::rad(10));
@@ -237,6 +240,7 @@ TEST_CASE("Common rotation operations", "[rotation]")
   }
   SECTION("rzyz initialiation")
   {
+    std::cout << "\n== Test rzyz initialization ==" << std::endl;
     vpRzyzVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10));
     std::vector<double> bench1(3, vpMath::rad(10));
     vpColVector bench3(3, vpMath::rad(10));
@@ -271,6 +275,7 @@ TEST_CASE("Common rotation operations", "[rotation]")
   }
   SECTION("Test quaternion initialization", "[quaternion]")
   {
+    std::cout << "\n== Test quaternion initialization ==" << std::endl;
     vpQuaternionVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10), vpMath::rad(10));
     std::vector<double> bench1(4, vpMath::rad(10));
     vpColVector bench3(4, vpMath::rad(10));
@@ -305,41 +310,41 @@ TEST_CASE("Common rotation operations", "[rotation]")
   }
   SECTION("Conversions")
   {
+    std::cout << "\n== Test conversions ==" << std::endl;
     vpRotationMatrix R;
     for (int i = -10; i < 10; i++) {
       for (int j = -10; j < 10; j++) {
         vpThetaUVector tu(vpMath::rad(90 + i), vpMath::rad(170 + j), vpMath::rad(45));
         tu.buildFrom(vpRotationMatrix(tu)); // put some coherence into rotation convention
 
-        std::cout << "Initialization " << std::endl;
+        std::cout << "\n** Initialization with theta_u: " << tu.t()  << std::endl;
 
         double theta;
         vpColVector u;
         tu.extract(theta, u);
 
-        std::cout << "theta=" << vpMath::deg(theta) << std::endl;
-        std::cout << "u=" << u << std::endl;
+        std::cout << "Extracted theta: " << vpMath::deg(theta) << " deg" << std::endl;
+        std::cout << "Extracted u: " << u.t() << std::endl;
 
         std::cout << "From vpThetaUVector to vpRotationMatrix " << std::endl;
         R.buildFrom(tu);
 
-        std::cout << "Matrix R";
+        std::cout << "Matrix R: \n" << R << std::endl;
         CHECK(R.isARotationMatrix());
 
-        std::cout << R << std::endl;
-
         std::cout << "From vpRotationMatrix to vpQuaternionVector " << std::endl;
         vpQuaternionVector q(R);
+        std::cout << "Quaternion: " << q.t() << std::endl;
         CHECK(q.magnitude() == Catch::Approx(1.0).margin(1e-4));
-        std::cout << q << std::endl;
 
+        std::cout << "From vpQuaternionVector to vpRotationMatrix  " << std::endl;
         R.buildFrom(q);
+        std::cout << "Matrix R: \n" << R << std::endl;
         CHECK(R.isARotationMatrix());
-        std::cout << "From vpQuaternionVector to vpRotationMatrix  " << std::endl;
 
         std::cout << "From vpRotationMatrix to vpRxyzVector " << std::endl;
         vpRxyzVector RxyzbuildR(R);
-        std::cout << RxyzbuildR << std::endl;
+        std::cout << "Rxyz: " << RxyzbuildR.t() << std::endl;
 
         std::cout << "From vpRxyzVector to vpThetaUVector " << std::endl;
         std::cout << "  use From vpRxyzVector to vpRotationMatrix " << std::endl;
@@ -348,15 +353,14 @@ TEST_CASE("Common rotation operations", "[rotation]")
         vpThetaUVector tubuildEu;
         tubuildEu.buildFrom(R);
 
-        std::cout << std::endl;
-        std::cout << "result : should equivalent to the first one " << std::endl;
+        std::cout << "Result : should equivalent to the first one " << std::endl;
 
         double theta2;
         vpColVector u2;
 
         tubuildEu.extract(theta2, u2);
-        std::cout << "theta=" << vpMath::deg(theta2) << std::endl;
-        std::cout << "u=" << u2 << std::endl;
+        std::cout << "Extracted theta: " << vpMath::deg(theta2) << " deg" << std::endl;
+        std::cout << "Extracted u: " << u2.t() << std::endl;
 
         CHECK(vpMath::abs(theta2 - theta) < std::numeric_limits<double>::epsilon() * 1e10);
         CHECK(vpMath::abs(u[0] - u2[0]) < std::numeric_limits<double>::epsilon() * 1e10);
@@ -366,35 +370,39 @@ TEST_CASE("Common rotation operations", "[rotation]")
     }
     SECTION("Conversion from and to rzyz vector")
     {
+      std::cout << "\n== Test conversion from and to rzyz vector ==" << std::endl;
       vpRzyzVector rzyz(vpMath::rad(180), vpMath::rad(120), vpMath::rad(45));
-      std::cout << "Initialization vpRzyzVector " << std::endl;
-      std::cout << rzyz << std::endl;
+      std::cout << "\n** Initialization with rzyz: " << rzyz.t() << std::endl;
       std::cout << "From vpRzyzVector to vpRotationMatrix  " << std::endl;
+
       R.buildFrom(rzyz);
+      std::cout << "Matrix R: \n" << R << std::endl;
       CHECK(R.isARotationMatrix());
       std::cout << "From vpRotationMatrix to vpRzyzVector " << std::endl;
       vpRzyzVector rzyz_final;
       rzyz_final.buildFrom(R);
+      std::cout << "rzyz: " << rzyz_final.t() << std::endl;
       CHECK(test("rzyz", rzyz_final, vpColVector(rzyz)));
-      std::cout << rzyz_final << std::endl;
     }
     SECTION("Conversion from and to rzyx vector")
     {
+      std::cout << "\n== Test conversion from and to rzyx vector ==" << std::endl;
       vpRzyxVector rzyx(vpMath::rad(180), vpMath::rad(120), vpMath::rad(45));
-      std::cout << "Initialization vpRzyxVector " << std::endl;
-      std::cout << rzyx << std::endl;
+      std::cout << "\n** Initialization with rzyx: " << rzyx.t() << std::endl;
       std::cout << "From vpRzyxVector to vpRotationMatrix  " << std::endl;
       R.buildFrom(rzyx);
+      std::cout << "Matrix R: \n" << R << std::endl;
       CHECK(R.isARotationMatrix());
-      std::cout << R << std::endl;
       std::cout << "From vpRotationMatrix to vpRzyxVector " << std::endl;
       vpRzyxVector rzyx_final;
       rzyx_final.buildFrom(R);
+      std::cout << "rzyx: " << rzyx_final.t() << std::endl;
       bool ret = test("rzyx", rzyx_final, vpColVector(rzyx));
       if (ret == false) {
         // Euler angle representation is not unique
         std::cout << "Rzyx vector differ. Test rotation matrix..." << std::endl;
         vpRotationMatrix RR(rzyx_final);
+        std::cout << "Matrix R: \n" << RR << std::endl;
         if (R == RR) {
           std::cout << "Rzyx vector differ but rotation matrix is valid" << std::endl;
           ret = true;
@@ -406,6 +414,7 @@ TEST_CASE("Common rotation operations", "[rotation]")
   }
   SECTION("Rotation matrix extraction from homogeneous matrix and multiplication")
   {
+    std::cout << "\n== Test rotation matrix extraction from homogeneous matrix and multiplication ==" << std::endl;
     // Test rotation_matrix * homogeneous_matrix
     vpHomogeneousMatrix  _1_M_2_truth;
     _1_M_2_truth[0][0] = 0.9835;
@@ -443,6 +452,7 @@ TEST_CASE("Common rotation operations", "[rotation]")
 
 TEST_CASE("Theta u multiplication", "[theta.u]")
 {
+  std::cout << "\n== Test theta u multiplication ==" << std::endl;
   const int nTrials = 100;
   const uint64_t seed = 0x123456789;
   vpUniRand rng(seed);
@@ -469,6 +479,7 @@ TEST_CASE("Theta u multiplication", "[theta.u]")
 
 TEST_CASE("Quaternion multiplication", "[quaternion]")
 {
+  std::cout << "\n== Test quaternion multiplication ==" << std::endl;
   const int nTrials = 100;
   const uint64_t seed = 0x123456789;
   vpUniRand rng(seed);
@@ -496,6 +507,7 @@ TEST_CASE("Quaternion multiplication", "[quaternion]")
 
 TEST_CASE("Default constructor", "[rotation matrix]")
 {
+  std::cout << "\n== Test default constructor ==" << std::endl;
   vpRotationMatrix I;
   SECTION("rotation matrix")
   {
@@ -561,9 +573,14 @@ TEST_CASE("Default constructor", "[rotation matrix]")
 
 TEST_CASE("Vector rotation", "project")
 {
+  std::cout << "\n== Test rotation of vectors ==" << std::endl;
+#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) // Should be adapted to future evolution of vpRotationMatrix::rotateVectors()
   std::vector<unsigned int> NS = { 1, 10, 100, 1000, 10000 };
+#else
+  std::vector<unsigned int> NS = { 1, 10 };
+#endif
   for (unsigned int N : NS) {
-    std::cout << "Running for N = " << N << std::endl;
+    std::cout << "** Running for N = " << N << std::endl;
     std::vector<double> timeProjectTransposed, timeProject, timeMult, timeNaive;
     vpUniRand r(42);
     for (unsigned int trial = 0; trial < 100000; ++trial) {
@@ -587,7 +604,6 @@ TEST_CASE("Vector rotation", "project")
       t2 = vpTime::measureTimeMs();
       timeProject.push_back(t2 - t1);
 
-
       vpColVector x(3, 1);
       vpColVector res(3);
       vpMatrix outputR(N, 3);

From f9f59e2ff48c2de2c58e05fce0b663337be24320 Mon Sep 17 00:00:00 2001
From: Fabien Spindler <Fabien.Spindler@inria.fr>
Date: Sun, 7 Dec 2025 15:33:38 +0100
Subject: [PATCH 21/24] Fix _mm_hadd_pd() usage that requires SSE3 on pixi
 windows CI modules\core\src\math\transformation\vpRotationMatrix.cpp(128,21):
 error: always_inline function '_mm_hadd_pd' requires target feature 'sse3',
 but would be inlined into function 'rotateVectors' that is compiled without
 support for 'sse3'   128 |       __m128d r01 = _mm_hadd_pd(mul0, mul1);      
 |                     ^

---
 modules/core/src/math/transformation/vpRotationMatrix.cpp | 2 +-
 modules/core/test/math/catchRotation.cpp                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp
index 5f665d6ca5..a497f3c5c1 100644
--- a/modules/core/src/math/transformation/vpRotationMatrix.cpp
+++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp
@@ -106,7 +106,7 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
       outputData += 3;
     }
 
-#elif VISP_HAVE_SSE2
+#elif VISP_HAVE_SSE3
     __m128d row01_lo = _mm_loadu_pd(rowPtrs[0]);
     __m128d row01_hi = _mm_loadu_pd(rowPtrs[1]);
     __m128d row2_xy = _mm_loadu_pd(rowPtrs[2]);
diff --git a/modules/core/test/math/catchRotation.cpp b/modules/core/test/math/catchRotation.cpp
index 874e78af57..820bca14fb 100644
--- a/modules/core/test/math/catchRotation.cpp
+++ b/modules/core/test/math/catchRotation.cpp
@@ -574,7 +574,7 @@ TEST_CASE("Default constructor", "[rotation matrix]")
 TEST_CASE("Vector rotation", "project")
 {
   std::cout << "\n== Test rotation of vectors ==" << std::endl;
-#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) // Should be adapted to future evolution of vpRotationMatrix::rotateVectors()
+#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE3) // Should be adapted to future evolution of vpRotationMatrix::rotateVectors()
   std::vector<unsigned int> NS = { 1, 10, 100, 1000, 10000 };
 #else
   std::vector<unsigned int> NS = { 1, 10 };

From 7502eace1c1f3afd869ef610677f88173486b32e Mon Sep 17 00:00:00 2001
From: Fabien Spindler <fabien.spindler@inria.fr>
Date: Fri, 12 Dec 2025 09:23:41 +0100
Subject: [PATCH 22/24] Make vpSIMDUtils.h private to not expose SIMD code to
 the user For the moment this file is duplicated in core and rbt modules. We
 should find a solution to avoid code duplication.

---
 .../math/matrix/private}/vpSIMDUtils.h        |   0
 .../transformation/vpHomogeneousMatrix.cpp    |   3 +-
 .../math/transformation/vpRotationMatrix.cpp  |   2 +-
 .../rbt/src/core/private/vpSIMDUtils.h        | 231 ++++++++++++++++++
 .../src/features/vpRBDenseDepthTracker.cpp    |   3 +-
 5 files changed, 236 insertions(+), 3 deletions(-)
 rename modules/core/{include/visp3/core => src/math/matrix/private}/vpSIMDUtils.h (100%)
 create mode 100644 modules/tracker/rbt/src/core/private/vpSIMDUtils.h

diff --git a/modules/core/include/visp3/core/vpSIMDUtils.h b/modules/core/src/math/matrix/private/vpSIMDUtils.h
similarity index 100%
rename from modules/core/include/visp3/core/vpSIMDUtils.h
rename to modules/core/src/math/matrix/private/vpSIMDUtils.h
diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
index 9e295b25b1..9b864926af 100644
--- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
+++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
@@ -42,7 +42,8 @@
 #include <visp3/core/vpMatrix.h>
 #include <visp3/core/vpPoint.h>
 #include <visp3/core/vpQuaternionVector.h>
-#include <visp3/core/vpSIMDUtils.h>
+
+#include "../matrix/private/vpSIMDUtils.h"
 
 
 BEGIN_VISP_NAMESPACE
diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp
index a497f3c5c1..d16dd86205 100644
--- a/modules/core/src/math/transformation/vpRotationMatrix.cpp
+++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp
@@ -40,7 +40,7 @@
 #include <visp3/core/vpMath.h>
 #include <visp3/core/vpMatrix.h>
 
-#include <visp3/core/vpSIMDUtils.h>
+#include "../matrix/private/vpSIMDUtils.h"
 
 // Rotation classes
 #include <visp3/core/vpRotationMatrix.h>
diff --git a/modules/tracker/rbt/src/core/private/vpSIMDUtils.h b/modules/tracker/rbt/src/core/private/vpSIMDUtils.h
new file mode 100644
index 0000000000..7d81d01a77
--- /dev/null
+++ b/modules/tracker/rbt/src/core/private/vpSIMDUtils.h
@@ -0,0 +1,231 @@
+/*
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2025 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See https://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * SIMD utilities.
+ */
+
+/*!
+  \file vpSIMDUtils.h
+  \brief Header that defines and includes useful SIMD routines and macros
+*/
+
+#ifndef VP_SIMD_UTILS_H
+#define VP_SIMD_UTILS_H
+#include <visp3/core/vpConfig.h>
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#define VISP_HAVE_SSE2 1
+#endif
+
+#if defined __AVX2__
+#define VISP_HAVE_AVX2 1
+#endif
+
+#if defined __AVX__
+#define VISP_HAVE_AVX 1
+#endif
+
+// https://stackoverflow.com/a/40765925
+#if !defined(__FMA__) && defined(__AVX2__)
+#define __FMA__ 1
+#endif
+
+
+#if defined(__FMA__)
+#define VISP_HAVE_FMA
+#endif
+
+#if defined _WIN32 && defined(_M_ARM64)
+#define _ARM64_DISTINCT_NEON_TYPES
+#include <Intrin.h>
+#include <arm_neon.h>
+#define VISP_HAVE_NEON 1
+#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__)
+#include <arm_neon.h>
+#define VISP_HAVE_NEON 1
+#else
+#define VISP_HAVE_NEON 0
+#endif
+
+#if VISP_HAVE_SSE2 && USE_SIMD_CODE
+#define USE_SSE 1
+#else
+#define USE_SSE 0
+#endif
+
+#if VISP_HAVE_NEON && USE_SIMD_CODE
+#define USE_NEON 1
+#else
+#define USE_NEON 0
+#endif
+
+namespace vpSIMD
+{
+#if defined(VISP_HAVE_AVX2)
+using Register = __m512d;
+
+inline constexpr int numLanes = 8;
+inline const Register add(const Register a, const Register b)
+{
+  return _mm512_add_pd(a, b);
+}
+
+inline Register sub(const Register a, const Register b)
+{
+  return _mm512_sub_pd(a, b);
+}
+
+inline Register mul(const Register a, const Register b)
+{
+  return _mm512_mul_pd(a, b);
+}
+
+inline Register fma(const Register a, const Register b, const Register c)
+{
+#if defined(VISP_HAVE_FMA)
+  return _mm512_fmadd_pd(a, b, c);
+#else
+  return add(mul(a, b), c);
+#endif
+}
+
+inline Register loadu(const double *const data)
+{
+  return _mm512_loadu_pd(data);
+}
+
+inline Register set1(double v)
+{
+  return _mm512_set1_pd(v);
+}
+
+inline void storeu(double *data, const Register a)
+{
+  _mm512_storeu_pd(data, a);
+}
+
+#elif defined(VISP_HAVE_AVX)
+using Register = __m256d;
+inline const int numLanes = 4;
+
+inline Register add(const Register a, const Register b)
+{
+  return _mm256_add_pd(a, b);
+}
+
+inline Register sub(const Register a, const Register b)
+{
+  return _mm256_sub_pd(a, b);
+}
+
+inline Register mul(const Register a, const Register b)
+{
+  return _mm256_mul_pd(a, b);
+}
+
+inline Register fma(const Register a, const Register b, const Register c)
+{
+#if defined(VISP_HAVE_FMA)
+  return _mm256_fmadd_pd(a, b, c);
+#else
+  return add(mul(a, b), c);
+#endif
+}
+
+inline Register loadu(const double *const data)
+{
+  return _mm256_loadu_pd(data);
+}
+
+inline Register set1(double v)
+{
+  return _mm256_set1_pd(v);
+}
+
+inline void storeu(double *data, const Register a)
+{
+  _mm256_storeu_pd(data, a);
+}
+
+#elif VISP_HAVE_SSE2
+using Register = __m128d;
+inline const int numLanes = 2;
+
+inline Register add(const Register a, const Register b)
+{
+  return _mm_add_pd(a, b);
+}
+
+inline Register sub(const Register a, const Register b)
+{
+  return _mm_sub_pd(a, b);
+}
+
+inline Register mul(const Register a, const Register b)
+{
+  return _mm_mul_pd(a, b);
+}
+
+inline Register fma(const Register a, const Register b, const Register c)
+{
+#if defined(VISP_HAVE_FMA)
+  return _mm_fmadd_pd(a, b, c);
+#else
+  return add(mul(a, b), c);
+#endif
+}
+
+inline Register loadu(const double *const data)
+{
+  return _mm_loadu_pd(data);
+}
+
+inline Register set1(double v)
+{
+  return _mm_set1_pd(v);
+}
+
+inline void storeu(double *data, const Register a)
+{
+  _mm_storeu_pd(data, a);
+}
+
+#endif
+
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // VP_SIMD_UTILS_H
diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
index 2772c9adc4..31abf019dd 100644
--- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
+++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
@@ -30,9 +30,10 @@
 
 #include <visp3/rbt/vpRBDenseDepthTracker.h>
 #include <visp3/core/vpMeterPixelConversion.h>
-#include <visp3/core/vpSIMDUtils.h>
 #include <visp3/core/vpDisplay.h>
 
+#include "../core/private/vpSIMDUtils.h"
+
 #ifdef VISP_HAVE_OPENMP
 #include <omp.h>
 #endif

From 1f6079784835618ab05ac279b2a40778af3c3db5 Mon Sep 17 00:00:00 2001
From: Fabien Spindler <fabien.spindler@inria.fr>
Date: Fri, 12 Dec 2025 10:48:04 +0100
Subject: [PATCH 23/24] Remove  to make code more explicit

---
 .../transformation/vpHomogeneousMatrix.cpp    | 39 +++++++++----------
 .../math/transformation/vpRotationMatrix.cpp  | 23 ++++++-----
 .../src/features/vpRBDenseDepthTracker.cpp    | 30 +++++++-------
 3 files changed, 43 insertions(+), 49 deletions(-)

diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
index 9b864926af..175bf5ceaa 100644
--- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
+++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
@@ -157,47 +157,44 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool
     double *outputZ = output[2];
 
 #if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2)
-
-    using namespace vpSIMD;
-
-    Register elems[12];
+    vpSIMD::Register elems[12];
     for (unsigned int i = 0; i < 12; ++i) {
-      elems[i] = set1(data[i]);
+      elems[i] = vpSIMD::set1(data[i]);
     }
 
-    for (int i = 0; i <= static_cast<int>(input.getCols()) - numLanes; i += numLanes) {
-      const Register x4 = loadu(inputX);
-      const Register y4 = loadu(inputY);
-      const Register z4 = loadu(inputZ);
+    for (int i = 0; i <= static_cast<int>(input.getCols()) - vpSIMD::numLanes; i += vpSIMD::numLanes) {
+      const vpSIMD::Register x4 = vpSIMD::loadu(inputX);
+      const vpSIMD::Register y4 = vpSIMD::loadu(inputY);
+      const vpSIMD::Register z4 = vpSIMD::loadu(inputZ);
 
-      Register dp1 = mul(x4, elems[0]);
+      vpSIMD::Register dp1 = vpSIMD::mul(x4, elems[0]);
       dp1 = vpSIMD::fma(y4, elems[1], dp1);
       dp1 = vpSIMD::fma(z4, elems[2], dp1);
-      dp1 = add(elems[3], dp1);
+      dp1 = vpSIMD::add(elems[3], dp1);
 
-      Register dp2 = mul(x4, elems[4]);
+      vpSIMD::Register dp2 = vpSIMD::mul(x4, elems[4]);
       dp2 = vpSIMD::fma(y4, elems[5], dp2);
       dp2 = vpSIMD::fma(z4, elems[6], dp2);
-      dp2 = add(elems[7], dp2);
+      dp2 = vpSIMD::add(elems[7], dp2);
 
-      Register dp3 = mul(x4, elems[8]);
+      vpSIMD::Register dp3 = vpSIMD::mul(x4, elems[8]);
       dp3 = vpSIMD::fma(y4, elems[9], dp3);
       dp3 = vpSIMD::fma(z4, elems[10], dp3);
-      dp3 = add(elems[11], dp3);
+      dp3 = vpSIMD::add(elems[11], dp3);
 
-      storeu(outputX, dp1);
-      storeu(outputY, dp2);
-      storeu(outputZ, dp3);
+      vpSIMD::storeu(outputX, dp1);
+      vpSIMD::storeu(outputY, dp2);
+      vpSIMD::storeu(outputZ, dp3);
 
-      inputX += numLanes; inputY += numLanes; inputZ += numLanes;
-      outputX += numLanes; outputY += numLanes; outputZ += numLanes;
+      inputX += vpSIMD::numLanes; inputY += vpSIMD::numLanes; inputZ += vpSIMD::numLanes;
+      outputX += vpSIMD::numLanes; outputY += vpSIMD::numLanes; outputZ += vpSIMD::numLanes;
 
     }
     double *r0 = rowPtrs[0];
     double *r1 = rowPtrs[1];
     double *r2 = rowPtrs[2];
 
-    for (unsigned int i = (input.getCols() / numLanes) * numLanes; i < input.getCols(); ++i) {
+    for (unsigned int i = (input.getCols() / vpSIMD::numLanes) * vpSIMD::numLanes; i < input.getCols(); ++i) {
       // std::cout << "i = " << i << std::endl;
       double X = *inputX, Y = *inputY, Z = *inputZ;
       *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z + r0[3];
diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp
index d16dd86205..bd6337a70f 100644
--- a/modules/core/src/math/transformation/vpRotationMatrix.cpp
+++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp
@@ -155,7 +155,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
 #endif
   }
   else { // 3xN matrix
-    using namespace vpSIMD;
     if (input.getRows() != 3) {
       throw vpException(vpException::dimensionError, "Expected input to have 3 rows");
     }
@@ -168,28 +167,28 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo
 
 #if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) || defined(VISP_HAVE_AVX2)
 
-    Register elems[9];
+    vpSIMD::Register elems[9];
     for (unsigned int i = 0; i < 9; ++i) {
-      elems[i] = set1(data[i]);
+      elems[i] = vpSIMD::set1(data[i]);
     }
     for (int i = 0; i <= static_cast<int>(input.getCols()) - vpSIMD::numLanes; i += vpSIMD::numLanes) {
-      const Register x4 = loadu(inputX);
-      const Register y4 = loadu(inputY);
-      const Register z4 = loadu(inputZ);
+      const vpSIMD::Register x4 = vpSIMD::loadu(inputX);
+      const vpSIMD::Register y4 = vpSIMD::loadu(inputY);
+      const vpSIMD::Register z4 = vpSIMD::loadu(inputZ);
 
-      Register dp1 = mul(x4, elems[0]);
+      vpSIMD::Register dp1 = vpSIMD::mul(x4, elems[0]);
       dp1 = vpSIMD::fma(y4, elems[1], dp1);
       dp1 = vpSIMD::fma(z4, elems[2], dp1);
-      Register dp2 = mul(x4, elems[3]);
+      vpSIMD::Register dp2 = vpSIMD::mul(x4, elems[3]);
       dp2 = vpSIMD::fma(y4, elems[4], dp2);
       dp2 = vpSIMD::fma(z4, elems[5], dp2);
-      Register dp3 = mul(x4, elems[6]);
+      vpSIMD::Register dp3 = vpSIMD::mul(x4, elems[6]);
       dp3 = vpSIMD::fma(y4, elems[7], dp3);
       dp3 = vpSIMD::fma(z4, elems[8], dp3);
 
-      storeu(outputX, dp1);
-      storeu(outputY, dp2);
-      storeu(outputZ, dp3);
+      vpSIMD::storeu(outputX, dp1);
+      vpSIMD::storeu(outputY, dp2);
+      vpSIMD::storeu(outputZ, dp3);
 
       inputX += vpSIMD::numLanes; inputY += vpSIMD::numLanes; inputZ += vpSIMD::numLanes;
       outputX += vpSIMD::numLanes; outputY += vpSIMD::numLanes; outputZ += vpSIMD::numLanes;
diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
index 31abf019dd..384a3d6119 100644
--- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
+++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
@@ -246,35 +246,33 @@ void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, con
 #if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2)
 void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const  vpMatrix &cNt, const  vpMatrix &obsT, vpColVector &e, vpMatrix &Lt)
 {
-  using namespace vpSIMD;
-
 #if defined(VISP_HAVE_OPENMP)
 #pragma omp parallel for schedule(static, 1024)
 #endif
-  for (int i = 0; i <= static_cast<int>(cXt.getCols()) - numLanes; i += numLanes) {
-    const Register X = loadu(cXt[0] + i), Y = loadu(cXt[1] + i), Z = loadu(cXt[2] + i);
-    const Register nX = loadu(cNt[0] + i), nY = loadu(cNt[1] + i), nZ = loadu(cNt[2] + i);
-    const Register obsX = loadu(obsT[0] + i), obsY = loadu(obsT[1] + i), obsZ = loadu(obsT[2] + i);
+  for (int i = 0; i <= static_cast<int>(cXt.getCols()) - vpSIMD::numLanes; i += vpSIMD::numLanes) {
+    const vpSIMD::Register X = vpSIMD::loadu(cXt[0] + i), Y = vpSIMD::loadu(cXt[1] + i), Z = vpSIMD::loadu(cXt[2] + i);
+    const vpSIMD::Register nX = vpSIMD::loadu(cNt[0] + i), nY = vpSIMD::loadu(cNt[1] + i), nZ = vpSIMD::loadu(cNt[2] + i);
+    const vpSIMD::Register obsX = vpSIMD::loadu(obsT[0] + i), obsY = vpSIMD::loadu(obsT[1] + i), obsZ = vpSIMD::loadu(obsT[2] + i);
 
-    Register D = mul(nX, X);
-    Register projNormal = mul(obsX, X);
+    vpSIMD::Register D = vpSIMD::mul(nX, X);
+    vpSIMD::Register projNormal = vpSIMD::mul(obsX, X);
     D = vpSIMD::fma(nY, Y, D);
     D = vpSIMD::fma(nZ, Z, D);
 
     projNormal = vpSIMD::fma(nY, obsY, projNormal);
     projNormal = vpSIMD::fma(nZ, obsZ, projNormal);
 
-    storeu(e.data + i, sub(projNormal, D));
+    vpSIMD::storeu(e.data + i, vpSIMD::sub(projNormal, D));
 
-    storeu(Lt[0] + i, nX);
-    storeu(Lt[1] + i, nY);
-    storeu(Lt[2] + i, nZ);
+    vpSIMD::storeu(Lt[0] + i, nX);
+    vpSIMD::storeu(Lt[1] + i, nY);
+    vpSIMD::storeu(Lt[2] + i, nZ);
 
-    storeu(Lt[3] + i, sub(mul(nZ, Y), mul(nY, Z)));
-    storeu(Lt[4] + i, sub(mul(nX, Z), mul(nZ, X)));
-    storeu(Lt[5] + i, sub(mul(nY, X), mul(nX, Y)));
+    vpSIMD::storeu(Lt[3] + i, vpSIMD::sub(vpSIMD::mul(nZ, Y), vpSIMD::mul(nY, Z)));
+    vpSIMD::storeu(Lt[4] + i, vpSIMD::sub(vpSIMD::mul(nX, Z), vpSIMD::mul(nZ, X)));
+    vpSIMD::storeu(Lt[5] + i, vpSIMD::sub(vpSIMD::mul(nY, X), vpSIMD::mul(nX, Y)));
   }
-  errorAndInteractionMatrixBase(cXt, cNt, obsT, e, Lt, (cXt.getCols() / numLanes) * numLanes, cXt.getCols());
+  errorAndInteractionMatrixBase(cXt, cNt, obsT, e, Lt, (cXt.getCols() / vpSIMD::numLanes) * vpSIMD::numLanes, cXt.getCols());
 }
 #endif
 

From 671b8839afec140d25077d2a0aaccddb8e70c69a Mon Sep 17 00:00:00 2001
From: Fabien Spindler <fabien.spindler@inria.fr>
Date: Fri, 12 Dec 2025 11:29:03 +0100
Subject: [PATCH 24/24] Make test independent from SIMD instruction set

---
 .../core/test/math/catchHomogeneousMatrix.cpp | 31 +++++++++----------
 modules/core/test/math/catchRotation.cpp      | 29 ++++++++---------
 2 files changed, 27 insertions(+), 33 deletions(-)

diff --git a/modules/core/test/math/catchHomogeneousMatrix.cpp b/modules/core/test/math/catchHomogeneousMatrix.cpp
index de2f1d8182..a6e11fcb0c 100644
--- a/modules/core/test/math/catchHomogeneousMatrix.cpp
+++ b/modules/core/test/math/catchHomogeneousMatrix.cpp
@@ -233,28 +233,25 @@ TEST_CASE("vpHomogenousMatrix * vpRotationMatrix", "[operator*]")
 TEST_CASE("Point projection", "project")
 {
   std::cout << "\n== Test point projection ==" << std::endl;
-#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE3) // Should be adapted to future evolution of vpHomogeneousMatrix::project()
-  std::vector<unsigned int> NS = { 1, 10, 100, 1000, 10000 };
-#else
-  std::vector<unsigned int> NS = { 1, 10 };
-#endif
-  for (unsigned int N : NS) {
-    std::cout << "Running for N = " << N << std::endl;
+  std::map<unsigned int, unsigned int> map_vecsize_trials = { {1, 100000}, {10, 10000}, {100, 1000}, {1000, 1000}, { 10000, 100} };
+
+  for (const auto &[vecsize, trials] : map_vecsize_trials) {
+    std::cout << "** Running for vector size = " << vecsize << std::endl;
     std::vector<double> timeProjectTransposed, timeProject, timeMult, timeNaive;
     vpUniRand r(42);
-    for (unsigned int trial = 0; trial < 100000; ++trial) {
+    for (unsigned int trial = 0; trial < trials; ++trial) {
       vpHomogeneousMatrix M(
         r.uniform(0.0, 1.0), r.uniform(0.0, 1.0), r.uniform(0.0, 1.0),
         r.uniform(0.0, M_PI), r.uniform(0.0, M_PI), r.uniform(0.0, M_PI));
-      vpMatrix inputT(N, 3);
-      for (unsigned int i = 0; i< N; ++i) {
+      vpMatrix inputT(vecsize, 3);
+      for (unsigned int i = 0; i< vecsize; ++i) {
         inputT[i][0] = r.uniform(0.0, 1.0);
         inputT[i][1] = r.uniform(0.0, 1.0);
         inputT[i][2] = r.uniform(0.0, 1.0);
       }
       vpMatrix input = inputT.t();
-      vpMatrix outputT(N, 3);
-      vpMatrix output(3, N);
+      vpMatrix outputT(vecsize, 3);
+      vpMatrix output(3, vecsize);
 
       double t1 = vpTime::measureTimeMs();
       M.project(inputT, outputT, true);
@@ -269,7 +266,7 @@ TEST_CASE("Point projection", "project")
       vpColVector x(4, 1);
       vpColVector res(4);
 
-      vpMatrix outputR(N, 3);
+      vpMatrix outputR(vecsize, 3);
       double t1r = vpTime::measureTimeMs();
 
       for (unsigned int i = 0; i < inputT.getRows(); ++i) {
@@ -285,22 +282,22 @@ TEST_CASE("Point projection", "project")
       double t2r = vpTime::measureTimeMs();
       timeNaive.push_back(t2r - t1r);
 
-      vpMatrix input4(4, N);
+      vpMatrix input4(4, vecsize);
       for (unsigned int i = 0; i < inputT.getRows(); ++i) {
         input4[0][i] = inputT[i][0];
         input4[1][i] = inputT[i][1];
         input4[2][i] = inputT[i][2];
         input4[3][i] = 1;
       }
-      vpMatrix output4(4, N);
+      vpMatrix output4(4, vecsize);
       double t14 = vpTime::measureTimeMs();
 
       vpMatrix::mult2Matrices(static_cast<vpMatrix>(M), input4, output4);
       double t24 = vpTime::measureTimeMs();
       timeMult.push_back(t24 - t14);
 
-      double errorT = (outputR - outputT).frobeniusNorm() / N;
-      double error = (outputR - output.t()).frobeniusNorm() / N;
+      double errorT = (outputR - outputT).frobeniusNorm() / vecsize;
+      double error = (outputR - output.t()).frobeniusNorm() / vecsize;
 
       if (errorT > 1e-10 || error > 1e-10) {
         std::cout << "M = " << M << std::endl;
diff --git a/modules/core/test/math/catchRotation.cpp b/modules/core/test/math/catchRotation.cpp
index 820bca14fb..ba84966fa7 100644
--- a/modules/core/test/math/catchRotation.cpp
+++ b/modules/core/test/math/catchRotation.cpp
@@ -574,27 +574,24 @@ TEST_CASE("Default constructor", "[rotation matrix]")
 TEST_CASE("Vector rotation", "project")
 {
   std::cout << "\n== Test rotation of vectors ==" << std::endl;
-#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE3) // Should be adapted to future evolution of vpRotationMatrix::rotateVectors()
-  std::vector<unsigned int> NS = { 1, 10, 100, 1000, 10000 };
-#else
-  std::vector<unsigned int> NS = { 1, 10 };
-#endif
-  for (unsigned int N : NS) {
-    std::cout << "** Running for N = " << N << std::endl;
+  std::map<unsigned int, unsigned int> map_vecsize_trials = { {1, 100000}, {10, 10000}, {100, 1000}, {1000, 1000}, { 10000, 100} };
+
+  for (const auto &[vecsize, trials] : map_vecsize_trials) {
+    std::cout << "** Running for vector size = " << vecsize << std::endl;
     std::vector<double> timeProjectTransposed, timeProject, timeMult, timeNaive;
     vpUniRand r(42);
-    for (unsigned int trial = 0; trial < 100000; ++trial) {
+    for (unsigned int trial = 0; trial < trials; ++trial) {
       vpRotationMatrix M(r.uniform(0.0, M_PI), r.uniform(0.0, M_PI), r.uniform(0.0, M_PI));
-      vpMatrix inputT(N, 3);
+      vpMatrix inputT(vecsize, 3);
 
-      for (unsigned int i = 0; i< N; ++i) {
+      for (unsigned int i = 0; i< vecsize; ++i) {
         inputT[i][0] = r.uniform(0.0, 1.0);
         inputT[i][1] = r.uniform(0.0, 1.0);
         inputT[i][2] = r.uniform(0.0, 1.0);
       }
       vpMatrix input = inputT.t();
-      vpMatrix outputT(N, 3);
-      vpMatrix output(3, N);
+      vpMatrix outputT(vecsize, 3);
+      vpMatrix output(3, vecsize);
       double t1 = vpTime::measureTimeMs();
       M.rotateVectors(inputT, outputT, true);
       double t2 = vpTime::measureTimeMs();
@@ -606,7 +603,7 @@ TEST_CASE("Vector rotation", "project")
 
       vpColVector x(3, 1);
       vpColVector res(3);
-      vpMatrix outputR(N, 3);
+      vpMatrix outputR(vecsize, 3);
       double t1r = vpTime::measureTimeMs();
 
       for (unsigned int i = 0; i < inputT.getRows(); ++i) {
@@ -622,15 +619,15 @@ TEST_CASE("Vector rotation", "project")
       double t2r = vpTime::measureTimeMs();
       timeNaive.push_back(t2r - t1r);
 
-      vpMatrix outputMM(3, N);
+      vpMatrix outputMM(3, vecsize);
       double t14 = vpTime::measureTimeMs();
 
       vpMatrix::mult2Matrices((vpMatrix)M, input, outputMM);
       double t24 = vpTime::measureTimeMs();
       timeMult.push_back(t24 - t14);
 
-      double errorT = (outputR - outputT).frobeniusNorm() / N;
-      double error = (outputR - output.t()).frobeniusNorm() / N;
+      double errorT = (outputR - outputT).frobeniusNorm() / vecsize;
+      double error = (outputR - output.t()).frobeniusNorm() / vecsize;
 
       if (errorT > 1e-10) {
         std::cerr << "Transposed version failed" << std::endl;