diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4f3ad49afe..edb0af45da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 #############################################################################
 #
 # ViSP, open source Visual Servoing Platform software.
-# Copyright (C) 2005 - 2024 by Inria. All rights reserved.
+# Copyright (C) 2005 - 2025 by Inria. All rights reserved.
 #
 # This software is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -381,6 +381,9 @@ VP_OPTION(ENABLE_SSSE3 "" "" "Enable SSSE3 instructions" "" ON IF ((MSVC OR CMAK
 if(X86_64)
   VP_OPTION(ENABLE_AVX   "" "" "Enable AVX instructions"   "" OFF) # should be explicitly enabled, used in matrix transpose code
 endif()
+if(CMAKE_COMPILER_IS_GNUCXX)
+  VP_OPTION(ENABLE_NATIVE_ARCH  "" "" "Enable all available CPU instruction sets"   "" OFF)
+endif()
 
 #----------------------------------------------------------------------
 # BLAS / LAPACK
diff --git a/cmake/AddExtraCompilationFlags.cmake b/cmake/AddExtraCompilationFlags.cmake
index 35f0a269dc..4eba86f73d 100644
--- a/cmake/AddExtraCompilationFlags.cmake
+++ b/cmake/AddExtraCompilationFlags.cmake
@@ -171,25 +171,30 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     add_extra_compiler_options("-fvisibility-inlines-hidden")
   endif()
 
-  if(ENABLE_AVX AND X86_64)
-    add_extra_compiler_options("-mavx")
+  if(ENABLE_NATIVE_ARCH)
+    add_extra_compiler_options("-march=native")
   else()
-    if(ENABLE_SSE2)
-      add_extra_compiler_options("-msse2")
-    elseif(X86 OR X86_64)
-      add_extra_compiler_options("-mno-sse2")
-    endif()
 
-    if(ENABLE_SSE3)
-      add_extra_compiler_options("-msse3")
-    elseif(X86 OR X86_64)
-      #add_extra_compiler_options("-mno-sse3")
-    endif()
+    if(ENABLE_AVX AND X86_64)
+      add_extra_compiler_options("-mavx")
+    else()
+      if(ENABLE_SSE2)
+        add_extra_compiler_options("-msse2")
+      elseif(X86 OR X86_64)
+        add_extra_compiler_options("-mno-sse2")
+      endif()
+
+      if(ENABLE_SSE3)
+        add_extra_compiler_options("-msse3")
+      elseif(X86 OR X86_64)
+        #add_extra_compiler_options("-mno-sse3")
+      endif()
 
-    if(ENABLE_SSSE3)
-      add_extra_compiler_options("-mssse3")
-    elseif(X86 OR X86_64)
-      add_extra_compiler_options("-mno-ssse3")
+      if(ENABLE_SSSE3)
+        add_extra_compiler_options("-mssse3")
+      elseif(X86 OR X86_64)
+        add_extra_compiler_options("-mno-ssse3")
+      endif()
     endif()
   endif()
 
diff --git a/modules/core/include/visp3/core/vpHomogeneousMatrix.h b/modules/core/include/visp3/core/vpHomogeneousMatrix.h
index 76950d9a82..68f33ed3b4 100644
--- a/modules/core/include/visp3/core/vpHomogeneousMatrix.h
+++ b/modules/core/include/visp3/core/vpHomogeneousMatrix.h
@@ -381,6 +381,8 @@ class VISP_EXPORT vpHomogeneousMatrix : public vpArray2D<double>
   vpHomogeneousMatrix &operator<<(double val);
   vpHomogeneousMatrix &operator,(double val);
 
+  void project(const vpMatrix &inputPoints, vpMatrix &outputPoints, bool isTransposed) const;
+
   void orthogonalizeRotation();
 
   void print() const;
diff --git a/modules/core/include/visp3/core/vpRotationMatrix.h b/modules/core/include/visp3/core/vpRotationMatrix.h
index fbbf0b6f0e..38ec6f2a05 100644
--- a/modules/core/include/visp3/core/vpRotationMatrix.h
+++ b/modules/core/include/visp3/core/vpRotationMatrix.h
@@ -190,6 +190,8 @@ class VISP_EXPORT vpRotationMatrix : public vpArray2D<double>
 
   void printVector();
 
+  void rotateVectors(const vpMatrix &inputs, vpMatrix &outputs, bool isTransposed) const;
+
   /*!
     This function is not applicable to a rotation matrix that is always a
     3-by-3 matrix.
diff --git a/modules/core/src/math/matrix/private/vpSIMDUtils.h b/modules/core/src/math/matrix/private/vpSIMDUtils.h
new file mode 100644
index 0000000000..7d81d01a77
--- /dev/null
+++ b/modules/core/src/math/matrix/private/vpSIMDUtils.h
@@ -0,0 +1,231 @@
+/*
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2025 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See https://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * SIMD utilities.
+ */
+
+/*!
+  \file vpSIMDUtils.h
+  \brief Header that defines and includes useful SIMD routines and macros
+*/
+
+#ifndef VP_SIMD_UTILS_H
+#define VP_SIMD_UTILS_H
+#include <visp3/core/vpConfig.h>
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#define VISP_HAVE_SSE2 1
+#endif
+
+#if defined __AVX2__
+#define VISP_HAVE_AVX2 1
+#endif
+
+#if defined __AVX__
+#define VISP_HAVE_AVX 1
+#endif
+
+// https://stackoverflow.com/a/40765925
+#if !defined(__FMA__) && defined(__AVX2__)
+#define __FMA__ 1
+#endif
+
+
+#if defined(__FMA__)
+#define VISP_HAVE_FMA
+#endif
+
+#if defined _WIN32 && defined(_M_ARM64)
+#define _ARM64_DISTINCT_NEON_TYPES
+#include <Intrin.h>
+#include <arm_neon.h>
+#define VISP_HAVE_NEON 1
+#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__)
+#include <arm_neon.h>
+#define VISP_HAVE_NEON 1
+#else
+#define VISP_HAVE_NEON 0
+#endif
+
+#if VISP_HAVE_SSE2 && USE_SIMD_CODE
+#define USE_SSE 1
+#else
+#define USE_SSE 0
+#endif
+
+#if VISP_HAVE_NEON && USE_SIMD_CODE
+#define USE_NEON 1
+#else
+#define USE_NEON 0
+#endif
+
+namespace vpSIMD
+{
+#if defined(VISP_HAVE_AVX2)
+using Register = __m512d;
+
+inline constexpr int numLanes = 8;
+inline const Register add(const Register a, const Register b)
+{
+  return _mm512_add_pd(a, b);
+}
+
+inline Register sub(const Register a, const Register b)
+{
+  return _mm512_sub_pd(a, b);
+}
+
+inline Register mul(const Register a, const Register b)
+{
+  return _mm512_mul_pd(a, b);
+}
+
+inline Register fma(const Register a, const Register b, const Register c)
+{
+#if defined(VISP_HAVE_FMA)
+  return _mm512_fmadd_pd(a, b, c);
+#else
+  return add(mul(a, b), c);
+#endif
+}
+
+inline Register loadu(const double *const data)
+{
+  return _mm512_loadu_pd(data);
+}
+
+inline Register set1(double v)
+{
+  return _mm512_set1_pd(v);
+}
+
+inline void storeu(double *data, const Register a)
+{
+  _mm512_storeu_pd(data, a);
+}
+
+#elif defined(VISP_HAVE_AVX)
+using Register = __m256d;
+inline const int numLanes = 4;
+
+inline Register add(const Register a, const Register b)
+{
+  return _mm256_add_pd(a, b);
+}
+
+inline Register sub(const Register a, const Register b)
+{
+  return _mm256_sub_pd(a, b);
+}
+
+inline Register mul(const Register a, const Register b)
+{
+  return _mm256_mul_pd(a, b);
+}
+
+inline Register fma(const Register a, const Register b, const Register c)
+{
+#if defined(VISP_HAVE_FMA)
+  return _mm256_fmadd_pd(a, b, c);
+#else
+  return add(mul(a, b), c);
+#endif
+}
+
+inline Register loadu(const double *const data)
+{
+  return _mm256_loadu_pd(data);
+}
+
+inline Register set1(double v)
+{
+  return _mm256_set1_pd(v);
+}
+
+inline void storeu(double *data, const Register a)
+{
+  _mm256_storeu_pd(data, a);
+}
+
+#elif VISP_HAVE_SSE2
+using Register = __m128d;
+inline const int numLanes = 2;
+
+inline Register add(const Register a, const Register b)
+{
+  return _mm_add_pd(a, b);
+}
+
+inline Register sub(const Register a, const Register b)
+{
+  return _mm_sub_pd(a, b);
+}
+
+inline Register mul(const Register a, const Register b)
+{
+  return _mm_mul_pd(a, b);
+}
+
+inline Register fma(const Register a, const Register b, const Register c)
+{
+#if defined(VISP_HAVE_FMA)
+  return _mm_fmadd_pd(a, b, c);
+#else
+  return add(mul(a, b), c);
+#endif
+}
+
+inline Register loadu(const double *const data)
+{
+  return _mm_loadu_pd(data);
+}
+
+inline Register set1(double v)
+{
+  return _mm_set1_pd(v);
+}
+
+inline void storeu(double *data, const Register a)
+{
+  _mm_storeu_pd(data, a);
+}
+
+#endif
+
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // VP_SIMD_UTILS_H
diff --git a/modules/core/src/math/robust/vpRobust.cpp b/modules/core/src/math/robust/vpRobust.cpp
index 27fe9159c6..08b0c8da09 100644
--- a/modules/core/src/math/robust/vpRobust.cpp
+++ b/modules/core/src/math/robust/vpRobust.cpp
@@ -107,9 +107,9 @@ vpRobust &vpRobust::operator=(const vpRobust &&other)
 void vpRobust::resize(unsigned int n_data)
 {
   if (n_data != m_size) {
-    m_normres.resize(n_data);
-    m_sorted_normres.resize(n_data);
-    m_sorted_residues.resize(n_data);
+    m_normres.resize(n_data, false);
+    m_sorted_normres.resize(n_data, false);
+    m_sorted_residues.resize(n_data, false);
     m_size = n_data;
   }
 }
@@ -143,6 +143,8 @@ void vpRobust::MEstimator(const vpRobustEstimatorType method, const vpColVector
 
   // Calculate median
   med = select(m_sorted_residues, 0, n_data - 1, ind_med);
+
+  // med = select(m_sorted_residues, 0, n_data - 1, ind_med);
   // --comment: residualMedian = med
 
   // Normalize residues
@@ -153,6 +155,7 @@ void vpRobust::MEstimator(const vpRobustEstimatorType method, const vpColVector
 
   // Calculate MAD
   normmedian = select(m_sorted_normres, 0, n_data - 1, ind_med);
+
   // normalizedResidualMedian = normmedian ;
   // 1.48 keeps scale estimate consistent for a normal probability dist.
   m_mad = 1.4826 * normmedian; // median Absolute Deviation
diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
index 8e17bea6a5..175bf5ceaa 100644
--- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
+++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp
@@ -43,8 +43,184 @@
 #include <visp3/core/vpPoint.h>
 #include <visp3/core/vpQuaternionVector.h>
 
+#include "../matrix/private/vpSIMDUtils.h"
+
+
 BEGIN_VISP_NAMESPACE
 const unsigned int vpHomogeneousMatrix::constr_value_4 = 4;
+
+/**
+ * Project a set of 3D points using the homogeneous matrix.
+ * The input points can be stored either by row or by column.
+ * \param input : Input matrix containing 3D points to project. By default, each column is a 3D point with 3 rows.
+ * When `transposed` is true, each row is a 3D point with 3 columns.
+ * \param output : Output matrix containing projected points. Has the same layout and size as the input matrix.
+ * \param transposed : If true, input matrix is transposed (each row is a 3D point).
+ */
+void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool transposed) const
+{
+  output.resize(input.getRows(), input.getCols(), false, false);
+  if (transposed) {
+
+    if (input.getCols() != 3) {
+      throw vpException(vpException::dimensionError, "Input matrix should have 3 columns");
+    }
+    double *inputData = input.data;
+#if defined(VISP_HAVE_AVX)
+    double *outputData = output.data;
+    __m256d rows[] = {
+      _mm256_loadu_pd(rowPtrs[0]),
+      _mm256_loadu_pd(rowPtrs[1]),
+      _mm256_loadu_pd(rowPtrs[2]),
+      _mm256_loadu_pd(rowPtrs[3]),
+
+    };
+
+    double result[4];
+
+    for (unsigned int i = 0; i < input.getRows(); ++i) {
+      const __m256d xyzw = _mm256_setr_pd(inputData[0], inputData[1], inputData[2], 1.0);
+
+      const __m256d rs[] = {
+        _mm256_mul_pd(rows[0], xyzw),
+        _mm256_mul_pd(rows[1], xyzw),
+        _mm256_mul_pd(rows[2], xyzw),
+      };
+
+      __m256d rs_half_sum1 = _mm256_hadd_pd(rs[0], rs[1]);
+
+
+      _mm256_storeu_pd(result, rs_half_sum1);
+      __m128d hi = _mm256_extractf128_pd(rs[2], 1);   // [A3, A2]
+      __m128d lo = _mm256_castpd256_pd128(rs[2]);     // [A1, A0]
+
+      __m128d sum2 = _mm_add_pd(lo, hi);          // [A3+A1, A2+A0]
+      __m128d sum1 = _mm_hadd_pd(sum2, sum2);     // [total, total]
+      double total = _mm_cvtsd_f64(sum1);
+
+      outputData[0] = result[0] + result[2];
+      outputData[1] = result[1] + result[3];
+      outputData[2] = total;
+
+      inputData += 3;
+      outputData += 3;
+    }
+
+#elif (VISP_HAVE_SSE3)
+    double *outputData = output.data;
+    __m128d rows[] = {
+      _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2),
+      _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2),
+      _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2)
+    };
+    for (unsigned int i = 0; i < input.getRows(); ++i) {
+
+      __m128d r1 = _mm_set_pd(inputData[1], inputData[0]);
+      __m128d r2 = _mm_set_pd(1.0, inputData[2]);
+
+      for (unsigned int j = 0; j < 3; ++j) {
+        __m128d m1 = _mm_mul_pd(r1, rows[j * 2]);
+#if !defined(VISP_HAVE_FMA)
+        __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]);
+        __m128d add = _mm_add_pd(m1, m2);
+#else
+        __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1);
+#endif
+        __m128d sum = _mm_hadd_pd(add, add);
+        outputData[j] = _mm_cvtsd_f64(sum);
+      }
+      inputData += 3;
+      outputData += 3;
+    }
+#else
+    double *r0 = rowPtrs[0];
+    double *r1 = rowPtrs[1];
+    double *r2 = rowPtrs[2];
+
+    for (unsigned int i = 0; i < input.getRows(); ++i) {
+      output[i][0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2] + r0[3];
+      output[i][1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2] + r1[3];
+      output[i][2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2] + r2[3];
+      inputData += 3;
+    }
+#endif
+  }
+  else {
+    if (input.getRows() != 3) {
+      throw vpException(vpException::dimensionError, "Expected input to have 3 rows");
+    }
+    double *inputX = input[0];
+    double *inputY = input[1];
+    double *inputZ = input[2];
+    double *outputX = output[0];
+    double *outputY = output[1];
+    double *outputZ = output[2];
+
+#if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2)
+    vpSIMD::Register elems[12];
+    for (unsigned int i = 0; i < 12; ++i) {
+      elems[i] = vpSIMD::set1(data[i]);
+    }
+
+    for (int i = 0; i <= static_cast<int>(input.getCols()) - vpSIMD::numLanes; i += vpSIMD::numLanes) {
+      const vpSIMD::Register x4 = vpSIMD::loadu(inputX);
+      const vpSIMD::Register y4 = vpSIMD::loadu(inputY);
+      const vpSIMD::Register z4 = vpSIMD::loadu(inputZ);
+
+      vpSIMD::Register dp1 = vpSIMD::mul(x4, elems[0]);
+      dp1 = vpSIMD::fma(y4, elems[1], dp1);
+      dp1 = vpSIMD::fma(z4, elems[2], dp1);
+      dp1 = vpSIMD::add(elems[3], dp1);
+
+      vpSIMD::Register dp2 = vpSIMD::mul(x4, elems[4]);
+      dp2 = vpSIMD::fma(y4, elems[5], dp2);
+      dp2 = vpSIMD::fma(z4, elems[6], dp2);
+      dp2 = vpSIMD::add(elems[7], dp2);
+
+      vpSIMD::Register dp3 = vpSIMD::mul(x4, elems[8]);
+      dp3 = vpSIMD::fma(y4, elems[9], dp3);
+      dp3 = vpSIMD::fma(z4, elems[10], dp3);
+      dp3 = vpSIMD::add(elems[11], dp3);
+
+      vpSIMD::storeu(outputX, dp1);
+      vpSIMD::storeu(outputY, dp2);
+      vpSIMD::storeu(outputZ, dp3);
+
+      inputX += vpSIMD::numLanes; inputY += vpSIMD::numLanes; inputZ += vpSIMD::numLanes;
+      outputX += vpSIMD::numLanes; outputY += vpSIMD::numLanes; outputZ += vpSIMD::numLanes;
+
+    }
+    double *r0 = rowPtrs[0];
+    double *r1 = rowPtrs[1];
+    double *r2 = rowPtrs[2];
+
+    for (unsigned int i = (input.getCols() / vpSIMD::numLanes) * vpSIMD::numLanes; i < input.getCols(); ++i) {
+      // std::cout << "i = " << i << std::endl;
+      double X = *inputX, Y = *inputY, Z = *inputZ;
+      *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z + r0[3];
+      *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z + r1[3];
+      *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z + r2[3];
+      ++inputX; ++inputY; ++inputZ;
+      ++outputX; ++outputY; ++outputZ;
+    }
+#else
+    double *r0 = rowPtrs[0];
+    double *r1 = rowPtrs[1];
+    double *r2 = rowPtrs[2];
+
+    for (unsigned int i = 0; i < input.getCols(); ++i) {
+      double X = *inputX, Y = *inputY, Z = *inputZ;
+      *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z + r0[3];
+      *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z + r1[3];
+      *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z + r2[3];
+      ++inputX; ++inputY; ++inputZ;
+      ++outputX; ++outputY; ++outputZ;
+    }
+#endif
+  }
+}
+
+
 /*!
   Construct an homogeneous matrix from a translation vector and quaternion
   rotation vector.
@@ -651,16 +827,41 @@ vpColVector vpHomogeneousMatrix::operator*(const vpColVector &v) const
                       "(%dx1) column vector",
                       v.getRows()));
   }
-  vpColVector p(rowNum);
+  vpColVector p(rowNum, 0.0);
 
-  p = 0.0;
+// #if USE_SSE
+//   __m128d rows[] = {
+//     _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2),
+//     _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2),
+//     _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2),
+//     _mm_loadu_pd(rowPtrs[3]), _mm_loadu_pd(rowPtrs[3] + 2)
+
+//   };
+
+//   __m128d r1 = _mm_loadu_pd(v.data);
+//   __m128d r2 = _mm_loadu_pd(v.data + 2);
+
+//   for (unsigned int j = 0; j < 4; ++j) {
+//     __m128d m1 = _mm_mul_pd(r1, rows[j * 2]);
+// #if !defined(__FMA__)
+//     __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]);
+//     __m128d add = _mm_add_pd(m1, m2);
+// #else
+//     __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1);
+// #endif
+//     __m128d sum = _mm_hadd_pd(add, add);
+//     p[j] = _mm_cvtsd_f64(sum);
+//   }
+
+
+// #else
 
   for (unsigned int j = 0; j < val_4; ++j) {
     for (unsigned int i = 0; i < val_4; ++i) {
       p[i] += rowPtrs[i][j] * v[j];
     }
   }
-
+// #endif
   return p;
 }
 
diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp
index 067db852cf..bd6337a70f 100644
--- a/modules/core/src/math/transformation/vpRotationMatrix.cpp
+++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp
@@ -1,6 +1,6 @@
 /*
  * ViSP, open source Visual Servoing Platform software.
- * Copyright (C) 2005 - 2024 by Inria. All rights reserved.
+ * Copyright (C) 2005 - 2025 by Inria. All rights reserved.
  *
  * This software is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -40,6 +40,8 @@
 #include <visp3/core/vpMath.h>
 #include <visp3/core/vpMatrix.h>
 
+#include "../matrix/private/vpSIMDUtils.h"
+
 // Rotation classes
 #include <visp3/core/vpRotationMatrix.h>
 
@@ -50,6 +52,177 @@
 #include <math.h>
 
 BEGIN_VISP_NAMESPACE
+
+/*!
+ * Rotate a set of vectors by this rotation matrix.
+ * \param input : Input matrix containing vectors to rotate. By default, each column is a 3D vector with 3 rows.
+ * When `isTransposed` is true, each row is a 3D vector with 3 columns.
+ * \param output : Output matrix containing rotated vectors. Has the same layout and size as the input matrix.
+ * \param isTransposed : If true, input matrix is transposed (each row is a 3D vector).
+**/
+void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bool isTransposed) const
+{
+  output.resize(input.getRows(), input.getCols(), false, false);
+  if (isTransposed) {
+    if (input.getCols() != 3) {
+      throw vpException(vpException::dimensionError, "Input matrix should have 3 columns");
+    }
+    double *inputData = input.data;
+    double *outputData = output.data;
+#if defined(VISP_HAVE_AVX)
+    __m256d rows[] = {
+      _mm256_setr_pd(rowPtrs[0][0], rowPtrs[0][1], rowPtrs[0][2], 0.0),
+      _mm256_setr_pd(rowPtrs[1][0], rowPtrs[1][1], rowPtrs[1][2], 0.0),
+      _mm256_setr_pd(rowPtrs[2][0], rowPtrs[2][1], rowPtrs[2][2], 0.0),
+    };
+
+    double result[4];
+
+    for (unsigned int i = 0; i < input.getRows(); ++i) {
+      const __m256d xyzw = _mm256_setr_pd(inputData[0], inputData[1], inputData[2], 0.0);
+
+      const __m256d rs[] = {
+        _mm256_mul_pd(rows[0], xyzw),
+        _mm256_mul_pd(rows[1], xyzw),
+        _mm256_mul_pd(rows[2], xyzw),
+      };
+
+      __m256d rs_half_sum1 = _mm256_hadd_pd(rs[0], rs[1]);
+
+
+      _mm256_storeu_pd(result, rs_half_sum1);
+      __m128d hi = _mm256_extractf128_pd(rs[2], 1);   // [A3, A2]
+      __m128d lo = _mm256_castpd256_pd128(rs[2]);     // [A1, A0]
+
+      __m128d sum2 = _mm_add_pd(lo, hi);          // [A3+A1, A2+A0]
+      __m128d sum1 = _mm_hadd_pd(sum2, sum2);     // [total, total]
+      double total = _mm_cvtsd_f64(sum1);
+
+      outputData[0] = result[0] + result[2];
+      outputData[1] = result[1] + result[3];
+      outputData[2] = total;
+
+      inputData += 3;
+      outputData += 3;
+    }
+
+#elif VISP_HAVE_SSE3
+    __m128d row01_lo = _mm_loadu_pd(rowPtrs[0]);
+    __m128d row01_hi = _mm_loadu_pd(rowPtrs[1]);
+    __m128d row2_xy = _mm_loadu_pd(rowPtrs[2]);
+
+    // Preload third column as scalars
+    double c0 = rowPtrs[0][2];
+    double c1 = rowPtrs[1][2];
+    double c2 = rowPtrs[2][2];
+
+    for (unsigned i = 0; i < input.getRows(); ++i) {
+
+      const __m128d xy = _mm_loadu_pd(inputData);
+
+      const double z = inputData[2];
+      // const __m128d zz = _mm_set_pd(z, z);
+
+      __m128d mul0 = _mm_mul_pd(row01_lo, xy);
+      __m128d mul1 = _mm_mul_pd(row01_hi, xy);
+      __m128d r01 = _mm_hadd_pd(mul0, mul1);
+      __m128d mul2 = _mm_mul_pd(row2_xy, xy);
+
+      // Adding scalars at the end without using simd is faster
+      // __m128d mulzz = _mm_mul_pd(row01_z, zz);
+      // r01 = _mm_add_pd(r01, mulzz);
+
+      // store result
+      outputData[0] = _mm_cvtsd_f64(r01) + c0 * z;
+      outputData[1] = _mm_cvtsd_f64(_mm_unpackhi_pd(r01, r01)) + c1 * z;
+      outputData[2] = _mm_cvtsd_f64(_mm_hadd_pd(mul2, mul2)) + c2 * z;
+
+      inputData += 3;
+      outputData += 3;
+    }
+#else
+    double *r0 = rowPtrs[0];
+    double *r1 = rowPtrs[1];
+    double *r2 = rowPtrs[2];
+
+    for (unsigned int i = 0; i < input.getRows(); ++i) {
+      outputData[0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2];
+      outputData[1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2];
+      outputData[2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2];
+      inputData += 3;
+      outputData += 3;
+    }
+#endif
+  }
+  else { // 3xN matrix
+    if (input.getRows() != 3) {
+      throw vpException(vpException::dimensionError, "Expected input to have 3 rows");
+    }
+    double *inputX = input[0];
+    double *inputY = input[1];
+    double *inputZ = input[2];
+    double *outputX = output[0];
+    double *outputY = output[1];
+    double *outputZ = output[2];
+
+#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) || defined(VISP_HAVE_AVX2)
+
+    vpSIMD::Register elems[9];
+    for (unsigned int i = 0; i < 9; ++i) {
+      elems[i] = vpSIMD::set1(data[i]);
+    }
+    for (int i = 0; i <= static_cast<int>(input.getCols()) - vpSIMD::numLanes; i += vpSIMD::numLanes) {
+      const vpSIMD::Register x4 = vpSIMD::loadu(inputX);
+      const vpSIMD::Register y4 = vpSIMD::loadu(inputY);
+      const vpSIMD::Register z4 = vpSIMD::loadu(inputZ);
+
+      vpSIMD::Register dp1 = vpSIMD::mul(x4, elems[0]);
+      dp1 = vpSIMD::fma(y4, elems[1], dp1);
+      dp1 = vpSIMD::fma(z4, elems[2], dp1);
+      vpSIMD::Register dp2 = vpSIMD::mul(x4, elems[3]);
+      dp2 = vpSIMD::fma(y4, elems[4], dp2);
+      dp2 = vpSIMD::fma(z4, elems[5], dp2);
+      vpSIMD::Register dp3 = vpSIMD::mul(x4, elems[6]);
+      dp3 = vpSIMD::fma(y4, elems[7], dp3);
+      dp3 = vpSIMD::fma(z4, elems[8], dp3);
+
+      vpSIMD::storeu(outputX, dp1);
+      vpSIMD::storeu(outputY, dp2);
+      vpSIMD::storeu(outputZ, dp3);
+
+      inputX += vpSIMD::numLanes; inputY += vpSIMD::numLanes; inputZ += vpSIMD::numLanes;
+      outputX += vpSIMD::numLanes; outputY += vpSIMD::numLanes; outputZ += vpSIMD::numLanes;
+    }
+    double *r0 = rowPtrs[0];
+    double *r1 = rowPtrs[1];
+    double *r2 = rowPtrs[2];
+
+    for (unsigned int i = (input.getCols() / vpSIMD::numLanes) * vpSIMD::numLanes; i < input.getCols(); ++i) {
+      // std::cout << "i = " << i << std::endl;
+      double X = *inputX, Y = *inputY, Z = *inputZ;
+      *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z;
+      *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z;
+      *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z;
+      ++inputX; ++inputY; ++inputZ;
+      ++outputX; ++outputY; ++outputZ;
+    }
+#else
+    double *r0 = rowPtrs[0];
+    double *r1 = rowPtrs[1];
+    double *r2 = rowPtrs[2];
+
+    for (unsigned int i = 0; i < input.getCols(); ++i) {
+      double X = *inputX, Y = *inputY, Z = *inputZ;
+      *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z;
+      *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z;
+      *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z;
+      ++inputX; ++inputY; ++inputZ;
+      ++outputX; ++outputY; ++outputZ;
+    }
+#endif
+  }
+}
+
 const unsigned int vpRotationMatrix::constr_val_3 = 3;
 /*!
   Initialize the rotation matrix as identity.
diff --git a/modules/core/test/math/catchHomogeneousMatrix.cpp b/modules/core/test/math/catchHomogeneousMatrix.cpp
index af819ab093..a6e11fcb0c 100644
--- a/modules/core/test/math/catchHomogeneousMatrix.cpp
+++ b/modules/core/test/math/catchHomogeneousMatrix.cpp
@@ -1,6 +1,6 @@
 /*
  * ViSP, open source Visual Servoing Platform software.
- * Copyright (C) 2005 - 2024 by Inria. All rights reserved.
+ * Copyright (C) 2005 - 2025 by Inria. All rights reserved.
  *
  * This software is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -38,6 +38,8 @@
  */
 #include <visp3/core/vpConfig.h>
 
+#include <visp3/core/vpUniRand.h>
+
 #if defined(VISP_HAVE_CATCH2)
 #include <visp3/core/vpHomogeneousMatrix.h>
 
@@ -61,6 +63,7 @@ bool test_matrix_equal(const vpHomogeneousMatrix &M1, const vpHomogeneousMatrix
 
 TEST_CASE("vpHomogeneousMatrix re-orthogonalize rotation matrix", "[vpHomogeneousMatrix]")
 {
+  std::cout << "\n== Test vpHomogeneousMatrix re-orthogonalize rotation matrix ==" << std::endl;
   CHECK_NOTHROW([]() {
     vpHomogeneousMatrix M { 0.9835,  -0.0581, 0.1716, 0.0072, -0.0489, -0.9972,
                           -0.0571, 0.0352,  0.1744, 0.0478, -0.9835, 0.9470 };
@@ -120,6 +123,7 @@ TEST_CASE("vpHomogeneousMatrix re-orthogonalize rotation matrix", "[vpHomogeneou
 
 TEST_CASE("vpRotationMatrix re-orthogonalize rotation matrix", "[vpRotationMatrix]")
 {
+  std::cout << "\n== Test vpRotationMatrix re-orthogonalize rotation matrix ==" << std::endl;
   CHECK_NOTHROW(
       []() { vpRotationMatrix R { 0.9835, -0.0581, 0.1716, -0.0489, -0.9972, -0.0571, 0.1744, 0.0478, -0.9835 }; }());
 
@@ -163,6 +167,7 @@ TEST_CASE("vpRotationMatrix re-orthogonalize rotation matrix", "[vpRotationMatri
 
 TEST_CASE("ENU to NED conversion", "[enu2ned]")
 {
+  std::cout << "\n== Test ENU to NED conversion ==" << std::endl;
   vpHomogeneousMatrix enu_M_flu { 0, -1, 0, 0.2, 1, 0, 0, 1., 0, 0, 1, 0.3 };
   std::cout << "enu_M_flu:\n" << enu_M_flu << std::endl;
 
@@ -204,6 +209,7 @@ TEST_CASE("ENU to NED conversion", "[enu2ned]")
 
 TEST_CASE("vpHomogenousMatrix * vpRotationMatrix", "[operator*]")
 {
+  std::cout << "\n== Test vpHomogenousMatrix * vpRotationMatrix ==" << std::endl;
   // Test rotation_matrix * homogeneous_matrix
   vpHomogeneousMatrix _1_M_2_ {
        0.9835, -0.0581,  0.1716, 0.0072,
@@ -224,6 +230,97 @@ TEST_CASE("vpHomogenousMatrix * vpRotationMatrix", "[operator*]")
   CHECK(success);
 }
 
+TEST_CASE("Point projection", "project")
+{
+  std::cout << "\n== Test point projection ==" << std::endl;
+  std::map<unsigned int, unsigned int> map_vecsize_trials = { {1, 100000}, {10, 10000}, {100, 1000}, {1000, 1000}, { 10000, 100} };
+
+  for (const auto &[vecsize, trials] : map_vecsize_trials) {
+    std::cout << "** Running for vector size = " << vecsize << std::endl;
+    std::vector<double> timeProjectTransposed, timeProject, timeMult, timeNaive;
+    vpUniRand r(42);
+    for (unsigned int trial = 0; trial < trials; ++trial) {
+      vpHomogeneousMatrix M(
+        r.uniform(0.0, 1.0), r.uniform(0.0, 1.0), r.uniform(0.0, 1.0),
+        r.uniform(0.0, M_PI), r.uniform(0.0, M_PI), r.uniform(0.0, M_PI));
+      vpMatrix inputT(vecsize, 3);
+      for (unsigned int i = 0; i< vecsize; ++i) {
+        inputT[i][0] = r.uniform(0.0, 1.0);
+        inputT[i][1] = r.uniform(0.0, 1.0);
+        inputT[i][2] = r.uniform(0.0, 1.0);
+      }
+      vpMatrix input = inputT.t();
+      vpMatrix outputT(vecsize, 3);
+      vpMatrix output(3, vecsize);
+
+      double t1 = vpTime::measureTimeMs();
+      M.project(inputT, outputT, true);
+      double t2 = vpTime::measureTimeMs();
+      timeProjectTransposed.push_back(t2 - t1);
+
+      t1 = vpTime::measureTimeMs();
+      M.project(input, output, false);
+      t2 = vpTime::measureTimeMs();
+      timeProject.push_back(t2 - t1);
+
+      vpColVector x(4, 1);
+      vpColVector res(4);
+
+      vpMatrix outputR(vecsize, 3);
+      double t1r = vpTime::measureTimeMs();
+
+      for (unsigned int i = 0; i < inputT.getRows(); ++i) {
+        x[0] = inputT[i][0];
+        x[1] = inputT[i][1];
+        x[2] = inputT[i][2];
+
+        vpColVector res = M * x;
+        outputR[i][0] = res[0];
+        outputR[i][1] = res[1];
+        outputR[i][2] = res[2];
+      }
+      double t2r = vpTime::measureTimeMs();
+      timeNaive.push_back(t2r - t1r);
+
+      vpMatrix input4(4, vecsize);
+      for (unsigned int i = 0; i < inputT.getRows(); ++i) {
+        input4[0][i] = inputT[i][0];
+        input4[1][i] = inputT[i][1];
+        input4[2][i] = inputT[i][2];
+        input4[3][i] = 1;
+      }
+      vpMatrix output4(4, vecsize);
+      double t14 = vpTime::measureTimeMs();
+
+      vpMatrix::mult2Matrices(static_cast<vpMatrix>(M), input4, output4);
+      double t24 = vpTime::measureTimeMs();
+      timeMult.push_back(t24 - t14);
+
+      double errorT = (outputR - outputT).frobeniusNorm() / vecsize;
+      double error = (outputR - output.t()).frobeniusNorm() / vecsize;
+
+      if (errorT > 1e-10 || error > 1e-10) {
+        std::cout << "M = " << M << std::endl;
+        std::cerr << "Naive outputT = " << outputR << std::endl;
+        std::cerr << "Mult outputT = " << output4.t() << std::endl;
+        std::cerr << "Project output transposed version = " << outputT << std::endl;
+        std::cerr << "Project output = " << outputT << std::endl;
+        std::cerr << "Diff transposed = " << outputR - outputT << std::endl;
+        std::cerr << "Diff = " << outputR - output.t() << std::endl;
+
+        FAIL();
+      }
+    }
+    std::cout << "Optimized version (transposed) took: " << vpMath::getMean(timeProjectTransposed) << " +-" << vpMath::getStdev(timeProjectTransposed) <<  "ms" << std::endl;
+    std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +-" << vpMath::getStdev(timeProject) <<  "ms" << std::endl;
+
+    std::cout << "Mult  version took: " << vpMath::getMean(timeMult) << " +-" << vpMath::getStdev(timeMult) << "ms" << std::endl;
+    std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +-" << vpMath::getStdev(timeNaive) <<  "ms" << std::endl;
+    std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::minimum(vpMath::getMean(timeProjectTransposed), vpMath::getMean(timeProject)) << std::endl;
+
+  }
+}
+
 int main(int argc, char *argv[])
 {
   Catch::Session session;
diff --git a/modules/core/test/math/catchRotation.cpp b/modules/core/test/math/catchRotation.cpp
index 5031a2efcc..ba84966fa7 100644
--- a/modules/core/test/math/catchRotation.cpp
+++ b/modules/core/test/math/catchRotation.cpp
@@ -134,6 +134,7 @@ TEST_CASE("Common rotation operations", "[rotation]")
 {
   SECTION("Theta u initialization")
   {
+    std::cout << "\n== Test theta u initialization ==" << std::endl;
     vpThetaUVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10));
     std::vector<double> bench1(3, vpMath::rad(10));
     vpColVector bench3(3, vpMath::rad(10));
@@ -169,6 +170,7 @@ TEST_CASE("Common rotation operations", "[rotation]")
   }
   SECTION("Rxyz initialization")
   {
+    std::cout << "\n== Test Rxyz initialization ==" << std::endl;
     vpRxyzVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10));
     std::vector<double> bench1(3, vpMath::rad(10));
     vpColVector bench3(3, vpMath::rad(10));
@@ -203,6 +205,7 @@ TEST_CASE("Common rotation operations", "[rotation]")
   }
   SECTION("rzyx initialization")
   {
+    std::cout << "\n== Test rzyx initialization ==" << std::endl;
     vpRzyxVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10));
     std::vector<double> bench1(3, vpMath::rad(10));
     vpColVector bench3(3, vpMath::rad(10));
@@ -237,6 +240,7 @@ TEST_CASE("Common rotation operations", "[rotation]")
   }
   SECTION("rzyz initialiation")
   {
+    std::cout << "\n== Test rzyz initialization ==" << std::endl;
     vpRzyzVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10));
     std::vector<double> bench1(3, vpMath::rad(10));
     vpColVector bench3(3, vpMath::rad(10));
@@ -271,6 +275,7 @@ TEST_CASE("Common rotation operations", "[rotation]")
   }
   SECTION("Test quaternion initialization", "[quaternion]")
   {
+    std::cout << "\n== Test quaternion initialization ==" << std::endl;
     vpQuaternionVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10), vpMath::rad(10));
     std::vector<double> bench1(4, vpMath::rad(10));
     vpColVector bench3(4, vpMath::rad(10));
@@ -305,41 +310,41 @@ TEST_CASE("Common rotation operations", "[rotation]")
   }
   SECTION("Conversions")
   {
+    std::cout << "\n== Test conversions ==" << std::endl;
     vpRotationMatrix R;
     for (int i = -10; i < 10; i++) {
       for (int j = -10; j < 10; j++) {
         vpThetaUVector tu(vpMath::rad(90 + i), vpMath::rad(170 + j), vpMath::rad(45));
         tu.buildFrom(vpRotationMatrix(tu)); // put some coherence into rotation convention
 
-        std::cout << "Initialization " << std::endl;
+        std::cout << "\n** Initialization with theta_u: " << tu.t()  << std::endl;
 
         double theta;
         vpColVector u;
         tu.extract(theta, u);
 
-        std::cout << "theta=" << vpMath::deg(theta) << std::endl;
-        std::cout << "u=" << u << std::endl;
+        std::cout << "Extracted theta: " << vpMath::deg(theta) << " deg" << std::endl;
+        std::cout << "Extracted u: " << u.t() << std::endl;
 
         std::cout << "From vpThetaUVector to vpRotationMatrix " << std::endl;
         R.buildFrom(tu);
 
-        std::cout << "Matrix R";
+        std::cout << "Matrix R: \n" << R << std::endl;
         CHECK(R.isARotationMatrix());
 
-        std::cout << R << std::endl;
-
         std::cout << "From vpRotationMatrix to vpQuaternionVector " << std::endl;
         vpQuaternionVector q(R);
+        std::cout << "Quaternion: " << q.t() << std::endl;
         CHECK(q.magnitude() == Catch::Approx(1.0).margin(1e-4));
-        std::cout << q << std::endl;
 
+        std::cout << "From vpQuaternionVector to vpRotationMatrix  " << std::endl;
         R.buildFrom(q);
+        std::cout << "Matrix R: \n" << R << std::endl;
         CHECK(R.isARotationMatrix());
-        std::cout << "From vpQuaternionVector to vpRotationMatrix  " << std::endl;
 
         std::cout << "From vpRotationMatrix to vpRxyzVector " << std::endl;
         vpRxyzVector RxyzbuildR(R);
-        std::cout << RxyzbuildR << std::endl;
+        std::cout << "Rxyz: " << RxyzbuildR.t() << std::endl;
 
         std::cout << "From vpRxyzVector to vpThetaUVector " << std::endl;
         std::cout << "  use From vpRxyzVector to vpRotationMatrix " << std::endl;
@@ -348,15 +353,14 @@ TEST_CASE("Common rotation operations", "[rotation]")
         vpThetaUVector tubuildEu;
         tubuildEu.buildFrom(R);
 
-        std::cout << std::endl;
-        std::cout << "result : should equivalent to the first one " << std::endl;
+        std::cout << "Result : should equivalent to the first one " << std::endl;
 
         double theta2;
         vpColVector u2;
 
         tubuildEu.extract(theta2, u2);
-        std::cout << "theta=" << vpMath::deg(theta2) << std::endl;
-        std::cout << "u=" << u2 << std::endl;
+        std::cout << "Extracted theta: " << vpMath::deg(theta2) << " deg" << std::endl;
+        std::cout << "Extracted u: " << u2.t() << std::endl;
 
         CHECK(vpMath::abs(theta2 - theta) < std::numeric_limits<double>::epsilon() * 1e10);
         CHECK(vpMath::abs(u[0] - u2[0]) < std::numeric_limits<double>::epsilon() * 1e10);
@@ -366,35 +370,39 @@ TEST_CASE("Common rotation operations", "[rotation]")
     }
     SECTION("Conversion from and to rzyz vector")
     {
+      std::cout << "\n== Test conversion from and to rzyz vector ==" << std::endl;
       vpRzyzVector rzyz(vpMath::rad(180), vpMath::rad(120), vpMath::rad(45));
-      std::cout << "Initialization vpRzyzVector " << std::endl;
-      std::cout << rzyz << std::endl;
+      std::cout << "\n** Initialization with rzyz: " << rzyz.t() << std::endl;
       std::cout << "From vpRzyzVector to vpRotationMatrix  " << std::endl;
+
       R.buildFrom(rzyz);
+      std::cout << "Matrix R: \n" << R << std::endl;
       CHECK(R.isARotationMatrix());
       std::cout << "From vpRotationMatrix to vpRzyzVector " << std::endl;
       vpRzyzVector rzyz_final;
       rzyz_final.buildFrom(R);
+      std::cout << "rzyz: " << rzyz_final.t() << std::endl;
       CHECK(test("rzyz", rzyz_final, vpColVector(rzyz)));
-      std::cout << rzyz_final << std::endl;
     }
     SECTION("Conversion from and to rzyx vector")
     {
+      std::cout << "\n== Test conversion from and to rzyx vector ==" << std::endl;
       vpRzyxVector rzyx(vpMath::rad(180), vpMath::rad(120), vpMath::rad(45));
-      std::cout << "Initialization vpRzyxVector " << std::endl;
-      std::cout << rzyx << std::endl;
+      std::cout << "\n** Initialization with rzyx: " << rzyx.t() << std::endl;
       std::cout << "From vpRzyxVector to vpRotationMatrix  " << std::endl;
       R.buildFrom(rzyx);
+      std::cout << "Matrix R: \n" << R << std::endl;
       CHECK(R.isARotationMatrix());
-      std::cout << R << std::endl;
       std::cout << "From vpRotationMatrix to vpRzyxVector " << std::endl;
       vpRzyxVector rzyx_final;
       rzyx_final.buildFrom(R);
+      std::cout << "rzyx: " << rzyx_final.t() << std::endl;
       bool ret = test("rzyx", rzyx_final, vpColVector(rzyx));
       if (ret == false) {
         // Euler angle representation is not unique
         std::cout << "Rzyx vector differ. Test rotation matrix..." << std::endl;
         vpRotationMatrix RR(rzyx_final);
+        std::cout << "Matrix R: \n" << RR << std::endl;
         if (R == RR) {
           std::cout << "Rzyx vector differ but rotation matrix is valid" << std::endl;
           ret = true;
@@ -406,6 +414,7 @@ TEST_CASE("Common rotation operations", "[rotation]")
   }
   SECTION("Rotation matrix extraction from homogeneous matrix and multiplication")
   {
+    std::cout << "\n== Test rotation matrix extraction from homogeneous matrix and multiplication ==" << std::endl;
     // Test rotation_matrix * homogeneous_matrix
     vpHomogeneousMatrix  _1_M_2_truth;
     _1_M_2_truth[0][0] = 0.9835;
@@ -443,6 +452,7 @@ TEST_CASE("Common rotation operations", "[rotation]")
 
 TEST_CASE("Theta u multiplication", "[theta.u]")
 {
+  std::cout << "\n== Test theta u multiplication ==" << std::endl;
   const int nTrials = 100;
   const uint64_t seed = 0x123456789;
   vpUniRand rng(seed);
@@ -469,6 +479,7 @@ TEST_CASE("Theta u multiplication", "[theta.u]")
 
 TEST_CASE("Quaternion multiplication", "[quaternion]")
 {
+  std::cout << "\n== Test quaternion multiplication ==" << std::endl;
   const int nTrials = 100;
   const uint64_t seed = 0x123456789;
   vpUniRand rng(seed);
@@ -496,6 +507,7 @@ TEST_CASE("Quaternion multiplication", "[quaternion]")
 
 TEST_CASE("Default constructor", "[rotation matrix]")
 {
+  std::cout << "\n== Test default constructor ==" << std::endl;
   vpRotationMatrix I;
   SECTION("rotation matrix")
   {
@@ -559,6 +571,82 @@ TEST_CASE("Default constructor", "[rotation matrix]")
 #endif
 }
 
+TEST_CASE("Vector rotation", "project")
+{
+  std::cout << "\n== Test rotation of vectors ==" << std::endl;
+  std::map<unsigned int, unsigned int> map_vecsize_trials = { {1, 100000}, {10, 10000}, {100, 1000}, {1000, 1000}, { 10000, 100} };
+
+  for (const auto &[vecsize, trials] : map_vecsize_trials) {
+    std::cout << "** Running for vector size = " << vecsize << std::endl;
+    std::vector<double> timeProjectTransposed, timeProject, timeMult, timeNaive;
+    vpUniRand r(42);
+    for (unsigned int trial = 0; trial < trials; ++trial) {
+      vpRotationMatrix M(r.uniform(0.0, M_PI), r.uniform(0.0, M_PI), r.uniform(0.0, M_PI));
+      vpMatrix inputT(vecsize, 3);
+
+      for (unsigned int i = 0; i< vecsize; ++i) {
+        inputT[i][0] = r.uniform(0.0, 1.0);
+        inputT[i][1] = r.uniform(0.0, 1.0);
+        inputT[i][2] = r.uniform(0.0, 1.0);
+      }
+      vpMatrix input = inputT.t();
+      vpMatrix outputT(vecsize, 3);
+      vpMatrix output(3, vecsize);
+      double t1 = vpTime::measureTimeMs();
+      M.rotateVectors(inputT, outputT, true);
+      double t2 = vpTime::measureTimeMs();
+      timeProjectTransposed.push_back(t2 - t1);
+      t1 = vpTime::measureTimeMs();
+      M.rotateVectors(input, output, false);
+      t2 = vpTime::measureTimeMs();
+      timeProject.push_back(t2 - t1);
+
+      vpColVector x(3, 1);
+      vpColVector res(3);
+      vpMatrix outputR(vecsize, 3);
+      double t1r = vpTime::measureTimeMs();
+
+      for (unsigned int i = 0; i < inputT.getRows(); ++i) {
+        x[0] = inputT[i][0];
+        x[1] = inputT[i][1];
+        x[2] = inputT[i][2];
+
+        res = M * x;
+        outputR[i][0] = res[0];
+        outputR[i][1] = res[1];
+        outputR[i][2] = res[2];
+      }
+      double t2r = vpTime::measureTimeMs();
+      timeNaive.push_back(t2r - t1r);
+
+      vpMatrix outputMM(3, vecsize);
+      double t14 = vpTime::measureTimeMs();
+
+      vpMatrix::mult2Matrices((vpMatrix)M, input, outputMM);
+      double t24 = vpTime::measureTimeMs();
+      timeMult.push_back(t24 - t14);
+
+      double errorT = (outputR - outputT).frobeniusNorm() / vecsize;
+      double error = (outputR - output.t()).frobeniusNorm() / vecsize;
+
+      if (errorT > 1e-10) {
+        std::cerr << "Transposed version failed" << std::endl;
+        FAIL();
+      }
+
+      if (error > 1e-10) {
+        std::cerr << "Optimized 3xN version failed" << std::endl;
+        FAIL();
+      }
+    }
+    std::cout << "Optimized (transposed) version took: " << vpMath::getMean(timeProjectTransposed) << " +- " << vpMath::getStdev(timeProjectTransposed) <<  "ms" << std::endl;
+    std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +- " << vpMath::getStdev(timeProject) <<  "ms" << std::endl;
+    std::cout << "Mult  version took: " << vpMath::getMean(timeMult) << " +- " << vpMath::getStdev(timeMult) << "ms" << std::endl;
+    std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +- " << vpMath::getStdev(timeNaive) <<  "ms" << std::endl;
+    std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::minimum(vpMath::getMean(timeProjectTransposed), vpMath::getMean(timeProject)) << std::endl;
+  }
+}
+
 int main(int argc, char *argv[])
 {
   Catch::Session session;
diff --git a/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h b/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h
index cd8136b900..07282c2377 100644
--- a/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h
+++ b/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h
@@ -151,7 +151,7 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker
 
   void extractFeatures(const vpRBFeatureTrackerInput &frame, const vpRBFeatureTrackerInput &previousFrame, const vpHomogeneousMatrix &cMo) VP_OVERRIDE;
   void trackFeatures(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) VP_OVERRIDE { }
-  void initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) VP_OVERRIDE { }
+  void initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) VP_OVERRIDE;
   void computeVVSIter(const vpRBFeatureTrackerInput &frame, const vpHomogeneousMatrix &cMo, unsigned int iteration) VP_OVERRIDE;
 
   void display(const vpCameraParameters &cam, const vpImage<unsigned char> &I, const vpImage<vpRGBa> &IRGB, const vpImage<unsigned char> &depth) const VP_OVERRIDE;
@@ -179,10 +179,8 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker
       for (vpMatrix *m: matrices) {
         m->resize(3, numPoints, false, false);
       }
-      m_valid.resize(numPoints);
 
       for (unsigned int i = 0; i < numPoints; ++i) {
-        m_valid[i] = true;
         for (unsigned int j = 0; j < 3; ++j) {
           m_oXt[j][i] = points[i].oX[j];
           m_oNt[j][i] = points[i].objectNormal[j];
@@ -190,89 +188,9 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker
         }
       }
     }
-    inline void updateAndErrorAndInteractionMatrix(const vpCameraParameters &cam, const vpHomogeneousMatrix &cMo, const vpImage<float> &depth, vpColVector &e, vpMatrix &L)
-    {
-      const vpRotationMatrix cRo = cMo.getRotationMatrix();
-      const vpTranslationVector t = cMo.getTranslationVector();
-      vpMatrix::mult2Matrices(cRo, m_oXt, m_cXt);
-      vpMatrix::mult2Matrices(cRo, m_oNt, m_cNt);
-
-      const unsigned int numPoints = m_oXt.getCols();
-      e.resize(numPoints, false);
-      L.resize(numPoints, 6, false, false);
-
-#ifdef VISP_HAVE_OPENMP
-#pragma omp parallel for
-#endif
-      for (int i = 0; i < static_cast<int>(numPoints); ++i) {
-
-        //Step 1: update and filter out points that are no longer valid
-        {
-
-          for (unsigned int j = 0; j < 3; ++j) {
-            m_cXt[j][i] += t[j];
-          }
-          // Plane points away from the camera: this surface is no longer visible due to rotation
-          if (m_cNt[2][i] >= 0.0) {
-            m_valid[i] = false;
-            continue;
-          }
-          double x, y, u, v;
-          x = m_cXt[0][i] / m_cXt[2][i];
-          y = m_cXt[1][i] / m_cXt[2][i];
-
-          vpMeterPixelConversion::convertPointWithoutDistortion(cam, x, y, u, v);
-          // Point is no longer in image: depth value cannot be sampled
-          if (u < 0 || v < 0 || u >= depth.getWidth() || v >= depth.getHeight()) {
-            m_valid[i] = false;
-            continue;
-          }
-          const double Z = depth[static_cast<unsigned int>(v)][static_cast<unsigned int>(u)];
-          // Z value in the depth image from the camera is invalid
-          if (Z <= 0.0) {
-            m_valid[i] = false;
-            continue;
-          }
-
-          m_valid[i] = true;
-          m_observations[0][i] = x * Z;
-          m_observations[1][i] = y * Z;
-          m_observations[2][i] = Z;
-        }
-        // Step 2: update Jacobian and error for valid points
-        {
-          const double X = m_cXt[0][i], Y = m_cXt[1][i], Z = m_cXt[2][i];
-          const double nX = m_cNt[0][i], nY = m_cNt[1][i], nZ = m_cNt[2][i];
+    void updateAndErrorAndInteractionMatrix(const vpCameraParameters &cam, const vpHomogeneousMatrix &cMo,
+       const vpImage<float> &depth, vpColVector &e, vpMatrix &L);
 
-          const double D = -((nX * X) + (nY  * Y) + (nZ * Z));
-          double projNormal = nX * m_observations[0][i] + nY  * m_observations[1][i] + nZ * m_observations[2][i];
-
-          e[i] = D + projNormal;
-
-          L[i][0] = nX;
-          L[i][1] = nY;
-          L[i][2] = nZ;
-          L[i][3] = nZ * Y - nY * Z;
-          L[i][4] = nX * Z - nZ * X;
-          L[i][5] = nY * X - nX * Y;
-
-        }
-      }
-
-      // Disable invalid points
-      for (unsigned int i = 0; i < numPoints; ++i) {
-        if (!m_valid[i]) {
-          e[i] = 0.0;
-
-          L[i][0] = 0;
-          L[i][1] = 0;
-          L[i][2] = 0;
-          L[i][3] = 0;
-          L[i][4] = 0;
-          L[i][5] = 0;
-        }
-      }
-    }
 
     const vpMatrix &getPointsObject() const { return m_oXt; }
     const vpMatrix &getNormalsObject() const { return m_oNt; }
@@ -285,7 +203,7 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker
     vpMatrix m_oNt; // a 3xN matrix containing the 3D normals on the object
     vpMatrix m_cXt;
     vpMatrix m_cNt;
-    std::vector<bool> m_valid;
+    std::vector<unsigned int> m_invalid;
 
   };
 
@@ -304,7 +222,7 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker
       {vpRBDenseDepthTracker::vpDisplayType::DT_WEIGHT, "weight"},
       {vpRBDenseDepthTracker::vpDisplayType::DT_ERROR, "error"},
       {vpRBDenseDepthTracker::vpDisplayType::DT_WEIGHT_AND_ERROR, "weightAndError"}
-    });
+});
 
   virtual void loadJsonConfiguration(const nlohmann::json &j) VP_OVERRIDE
   {
@@ -330,6 +248,9 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker
   bool m_useMask;
   float m_minMaskConfidence;
   vpDisplayType m_displayType;
+
+  vpMatrix m_Lt;
+
 };
 
 END_VISP_NAMESPACE
diff --git a/modules/tracker/rbt/src/core/private/vpSIMDUtils.h b/modules/tracker/rbt/src/core/private/vpSIMDUtils.h
new file mode 100644
index 0000000000..7d81d01a77
--- /dev/null
+++ b/modules/tracker/rbt/src/core/private/vpSIMDUtils.h
@@ -0,0 +1,231 @@
+/*
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2025 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See https://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * SIMD utilities.
+ */
+
+/*!
+  \file vpSIMDUtils.h
+  \brief Header that defines and includes useful SIMD routines and macros
+*/
+
+#ifndef VP_SIMD_UTILS_H
+#define VP_SIMD_UTILS_H
+#include <visp3/core/vpConfig.h>
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#define VISP_HAVE_SSE2 1
+#endif
+
+#if defined __AVX2__
+#define VISP_HAVE_AVX2 1
+#endif
+
+#if defined __AVX__
+#define VISP_HAVE_AVX 1
+#endif
+
+// https://stackoverflow.com/a/40765925
+#if !defined(__FMA__) && defined(__AVX2__)
+#define __FMA__ 1
+#endif
+
+
+#if defined(__FMA__)
+#define VISP_HAVE_FMA
+#endif
+
+#if defined _WIN32 && defined(_M_ARM64)
+#define _ARM64_DISTINCT_NEON_TYPES
+#include <Intrin.h>
+#include <arm_neon.h>
+#define VISP_HAVE_NEON 1
+#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__)
+#include <arm_neon.h>
+#define VISP_HAVE_NEON 1
+#else
+#define VISP_HAVE_NEON 0
+#endif
+
+#if VISP_HAVE_SSE2 && USE_SIMD_CODE
+#define USE_SSE 1
+#else
+#define USE_SSE 0
+#endif
+
+#if VISP_HAVE_NEON && USE_SIMD_CODE
+#define USE_NEON 1
+#else
+#define USE_NEON 0
+#endif
+
+namespace vpSIMD
+{
+#if defined(VISP_HAVE_AVX2)
+using Register = __m512d;
+
+inline constexpr int numLanes = 8;
+inline const Register add(const Register a, const Register b)
+{
+  return _mm512_add_pd(a, b);
+}
+
+inline Register sub(const Register a, const Register b)
+{
+  return _mm512_sub_pd(a, b);
+}
+
+inline Register mul(const Register a, const Register b)
+{
+  return _mm512_mul_pd(a, b);
+}
+
+inline Register fma(const Register a, const Register b, const Register c)
+{
+#if defined(VISP_HAVE_FMA)
+  return _mm512_fmadd_pd(a, b, c);
+#else
+  return add(mul(a, b), c);
+#endif
+}
+
+inline Register loadu(const double *const data)
+{
+  return _mm512_loadu_pd(data);
+}
+
+inline Register set1(double v)
+{
+  return _mm512_set1_pd(v);
+}
+
+inline void storeu(double *data, const Register a)
+{
+  _mm512_storeu_pd(data, a);
+}
+
+#elif defined(VISP_HAVE_AVX)
+using Register = __m256d;
+inline const int numLanes = 4;
+
+inline Register add(const Register a, const Register b)
+{
+  return _mm256_add_pd(a, b);
+}
+
+inline Register sub(const Register a, const Register b)
+{
+  return _mm256_sub_pd(a, b);
+}
+
+inline Register mul(const Register a, const Register b)
+{
+  return _mm256_mul_pd(a, b);
+}
+
+inline Register fma(const Register a, const Register b, const Register c)
+{
+#if defined(VISP_HAVE_FMA)
+  return _mm256_fmadd_pd(a, b, c);
+#else
+  return add(mul(a, b), c);
+#endif
+}
+
+inline Register loadu(const double *const data)
+{
+  return _mm256_loadu_pd(data);
+}
+
+inline Register set1(double v)
+{
+  return _mm256_set1_pd(v);
+}
+
+inline void storeu(double *data, const Register a)
+{
+  _mm256_storeu_pd(data, a);
+}
+
+#elif VISP_HAVE_SSE2
+using Register = __m128d;
+inline const int numLanes = 2;
+
+inline Register add(const Register a, const Register b)
+{
+  return _mm_add_pd(a, b);
+}
+
+inline Register sub(const Register a, const Register b)
+{
+  return _mm_sub_pd(a, b);
+}
+
+inline Register mul(const Register a, const Register b)
+{
+  return _mm_mul_pd(a, b);
+}
+
+inline Register fma(const Register a, const Register b, const Register c)
+{
+#if defined(VISP_HAVE_FMA)
+  return _mm_fmadd_pd(a, b, c);
+#else
+  return add(mul(a, b), c);
+#endif
+}
+
+inline Register loadu(const double *const data)
+{
+  return _mm_loadu_pd(data);
+}
+
+inline Register set1(double v)
+{
+  return _mm_set1_pd(v);
+}
+
+inline void storeu(double *data, const Register a)
+{
+  _mm_storeu_pd(data, a);
+}
+
+#endif
+
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // VP_SIMD_UTILS_H
diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
index 2a69b7bfa7..384a3d6119 100644
--- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
+++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp
@@ -32,6 +32,8 @@
 #include <visp3/core/vpMeterPixelConversion.h>
 #include <visp3/core/vpDisplay.h>
 
+#include "../core/private/vpSIMDUtils.h"
+
 #ifdef VISP_HAVE_OPENMP
 #include <omp.h>
 #endif
@@ -172,6 +174,17 @@ void vpRBDenseDepthTracker::extractFeatures(const vpRBFeatureTrackerInput &frame
   }
 }
 
+void vpRBDenseDepthTracker::initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/)
+{
+  m_LTL.resize(6, 6, false, false);
+  m_LTR.resize(6, false);
+  m_L.resize(m_numFeatures, 6, false, false);
+  m_Lt.resize(6, m_numFeatures, false, false);
+  m_error.resize(m_numFeatures, false);
+  m_weighted_error.resize(m_numFeatures, false);
+  m_weights.resize(m_numFeatures, false);
+}
+
 void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, const vpHomogeneousMatrix &cMo, unsigned int /*iteration*/)
 {
   if (m_numFeatures == 0) {
@@ -185,13 +198,15 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame,
     return;
   }
 
-  m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_L);
+  m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_Lt);
+  m_Lt.transpose(m_L);
 
   m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01);
   m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights);
 
-
-
+// #if defined(VISP_HAVE_OPENMP)
+// #pragma omp parallel for
+// #endif
   for (unsigned int i = 0; i < m_depthPoints.size(); ++i) {
     m_weighted_error[i] = m_error[i] * m_weights[i];
     m_covWeightDiag[i] = m_weights[i] * m_weights[i];
@@ -199,12 +214,145 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame,
       m_L[i][dof] *= m_weights[i];
     }
   }
-
-  m_LTL = m_L.AtA();
+  m_LTL.resize(6, 6);
+  m_L.AtA(m_LTL);
   computeJTR(m_L, m_weighted_error, m_LTR);
   m_vvsConverged = false;
 }
 
+void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, const vpMatrix &obsT, vpColVector &e, vpMatrix &Lt, unsigned int start, unsigned int end)
+{
+#if defined(VISP_HAVE_OPENMP)
+#pragma omp parallel for
+#endif
+  for (unsigned int i = start; i < end; ++i) {
+    const double X = cXt[0][i], Y = cXt[1][i], Z = cXt[2][i];
+    const double nX = cNt[0][i], nY = cNt[1][i], nZ = cNt[2][i];
+
+    const double D = -((nX * X) + (nY  * Y) + (nZ * Z));
+    const double projNormal = nX * obsT[0][i] + nY  * obsT[1][i] + nZ * obsT[2][i];
+
+    e[i] = D + projNormal;
+
+    Lt[0][i] = nX;
+    Lt[1][i] = nY;
+    Lt[2][i] = nZ;
+    Lt[3][i] = nZ * Y - nY * Z;
+    Lt[4][i] = nX * Z - nZ * X;
+    Lt[5][i] = nY * X - nX * Y;
+  }
+}
+
+#if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2)
+void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const  vpMatrix &cNt, const  vpMatrix &obsT, vpColVector &e, vpMatrix &Lt)
+{
+#if defined(VISP_HAVE_OPENMP)
+#pragma omp parallel for schedule(static, 1024)
+#endif
+  for (int i = 0; i <= static_cast<int>(cXt.getCols()) - vpSIMD::numLanes; i += vpSIMD::numLanes) {
+    const vpSIMD::Register X = vpSIMD::loadu(cXt[0] + i), Y = vpSIMD::loadu(cXt[1] + i), Z = vpSIMD::loadu(cXt[2] + i);
+    const vpSIMD::Register nX = vpSIMD::loadu(cNt[0] + i), nY = vpSIMD::loadu(cNt[1] + i), nZ = vpSIMD::loadu(cNt[2] + i);
+    const vpSIMD::Register obsX = vpSIMD::loadu(obsT[0] + i), obsY = vpSIMD::loadu(obsT[1] + i), obsZ = vpSIMD::loadu(obsT[2] + i);
+
+    vpSIMD::Register D = vpSIMD::mul(nX, X);
+    vpSIMD::Register projNormal = vpSIMD::mul(obsX, X);
+    D = vpSIMD::fma(nY, Y, D);
+    D = vpSIMD::fma(nZ, Z, D);
+
+    projNormal = vpSIMD::fma(nY, obsY, projNormal);
+    projNormal = vpSIMD::fma(nZ, obsZ, projNormal);
+
+    vpSIMD::storeu(e.data + i, vpSIMD::sub(projNormal, D));
+
+    vpSIMD::storeu(Lt[0] + i, nX);
+    vpSIMD::storeu(Lt[1] + i, nY);
+    vpSIMD::storeu(Lt[2] + i, nZ);
+
+    vpSIMD::storeu(Lt[3] + i, vpSIMD::sub(vpSIMD::mul(nZ, Y), vpSIMD::mul(nY, Z)));
+    vpSIMD::storeu(Lt[4] + i, vpSIMD::sub(vpSIMD::mul(nX, Z), vpSIMD::mul(nZ, X)));
+    vpSIMD::storeu(Lt[5] + i, vpSIMD::sub(vpSIMD::mul(nY, X), vpSIMD::mul(nX, Y)));
+  }
+  errorAndInteractionMatrixBase(cXt, cNt, obsT, e, Lt, (cXt.getCols() / vpSIMD::numLanes) * vpSIMD::numLanes, cXt.getCols());
+}
+#endif
+
+void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix(
+  const vpCameraParameters &cam, const vpHomogeneousMatrix &cMo,
+  const vpImage<float> &depth, vpColVector &e, vpMatrix &Lt)
+{
+  const vpRotationMatrix cRo = cMo.getRotationMatrix();
+  cMo.project(m_oXt, m_cXt, false);
+  cRo.rotateVectors(m_oNt, m_cNt, false);
+  m_invalid.clear();
+
+  const unsigned int numPoints = m_oXt.getCols();
+  e.resize(numPoints, false);
+  Lt.resize(6, numPoints, false, false);
+#if defined(VISP_HAVE_OPENMP)
+#pragma omp parallel
+#endif
+  {
+    std::vector<unsigned int> localInvalid;
+#if defined(VISP_HAVE_OPENMP)
+#pragma omp for
+#endif
+
+    for (int i = 0; i < static_cast<int>(numPoints); ++i) {
+      //Step 1: update and filter out points that are no longer valid
+      {
+        // Plane points away from the camera: this surface is no longer visible due to rotation
+        if (m_cNt[2][i] >= 0.0) {
+          localInvalid.push_back(i);
+          continue;
+        }
+        double x, y, u, v;
+        x = m_cXt[0][i] / m_cXt[2][i];
+        y = m_cXt[1][i] / m_cXt[2][i];
+        vpMeterPixelConversion::convertPointWithoutDistortion(cam, x, y, u, v);
+        // Point is no longer in image: depth value cannot be sampled
+        if (u < 0 || v < 0 || u >= depth.getWidth() || v >= depth.getHeight()) {
+          localInvalid.push_back(i);
+          continue;
+        }
+        const double Z = depth[static_cast<unsigned int>(v)][static_cast<unsigned int>(u)];
+        // Z value in the depth image from the camera is invalid
+        if (Z <= 0.0) {
+          localInvalid.push_back(i);
+          continue;
+        }
+        m_observations[0][i] = x * Z;
+        m_observations[1][i] = y * Z;
+        m_observations[2][i] = Z;
+      }
+    }
+#if defined(VISP_HAVE_OPENMP)
+#pragma omp critical
+    {
+      m_invalid.insert(m_invalid.end(), localInvalid.begin(), localInvalid.end());
+    }
+#else
+    m_invalid = std::move(localInvalid);
+#endif
+  }
+
+#if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2)
+  errorAndInteractionMatrixSIMD(m_cXt, m_cNt, m_observations, e, Lt);
+#else
+  errorAndInteractionMatrixBase(m_cXt, m_cNt, m_observations, e, Lt, 0, numPoints);
+#endif
+  // Disable invalid points
+  for (unsigned int i : m_invalid) {
+    e[i] = 0.0;
+
+    Lt[0][i] = 0;
+    Lt[1][i] = 0;
+    Lt[2][i] = 0;
+    Lt[3][i] = 0;
+    Lt[4][i] = 0;
+    Lt[5][i] = 0;
+  }
+}
+
 void vpRBDenseDepthTracker::display(const vpCameraParameters &/*cam*/, const vpImage<unsigned char> &/*I*/,
                                     const vpImage<vpRGBa> &/*IRGB*/, const vpImage<unsigned char> &depth) const
 {