From 711fe778134c49c2f19e3862b628fc8fe6d3ef1d Mon Sep 17 00:00:00 2001 From: Samuel Felton Date: Wed, 26 Nov 2025 18:48:48 +0100 Subject: [PATCH 01/24] SIMD homogeneous matrix multiplicaiton --- .../include/visp3/core/vpHomogeneousMatrix.h | 2 + .../transformation/vpHomogeneousMatrix.cpp | 153 +++++++++++++++++- .../core/test/math/catchHomogeneousMatrix.cpp | 63 ++++++++ 3 files changed, 215 insertions(+), 3 deletions(-) diff --git a/modules/core/include/visp3/core/vpHomogeneousMatrix.h b/modules/core/include/visp3/core/vpHomogeneousMatrix.h index 76950d9a82..d5a7b696ee 100644 --- a/modules/core/include/visp3/core/vpHomogeneousMatrix.h +++ b/modules/core/include/visp3/core/vpHomogeneousMatrix.h @@ -381,6 +381,8 @@ class VISP_EXPORT vpHomogeneousMatrix : public vpArray2D vpHomogeneousMatrix &operator<<(double val); vpHomogeneousMatrix &operator,(double val); + void project(const vpMatrix &inputPoints, vpMatrix &outputPoints) const; + void orthogonalizeRotation(); void print() const; diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp index 8e17bea6a5..98222aeb0d 100644 --- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp +++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp @@ -43,8 +43,130 @@ #include #include + + + +#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) +#include +#include + +#define VISP_HAVE_SSE2 1 +#endif + +// https://stackoverflow.com/a/40765925 +#if !defined(__FMA__) && defined(__AVX2__) +#define __FMA__ 1 +#endif + +#if defined _WIN32 && defined(_M_ARM64) +#define _ARM64_DISTINCT_NEON_TYPES +#include +#include +#define VISP_HAVE_NEON 1 +#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__) +#include +#define VISP_HAVE_NEON 1 +#else +#define VISP_HAVE_NEON 0 +#endif + +#define USE_SIMD_CODE 1 + +#if VISP_HAVE_SSE2 && USE_SIMD_CODE +#define USE_SSE 1 +#else +#define USE_SSE 0 +#endif + +#if VISP_HAVE_NEON && USE_SIMD_CODE +#define USE_NEON 1 +#else +#define USE_NEON 0 +#endif + + + + BEGIN_VISP_NAMESPACE const unsigned int vpHomogeneousMatrix::constr_value_4 = 4; + +#if defined(__AVX__) +inline +double hsum_double_avx(__m256d v) +{ + __m128d vlow = _mm256_castpd256_pd128(v); + __m128d vhigh = _mm256_extractf128_pd(v, 1); // high 128 + vlow = _mm_add_pd(vlow, vhigh); // reduce down to 128 + __m128d high64 = _mm_unpackhi_pd(vlow, vlow); + return _mm_cvtsd_f64(_mm_add_sd(vlow, high64)); // reduce to scalar +} +#endif + +void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output) const +{ + output.resize(input.getRows(), input.getCols(), false); + if (input.getCols() != 3) { + throw vpException(vpException::dimensionError, "Input matrix should have 3 columns"); + } + double *inputData = input.data; + double *outputData = output.data; +#if (USE_SSE) + __m128d rows[] = { + _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2), + _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2), + _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2) + }; + for (unsigned int i = 0; i < input.getRows(); ++i) { + + __m128d r1 = _mm_set_pd(inputData[1], inputData[0]); + __m128d r2 = _mm_set_pd(1.0, inputData[2]); + + for (unsigned int j = 0; j < 3; ++j) { + __m128d m1 = _mm_mul_pd(r1, rows[j * 2]); +#if !defined(__FMA__) + __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]); + __m128d add = _mm_add_pd(m1, m2); +#else + __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1); +#endif + __m128d sum = _mm_hadd_pd(add, add); + outputData[j] = _mm_cvtsd_f64(sum); + } + inputData += 3; + outputData += 3; + } + +#elif defined(__AVX__) + __m256d rows[] = { + _mm256_loadu_pd(rowPtrs[0]), + _mm256_loadu_pd(rowPtrs[1]), + _mm256_loadu_pd(rowPtrs[2]) + }; + + for (unsigned int i = 0; i < input.getRows(); ++i) { + __m256d r = _mm256_set_pd(1.0, inputData[2], inputData[1], inputData[0]); + for (unsigned int j = 0; j < 3; ++j) { + __m256d mulres = _mm256_mul_pd(rows[j], r); + output[i][j] = hsum_double_avx(mulres); + } + inputData += 3; + } + +#else + double *r0 = rowPtrs[0]; + double *r1 = rowPtrs[1]; + double *r2 = rowPtrs[2]; + + for (unsigned int i = 0; i < input.getRows(); ++i) { + output[i][0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2] + r0[3]; + output[i][1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2] + r1[3]; + output[i][2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2] + r2[3]; + inputData += 3; + } +#endif +} + + /*! Construct an homogeneous matrix from a translation vector and quaternion rotation vector. @@ -651,16 +773,41 @@ vpColVector vpHomogeneousMatrix::operator*(const vpColVector &v) const "(%dx1) column vector", v.getRows())); } - vpColVector p(rowNum); + vpColVector p(rowNum, 0.0); + +#if USE_SSE + __m128d rows[] = { + _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2), + _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2), + _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2), + _mm_loadu_pd(rowPtrs[3]), _mm_loadu_pd(rowPtrs[3] + 2) + + }; + + __m128d r1 = _mm_loadu_pd(v.data); + __m128d r2 = _mm_loadu_pd(v.data + 2); + + for (unsigned int j = 0; j < 4; ++j) { + __m128d m1 = _mm_mul_pd(r1, rows[j * 2]); +#if !defined(__FMA__) + __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]); + __m128d add = _mm_add_pd(m1, m2); +#else + __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1); +#endif + __m128d sum = _mm_hadd_pd(add, add); + p[j] = _mm_cvtsd_f64(sum); + } + - p = 0.0; +#else for (unsigned int j = 0; j < val_4; ++j) { for (unsigned int i = 0; i < val_4; ++i) { p[i] += rowPtrs[i][j] * v[j]; } } - +#endif return p; } diff --git a/modules/core/test/math/catchHomogeneousMatrix.cpp b/modules/core/test/math/catchHomogeneousMatrix.cpp index af819ab093..9a41b7c0f1 100644 --- a/modules/core/test/math/catchHomogeneousMatrix.cpp +++ b/modules/core/test/math/catchHomogeneousMatrix.cpp @@ -38,6 +38,8 @@ */ #include +#include + #if defined(VISP_HAVE_CATCH2) #include @@ -224,6 +226,67 @@ TEST_CASE("vpHomogenousMatrix * vpRotationMatrix", "[operator*]") CHECK(success); } +TEST_CASE("Point projection", "project") +{ + std::vector NS = { 1, 10, 100, 1000, 10000 }; + for (unsigned int N : NS) { + std::cout << "Running for N = " << N << std::endl; + std::vector timeProject, timeMult, timeNaive; + vpUniRand r(42); + for (unsigned int trial = 0; trial < 100000; ++trial) { + vpHomogeneousMatrix M; + vpMatrix input(N, 3); + for (unsigned int i = 0; i< N; ++i) { + input[i][0] = r.uniform(0.0, 1.0); + input[i][1] = r.uniform(0.0, 1.0); + input[i][2] = r.uniform(0.0, 1.0); + } + vpMatrix output(N, 3); + + double t1 = vpTime::measureTimeMs(); + M.project(input, output); + double t2 = vpTime::measureTimeMs(); + timeProject.push_back(t2 - t1); + + + vpColVector x(4, 1); + + vpMatrix outputR(N, 3); + double t1r = vpTime::measureTimeMs(); + + for (unsigned int i = 0; i < input.getRows(); ++i) { + x[0] = input[i][0]; + x[1] = input[i][1]; + x[2] = input[i][2]; + + vpColVector res = M * x; + outputR[i][0] = res[0]; + outputR[i][1] = res[1]; + outputR[i][2] = res[2]; + } + double t2r = vpTime::measureTimeMs(); + timeNaive.push_back(t2r - t1r); + + + vpMatrix input4(4, N); + vpMatrix output4(4, N); + double t14 = vpTime::measureTimeMs(); + + vpMatrix::mult2Matrices((vpMatrix)M, input4, output4); + double t24 = vpTime::measureTimeMs(); + timeMult.push_back(t24 - t14); + + CHECK((outputR == output)); + } + std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +-" << vpMath::getStdev(timeProject) << "ms" << std::endl; + std::cout << "Mult version took: " << vpMath::getMean(timeMult) << " +-" << vpMath::getStdev(timeMult) << "ms" << std::endl; + std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +-" << vpMath::getStdev(timeNaive) << "ms" << std::endl; + std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::getMean(timeProject) << std::endl; + + } + +} + int main(int argc, char *argv[]) { Catch::Session session; From 196660c49a471fe05c9a198b6fa10db2055a6d9b Mon Sep 17 00:00:00 2001 From: Samuel Felton Date: Thu, 27 Nov 2025 17:07:36 +0100 Subject: [PATCH 02/24] implement first version of simd for rotaiton matrix --- .../include/visp3/core/vpRotationMatrix.h | 2 + .../transformation/vpHomogeneousMatrix.cpp | 2 +- .../math/transformation/vpRotationMatrix.cpp | 109 ++++++++++++++++++ .../core/test/math/catchHomogeneousMatrix.cpp | 1 + modules/core/test/math/catchRotation.cpp | 68 +++++++++++ 5 files changed, 181 insertions(+), 1 deletion(-) diff --git a/modules/core/include/visp3/core/vpRotationMatrix.h b/modules/core/include/visp3/core/vpRotationMatrix.h index fbbf0b6f0e..96dc62969d 100644 --- a/modules/core/include/visp3/core/vpRotationMatrix.h +++ b/modules/core/include/visp3/core/vpRotationMatrix.h @@ -190,6 +190,8 @@ class VISP_EXPORT vpRotationMatrix : public vpArray2D void printVector(); + void rotateVectors(const vpMatrix &inputs, vpMatrix &outputs) const; + /*! This function is not applicable to a rotation matrix that is always a 3-by-3 matrix. diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp index 98222aeb0d..354010c19c 100644 --- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp +++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp @@ -104,7 +104,7 @@ double hsum_double_avx(__m256d v) void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output) const { - output.resize(input.getRows(), input.getCols(), false); + output.resize(input.getRows(), input.getCols(), false, false); if (input.getCols() != 3) { throw vpException(vpException::dimensionError, "Input matrix should have 3 columns"); } diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp index 067db852cf..dd66958167 100644 --- a/modules/core/src/math/transformation/vpRotationMatrix.cpp +++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp @@ -49,7 +49,116 @@ // Debug trace #include + +#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) +#include +#include +#include + + +#define VISP_HAVE_SSE2 1 +#endif + +// https://stackoverflow.com/a/40765925 +#if !defined(__FMA__) && defined(__AVX2__) +#define __FMA__ 1 +#endif + +#if defined _WIN32 && defined(_M_ARM64) +#define _ARM64_DISTINCT_NEON_TYPES +#include +#include +#define VISP_HAVE_NEON 1 +#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__) +#include +#define VISP_HAVE_NEON 1 +#else +#define VISP_HAVE_NEON 0 +#endif + +#define USE_SIMD_CODE 1 + +#if VISP_HAVE_SSE2 && USE_SIMD_CODE +#define USE_SSE 1 +#else +#define USE_SSE 0 +#endif + +#if VISP_HAVE_NEON && USE_SIMD_CODE +#define USE_NEON 1 +#else +#define USE_NEON 0 +#endif + + BEGIN_VISP_NAMESPACE + + +void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output) const +{ + + + + output.resize(input.getRows(), input.getCols(), false, false); + if (input.getCols() != 3) { + throw vpException(vpException::dimensionError, "Input matrix should have 3 columns"); + } + double *inputData = input.data; + double *outputData = output.data; + +#if USE_SSE + __m128d rowStarts[] = { + _mm_loadu_pd(rowPtrs[0]), + _mm_loadu_pd(rowPtrs[1]), + _mm_loadu_pd(rowPtrs[2]) + }; + + __m128d lastColStart = _mm_set_pd(rowPtrs[1][2], rowPtrs[0][2]); + __m128d lastColEnd = _mm_set_pd(rowPtrs[2][2], rowPtrs[2][2]); + + const int dotProdFullMask = 0x33; // 0x3: use both input floats, 0x2: store into lane 1 + const int dotProdFirstElemMask = 0x11; + + for (unsigned int i = 0; i < input.getRows(); ++i) { + __m128d xy = _mm_loadu_pd(inputData); + __m128d zz = _mm_set_pd(inputData[2], inputData[2]); + // Perform dot products with the two first elements of each row + __m128d r1 = _mm_dp_pd(xy, rowStarts[0], dotProdFullMask); + __m128d r2 = _mm_dp_pd(xy, rowStarts[1], dotProdFullMask); + __m128d r3 = _mm_dp_pd(xy, rowStarts[2], dotProdFullMask); + + __m128d r12xy = _mm_shuffle_pd(r1, r2, 0x2); + + // Dot product between last columns of the rows and Z + __m128d lastColxy = _mm_dp_pd(zz, lastColStart, dotProdFullMask); // First two cols + __m128d lastColz = _mm_dp_pd(zz, lastColEnd, dotProdFirstElemMask); // Last one + + __m128d r3z = _mm_add_pd(r3, lastColz); + + _mm_storeu_pd(outputData, _mm_add_pd(r12xy, lastColxy)); + + outputData[2] = _mm_cvtsd_f64(r3z); + inputData += 3; + outputData += 3; + } + + +#else + double *r0 = rowPtrs[0]; + double *r1 = rowPtrs[1]; + double *r2 = rowPtrs[2]; + + for (unsigned int i = 0; i < input.getRows(); ++i) { + outputData[0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2]; + outputData[1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2]; + outputData[2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2]; + inputData += 3; + outputData += 3; + } +#endif +} + + const unsigned int vpRotationMatrix::constr_val_3 = 3; /*! Initialize the rotation matrix as identity. diff --git a/modules/core/test/math/catchHomogeneousMatrix.cpp b/modules/core/test/math/catchHomogeneousMatrix.cpp index 9a41b7c0f1..e3b6c00f20 100644 --- a/modules/core/test/math/catchHomogeneousMatrix.cpp +++ b/modules/core/test/math/catchHomogeneousMatrix.cpp @@ -250,6 +250,7 @@ TEST_CASE("Point projection", "project") vpColVector x(4, 1); + vpColVector res(4); vpMatrix outputR(N, 3); double t1r = vpTime::measureTimeMs(); diff --git a/modules/core/test/math/catchRotation.cpp b/modules/core/test/math/catchRotation.cpp index 5031a2efcc..51455e8059 100644 --- a/modules/core/test/math/catchRotation.cpp +++ b/modules/core/test/math/catchRotation.cpp @@ -559,6 +559,74 @@ TEST_CASE("Default constructor", "[rotation matrix]") #endif } + + +TEST_CASE("Vector rotation", "project") +{ + std::vector NS = { 1, 10, 100, 1000, 10000 }; + for (unsigned int N : NS) { + std::cout << "Running for N = " << N << std::endl; + std::vector timeProject, timeMult, timeNaive; + vpUniRand r(42); + for (unsigned int trial = 0; trial < 100000; ++trial) { + vpRotationMatrix M; + vpMatrix input(N, 3); + for (unsigned int i = 0; i< N; ++i) { + input[i][0] = r.uniform(0.0, 1.0); + input[i][1] = r.uniform(0.0, 1.0); + input[i][2] = r.uniform(0.0, 1.0); + } + vpMatrix output(N, 3); + + double t1 = vpTime::measureTimeMs(); + M.rotateVectors(input, output); + double t2 = vpTime::measureTimeMs(); + timeProject.push_back(t2 - t1); + + + vpColVector x(3, 1); + vpColVector res(3); + vpMatrix outputR(N, 3); + double t1r = vpTime::measureTimeMs(); + + for (unsigned int i = 0; i < input.getRows(); ++i) { + x[0] = input[i][0]; + x[1] = input[i][1]; + x[2] = input[i][2]; + + res = M * x; + outputR[i][0] = res[0]; + outputR[i][1] = res[1]; + outputR[i][2] = res[2]; + } + double t2r = vpTime::measureTimeMs(); + timeNaive.push_back(t2r - t1r); + + + vpMatrix input4(3, N); + vpMatrix output4(3, N); + double t14 = vpTime::measureTimeMs(); + + vpMatrix::mult2Matrices((vpMatrix)M, input4, output4); + double t24 = vpTime::measureTimeMs(); + timeMult.push_back(t24 - t14); + + if (outputR != output) { + std::cerr << "Output diff = " << (output - outputR)<< std::endl; + + + } + + CHECK((outputR == output)); + } + std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +-" << vpMath::getStdev(timeProject) << "ms" << std::endl; + std::cout << "Mult version took: " << vpMath::getMean(timeMult) << " +-" << vpMath::getStdev(timeMult) << "ms" << std::endl; + std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +-" << vpMath::getStdev(timeNaive) << "ms" << std::endl; + std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::getMean(timeProject) << std::endl; + + } +} + int main(int argc, char *argv[]) { Catch::Session session; From 729110a80b90448bbb75a4a2f4339e65fabedbd0 Mon Sep 17 00:00:00 2001 From: Samuel Felton Date: Tue, 2 Dec 2025 13:39:57 +0100 Subject: [PATCH 03/24] AVX implem for rotation matmul --- .../math/transformation/vpRotationMatrix.cpp | 41 ++++++++++++++++--- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp index dd66958167..e38ded51f4 100644 --- a/modules/core/src/math/transformation/vpRotationMatrix.cpp +++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp @@ -55,7 +55,6 @@ #include #include - #define VISP_HAVE_SSE2 1 #endif @@ -96,9 +95,6 @@ BEGIN_VISP_NAMESPACE void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output) const { - - - output.resize(input.getRows(), input.getCols(), false, false); if (input.getCols() != 3) { throw vpException(vpException::dimensionError, "Input matrix should have 3 columns"); @@ -106,7 +102,42 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output) co double *inputData = input.data; double *outputData = output.data; -#if USE_SSE +#if defined(__AVX__) + __m256d rows[] = { + _mm256_set_pd(0.0, rowPtrs[2][0], rowPtrs[1][0], rowPtrs[0][0]), + _mm256_set_pd(0.0, rowPtrs[2][1], rowPtrs[1][1], rowPtrs[0][1]), + _mm256_set_pd(0.0, rowPtrs[2][2], rowPtrs[1][2], rowPtrs[0][2]), + }; + + double result[4]; + + for (unsigned int i = 0; i < input.getRows(); ++i) { + const __m256d xyz = _mm256_set_pd(0.0, inputData[2], inputData[1], inputData[0]); + +#if defined(__FMA__) + __m256d dp = _mm256_mul_pd(rows[0], xyz); + dp = _mm256_fmadd_pd(rows[1], xyz, dp); + dp = _mm256_fmadd_pd(rows[2], xyz, dp); +#else + __m256d col1 = _mm256_mul_pd(rows[0], xyz); + __m256d col2 = _mm256_mul_pd(rows[1], xyz); + __m256d col3 = _mm256_mul_pd(rows[2], xyz); + + __m256d dp = _mm256_add_pd(col1, col2); + dp = _mm256_add_pd(dp, col3); +#endif + + _mm256_storeu_pd(result, dp); + + outputData[0] = result[0]; + outputData[1] = result[1]; + outputData[2] = result[2]; + + inputData += 3; + outputData += 3; + } + +#elif USE_SSE __m128d rowStarts[] = { _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[1]), From e230a7218f714e84d2b84906756650f70622428b Mon Sep 17 00:00:00 2001 From: Samuel Felton Date: Tue, 2 Dec 2025 15:45:33 +0100 Subject: [PATCH 04/24] Fix intrinsics usage, performance improvement --- .../math/transformation/vpRotationMatrix.cpp | 83 +++++++++++++------ 1 file changed, 59 insertions(+), 24 deletions(-) diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp index e38ded51f4..cd8ee56d12 100644 --- a/modules/core/src/math/transformation/vpRotationMatrix.cpp +++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp @@ -102,7 +102,7 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output) co double *inputData = input.data; double *outputData = output.data; -#if defined(__AVX__) +#if defined(__AVX__) && 0 == 1 __m256d rows[] = { _mm256_set_pd(0.0, rowPtrs[2][0], rowPtrs[1][0], rowPtrs[0][0]), _mm256_set_pd(0.0, rowPtrs[2][1], rowPtrs[1][1], rowPtrs[0][1]), @@ -138,40 +138,75 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output) co } #elif USE_SSE - __m128d rowStarts[] = { - _mm_loadu_pd(rowPtrs[0]), - _mm_loadu_pd(rowPtrs[1]), - _mm_loadu_pd(rowPtrs[2]) - }; - __m128d lastColStart = _mm_set_pd(rowPtrs[1][2], rowPtrs[0][2]); - __m128d lastColEnd = _mm_set_pd(rowPtrs[2][2], rowPtrs[2][2]); - const int dotProdFullMask = 0x33; // 0x3: use both input floats, 0x2: store into lane 1 - const int dotProdFirstElemMask = 0x11; + __m128d row01_lo = _mm_loadu_pd(rowPtrs[0]); + __m128d row01_hi = _mm_loadu_pd(rowPtrs[1]); + __m128d row2_xy = _mm_loadu_pd(rowPtrs[2]); - for (unsigned int i = 0; i < input.getRows(); ++i) { - __m128d xy = _mm_loadu_pd(inputData); - __m128d zz = _mm_set_pd(inputData[2], inputData[2]); - // Perform dot products with the two first elements of each row - __m128d r1 = _mm_dp_pd(xy, rowStarts[0], dotProdFullMask); - __m128d r2 = _mm_dp_pd(xy, rowStarts[1], dotProdFullMask); - __m128d r3 = _mm_dp_pd(xy, rowStarts[2], dotProdFullMask); + // Preload third column as scalars + double c0 = rowPtrs[0][2]; + double c1 = rowPtrs[1][2]; + double c2 = rowPtrs[2][2]; + + for (unsigned i = 0; i < input.getRows(); ++i) { - __m128d r12xy = _mm_shuffle_pd(r1, r2, 0x2); + const __m128d xy = _mm_loadu_pd(inputData); - // Dot product between last columns of the rows and Z - __m128d lastColxy = _mm_dp_pd(zz, lastColStart, dotProdFullMask); // First two cols - __m128d lastColz = _mm_dp_pd(zz, lastColEnd, dotProdFirstElemMask); // Last one + const double z = inputData[2]; + // const __m128d zz = _mm_set_pd(z, z); - __m128d r3z = _mm_add_pd(r3, lastColz); + __m128d mul0 = _mm_mul_pd(row01_lo, xy); + __m128d mul1 = _mm_mul_pd(row01_hi, xy); + __m128d r01 = _mm_hadd_pd(mul0, mul1); + __m128d mul2 = _mm_mul_pd(row2_xy, xy); - _mm_storeu_pd(outputData, _mm_add_pd(r12xy, lastColxy)); + // Adding scalars at the end without using simd is faster + // __m128d mulzz = _mm_mul_pd(row01_z, zz); + // r01 = _mm_add_pd(r01, mulzz); + + // store result + outputData[0] = _mm_cvtsd_f64(r01) + c0 * z; + outputData[1] = _mm_cvtsd_f64(_mm_unpackhi_pd(r01, r01)) + c1 * z; + outputData[2] = _mm_cvtsd_f64(_mm_hadd_pd(mul2, mul2)) + c2 * z; - outputData[2] = _mm_cvtsd_f64(r3z); inputData += 3; outputData += 3; } + // __m128d rowStarts[] = { + // _mm_loadu_pd(rowPtrs[0]), + // _mm_loadu_pd(rowPtrs[1]), + // _mm_loadu_pd(rowPtrs[2]) + // }; + + // __m128d lastColStart = _mm_set_pd(rowPtrs[1][2], rowPtrs[0][2]); + // __m128d lastColEnd = _mm_set_pd(rowPtrs[2][2], rowPtrs[2][2]); + + // const int dotProdFullMask = 0x33; // 0x3: use both input floats, 0x2: store into lane 1 + // const int dotProdFirstElemMask = 0x11; + + // for (unsigned int i = 0; i < input.getRows(); ++i) { + // __m128d xy = _mm_loadu_pd(inputData); + // __m128d zz = _mm_set_pd(inputData[2], inputData[2]); + // // Perform dot products with the two first elements of each row + // __m128d r1 = _mm_dp_pd(xy, rowStarts[0], dotProdFullMask); + // __m128d r2 = _mm_dp_pd(xy, rowStarts[1], dotProdFullMask); + // __m128d r3 = _mm_dp_pd(xy, rowStarts[2], dotProdFullMask); + + // __m128d r12xy = _mm_shuffle_pd(r1, r2, 0x2); + + // // Dot product between last columns of the rows and Z + // __m128d lastColxy = _mm_dp_pd(zz, lastColStart, dotProdFullMask); // First two cols + // __m128d lastColz = _mm_dp_pd(zz, lastColEnd, dotProdFirstElemMask); // Last one + + // __m128d r3z = _mm_add_pd(r3, lastColz); + + // _mm_storeu_pd(outputData, _mm_add_pd(r12xy, lastColxy)); + + // outputData[2] = _mm_cvtsd_f64(r3z); + // inputData += 3; + // outputData += 3; + // } #else From 1ff0b576ebca50bc7f984f598aa6d22a81b653b1 Mon Sep 17 00:00:00 2001 From: Samuel Felton Date: Wed, 3 Dec 2025 19:14:12 +0100 Subject: [PATCH 05/24] Simd version of the rbt dense depth, AVX matmul version for 3xN input matrices --- .../include/visp3/core/vpHomogeneousMatrix.h | 2 +- .../include/visp3/core/vpRotationMatrix.h | 2 +- .../transformation/vpHomogeneousMatrix.cpp | 350 ++++++++++++------ .../math/transformation/vpRotationMatrix.cpp | 293 ++++++++------- .../include/visp3/rbt/vpRBDenseDepthTracker.h | 93 +---- .../src/features/vpRBDenseDepthTracker.cpp | 268 ++++++++++++-- 6 files changed, 635 insertions(+), 373 deletions(-) diff --git a/modules/core/include/visp3/core/vpHomogeneousMatrix.h b/modules/core/include/visp3/core/vpHomogeneousMatrix.h index d5a7b696ee..68f33ed3b4 100644 --- a/modules/core/include/visp3/core/vpHomogeneousMatrix.h +++ b/modules/core/include/visp3/core/vpHomogeneousMatrix.h @@ -381,7 +381,7 @@ class VISP_EXPORT vpHomogeneousMatrix : public vpArray2D vpHomogeneousMatrix &operator<<(double val); vpHomogeneousMatrix &operator,(double val); - void project(const vpMatrix &inputPoints, vpMatrix &outputPoints) const; + void project(const vpMatrix &inputPoints, vpMatrix &outputPoints, bool isTransposed) const; void orthogonalizeRotation(); diff --git a/modules/core/include/visp3/core/vpRotationMatrix.h b/modules/core/include/visp3/core/vpRotationMatrix.h index 96dc62969d..38ec6f2a05 100644 --- a/modules/core/include/visp3/core/vpRotationMatrix.h +++ b/modules/core/include/visp3/core/vpRotationMatrix.h @@ -190,7 +190,7 @@ class VISP_EXPORT vpRotationMatrix : public vpArray2D void printVector(); - void rotateVectors(const vpMatrix &inputs, vpMatrix &outputs) const; + void rotateVectors(const vpMatrix &inputs, vpMatrix &outputs, bool isTransposed) const; /*! This function is not applicable to a rotation matrix that is always a diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp index 354010c19c..4867583025 100644 --- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp +++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp @@ -42,128 +42,242 @@ #include #include #include +#include +BEGIN_VISP_NAMESPACE +const unsigned int vpHomogeneousMatrix::constr_value_4 = 4; +void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool transposed) const +{ + output.resize(input.getRows(), input.getCols(), false, false); + if (transposed) { -#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) -#include -#include + if (input.getCols() != 3) { + throw vpException(vpException::dimensionError, "Input matrix should have 3 columns"); + } + double *inputData = input.data; + double *outputData = output.data; +#if defined(VISP_HAVE_AVX) + __m256d rows[] = { + _mm256_loadu_pd(rowPtrs[0]), + _mm256_loadu_pd(rowPtrs[1]), + _mm256_loadu_pd(rowPtrs[2]), + _mm256_loadu_pd(rowPtrs[3]), -#define VISP_HAVE_SSE2 1 -#endif + }; -// https://stackoverflow.com/a/40765925 -#if !defined(__FMA__) && defined(__AVX2__) -#define __FMA__ 1 -#endif -#if defined _WIN32 && defined(_M_ARM64) -#define _ARM64_DISTINCT_NEON_TYPES -#include -#include -#define VISP_HAVE_NEON 1 -#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__) -#include -#define VISP_HAVE_NEON 1 -#else -#define VISP_HAVE_NEON 0 -#endif + double result[4]; + + for (unsigned int i = 0; i < input.getRows(); ++i) { + const __m256d xyzw = _mm256_setr_pd(inputData[0], inputData[1], inputData[2], 1.0); + + const __m256d rs[] = { + _mm256_mul_pd(rows[0], xyzw), + _mm256_mul_pd(rows[1], xyzw), + _mm256_mul_pd(rows[2], xyzw), + }; + + __m256d rs_half_sum1 = _mm256_hadd_pd(rs[0], rs[1]); + + + _mm256_storeu_pd(result, rs_half_sum1); + __m128d hi = _mm256_extractf128_pd(rs[2], 1); // [A3, A2] + __m128d lo = _mm256_castpd256_pd128(rs[2]); // [A1, A0] -#define USE_SIMD_CODE 1 + __m128d sum2 = _mm_add_pd(lo, hi); // [A3+A1, A2+A0] + __m128d sum1 = _mm_hadd_pd(sum2, sum2); // [total, total] + double total = _mm_cvtsd_f64(sum1); -#if VISP_HAVE_SSE2 && USE_SIMD_CODE -#define USE_SSE 1 + + outputData[0] = result[0] + result[2]; + outputData[1] = result[1] + result[3]; + outputData[2] = total; + + inputData += 3; + outputData += 3; + } + +#elif (VISP_HAVE_SSE2) + __m128d rows[] = { + _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2), + _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2), + _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2) + }; + for (unsigned int i = 0; i < input.getRows(); ++i) { + + __m128d r1 = _mm_set_pd(inputData[1], inputData[0]); + __m128d r2 = _mm_set_pd(1.0, inputData[2]); + + for (unsigned int j = 0; j < 3; ++j) { + __m128d m1 = _mm_mul_pd(r1, rows[j * 2]); +#if !defined(VISP_HAVE_FMA) + __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]); + __m128d add = _mm_add_pd(m1, m2); #else -#define USE_SSE 0 + __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1); #endif - -#if VISP_HAVE_NEON && USE_SIMD_CODE -#define USE_NEON 1 + __m128d sum = _mm_hadd_pd(add, add); + outputData[j] = _mm_cvtsd_f64(sum); + } + inputData += 3; + outputData += 3; + } #else -#define USE_NEON 0 + double *r0 = rowPtrs[0]; + double *r1 = rowPtrs[1]; + double *r2 = rowPtrs[2]; + + for (unsigned int i = 0; i < input.getRows(); ++i) { + output[i][0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2] + r0[3]; + output[i][1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2] + r1[3]; + output[i][2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2] + r2[3]; + inputData += 3; + } +#endif + } + else { + if (input.getRows() != 3) { + throw vpException(vpException::dimensionError, "Expected input to have 3 rows"); + } + double *inputX = input[0]; + double *inputY = input[1]; + double *inputZ = input[2]; + double *outputX = output[0]; + double *outputY = output[1]; + double *outputZ = output[2]; + +#if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) + +#if defined(VISP_HAVE_AVX2) + + using Register = __m512d; + constexpr int numLanes = 8; + constexpr auto &set1 = _mm512_set1_pd; + constexpr auto &loadu = _mm512_loadu_pd; + constexpr auto &mul = _mm512_mul_pd; + constexpr auto &add = _mm512_add_pd; + constexpr auto &storeu = _mm512_storeu_pd; +#if defined(VISP_HAVE_FMA) + constexpr auto &fma = _mm512_fmadd_pd; +#endif +#elif defined(VISP_HAVE_AVX) + using Register = __m256d; + constexpr int numLanes = 4; + + constexpr auto &set1 = _mm256_set1_pd; + constexpr auto &loadu = _mm256_loadu_pd; + constexpr auto &mul = _mm256_mul_pd; + constexpr auto &add = _mm256_add_pd; + constexpr auto &storeu = _mm256_storeu_pd; +#if defined(VISP_HAVE_FMA) + constexpr auto &fma = _mm256_fmadd_pd; +#endif +#elif VISP_HAVE_SSE2 + using Register = __m128d; + constexpr int numLanes = 2; + + constexpr auto &set1 = _mm_set1_pd; + constexpr auto &loadu = _mm_loadu_pd; + constexpr auto &mul = _mm_mul_pd; + constexpr auto &add = _mm_add_pd; + constexpr auto &storeu = _mm_storeu_pd; +#if defined(VISP_HAVE_FMA) + constexpr auto &fma = _mm_fmadd_pd; +#endif #endif + Register elems[12]; + for (unsigned int i = 0; i < 12; ++i) { + elems[i] = set1(data[i]); + } + for (int i = 0; i <= static_cast(input.getCols()) - numLanes; i += numLanes) { + const Register x4 = loadu(inputX); + const Register y4 = loadu(inputY); + const Register z4 = loadu(inputZ); +#if defined(VISP_HAVE_FMA) + Register dp1 = mul(x4, elems[0]); + dp1 = fma(y4, elems[1], dp1); + dp1 = fma(z4, elems[2], dp1); + dp1 = add(elems[3], dp1); -BEGIN_VISP_NAMESPACE -const unsigned int vpHomogeneousMatrix::constr_value_4 = 4; + Register dp2 = mul(x4, elems[4]); + dp2 = fma(y4, elems[5], dp2); + dp2 = fma(z4, elems[6], dp2); + dp2 = add(elems[7], dp2); + + Register dp3 = mul(x4, elems[8]); + dp3 = fma(y4, elems[9], dp3); + dp3 = fma(z4, elems[10], dp3); + dp3 = add(elems[11], dp3); -#if defined(__AVX__) -inline -double hsum_double_avx(__m256d v) -{ - __m128d vlow = _mm256_castpd256_pd128(v); - __m128d vhigh = _mm256_extractf128_pd(v, 1); // high 128 - vlow = _mm_add_pd(vlow, vhigh); // reduce down to 128 - __m128d high64 = _mm_unpackhi_pd(vlow, vlow); - return _mm_cvtsd_f64(_mm_add_sd(vlow, high64)); // reduce to scalar -} -#endif -void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output) const -{ - output.resize(input.getRows(), input.getCols(), false, false); - if (input.getCols() != 3) { - throw vpException(vpException::dimensionError, "Input matrix should have 3 columns"); - } - double *inputData = input.data; - double *outputData = output.data; -#if (USE_SSE) - __m128d rows[] = { - _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2), - _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2), - _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2) - }; - for (unsigned int i = 0; i < input.getRows(); ++i) { - - __m128d r1 = _mm_set_pd(inputData[1], inputData[0]); - __m128d r2 = _mm_set_pd(1.0, inputData[2]); - - for (unsigned int j = 0; j < 3; ++j) { - __m128d m1 = _mm_mul_pd(r1, rows[j * 2]); -#if !defined(__FMA__) - __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]); - __m128d add = _mm_add_pd(m1, m2); #else - __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1); + const Register muls[] = { + mul(x4, elems[0]), + mul(y4, elems[1]), + mul(z4, elems[2]), + + mul(x4, elems[4]), + mul(y4, elems[5]), + mul(z4, elems[6]), + + mul(x4, elems[8]), + mul(y4, elems[9]), + mul(z4, elems[10]), + }; + + Register dp1 = add(add(muls[0], muls[1]), muls[2]); + Register dp2 = add(add(muls[3], muls[4]), muls[5]); + Register dp3 = add(add(muls[6], muls[7]), muls[8]); + + dp1 = add(dp1, elems[3]); + dp2 = add(dp2, elems[7]); + dp3 = add(dp3, elems[11]); #endif - __m128d sum = _mm_hadd_pd(add, add); - outputData[j] = _mm_cvtsd_f64(sum); - } - inputData += 3; - outputData += 3; - } -#elif defined(__AVX__) - __m256d rows[] = { - _mm256_loadu_pd(rowPtrs[0]), - _mm256_loadu_pd(rowPtrs[1]), - _mm256_loadu_pd(rowPtrs[2]) - }; - - for (unsigned int i = 0; i < input.getRows(); ++i) { - __m256d r = _mm256_set_pd(1.0, inputData[2], inputData[1], inputData[0]); - for (unsigned int j = 0; j < 3; ++j) { - __m256d mulres = _mm256_mul_pd(rows[j], r); - output[i][j] = hsum_double_avx(mulres); + storeu(outputX, dp1); + storeu(outputY, dp2); + storeu(outputZ, dp3); + + + inputX += numLanes; inputY += numLanes; inputZ += numLanes; + outputX += numLanes; outputY += numLanes; outputZ += numLanes; + + } + double *r0 = rowPtrs[0]; + double *r1 = rowPtrs[1]; + double *r2 = rowPtrs[2]; + + for (unsigned int i = (input.getCols() / numLanes) * numLanes; i < input.getCols(); ++i) { + // std::cout << "i = " << i << std::endl; + double X = *inputX, Y = *inputY, Z = *inputZ; + *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z + r0[3]; + *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z + r1[3]; + *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z + r2[3]; + ++inputX; ++inputY; ++inputZ; + ++outputX; ++outputY; ++outputZ; } - inputData += 3; - } #else - double *r0 = rowPtrs[0]; - double *r1 = rowPtrs[1]; - double *r2 = rowPtrs[2]; - - for (unsigned int i = 0; i < input.getRows(); ++i) { - output[i][0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2] + r0[3]; - output[i][1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2] + r1[3]; - output[i][2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2] + r2[3]; - inputData += 3; - } + double *r0 = rowPtrs[0]; + double *r1 = rowPtrs[1]; + double *r2 = rowPtrs[2]; + + for (unsigned int i = 0; i < input.getRows(); ++i) { + double X = *inputX, Y = *inputY, Z = *inputZ; + *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z + r0[3]; + *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z + r1[3]; + *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z + r2[3]; + ++inputX; ++inputY; ++inputZ; + ++outputX; ++outputY; ++outputZ; + } #endif + + } } @@ -775,39 +889,39 @@ vpColVector vpHomogeneousMatrix::operator*(const vpColVector &v) const } vpColVector p(rowNum, 0.0); -#if USE_SSE - __m128d rows[] = { - _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2), - _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2), - _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2), - _mm_loadu_pd(rowPtrs[3]), _mm_loadu_pd(rowPtrs[3] + 2) +// #if USE_SSE +// __m128d rows[] = { +// _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2), +// _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2), +// _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2), +// _mm_loadu_pd(rowPtrs[3]), _mm_loadu_pd(rowPtrs[3] + 2) - }; +// }; - __m128d r1 = _mm_loadu_pd(v.data); - __m128d r2 = _mm_loadu_pd(v.data + 2); +// __m128d r1 = _mm_loadu_pd(v.data); +// __m128d r2 = _mm_loadu_pd(v.data + 2); - for (unsigned int j = 0; j < 4; ++j) { - __m128d m1 = _mm_mul_pd(r1, rows[j * 2]); -#if !defined(__FMA__) - __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]); - __m128d add = _mm_add_pd(m1, m2); -#else - __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1); -#endif - __m128d sum = _mm_hadd_pd(add, add); - p[j] = _mm_cvtsd_f64(sum); - } +// for (unsigned int j = 0; j < 4; ++j) { +// __m128d m1 = _mm_mul_pd(r1, rows[j * 2]); +// #if !defined(__FMA__) +// __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]); +// __m128d add = _mm_add_pd(m1, m2); +// #else +// __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1); +// #endif +// __m128d sum = _mm_hadd_pd(add, add); +// p[j] = _mm_cvtsd_f64(sum); +// } -#else +// #else for (unsigned int j = 0; j < val_4; ++j) { for (unsigned int i = 0; i < val_4; ++i) { p[i] += rowPtrs[i][j] * v[j]; } } -#endif +// #endif return p; } diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp index cd8ee56d12..fa5e78e9c3 100644 --- a/modules/core/src/math/transformation/vpRotationMatrix.cpp +++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp @@ -40,6 +40,8 @@ #include #include +#include + // Rotation classes #include @@ -49,179 +51,208 @@ // Debug trace #include +BEGIN_VISP_NAMESPACE -#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) -#include -#include -#include -#define VISP_HAVE_SSE2 1 -#endif +void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bool isTransposed) const +{ + output.resize(input.getRows(), input.getCols(), false, false); + if (isTransposed) { + if (input.getCols() != 3) { + throw vpException(vpException::dimensionError, "Input matrix should have 3 columns"); + } + double *inputData = input.data; + double *outputData = output.data; +#if defined(VISP_HAVE_AVX) + __m256d rows[] = { + _mm256_setr_pd(rowPtrs[0][0], rowPtrs[0][1], rowPtrs[0][2], 0.0), + _mm256_setr_pd(rowPtrs[1][0], rowPtrs[1][1], rowPtrs[1][2], 0.0), + _mm256_setr_pd(rowPtrs[2][0], rowPtrs[2][1], rowPtrs[2][2], 0.0), + }; -// https://stackoverflow.com/a/40765925 -#if !defined(__FMA__) && defined(__AVX2__) -#define __FMA__ 1 -#endif + double result[4]; -#if defined _WIN32 && defined(_M_ARM64) -#define _ARM64_DISTINCT_NEON_TYPES -#include -#include -#define VISP_HAVE_NEON 1 -#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__) -#include -#define VISP_HAVE_NEON 1 -#else -#define VISP_HAVE_NEON 0 -#endif + for (unsigned int i = 0; i < input.getRows(); ++i) { + const __m256d xyzw = _mm256_setr_pd(inputData[0], inputData[1], inputData[2], 0.0); -#define USE_SIMD_CODE 1 + const __m256d rs[] = { + _mm256_mul_pd(rows[0], xyzw), + _mm256_mul_pd(rows[1], xyzw), + _mm256_mul_pd(rows[2], xyzw), + }; -#if VISP_HAVE_SSE2 && USE_SIMD_CODE -#define USE_SSE 1 -#else -#define USE_SSE 0 -#endif + __m256d rs_half_sum1 = _mm256_hadd_pd(rs[0], rs[1]); -#if VISP_HAVE_NEON && USE_SIMD_CODE -#define USE_NEON 1 -#else -#define USE_NEON 0 -#endif + _mm256_storeu_pd(result, rs_half_sum1); + __m128d hi = _mm256_extractf128_pd(rs[2], 1); // [A3, A2] + __m128d lo = _mm256_castpd256_pd128(rs[2]); // [A1, A0] -BEGIN_VISP_NAMESPACE + __m128d sum2 = _mm_add_pd(lo, hi); // [A3+A1, A2+A0] + __m128d sum1 = _mm_hadd_pd(sum2, sum2); // [total, total] + double total = _mm_cvtsd_f64(sum1); -void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output) const -{ - output.resize(input.getRows(), input.getCols(), false, false); - if (input.getCols() != 3) { - throw vpException(vpException::dimensionError, "Input matrix should have 3 columns"); - } - double *inputData = input.data; - double *outputData = output.data; + outputData[0] = result[0] + result[2]; + outputData[1] = result[1] + result[3]; + outputData[2] = total; -#if defined(__AVX__) && 0 == 1 - __m256d rows[] = { - _mm256_set_pd(0.0, rowPtrs[2][0], rowPtrs[1][0], rowPtrs[0][0]), - _mm256_set_pd(0.0, rowPtrs[2][1], rowPtrs[1][1], rowPtrs[0][1]), - _mm256_set_pd(0.0, rowPtrs[2][2], rowPtrs[1][2], rowPtrs[0][2]), - }; + inputData += 3; + outputData += 3; + } - double result[4]; +#elif VISP_HAVE_SSE2 - for (unsigned int i = 0; i < input.getRows(); ++i) { - const __m256d xyz = _mm256_set_pd(0.0, inputData[2], inputData[1], inputData[0]); -#if defined(__FMA__) - __m256d dp = _mm256_mul_pd(rows[0], xyz); - dp = _mm256_fmadd_pd(rows[1], xyz, dp); - dp = _mm256_fmadd_pd(rows[2], xyz, dp); -#else - __m256d col1 = _mm256_mul_pd(rows[0], xyz); - __m256d col2 = _mm256_mul_pd(rows[1], xyz); - __m256d col3 = _mm256_mul_pd(rows[2], xyz); + __m128d row01_lo = _mm_loadu_pd(rowPtrs[0]); + __m128d row01_hi = _mm_loadu_pd(rowPtrs[1]); + __m128d row2_xy = _mm_loadu_pd(rowPtrs[2]); - __m256d dp = _mm256_add_pd(col1, col2); - dp = _mm256_add_pd(dp, col3); -#endif + // Preload third column as scalars + double c0 = rowPtrs[0][2]; + double c1 = rowPtrs[1][2]; + double c2 = rowPtrs[2][2]; - _mm256_storeu_pd(result, dp); - outputData[0] = result[0]; - outputData[1] = result[1]; - outputData[2] = result[2]; + for (unsigned i = 0; i < input.getRows(); ++i) { - inputData += 3; - outputData += 3; - } + const __m128d xy = _mm_loadu_pd(inputData); -#elif USE_SSE + const double z = inputData[2]; + // const __m128d zz = _mm_set_pd(z, z); + __m128d mul0 = _mm_mul_pd(row01_lo, xy); + __m128d mul1 = _mm_mul_pd(row01_hi, xy); + __m128d r01 = _mm_hadd_pd(mul0, mul1); + __m128d mul2 = _mm_mul_pd(row2_xy, xy); - __m128d row01_lo = _mm_loadu_pd(rowPtrs[0]); - __m128d row01_hi = _mm_loadu_pd(rowPtrs[1]); - __m128d row2_xy = _mm_loadu_pd(rowPtrs[2]); + // Adding scalars at the end without using simd is faster + // __m128d mulzz = _mm_mul_pd(row01_z, zz); + // r01 = _mm_add_pd(r01, mulzz); - // Preload third column as scalars - double c0 = rowPtrs[0][2]; - double c1 = rowPtrs[1][2]; - double c2 = rowPtrs[2][2]; + // store result + outputData[0] = _mm_cvtsd_f64(r01) + c0 * z; + outputData[1] = _mm_cvtsd_f64(_mm_unpackhi_pd(r01, r01)) + c1 * z; + outputData[2] = _mm_cvtsd_f64(_mm_hadd_pd(mul2, mul2)) + c2 * z; - for (unsigned i = 0; i < input.getRows(); ++i) { + inputData += 3; + outputData += 3; + } - const __m128d xy = _mm_loadu_pd(inputData); +#else + double *r0 = rowPtrs[0]; + double *r1 = rowPtrs[1]; + double *r2 = rowPtrs[2]; + + for (unsigned int i = 0; i < input.getRows(); ++i) { + outputData[0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2]; + outputData[1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2]; + outputData[2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2]; + inputData += 3; + outputData += 3; + } +#endif + } - const double z = inputData[2]; - // const __m128d zz = _mm_set_pd(z, z); - __m128d mul0 = _mm_mul_pd(row01_lo, xy); - __m128d mul1 = _mm_mul_pd(row01_hi, xy); - __m128d r01 = _mm_hadd_pd(mul0, mul1); - __m128d mul2 = _mm_mul_pd(row2_xy, xy); - // Adding scalars at the end without using simd is faster - // __m128d mulzz = _mm_mul_pd(row01_z, zz); - // r01 = _mm_add_pd(r01, mulzz); + else { // 3xN matrix + if (input.getRows() != 3) { + throw vpException(vpException::dimensionError, "Expected input to have 3 rows"); + } + double *inputX = input[0]; + double *inputY = input[1]; + double *inputZ = input[2]; + double *outputX = output[0]; + double *outputY = output[1]; + double *outputZ = output[2]; - // store result - outputData[0] = _mm_cvtsd_f64(r01) + c0 * z; - outputData[1] = _mm_cvtsd_f64(_mm_unpackhi_pd(r01, r01)) + c1 * z; - outputData[2] = _mm_cvtsd_f64(_mm_hadd_pd(mul2, mul2)) + c2 * z; - inputData += 3; - outputData += 3; - } - // __m128d rowStarts[] = { - // _mm_loadu_pd(rowPtrs[0]), - // _mm_loadu_pd(rowPtrs[1]), - // _mm_loadu_pd(rowPtrs[2]) - // }; +#if defined(VISP_HAVE_AVX) - // __m128d lastColStart = _mm_set_pd(rowPtrs[1][2], rowPtrs[0][2]); - // __m128d lastColEnd = _mm_set_pd(rowPtrs[2][2], rowPtrs[2][2]); + __m256d elems[9]; + for (unsigned int i = 0; i < 9; ++i) { + elems[i] = _mm256_set1_pd(data[i]); + } + for (int i = 0; i <= static_cast(input.getCols()) - 4; i += 4) { + const __m256d x4 = _mm256_loadu_pd(inputX); + const __m256d y4 = _mm256_loadu_pd(inputY); + const __m256d z4 = _mm256_loadu_pd(inputZ); + +#if defined(VISP_HAVE_FMA) + __m256d dp1 = _mm256_mul_pd(x4, elems[0]); + dp1 = _mm256_fmadd_pd(y4, elems[1], dp1); + dp1 = _mm256_fmadd_pd(z4, elems[2], dp1); + __m256d dp2 = _mm256_mul_pd(x4, elems[3]); + dp2 = _mm256_fmadd_pd(y4, elems[4], dp2); + dp2 = _mm256_fmadd_pd(z4, elems[5], dp2); + __m256d dp3 = _mm256_mul_pd(x4, elems[6]); + dp3 = _mm256_fmadd_pd(y4, elems[7], dp3); + dp3 = _mm256_fmadd_pd(z4, elems[8], dp3); - // const int dotProdFullMask = 0x33; // 0x3: use both input floats, 0x2: store into lane 1 - // const int dotProdFirstElemMask = 0x11; +#else + const __m256d muls = { + _mm256_mul_pd(x4, elems[0]), + _mm256_mul_pd(y4, elems[1]), + _mm256_mul_pd(z4, elems[2]), + + _mm256_mul_pd(x4, elems[3]), + _mm256_mul_pd(y4, elems[4]), + _mm256_mul_pd(z4, elems[5]), + + _mm256_mul_pd(x4, elems[6]), + _mm256_mul_pd(y4, elems[7]), + _mm256_mul_pd(z4, elems[8]), + } - // for (unsigned int i = 0; i < input.getRows(); ++i) { - // __m128d xy = _mm_loadu_pd(inputData); - // __m128d zz = _mm_set_pd(inputData[2], inputData[2]); - // // Perform dot products with the two first elements of each row - // __m128d r1 = _mm_dp_pd(xy, rowStarts[0], dotProdFullMask); - // __m128d r2 = _mm_dp_pd(xy, rowStarts[1], dotProdFullMask); - // __m128d r3 = _mm_dp_pd(xy, rowStarts[2], dotProdFullMask); + __m256d dp1 = _mm256_add_pd(_mm256_add_pd(muls[0], muls[1]), muls[2]); + __m256d dp2 = _mm256_add_pd(_mm256_add_pd(muls[3], muls[4]), muls[5]); + __m256d dp3 = _mm256_add_pd(_mm256_add_pd(muls[6], muls[7]), muls[8]); - // __m128d r12xy = _mm_shuffle_pd(r1, r2, 0x2); - // // Dot product between last columns of the rows and Z - // __m128d lastColxy = _mm_dp_pd(zz, lastColStart, dotProdFullMask); // First two cols - // __m128d lastColz = _mm_dp_pd(zz, lastColEnd, dotProdFirstElemMask); // Last one +#endif - // __m128d r3z = _mm_add_pd(r3, lastColz); + _mm256_storeu_pd(outputX, dp1); + _mm256_storeu_pd(outputY, dp2); + _mm256_storeu_pd(outputZ, dp3); - // _mm_storeu_pd(outputData, _mm_add_pd(r12xy, lastColxy)); - // outputData[2] = _mm_cvtsd_f64(r3z); - // inputData += 3; - // outputData += 3; - // } + inputX += 4; inputY += 4; inputZ += 4; + outputX += 4; outputY += 4; outputZ += 4; + } + double *r0 = rowPtrs[0]; + double *r1 = rowPtrs[1]; + double *r2 = rowPtrs[2]; + + for (unsigned int i = (input.getCols() / 4) * 4; i < input.getCols(); ++i) { + // std::cout << "i = " << i << std::endl; + double X = *inputX, Y = *inputY, Z = *inputZ; + *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z; + *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z; + *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z; + ++inputX; ++inputY; ++inputZ; + ++outputX; ++outputY; ++outputZ; + } #else - double *r0 = rowPtrs[0]; - double *r1 = rowPtrs[1]; - double *r2 = rowPtrs[2]; - - for (unsigned int i = 0; i < input.getRows(); ++i) { - outputData[0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2]; - outputData[1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2]; - outputData[2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2]; - inputData += 3; - outputData += 3; - } + double *r0 = rowPtrs[0]; + double *r1 = rowPtrs[1]; + double *r2 = rowPtrs[2]; + + for (unsigned int i = 0; i < input.getRows(); ++i) { + double X = *inputX, Y = *inputY, Z = *inputZ; + *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z; + *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z; + *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z; + ++inputX; ++inputY; ++inputZ; + ++outputX; ++outputY; ++outputZ; + } #endif + } + + + } diff --git a/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h b/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h index cd8136b900..9539dc6aa1 100644 --- a/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h +++ b/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h @@ -179,10 +179,8 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker for (vpMatrix *m: matrices) { m->resize(3, numPoints, false, false); } - m_valid.resize(numPoints); for (unsigned int i = 0; i < numPoints; ++i) { - m_valid[i] = true; for (unsigned int j = 0; j < 3; ++j) { m_oXt[j][i] = points[i].oX[j]; m_oNt[j][i] = points[i].objectNormal[j]; @@ -190,89 +188,9 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker } } } - inline void updateAndErrorAndInteractionMatrix(const vpCameraParameters &cam, const vpHomogeneousMatrix &cMo, const vpImage &depth, vpColVector &e, vpMatrix &L) - { - const vpRotationMatrix cRo = cMo.getRotationMatrix(); - const vpTranslationVector t = cMo.getTranslationVector(); - vpMatrix::mult2Matrices(cRo, m_oXt, m_cXt); - vpMatrix::mult2Matrices(cRo, m_oNt, m_cNt); - - const unsigned int numPoints = m_oXt.getCols(); - e.resize(numPoints, false); - L.resize(numPoints, 6, false, false); - -#ifdef VISP_HAVE_OPENMP -#pragma omp parallel for -#endif - for (int i = 0; i < static_cast(numPoints); ++i) { - - //Step 1: update and filter out points that are no longer valid - { - - for (unsigned int j = 0; j < 3; ++j) { - m_cXt[j][i] += t[j]; - } - // Plane points away from the camera: this surface is no longer visible due to rotation - if (m_cNt[2][i] >= 0.0) { - m_valid[i] = false; - continue; - } - double x, y, u, v; - x = m_cXt[0][i] / m_cXt[2][i]; - y = m_cXt[1][i] / m_cXt[2][i]; - - vpMeterPixelConversion::convertPointWithoutDistortion(cam, x, y, u, v); - // Point is no longer in image: depth value cannot be sampled - if (u < 0 || v < 0 || u >= depth.getWidth() || v >= depth.getHeight()) { - m_valid[i] = false; - continue; - } - const double Z = depth[static_cast(v)][static_cast(u)]; - // Z value in the depth image from the camera is invalid - if (Z <= 0.0) { - m_valid[i] = false; - continue; - } - - m_valid[i] = true; - m_observations[0][i] = x * Z; - m_observations[1][i] = y * Z; - m_observations[2][i] = Z; - } - // Step 2: update Jacobian and error for valid points - { - const double X = m_cXt[0][i], Y = m_cXt[1][i], Z = m_cXt[2][i]; - const double nX = m_cNt[0][i], nY = m_cNt[1][i], nZ = m_cNt[2][i]; + void updateAndErrorAndInteractionMatrix(const vpCameraParameters &cam, const vpHomogeneousMatrix &cMo, + const vpImage &depth, vpColVector &e, vpMatrix &L); - const double D = -((nX * X) + (nY * Y) + (nZ * Z)); - double projNormal = nX * m_observations[0][i] + nY * m_observations[1][i] + nZ * m_observations[2][i]; - - e[i] = D + projNormal; - - L[i][0] = nX; - L[i][1] = nY; - L[i][2] = nZ; - L[i][3] = nZ * Y - nY * Z; - L[i][4] = nX * Z - nZ * X; - L[i][5] = nY * X - nX * Y; - - } - } - - // Disable invalid points - for (unsigned int i = 0; i < numPoints; ++i) { - if (!m_valid[i]) { - e[i] = 0.0; - - L[i][0] = 0; - L[i][1] = 0; - L[i][2] = 0; - L[i][3] = 0; - L[i][4] = 0; - L[i][5] = 0; - } - } - } const vpMatrix &getPointsObject() const { return m_oXt; } const vpMatrix &getNormalsObject() const { return m_oNt; } @@ -285,7 +203,7 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker vpMatrix m_oNt; // a 3xN matrix containing the 3D normals on the object vpMatrix m_cXt; vpMatrix m_cNt; - std::vector m_valid; + std::vector m_invalid; }; @@ -304,7 +222,7 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker {vpRBDenseDepthTracker::vpDisplayType::DT_WEIGHT, "weight"}, {vpRBDenseDepthTracker::vpDisplayType::DT_ERROR, "error"}, {vpRBDenseDepthTracker::vpDisplayType::DT_WEIGHT_AND_ERROR, "weightAndError"} - }); +}); virtual void loadJsonConfiguration(const nlohmann::json &j) VP_OVERRIDE { @@ -330,6 +248,9 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker bool m_useMask; float m_minMaskConfidence; vpDisplayType m_displayType; + + vpMatrix m_Lt; + }; END_VISP_NAMESPACE diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp index 2a69b7bfa7..bd17100062 100644 --- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp +++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp @@ -30,6 +30,7 @@ #include #include +#include #include #ifdef VISP_HAVE_OPENMP @@ -185,7 +186,12 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, return; } - m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_L); + double t1 = vpTime::measureTimeMs(); + m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_Lt); + m_L = m_Lt.t(); + + double t2 = vpTime::measureTimeMs(); + std::cout << "Update took: " << (t2 - t1) << std::endl; m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01); m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights); @@ -199,53 +205,243 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, m_L[i][dof] *= m_weights[i]; } } - m_LTL = m_L.AtA(); computeJTR(m_L, m_weighted_error, m_LTR); m_vvsConverged = false; } -void vpRBDenseDepthTracker::display(const vpCameraParameters &/*cam*/, const vpImage &/*I*/, - const vpImage &/*IRGB*/, const vpImage &depth) const +void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, const vpMatrix &obsT, vpColVector &e, vpMatrix &Lt, unsigned int start, unsigned int end) { - switch (m_displayType) { - case DT_SIMPLE: - { - for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { - vpColor c(0, 255, 0); - vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); - } - break; +#if defined(VISP_HAVE_OPENMP) +#pragma omp parallel for +#endif + for (unsigned int i = start; i < end; ++i) { + const double X = cXt[0][i], Y = cXt[1][i], Z = cXt[2][i]; + const double nX = cNt[0][i], nY = cNt[1][i], nZ = cNt[2][i]; + + const double D = -((nX * X) + (nY * Y) + (nZ * Z)); + const double projNormal = nX * obsT[0][i] + nY * obsT[1][i] + nZ * obsT[2][i]; + + e[i] = D + projNormal; + + Lt[0][i] = nX; + Lt[1][i] = nY; + Lt[2][i] = nZ; + Lt[3][i] = nZ * Y - nY * Z; + Lt[4][i] = nX * Z - nZ * X; + Lt[5][i] = nY * X - nX * Y; } - case DT_WEIGHT: - { - for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { - vpColor c(0, static_cast(m_weights[i] * 255), 0); - vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); - } - break; + +} + + +#if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) +void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const vpMatrix &cNt, const vpMatrix &obsT, vpColVector &e, vpMatrix &Lt) +{ + +#if defined(VISP_HAVE_AVX2) + + using Register = __m512d; + constexpr int numLanes = 8; + constexpr auto &loadu = _mm512_loadu_pd; + constexpr auto &mul = _mm512_mul_pd; + constexpr auto &sub = _mm512_sub_pd; + constexpr auto &storeu = _mm512_storeu_pd; +#if defined(VISP_HAVE_FMA) + constexpr auto &fma = _mm512_fmadd_pd; +#else + constexpr auto &add = _mm512_add_pd; +#endif +#elif defined(VISP_HAVE_AVX) + using Register = __m256d; + constexpr int numLanes = 4; + + constexpr auto &loadu = _mm256_loadu_pd; + constexpr auto &mul = _mm256_mul_pd; + constexpr auto &sub = _mm256_sub_pd; + + constexpr auto &storeu = _mm256_storeu_pd; +#if defined(VISP_HAVE_FMA) + constexpr auto &fma = _mm256_fmadd_pd; +#else + constexpr auto &add = _mm256_add_pd; +#endif +#elif VISP_HAVE_SSE2 + using Register = __m128d; + constexpr int numLanes = 2; + + constexpr auto &loadu = _mm_loadu_pd; + constexpr auto &mul = _mm_mul_pd; + constexpr auto &sub = _mm_sub_pd; + + constexpr auto &storeu = _mm_storeu_pd; +#if defined(VISP_HAVE_FMA) + constexpr auto &fma = _mm_fmadd_pd; +#else + constexpr auto &add = _mm_add_pd; +#endif +#endif + + for (int i = 0; i <= static_cast(cXt.getCols()) - numLanes; i += numLanes) { + const Register X = loadu(cXt[0] + i), Y = loadu(cXt[1] + i), Z = loadu(cXt[2] + i); + const Register nX = loadu(cNt[0] + i), nY = loadu(cNt[1] + i), nZ = loadu(cNt[2] + i); + const Register obsX = loadu(obsT[0] + i), obsY = loadu(obsT[1] + i), obsZ = loadu(obsT[2] + i); + + Register D = mul(nX, X); + Register projNormal = mul(obsX, X); +#if defined(VISP_HAVE_FMA) + D = fma(nY, Y, D); + D = fma(nZ, Z, D); + + projNormal = fma(nY, obsY, projNormal); + projNormal = fma(nZ, obsZ, projNormal); +#else + D = add(mul(nY, Y), D); + D = add(mul(nZ, Z), D); + + projNormal = add(mul(nY, obsY), projNormal); + projNormal = add(mul(nZ, obsZ), projNormal); +#endif + storeu(e.data + i, sub(projNormal, D)); + + storeu(Lt[0] + i, nX); + storeu(Lt[1] + i, nY); + storeu(Lt[2] + i, nZ); + + storeu(Lt[3] + i, sub(mul(nZ, Y), mul(nY, Z))); + storeu(Lt[4] + i, sub(mul(nX, Z), mul(nZ, X))); + storeu(Lt[5] + i, sub(mul(nY, X), mul(nX, Y))); } - case DT_ERROR: + errorAndInteractionMatrixBase(cXt, cNt, obsT, e, Lt, (cXt.getCols() / numLanes) * numLanes, cXt.getCols()); +} +#endif + + + +void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix( + const vpCameraParameters &cam, const vpHomogeneousMatrix &cMo, + const vpImage &depth, vpColVector &e, vpMatrix &Lt) +{ + const vpRotationMatrix cRo = cMo.getRotationMatrix(); + cMo.project(m_oXt, m_cXt, false); + cRo.rotateVectors(m_oNt, m_cNt, false); + + m_invalid.clear(); + + const unsigned int numPoints = m_oXt.getCols(); + e.resize(numPoints, false); + Lt.resize(6, numPoints, false, false); + +#if defined(VISP_HAVE_OPENMP) +#pragma omp parallel { - double maxError = m_error.getMaxValue(); - for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { - vpColor c(static_cast((m_error[i] / maxError) * 255), 0, 0); - vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); +#endif + std::vector localInvalid; +#if defined(VISP_HAVE_OPENMP) +#pragma omp for +#endif + + for (int i = 0; i < static_cast(numPoints); ++i) { + //Step 1: update and filter out points that are no longer valid + { + // Plane points away from the camera: this surface is no longer visible due to rotation + if (m_cNt[2][i] >= 0.0) { + localInvalid.push_back(i); + continue; + } + double x, y, u, v; + x = m_cXt[0][i] / m_cXt[2][i]; + y = m_cXt[1][i] / m_cXt[2][i]; + + vpMeterPixelConversion::convertPointWithoutDistortion(cam, x, y, u, v); + // Point is no longer in image: depth value cannot be sampled + if (u < 0 || v < 0 || u >= depth.getWidth() || v >= depth.getHeight()) { + localInvalid.push_back(i); + continue; + } + const double Z = depth[static_cast(v)][static_cast(u)]; + // Z value in the depth image from the camera is invalid + if (Z <= 0.0) { + localInvalid.push_back(i); + continue; + } + + m_observations[0][i] = x * Z; + m_observations[1][i] = y * Z; + m_observations[2][i] = Z; + } + } +#if defined(VISP_HAVE_OPENMP) +#pragma omp critical + { + m_invalid.insert(m_invalid.end(), localInvalid.begin(), localInvalid.end()); } - break; } - case DT_WEIGHT_AND_ERROR: +#else + m_invalid = std::move(localInvalid); +#endif + + +#if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) + errorAndInteractionMatrixSIMD(m_cXt, m_cNt, m_observations, e, Lt); +#else + errorAndInteractionMatrixBase(m_cXt, m_cNt, m_observations, e, Lt, 0, numPoints); +#endif + + // Disable invalid points + for (unsigned int i : m_invalid) { + e[i] = 0.0; + + Lt[0][i] = 0; + Lt[1][i] = 0; + Lt[2][i] = 0; + Lt[3][i] = 0; + Lt[4][i] = 0; + Lt[5][i] = 0; + } +} + + void vpRBDenseDepthTracker::display(const vpCameraParameters &/*cam*/, const vpImage &/*I*/, + const vpImage &/*IRGB*/, const vpImage &depth) const { - double maxError = m_error.getMaxValue(); - for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { - vpColor c(static_cast((m_error[i] / maxError) * 255.0), static_cast(m_weights[i] * 255), 0); - vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); + switch (m_displayType) { + case DT_SIMPLE: + { + for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { + vpColor c(0, 255, 0); + vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); + } + break; + } + case DT_WEIGHT: + { + for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { + vpColor c(0, static_cast(m_weights[i] * 255), 0); + vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); + } + break; + } + case DT_ERROR: + { + double maxError = m_error.getMaxValue(); + for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { + vpColor c(static_cast((m_error[i] / maxError) * 255), 0, 0); + vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); + } + break; + } + case DT_WEIGHT_AND_ERROR: + { + double maxError = m_error.getMaxValue(); + for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { + vpColor c(static_cast((m_error[i] / maxError) * 255.0), static_cast(m_weights[i] * 255), 0); + vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); + } + break; + } + default: + throw vpException(vpException::notImplementedError, "Depth tracker display type is invalid or not implemented"); } - break; - } - default: - throw vpException(vpException::notImplementedError, "Depth tracker display type is invalid or not implemented"); } -} -END_VISP_NAMESPACE + END_VISP_NAMESPACE From 690c718bf77a06e905f232ff81325c61e528f030 Mon Sep 17 00:00:00 2001 From: Samuel Felton Date: Wed, 3 Dec 2025 19:14:57 +0100 Subject: [PATCH 06/24] Export SIMD intrinsics utils in a separate header --- modules/core/include/visp3/core/vpSIMDUtils.h | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 modules/core/include/visp3/core/vpSIMDUtils.h diff --git a/modules/core/include/visp3/core/vpSIMDUtils.h b/modules/core/include/visp3/core/vpSIMDUtils.h new file mode 100644 index 0000000000..0342fbf06b --- /dev/null +++ b/modules/core/include/visp3/core/vpSIMDUtils.h @@ -0,0 +1,89 @@ +/* + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2024 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See https://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Rotation matrix. + */ + +/*! + \file vpSIMDUtils.h + \brief Header that defines and includes useful SIMD routines and macros +*/ +#include + +#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) +#include +#include +#include + +#define VISP_HAVE_SSE2 1 +#endif + +#if defined __AVX2__ +#define VISP_HAVE_AVX2 1 +#endif + +#if defined __AVX__ +#define VISP_HAVE_AVX 1 +#endif + +// https://stackoverflow.com/a/40765925 +#if !defined(__FMA__) && defined(__AVX2__) +#define __FMA__ 1 +#endif + + +#if defined(__FMA__) +#define VISP_HAVE_FMA +#endif + +#if defined _WIN32 && defined(_M_ARM64) +#define _ARM64_DISTINCT_NEON_TYPES +#include +#include +#define VISP_HAVE_NEON 1 +#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__) +#include +#define VISP_HAVE_NEON 1 +#else +#define VISP_HAVE_NEON 0 +#endif + + +#if VISP_HAVE_SSE2 && USE_SIMD_CODE +#define USE_SSE 1 +#else +#define USE_SSE 0 +#endif + +#if VISP_HAVE_NEON && USE_SIMD_CODE +#define USE_NEON 1 +#else +#define USE_NEON 0 +#endif From 4c88d39b0cf4ab4c124cfacbb8fa68b862047ae3 Mon Sep 17 00:00:00 2001 From: Samuel Felton Date: Thu, 4 Dec 2025 17:56:04 +0100 Subject: [PATCH 07/24] Fix test, improve vpRBDenseDepth --- modules/core/include/visp3/core/vpSIMDUtils.h | 145 ++++++++++++++++++ .../transformation/vpHomogeneousMatrix.cpp | 76 +-------- .../core/test/math/catchHomogeneousMatrix.cpp | 62 ++++++-- modules/core/test/math/catchRotation.cpp | 61 +++++--- .../src/features/vpRBDenseDepthTracker.cpp | 82 ++++------ 5 files changed, 262 insertions(+), 164 deletions(-) diff --git a/modules/core/include/visp3/core/vpSIMDUtils.h b/modules/core/include/visp3/core/vpSIMDUtils.h index 0342fbf06b..397def00a8 100644 --- a/modules/core/include/visp3/core/vpSIMDUtils.h +++ b/modules/core/include/visp3/core/vpSIMDUtils.h @@ -35,6 +35,9 @@ \file vpSIMDUtils.h \brief Header that defines and includes useful SIMD routines and macros */ + +#ifndef VP_SIMD_UTILS_H +#define VP_SIMD_UTILS_H #include #if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) @@ -87,3 +90,145 @@ #else #define USE_NEON 0 #endif + +namespace vpSIMD +{ +#if defined(VISP_HAVE_AVX2) +using Register = __m512d; + +inline const int numLanes = 8; +inline Register add(const Register a, const Register b) +{ + return _mm512_add_pd(a, b); +} + +inline Register sub(const Register a, const Register b) +{ + return _mm512_sub_pd(a, b); +} + +inline Register mul(const Register a, const Register b) +{ + return _mm512_mul_pd(a, b); +} + + +inline Register fma(const Register a, const Register b, const Register c) +{ +#if defined(VISP_HAVE_FMA) + return _mm512_fmadd_pd(a, b, c); +#else + return add(mul(a, b), c); +#endif +} + +inline Register loadu(const double *const data) +{ + return _mm512_loadu_pd(data); +} + + +inline Register set1(double v) +{ + return _mm512_set1_pd(v); +} + +inline void storeu(double *data, const Register a) +{ + _mm512_storeu_pd(data, a); +} + +#elif defined(VISP_HAVE_AVX) +using Register = __m256d; +inline const int numLanes = 4; + +inline Register add(const Register a, const Register b) +{ + return _mm256_add_pd(a, b); +} + +inline Register sub(const Register a, const Register b) +{ + return _mm256_sub_pd(a, b); +} + +inline Register mul(const Register a, const Register b) +{ + return _mm256_mul_pd(a, b); +} + + +inline Register fma(const Register a, const Register b, const Register c) +{ +#if defined(VISP_HAVE_FMA) + return _mm256_fmadd_pd(a, b, c); +#else + return add(mul(a, b), c); +#endif +} + +inline Register loadu(const double *const data) +{ + return _mm256_loadu_pd(data); +} + + +inline Register set1(double v) +{ + return _mm256_set1_pd(v); +} + +inline void storeu(double *data, const Register a) +{ + _mm256_storeu_pd(data, a); +} + +#elif VISP_HAVE_SSE2 +using Register = __m128d; +inline const int numLanes = 2; + +inline Register add(const Register a, const Register b) +{ + return _mm_add_pd(a, b); +} + +inline Register sub(const Register a, const Register b) +{ + return _mm_sub_pd(a, b); +} + +inline Register mul(const Register a, const Register b) +{ + return _mm_mul_pd(a, b); +} + + +inline Register fma(const Register a, const Register b, const Register c) +{ +#if defined(VISP_HAVE_FMA) + return _mm_fmadd_pd(a, b, c); +#else + return add(mul(a, b), c); +#endif +} + +inline Register loadu(const double *const data) +{ + return _mm_loadu_pd(data); +} + +inline Register set1(double v) +{ + return _mm_set1_pd(v); +} + +inline void storeu(double *data, const Register a) +{ + _mm_storeu_pd(data, a); +} + +#endif + +} + +#endif // VP_SIMD_UTILS_H diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp index 4867583025..9501e533b2 100644 --- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp +++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp @@ -150,43 +150,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool #if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) -#if defined(VISP_HAVE_AVX2) - - using Register = __m512d; - constexpr int numLanes = 8; - constexpr auto &set1 = _mm512_set1_pd; - constexpr auto &loadu = _mm512_loadu_pd; - constexpr auto &mul = _mm512_mul_pd; - constexpr auto &add = _mm512_add_pd; - constexpr auto &storeu = _mm512_storeu_pd; -#if defined(VISP_HAVE_FMA) - constexpr auto &fma = _mm512_fmadd_pd; -#endif -#elif defined(VISP_HAVE_AVX) - using Register = __m256d; - constexpr int numLanes = 4; - - constexpr auto &set1 = _mm256_set1_pd; - constexpr auto &loadu = _mm256_loadu_pd; - constexpr auto &mul = _mm256_mul_pd; - constexpr auto &add = _mm256_add_pd; - constexpr auto &storeu = _mm256_storeu_pd; -#if defined(VISP_HAVE_FMA) - constexpr auto &fma = _mm256_fmadd_pd; -#endif -#elif VISP_HAVE_SSE2 - using Register = __m128d; - constexpr int numLanes = 2; - - constexpr auto &set1 = _mm_set1_pd; - constexpr auto &loadu = _mm_loadu_pd; - constexpr auto &mul = _mm_mul_pd; - constexpr auto &add = _mm_add_pd; - constexpr auto &storeu = _mm_storeu_pd; -#if defined(VISP_HAVE_FMA) - constexpr auto &fma = _mm_fmadd_pd; -#endif -#endif + using namespace vpSIMD; Register elems[12]; for (unsigned int i = 0; i < 12; ++i) { @@ -198,52 +162,26 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool const Register y4 = loadu(inputY); const Register z4 = loadu(inputZ); -#if defined(VISP_HAVE_FMA) Register dp1 = mul(x4, elems[0]); - dp1 = fma(y4, elems[1], dp1); - dp1 = fma(z4, elems[2], dp1); + dp1 = vpSIMD::fma(y4, elems[1], dp1); + dp1 = vpSIMD::fma(z4, elems[2], dp1); dp1 = add(elems[3], dp1); Register dp2 = mul(x4, elems[4]); - dp2 = fma(y4, elems[5], dp2); - dp2 = fma(z4, elems[6], dp2); + dp2 = vpSIMD::fma(y4, elems[5], dp2); + dp2 = vpSIMD::fma(z4, elems[6], dp2); dp2 = add(elems[7], dp2); Register dp3 = mul(x4, elems[8]); - dp3 = fma(y4, elems[9], dp3); - dp3 = fma(z4, elems[10], dp3); + dp3 = vpSIMD::fma(y4, elems[9], dp3); + dp3 = vpSIMD::fma(z4, elems[10], dp3); dp3 = add(elems[11], dp3); -#else - const Register muls[] = { - mul(x4, elems[0]), - mul(y4, elems[1]), - mul(z4, elems[2]), - - mul(x4, elems[4]), - mul(y4, elems[5]), - mul(z4, elems[6]), - - mul(x4, elems[8]), - mul(y4, elems[9]), - mul(z4, elems[10]), - }; - - Register dp1 = add(add(muls[0], muls[1]), muls[2]); - Register dp2 = add(add(muls[3], muls[4]), muls[5]); - Register dp3 = add(add(muls[6], muls[7]), muls[8]); - - dp1 = add(dp1, elems[3]); - dp2 = add(dp2, elems[7]); - dp3 = add(dp3, elems[11]); -#endif - storeu(outputX, dp1); storeu(outputY, dp2); storeu(outputZ, dp3); - inputX += numLanes; inputY += numLanes; inputZ += numLanes; outputX += numLanes; outputY += numLanes; outputZ += numLanes; diff --git a/modules/core/test/math/catchHomogeneousMatrix.cpp b/modules/core/test/math/catchHomogeneousMatrix.cpp index e3b6c00f20..47aef0bb9f 100644 --- a/modules/core/test/math/catchHomogeneousMatrix.cpp +++ b/modules/core/test/math/catchHomogeneousMatrix.cpp @@ -231,23 +231,32 @@ TEST_CASE("Point projection", "project") std::vector NS = { 1, 10, 100, 1000, 10000 }; for (unsigned int N : NS) { std::cout << "Running for N = " << N << std::endl; - std::vector timeProject, timeMult, timeNaive; + std::vector timeProjectTransposed, timeProject, timeMult, timeNaive; vpUniRand r(42); for (unsigned int trial = 0; trial < 100000; ++trial) { - vpHomogeneousMatrix M; - vpMatrix input(N, 3); + vpHomogeneousMatrix M( + r.uniform(0.0, 1.0), r.uniform(0.0, 1.0), r.uniform(0.0, 1.0), + r.uniform(0.0, M_PI), r.uniform(0.0, M_PI), r.uniform(0.0, M_PI)); + vpMatrix inputT(N, 3); for (unsigned int i = 0; i< N; ++i) { - input[i][0] = r.uniform(0.0, 1.0); - input[i][1] = r.uniform(0.0, 1.0); - input[i][2] = r.uniform(0.0, 1.0); + inputT[i][0] = r.uniform(0.0, 1.0); + inputT[i][1] = r.uniform(0.0, 1.0); + inputT[i][2] = r.uniform(0.0, 1.0); } - vpMatrix output(N, 3); + vpMatrix input = inputT.t(); + vpMatrix outputT(N, 3); + vpMatrix output(3, N); + double t1 = vpTime::measureTimeMs(); - M.project(input, output); + M.project(inputT, outputT, true); double t2 = vpTime::measureTimeMs(); - timeProject.push_back(t2 - t1); + timeProjectTransposed.push_back(t2 - t1); + t1 = vpTime::measureTimeMs(); + M.project(input, output, false); + t2 = vpTime::measureTimeMs(); + timeProject.push_back(t2 - t1); vpColVector x(4, 1); vpColVector res(4); @@ -255,10 +264,10 @@ TEST_CASE("Point projection", "project") vpMatrix outputR(N, 3); double t1r = vpTime::measureTimeMs(); - for (unsigned int i = 0; i < input.getRows(); ++i) { - x[0] = input[i][0]; - x[1] = input[i][1]; - x[2] = input[i][2]; + for (unsigned int i = 0; i < inputT.getRows(); ++i) { + x[0] = inputT[i][0]; + x[1] = inputT[i][1]; + x[2] = inputT[i][2]; vpColVector res = M * x; outputR[i][0] = res[0]; @@ -270,6 +279,12 @@ TEST_CASE("Point projection", "project") vpMatrix input4(4, N); + for (unsigned int i = 0; i < inputT.getRows(); ++i) { + input4[0][i] = inputT[i][0]; + input4[1][i] = inputT[i][1]; + input4[2][i] = inputT[i][2]; + input4[3][i] = 1; + } vpMatrix output4(4, N); double t14 = vpTime::measureTimeMs(); @@ -277,12 +292,29 @@ TEST_CASE("Point projection", "project") double t24 = vpTime::measureTimeMs(); timeMult.push_back(t24 - t14); - CHECK((outputR == output)); + double errorT = (outputR - outputT).frobeniusNorm() / N; + double error = (outputR - output.t()).frobeniusNorm() / N; + + if (errorT > 1e-10 || error > 1e-10) { + std::cout << "M = " << M << std::endl; + std::cerr << "Naive outputT = " << std::endl << outputR << std::endl; + std::cerr << "Mult outputT = " << std::endl << output4.t() << std::endl; + std::cerr << "Project output transposed version = " << std::endl << outputT << std::endl; + std::cerr << "Project output = " << std::endl << outputT<< std::endl; + + + std::cerr << "diff transposed = " << std::endl << outputR - outputT; + std::cerr << "diff = " << std::endl << outputR - output.t(); + + FAIL(); + } } + std::cout << "Optimized version (transposed) took: " << vpMath::getMean(timeProjectTransposed) << " +-" << vpMath::getStdev(timeProjectTransposed) << "ms" << std::endl; std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +-" << vpMath::getStdev(timeProject) << "ms" << std::endl; + std::cout << "Mult version took: " << vpMath::getMean(timeMult) << " +-" << vpMath::getStdev(timeMult) << "ms" << std::endl; std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +-" << vpMath::getStdev(timeNaive) << "ms" << std::endl; - std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::getMean(timeProject) << std::endl; + std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::minimum(vpMath::getMean(timeProjectTransposed), vpMath::getMean(timeProject)) << std::endl; } diff --git a/modules/core/test/math/catchRotation.cpp b/modules/core/test/math/catchRotation.cpp index 51455e8059..00e396ee48 100644 --- a/modules/core/test/math/catchRotation.cpp +++ b/modules/core/test/math/catchRotation.cpp @@ -566,21 +566,27 @@ TEST_CASE("Vector rotation", "project") std::vector NS = { 1, 10, 100, 1000, 10000 }; for (unsigned int N : NS) { std::cout << "Running for N = " << N << std::endl; - std::vector timeProject, timeMult, timeNaive; + std::vector timeProjectTransposed, timeProject, timeMult, timeNaive; vpUniRand r(42); for (unsigned int trial = 0; trial < 100000; ++trial) { - vpRotationMatrix M; - vpMatrix input(N, 3); + vpRotationMatrix M(r.uniform(0.0, M_PI), r.uniform(0.0, M_PI), r.uniform(0.0, M_PI)); + vpMatrix inputT(N, 3); + for (unsigned int i = 0; i< N; ++i) { - input[i][0] = r.uniform(0.0, 1.0); - input[i][1] = r.uniform(0.0, 1.0); - input[i][2] = r.uniform(0.0, 1.0); + inputT[i][0] = r.uniform(0.0, 1.0); + inputT[i][1] = r.uniform(0.0, 1.0); + inputT[i][2] = r.uniform(0.0, 1.0); } - vpMatrix output(N, 3); - + vpMatrix input = inputT.t(); + vpMatrix outputT(N, 3); + vpMatrix output(3, N); double t1 = vpTime::measureTimeMs(); - M.rotateVectors(input, output); + M.rotateVectors(inputT, outputT, true); double t2 = vpTime::measureTimeMs(); + timeProjectTransposed.push_back(t2 - t1); + t1 = vpTime::measureTimeMs(); + M.rotateVectors(input, output, false); + t2 = vpTime::measureTimeMs(); timeProject.push_back(t2 - t1); @@ -589,10 +595,10 @@ TEST_CASE("Vector rotation", "project") vpMatrix outputR(N, 3); double t1r = vpTime::measureTimeMs(); - for (unsigned int i = 0; i < input.getRows(); ++i) { - x[0] = input[i][0]; - x[1] = input[i][1]; - x[2] = input[i][2]; + for (unsigned int i = 0; i < inputT.getRows(); ++i) { + x[0] = inputT[i][0]; + x[1] = inputT[i][1]; + x[2] = inputT[i][2]; res = M * x; outputR[i][0] = res[0]; @@ -602,27 +608,34 @@ TEST_CASE("Vector rotation", "project") double t2r = vpTime::measureTimeMs(); timeNaive.push_back(t2r - t1r); - - vpMatrix input4(3, N); - vpMatrix output4(3, N); + vpMatrix outputMM(3, N); double t14 = vpTime::measureTimeMs(); - vpMatrix::mult2Matrices((vpMatrix)M, input4, output4); + vpMatrix::mult2Matrices((vpMatrix)M, input, outputMM); double t24 = vpTime::measureTimeMs(); timeMult.push_back(t24 - t14); - if (outputR != output) { - std::cerr << "Output diff = " << (output - outputR)<< std::endl; + double errorT = (outputR - outputT).frobeniusNorm() / N; + double error = (outputR - output.t()).frobeniusNorm() / N; + if (errorT > 1e-10) { + std::cerr << "Transposed version failed" << std::endl; + FAIL(); + } + if (error > 1e-10) { + std::cerr << "Optimized 3xN version failed" << std::endl; + FAIL(); } - CHECK((outputR == output)); + + } - std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +-" << vpMath::getStdev(timeProject) << "ms" << std::endl; - std::cout << "Mult version took: " << vpMath::getMean(timeMult) << " +-" << vpMath::getStdev(timeMult) << "ms" << std::endl; - std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +-" << vpMath::getStdev(timeNaive) << "ms" << std::endl; - std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::getMean(timeProject) << std::endl; + std::cout << "Optimized (transposed) version took: " << vpMath::getMean(timeProjectTransposed) << " +- " << vpMath::getStdev(timeProjectTransposed) << "ms" << std::endl; + std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +- " << vpMath::getStdev(timeProject) << "ms" << std::endl; + std::cout << "Mult version took: " << vpMath::getMean(timeMult) << " +- " << vpMath::getStdev(timeMult) << "ms" << std::endl; + std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +- " << vpMath::getStdev(timeNaive) << "ms" << std::endl; + std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::minimum(vpMath::getMean(timeProjectTransposed), vpMath::getMean(timeProject)) << std::endl; } } diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp index bd17100062..5fffe2e32d 100644 --- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp +++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp @@ -193,11 +193,16 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, double t2 = vpTime::measureTimeMs(); std::cout << "Update took: " << (t2 - t1) << std::endl; + t1 = vpTime::measureTimeMs(); m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01); - m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights); + m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights); + t2 = vpTime::measureTimeMs(); + std::cout << "M estimator took: " << (t2 - t1) << std::endl; + t1 = vpTime::measureTimeMs(); +#pragma omp parallel for for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { m_weighted_error[i] = m_error[i] * m_weights[i]; m_covWeightDiag[i] = m_weights[i] * m_weights[i]; @@ -205,15 +210,26 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, m_L[i][dof] *= m_weights[i]; } } - m_LTL = m_L.AtA(); + t2 = vpTime::measureTimeMs(); + std::cout << "weight mult took: " << (t2 - t1) << std::endl; + t1 = vpTime::measureTimeMs(); + m_LTL.resize(6, 6); + m_L.AtA(m_LTL); + t2 = vpTime::measureTimeMs(); + std::cout << "LTL took: " << (t2 - t1) << std::endl; + // m_LTR = m_Lt * m_weighted_error; + t1 = vpTime::measureTimeMs(); computeJTR(m_L, m_weighted_error, m_LTR); + t2 = vpTime::measureTimeMs(); + std::cout << "LTR took: " << (t2 - t1) << std::endl; + std::cout << std::endl; m_vvsConverged = false; } void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, const vpMatrix &obsT, vpColVector &e, vpMatrix &Lt, unsigned int start, unsigned int end) { #if defined(VISP_HAVE_OPENMP) -#pragma omp parallel for +#pragma omp parallel for schedule(static, 2048) #endif for (unsigned int i = start; i < end; ++i) { const double X = cXt[0][i], Y = cXt[1][i], Z = cXt[2][i]; @@ -231,7 +247,6 @@ void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, con Lt[4][i] = nX * Z - nZ * X; Lt[5][i] = nY * X - nX * Y; } - } @@ -239,49 +254,11 @@ void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, con void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const vpMatrix &cNt, const vpMatrix &obsT, vpColVector &e, vpMatrix &Lt) { -#if defined(VISP_HAVE_AVX2) - - using Register = __m512d; - constexpr int numLanes = 8; - constexpr auto &loadu = _mm512_loadu_pd; - constexpr auto &mul = _mm512_mul_pd; - constexpr auto &sub = _mm512_sub_pd; - constexpr auto &storeu = _mm512_storeu_pd; -#if defined(VISP_HAVE_FMA) - constexpr auto &fma = _mm512_fmadd_pd; -#else - constexpr auto &add = _mm512_add_pd; -#endif -#elif defined(VISP_HAVE_AVX) - using Register = __m256d; - constexpr int numLanes = 4; - - constexpr auto &loadu = _mm256_loadu_pd; - constexpr auto &mul = _mm256_mul_pd; - constexpr auto &sub = _mm256_sub_pd; - - constexpr auto &storeu = _mm256_storeu_pd; -#if defined(VISP_HAVE_FMA) - constexpr auto &fma = _mm256_fmadd_pd; -#else - constexpr auto &add = _mm256_add_pd; -#endif -#elif VISP_HAVE_SSE2 - using Register = __m128d; - constexpr int numLanes = 2; - - constexpr auto &loadu = _mm_loadu_pd; - constexpr auto &mul = _mm_mul_pd; - constexpr auto &sub = _mm_sub_pd; + using namespace vpSIMD; - constexpr auto &storeu = _mm_storeu_pd; -#if defined(VISP_HAVE_FMA) - constexpr auto &fma = _mm_fmadd_pd; -#else - constexpr auto &add = _mm_add_pd; -#endif +#if defined(VISP_HAVE_OPENMP) +#pragma omp parallel for #endif - for (int i = 0; i <= static_cast(cXt.getCols()) - numLanes; i += numLanes) { const Register X = loadu(cXt[0] + i), Y = loadu(cXt[1] + i), Z = loadu(cXt[2] + i); const Register nX = loadu(cNt[0] + i), nY = loadu(cNt[1] + i), nZ = loadu(cNt[2] + i); @@ -289,19 +266,12 @@ void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const vpMatrix &cNt, co Register D = mul(nX, X); Register projNormal = mul(obsX, X); -#if defined(VISP_HAVE_FMA) - D = fma(nY, Y, D); - D = fma(nZ, Z, D); + D = vpSIMD::fma(nY, Y, D); + D = vpSIMD::fma(nZ, Z, D); - projNormal = fma(nY, obsY, projNormal); - projNormal = fma(nZ, obsZ, projNormal); -#else - D = add(mul(nY, Y), D); - D = add(mul(nZ, Z), D); + projNormal = vpSIMD::fma(nY, obsY, projNormal); + projNormal = vpSIMD::fma(nZ, obsZ, projNormal); - projNormal = add(mul(nY, obsY), projNormal); - projNormal = add(mul(nZ, obsZ), projNormal); -#endif storeu(e.data + i, sub(projNormal, D)); storeu(Lt[0] + i, nX); From 77adf9d73dce7b5fccfd8dcdb09f1df0b3784cd9 Mon Sep 17 00:00:00 2001 From: Samuel Felton Date: Thu, 4 Dec 2025 18:00:38 +0100 Subject: [PATCH 08/24] Remove debug prints --- .../src/features/vpRBDenseDepthTracker.cpp | 21 ++----------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp index 5fffe2e32d..7c28b552d6 100644 --- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp +++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp @@ -190,19 +190,12 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_Lt); m_L = m_Lt.t(); - double t2 = vpTime::measureTimeMs(); - std::cout << "Update took: " << (t2 - t1) << std::endl; - t1 = vpTime::measureTimeMs(); m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01); - m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights); - t2 = vpTime::measureTimeMs(); - std::cout << "M estimator took: " << (t2 - t1) << std::endl; - - t1 = vpTime::measureTimeMs(); - +#if defined(VISP_HAVE_OPENMP) #pragma omp parallel for +#endif for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { m_weighted_error[i] = m_error[i] * m_weights[i]; m_covWeightDiag[i] = m_weights[i] * m_weights[i]; @@ -210,19 +203,9 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, m_L[i][dof] *= m_weights[i]; } } - t2 = vpTime::measureTimeMs(); - std::cout << "weight mult took: " << (t2 - t1) << std::endl; - t1 = vpTime::measureTimeMs(); m_LTL.resize(6, 6); m_L.AtA(m_LTL); - t2 = vpTime::measureTimeMs(); - std::cout << "LTL took: " << (t2 - t1) << std::endl; - // m_LTR = m_Lt * m_weighted_error; - t1 = vpTime::measureTimeMs(); computeJTR(m_L, m_weighted_error, m_LTR); - t2 = vpTime::measureTimeMs(); - std::cout << "LTR took: " << (t2 - t1) << std::endl; - std::cout << std::endl; m_vvsConverged = false; } From 1f3c7c88881dc0dd7c4d11b324f7277fc4fff4f2 Mon Sep 17 00:00:00 2001 From: Samuel Felton Date: Fri, 5 Dec 2025 15:38:16 +0100 Subject: [PATCH 09/24] Move initVVS to cpp file, resize matrix there --- .../include/visp3/rbt/vpRBDenseDepthTracker.h | 2 +- .../src/features/vpRBDenseDepthTracker.cpp | 139 ++++++++++-------- 2 files changed, 79 insertions(+), 62 deletions(-) diff --git a/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h b/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h index 9539dc6aa1..07282c2377 100644 --- a/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h +++ b/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h @@ -151,7 +151,7 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker void extractFeatures(const vpRBFeatureTrackerInput &frame, const vpRBFeatureTrackerInput &previousFrame, const vpHomogeneousMatrix &cMo) VP_OVERRIDE; void trackFeatures(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) VP_OVERRIDE { } - void initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) VP_OVERRIDE { } + void initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) VP_OVERRIDE; void computeVVSIter(const vpRBFeatureTrackerInput &frame, const vpHomogeneousMatrix &cMo, unsigned int iteration) VP_OVERRIDE; void display(const vpCameraParameters &cam, const vpImage &I, const vpImage &IRGB, const vpImage &depth) const VP_OVERRIDE; diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp index 7c28b552d6..87fc23c564 100644 --- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp +++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #ifdef VISP_HAVE_OPENMP #include @@ -173,6 +174,17 @@ void vpRBDenseDepthTracker::extractFeatures(const vpRBFeatureTrackerInput &frame } } +void vpRBDenseDepthTracker::initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) +{ + m_LTL.resize(6, 6, false, false); + m_LTR.resize(6, false); + m_L.resize(m_numFeatures, 6, false, false); + m_Lt.resize(6, m_numFeatures, false, false); + m_error.resize(m_numFeatures, false); + m_weighted_error.resize(m_numFeatures, false); + m_weights.resize(m_numFeatures, false); +} + void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, const vpHomogeneousMatrix &cMo, unsigned int /*iteration*/) { if (m_numFeatures == 0) { @@ -188,14 +200,21 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, double t1 = vpTime::measureTimeMs(); m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_Lt); - m_L = m_Lt.t(); + m_Lt.transpose(m_L); + std::cout << "Update took: " << (vpTime::measureTimeMs() - t1) << std::endl; t1 = vpTime::measureTimeMs(); - m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01); - m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights); -#if defined(VISP_HAVE_OPENMP) -#pragma omp parallel for -#endif + // m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01); + // m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights); + + vpMbtTukeyEstimator robust; + // m_robust.setMinMedianAbsoluteDeviation(); + robust.MEstimator(m_error, m_weights, (frame.renders.zFar - frame.renders.zNear) * 0.01); + std::cout << "MEstimator took: " << (vpTime::measureTimeMs() - t1) << std::endl; + +// #if defined(VISP_HAVE_OPENMP) +// #pragma omp parallel for +// #endif for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { m_weighted_error[i] = m_error[i] * m_weights[i]; m_covWeightDiag[i] = m_weights[i] * m_weights[i]; @@ -240,7 +259,7 @@ void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const vpMatrix &cNt, co using namespace vpSIMD; #if defined(VISP_HAVE_OPENMP) -#pragma omp parallel for +#pragma omp parallel for schedule(static, 1024) #endif for (int i = 0; i <= static_cast(cXt.getCols()) - numLanes; i += numLanes) { const Register X = loadu(cXt[0] + i), Y = loadu(cXt[1] + i), Z = loadu(cXt[2] + i); @@ -284,11 +303,10 @@ void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix( const unsigned int numPoints = m_oXt.getCols(); e.resize(numPoints, false); Lt.resize(6, numPoints, false, false); - #if defined(VISP_HAVE_OPENMP) #pragma omp parallel - { #endif + { std::vector localInvalid; #if defined(VISP_HAVE_OPENMP) #pragma omp for @@ -329,72 +347,71 @@ void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix( { m_invalid.insert(m_invalid.end(), localInvalid.begin(), localInvalid.end()); } - } #else m_invalid = std::move(localInvalid); #endif - + } #if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) - errorAndInteractionMatrixSIMD(m_cXt, m_cNt, m_observations, e, Lt); + errorAndInteractionMatrixSIMD(m_cXt, m_cNt, m_observations, e, Lt); #else - errorAndInteractionMatrixBase(m_cXt, m_cNt, m_observations, e, Lt, 0, numPoints); + errorAndInteractionMatrixBase(m_cXt, m_cNt, m_observations, e, Lt, 0, numPoints); #endif - // Disable invalid points - for (unsigned int i : m_invalid) { - e[i] = 0.0; - - Lt[0][i] = 0; - Lt[1][i] = 0; - Lt[2][i] = 0; - Lt[3][i] = 0; - Lt[4][i] = 0; - Lt[5][i] = 0; - } + for (unsigned int i : m_invalid) { + e[i] = 0.0; + + Lt[0][i] = 0; + Lt[1][i] = 0; + Lt[2][i] = 0; + Lt[3][i] = 0; + Lt[4][i] = 0; + Lt[5][i] = 0; + } + } - void vpRBDenseDepthTracker::display(const vpCameraParameters &/*cam*/, const vpImage &/*I*/, - const vpImage &/*IRGB*/, const vpImage &depth) const +void vpRBDenseDepthTracker::display(const vpCameraParameters &/*cam*/, const vpImage &/*I*/, + const vpImage &/*IRGB*/, const vpImage &depth) const +{ + switch (m_displayType) { + case DT_SIMPLE: { - switch (m_displayType) { - case DT_SIMPLE: - { - for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { - vpColor c(0, 255, 0); - vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); - } - break; - } - case DT_WEIGHT: - { - for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { - vpColor c(0, static_cast(m_weights[i] * 255), 0); - vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); - } - break; + for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { + vpColor c(0, 255, 0); + vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); } - case DT_ERROR: - { - double maxError = m_error.getMaxValue(); - for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { - vpColor c(static_cast((m_error[i] / maxError) * 255), 0, 0); - vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); - } - break; + break; + } + case DT_WEIGHT: + { + for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { + vpColor c(0, static_cast(m_weights[i] * 255), 0); + vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); } - case DT_WEIGHT_AND_ERROR: - { - double maxError = m_error.getMaxValue(); - for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { - vpColor c(static_cast((m_error[i] / maxError) * 255.0), static_cast(m_weights[i] * 255), 0); - vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); - } - break; + break; + } + case DT_ERROR: + { + double maxError = m_error.getMaxValue(); + for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { + vpColor c(static_cast((m_error[i] / maxError) * 255), 0, 0); + vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); } - default: - throw vpException(vpException::notImplementedError, "Depth tracker display type is invalid or not implemented"); + break; + } + case DT_WEIGHT_AND_ERROR: + { + double maxError = m_error.getMaxValue(); + for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { + vpColor c(static_cast((m_error[i] / maxError) * 255.0), static_cast(m_weights[i] * 255), 0); + vpDisplay::displayPoint(depth, m_depthPoints[i].pixelPos[0], m_depthPoints[i].pixelPos[1], c, 1); } + break; } + default: + throw vpException(vpException::notImplementedError, "Depth tracker display type is invalid or not implemented"); + } +} - END_VISP_NAMESPACE +END_VISP_NAMESPACE From cc729b6115b9421133b4719fe862ec20b68ddf59 Mon Sep 17 00:00:00 2001 From: Samuel Felton Date: Fri, 5 Dec 2025 18:20:16 +0100 Subject: [PATCH 10/24] Add ENABLE_NATIVE_ARCH option for gcc --- CMakeLists.txt | 3 +++ cmake/AddExtraCompilationFlags.cmake | 37 ++++++++++++++++------------ 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e208d9a3a..a7f52d161a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -381,6 +381,9 @@ VP_OPTION(ENABLE_SSSE3 "" "" "Enable SSSE3 instructions" "" ON IF ((MSVC OR CMAK if(X86_64) VP_OPTION(ENABLE_AVX "" "" "Enable AVX instructions" "" OFF) # should be explicitly enabled, used in matrix transpose code endif() +if(CMAKE_COMPILER_IS_GNUCXX) + VP_OPTION(ENABLE_NATIVE_ARCH "" "" "Enable all available CPU instruction sets" "" OFF) +endif() #---------------------------------------------------------------------- # BLAS / LAPACK diff --git a/cmake/AddExtraCompilationFlags.cmake b/cmake/AddExtraCompilationFlags.cmake index 35f0a269dc..4eba86f73d 100644 --- a/cmake/AddExtraCompilationFlags.cmake +++ b/cmake/AddExtraCompilationFlags.cmake @@ -171,25 +171,30 @@ if(CMAKE_COMPILER_IS_GNUCXX) add_extra_compiler_options("-fvisibility-inlines-hidden") endif() - if(ENABLE_AVX AND X86_64) - add_extra_compiler_options("-mavx") + if(ENABLE_NATIVE_ARCH) + add_extra_compiler_options("-march=native") else() - if(ENABLE_SSE2) - add_extra_compiler_options("-msse2") - elseif(X86 OR X86_64) - add_extra_compiler_options("-mno-sse2") - endif() - if(ENABLE_SSE3) - add_extra_compiler_options("-msse3") - elseif(X86 OR X86_64) - #add_extra_compiler_options("-mno-sse3") - endif() + if(ENABLE_AVX AND X86_64) + add_extra_compiler_options("-mavx") + else() + if(ENABLE_SSE2) + add_extra_compiler_options("-msse2") + elseif(X86 OR X86_64) + add_extra_compiler_options("-mno-sse2") + endif() + + if(ENABLE_SSE3) + add_extra_compiler_options("-msse3") + elseif(X86 OR X86_64) + #add_extra_compiler_options("-mno-sse3") + endif() - if(ENABLE_SSSE3) - add_extra_compiler_options("-mssse3") - elseif(X86 OR X86_64) - add_extra_compiler_options("-mno-ssse3") + if(ENABLE_SSSE3) + add_extra_compiler_options("-mssse3") + elseif(X86 OR X86_64) + add_extra_compiler_options("-mno-ssse3") + endif() endif() endif() From a05c32905fcd1e90481cea631195e6c7283758f7 Mon Sep 17 00:00:00 2001 From: Samuel Felton Date: Fri, 5 Dec 2025 18:21:08 +0100 Subject: [PATCH 11/24] Remove reference to MBT tukey estimator, disable prints --- modules/core/include/visp3/core/vpSIMDUtils.h | 4 +- modules/core/src/math/robust/vpRobust.cpp | 12 ++-- .../math/transformation/vpRotationMatrix.cpp | 66 +++++++------------ .../src/features/vpRBDenseDepthTracker.cpp | 21 ++---- 4 files changed, 38 insertions(+), 65 deletions(-) diff --git a/modules/core/include/visp3/core/vpSIMDUtils.h b/modules/core/include/visp3/core/vpSIMDUtils.h index 397def00a8..2d08d4c787 100644 --- a/modules/core/include/visp3/core/vpSIMDUtils.h +++ b/modules/core/include/visp3/core/vpSIMDUtils.h @@ -96,8 +96,8 @@ namespace vpSIMD #if defined(VISP_HAVE_AVX2) using Register = __m512d; -inline const int numLanes = 8; -inline Register add(const Register a, const Register b) +inline constexpr int numLanes = 8; +inline const Register add(const Register a, const Register b) { return _mm512_add_pd(a, b); } diff --git a/modules/core/src/math/robust/vpRobust.cpp b/modules/core/src/math/robust/vpRobust.cpp index 27fe9159c6..56be0a0c59 100644 --- a/modules/core/src/math/robust/vpRobust.cpp +++ b/modules/core/src/math/robust/vpRobust.cpp @@ -107,13 +107,14 @@ vpRobust &vpRobust::operator=(const vpRobust &&other) void vpRobust::resize(unsigned int n_data) { if (n_data != m_size) { - m_normres.resize(n_data); - m_sorted_normres.resize(n_data); - m_sorted_residues.resize(n_data); + m_normres.resize(n_data, false); + m_sorted_normres.resize(n_data, false); + m_sorted_residues.resize(n_data, false); m_size = n_data; } } + // =================================================================== /*! @@ -143,6 +144,8 @@ void vpRobust::MEstimator(const vpRobustEstimatorType method, const vpColVector // Calculate median med = select(m_sorted_residues, 0, n_data - 1, ind_med); + + // med = select(m_sorted_residues, 0, n_data - 1, ind_med); // --comment: residualMedian = med // Normalize residues @@ -153,6 +156,7 @@ void vpRobust::MEstimator(const vpRobustEstimatorType method, const vpColVector // Calculate MAD normmedian = select(m_sorted_normres, 0, n_data - 1, ind_med); + // normalizedResidualMedian = normmedian ; // 1.48 keeps scale estimate consistent for a normal probability dist. m_mad = 1.4826 * normmedian; // median Absolute Deviation @@ -534,7 +538,7 @@ double vpRobust::constrainedChi(vpRobustEstimatorType method, double x) default: { throw(vpException(vpException::fatalError, "Unsupported robust estimator type in vpRobust::constrainedChi()")); } - } +} return -1; } diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp index fa5e78e9c3..8808039ed0 100644 --- a/modules/core/src/math/transformation/vpRotationMatrix.cpp +++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp @@ -157,6 +157,7 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo else { // 3xN matrix + using namespace vpSIMD; if (input.getRows() != 3) { throw vpException(vpException::dimensionError, "Expected input to have 3 rows"); } @@ -168,64 +169,43 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo double *outputZ = output[2]; -#if defined(VISP_HAVE_AVX) +#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) || defined(VISP_HAVE_AVX2) - __m256d elems[9]; + Register elems[9]; for (unsigned int i = 0; i < 9; ++i) { - elems[i] = _mm256_set1_pd(data[i]); + elems[i] = set1(data[i]); } - for (int i = 0; i <= static_cast(input.getCols()) - 4; i += 4) { - const __m256d x4 = _mm256_loadu_pd(inputX); - const __m256d y4 = _mm256_loadu_pd(inputY); - const __m256d z4 = _mm256_loadu_pd(inputZ); - -#if defined(VISP_HAVE_FMA) - __m256d dp1 = _mm256_mul_pd(x4, elems[0]); - dp1 = _mm256_fmadd_pd(y4, elems[1], dp1); - dp1 = _mm256_fmadd_pd(z4, elems[2], dp1); - __m256d dp2 = _mm256_mul_pd(x4, elems[3]); - dp2 = _mm256_fmadd_pd(y4, elems[4], dp2); - dp2 = _mm256_fmadd_pd(z4, elems[5], dp2); - __m256d dp3 = _mm256_mul_pd(x4, elems[6]); - dp3 = _mm256_fmadd_pd(y4, elems[7], dp3); - dp3 = _mm256_fmadd_pd(z4, elems[8], dp3); + for (int i = 0; i <= static_cast(input.getCols()) - vpSIMD::numLanes; i += vpSIMD::numLanes) { + const Register x4 = loadu(inputX); + const Register y4 = loadu(inputY); + const Register z4 = loadu(inputZ); -#else - const __m256d muls = { - _mm256_mul_pd(x4, elems[0]), - _mm256_mul_pd(y4, elems[1]), - _mm256_mul_pd(z4, elems[2]), - - _mm256_mul_pd(x4, elems[3]), - _mm256_mul_pd(y4, elems[4]), - _mm256_mul_pd(z4, elems[5]), - - _mm256_mul_pd(x4, elems[6]), - _mm256_mul_pd(y4, elems[7]), - _mm256_mul_pd(z4, elems[8]), - } + Register dp1 = mul(x4, elems[0]); + dp1 = vpSIMD::fma(y4, elems[1], dp1); + dp1 = vpSIMD::fma(z4, elems[2], dp1); + Register dp2 = mul(x4, elems[3]); + dp2 = vpSIMD::fma(y4, elems[4], dp2); + dp2 = vpSIMD::fma(z4, elems[5], dp2); + Register dp3 = mul(x4, elems[6]); + dp3 = vpSIMD::fma(y4, elems[7], dp3); + dp3 = vpSIMD::fma(z4, elems[8], dp3); - __m256d dp1 = _mm256_add_pd(_mm256_add_pd(muls[0], muls[1]), muls[2]); - __m256d dp2 = _mm256_add_pd(_mm256_add_pd(muls[3], muls[4]), muls[5]); - __m256d dp3 = _mm256_add_pd(_mm256_add_pd(muls[6], muls[7]), muls[8]); -#endif - - _mm256_storeu_pd(outputX, dp1); - _mm256_storeu_pd(outputY, dp2); - _mm256_storeu_pd(outputZ, dp3); + storeu(outputX, dp1); + storeu(outputY, dp2); + storeu(outputZ, dp3); - inputX += 4; inputY += 4; inputZ += 4; - outputX += 4; outputY += 4; outputZ += 4; + inputX += vpSIMD::numLanes; inputY += vpSIMD::numLanes; inputZ += vpSIMD::numLanes; + outputX += vpSIMD::numLanes; outputY += vpSIMD::numLanes; outputZ += vpSIMD::numLanes; } double *r0 = rowPtrs[0]; double *r1 = rowPtrs[1]; double *r2 = rowPtrs[2]; - for (unsigned int i = (input.getCols() / 4) * 4; i < input.getCols(); ++i) { + for (unsigned int i = (input.getCols() / vpSIMD::numLanes) * vpSIMD::numLanes; i < input.getCols(); ++i) { // std::cout << "i = " << i << std::endl; double X = *inputX, Y = *inputY, Z = *inputZ; *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z; diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp index 87fc23c564..81af464e3b 100644 --- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp +++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp @@ -32,7 +32,6 @@ #include #include #include -#include #ifdef VISP_HAVE_OPENMP #include @@ -154,7 +153,7 @@ void vpRBDenseDepthTracker::extractFeatures(const vpRBFeatureTrackerInput &frame } pointsPerThread[threadIdx] = std::move(localPoints); - } + } for (const std::vector &points: pointsPerThread) { m_depthPoints.insert(m_depthPoints.end(), std::make_move_iterator(points.begin()), std::make_move_iterator(points.end())); } @@ -172,7 +171,7 @@ void vpRBDenseDepthTracker::extractFeatures(const vpRBFeatureTrackerInput &frame else { m_numFeatures = 0; } -} + } void vpRBDenseDepthTracker::initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) { @@ -201,16 +200,10 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, double t1 = vpTime::measureTimeMs(); m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_Lt); m_Lt.transpose(m_L); - std::cout << "Update took: " << (vpTime::measureTimeMs() - t1) << std::endl; t1 = vpTime::measureTimeMs(); - // m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01); - // m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights); - - vpMbtTukeyEstimator robust; - // m_robust.setMinMedianAbsoluteDeviation(); - robust.MEstimator(m_error, m_weights, (frame.renders.zFar - frame.renders.zNear) * 0.01); - std::cout << "MEstimator took: " << (vpTime::measureTimeMs() - t1) << std::endl; + m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01); + m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights); // #if defined(VISP_HAVE_OPENMP) // #pragma omp parallel for @@ -231,7 +224,7 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, const vpMatrix &obsT, vpColVector &e, vpMatrix &Lt, unsigned int start, unsigned int end) { #if defined(VISP_HAVE_OPENMP) -#pragma omp parallel for schedule(static, 2048) +#pragma omp parallel for #endif for (unsigned int i = start; i < end; ++i) { const double X = cXt[0][i], Y = cXt[1][i], Z = cXt[2][i]; @@ -297,7 +290,6 @@ void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix( const vpRotationMatrix cRo = cMo.getRotationMatrix(); cMo.project(m_oXt, m_cXt, false); cRo.rotateVectors(m_oNt, m_cNt, false); - m_invalid.clear(); const unsigned int numPoints = m_oXt.getCols(); @@ -323,7 +315,6 @@ void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix( double x, y, u, v; x = m_cXt[0][i] / m_cXt[2][i]; y = m_cXt[1][i] / m_cXt[2][i]; - vpMeterPixelConversion::convertPointWithoutDistortion(cam, x, y, u, v); // Point is no longer in image: depth value cannot be sampled if (u < 0 || v < 0 || u >= depth.getWidth() || v >= depth.getHeight()) { @@ -336,7 +327,6 @@ void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix( localInvalid.push_back(i); continue; } - m_observations[0][i] = x * Z; m_observations[1][i] = y * Z; m_observations[2][i] = Z; @@ -368,7 +358,6 @@ void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix( Lt[4][i] = 0; Lt[5][i] = 0; } - } void vpRBDenseDepthTracker::display(const vpCameraParameters &/*cam*/, const vpImage &/*I*/, From d847bfb6990d54a2f5771923265499730a1e6a4a Mon Sep 17 00:00:00 2001 From: Samuel Felton Date: Sat, 6 Dec 2025 21:29:32 +0100 Subject: [PATCH 12/24] Fix SSE3 flag check --- modules/core/src/math/transformation/vpHomogeneousMatrix.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp index 9501e533b2..0d218c5ea2 100644 --- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp +++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp @@ -99,7 +99,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool outputData += 3; } -#elif (VISP_HAVE_SSE2) +#elif (VISP_HAVE_SSE3) __m128d rows[] = { _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2), _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2), From c4bc02aa143749b7ccc07570aeba2c51c7fa4b2e Mon Sep 17 00:00:00 2001 From: Fabien Spindler Date: Sun, 7 Dec 2025 10:09:55 +0100 Subject: [PATCH 13/24] Update copyright headers --- CMakeLists.txt | 2 +- modules/core/include/visp3/core/vpSIMDUtils.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 986d7d4792..edb0af45da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ ############################################################################# # # ViSP, open source Visual Servoing Platform software. -# Copyright (C) 2005 - 2024 by Inria. All rights reserved. +# Copyright (C) 2005 - 2025 by Inria. All rights reserved. # # This software is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/modules/core/include/visp3/core/vpSIMDUtils.h b/modules/core/include/visp3/core/vpSIMDUtils.h index 2d08d4c787..b46c9c917e 100644 --- a/modules/core/include/visp3/core/vpSIMDUtils.h +++ b/modules/core/include/visp3/core/vpSIMDUtils.h @@ -1,6 +1,6 @@ /* * ViSP, open source Visual Servoing Platform software. - * Copyright (C) 2005 - 2024 by Inria. All rights reserved. + * Copyright (C) 2005 - 2025 by Inria. All rights reserved. * * This software is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -28,7 +28,7 @@ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * * Description: - * Rotation matrix. + * SIMD utilities. */ /*! From 49e8c83bc6ef229294b3b15146afa296d9f3e227 Mon Sep 17 00:00:00 2001 From: Fabien Spindler Date: Sun, 7 Dec 2025 10:11:50 +0100 Subject: [PATCH 14/24] Fix warning unused variable modules/core/src/math/transformation/vpHomogeneousMatrix.cpp:60:13: warning: unused variable 'outputData' [-Wunused-variable] 60 | double *outputData = output.data; | ^~~~~~~~~~ --- .../core/src/math/transformation/vpHomogeneousMatrix.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp index 0d218c5ea2..6a8022732c 100644 --- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp +++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp @@ -57,8 +57,8 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool throw vpException(vpException::dimensionError, "Input matrix should have 3 columns"); } double *inputData = input.data; - double *outputData = output.data; #if defined(VISP_HAVE_AVX) + double *outputData = output.data; __m256d rows[] = { _mm256_loadu_pd(rowPtrs[0]), _mm256_loadu_pd(rowPtrs[1]), @@ -100,6 +100,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool } #elif (VISP_HAVE_SSE3) + double *outputData = output.data; __m128d rows[] = { _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2), _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2), @@ -123,7 +124,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool } inputData += 3; outputData += 3; - } + } #else double *r0 = rowPtrs[0]; double *r1 = rowPtrs[1]; @@ -136,7 +137,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool inputData += 3; } #endif - } +} else { if (input.getRows() != 3) { throw vpException(vpException::dimensionError, "Expected input to have 3 rows"); From ae5966b6dd25697766b13f50f6feced7ec7bfcfb Mon Sep 17 00:00:00 2001 From: Fabien Spindler Date: Sun, 7 Dec 2025 10:22:53 +0100 Subject: [PATCH 15/24] Remove useless empty lines --- modules/core/include/visp3/core/vpSIMDUtils.h | 6 ------ modules/core/src/math/robust/vpRobust.cpp | 3 +-- .../transformation/vpHomogeneousMatrix.cpp | 21 +++++++------------ .../math/transformation/vpRotationMatrix.cpp | 21 +------------------ .../core/test/math/catchHomogeneousMatrix.cpp | 5 +---- modules/core/test/math/catchRotation.cpp | 6 ------ .../src/features/vpRBDenseDepthTracker.cpp | 8 ++----- 7 files changed, 13 insertions(+), 57 deletions(-) diff --git a/modules/core/include/visp3/core/vpSIMDUtils.h b/modules/core/include/visp3/core/vpSIMDUtils.h index b46c9c917e..a54f3c5550 100644 --- a/modules/core/include/visp3/core/vpSIMDUtils.h +++ b/modules/core/include/visp3/core/vpSIMDUtils.h @@ -78,7 +78,6 @@ #define VISP_HAVE_NEON 0 #endif - #if VISP_HAVE_SSE2 && USE_SIMD_CODE #define USE_SSE 1 #else @@ -112,7 +111,6 @@ inline Register mul(const Register a, const Register b) return _mm512_mul_pd(a, b); } - inline Register fma(const Register a, const Register b, const Register c) { #if defined(VISP_HAVE_FMA) @@ -127,7 +125,6 @@ inline Register loadu(const double *const data) return _mm512_loadu_pd(data); } - inline Register set1(double v) { return _mm512_set1_pd(v); @@ -157,7 +154,6 @@ inline Register mul(const Register a, const Register b) return _mm256_mul_pd(a, b); } - inline Register fma(const Register a, const Register b, const Register c) { #if defined(VISP_HAVE_FMA) @@ -172,7 +168,6 @@ inline Register loadu(const double *const data) return _mm256_loadu_pd(data); } - inline Register set1(double v) { return _mm256_set1_pd(v); @@ -202,7 +197,6 @@ inline Register mul(const Register a, const Register b) return _mm_mul_pd(a, b); } - inline Register fma(const Register a, const Register b, const Register c) { #if defined(VISP_HAVE_FMA) diff --git a/modules/core/src/math/robust/vpRobust.cpp b/modules/core/src/math/robust/vpRobust.cpp index 56be0a0c59..08b0c8da09 100644 --- a/modules/core/src/math/robust/vpRobust.cpp +++ b/modules/core/src/math/robust/vpRobust.cpp @@ -114,7 +114,6 @@ void vpRobust::resize(unsigned int n_data) } } - // =================================================================== /*! @@ -538,7 +537,7 @@ double vpRobust::constrainedChi(vpRobustEstimatorType method, double x) default: { throw(vpException(vpException::fatalError, "Unsupported robust estimator type in vpRobust::constrainedChi()")); } -} + } return -1; } diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp index 6a8022732c..0154c26e35 100644 --- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp +++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp @@ -67,7 +67,6 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool }; - double result[4]; for (unsigned int i = 0; i < input.getRows(); ++i) { @@ -90,7 +89,6 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool __m128d sum1 = _mm_hadd_pd(sum2, sum2); // [total, total] double total = _mm_cvtsd_f64(sum1); - outputData[0] = result[0] + result[2]; outputData[1] = result[1] + result[3]; outputData[2] = total; @@ -124,7 +122,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool } inputData += 3; outputData += 3; - } + } #else double *r0 = rowPtrs[0]; double *r1 = rowPtrs[1]; @@ -137,7 +135,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool inputData += 3; } #endif -} + } else { if (input.getRows() != 3) { throw vpException(vpException::dimensionError, "Expected input to have 3 rows"); @@ -178,7 +176,6 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool dp3 = vpSIMD::fma(z4, elems[10], dp3); dp3 = add(elems[11], dp3); - storeu(outputX, dp1); storeu(outputY, dp2); storeu(outputZ, dp3); @@ -200,7 +197,6 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool ++inputX; ++inputY; ++inputZ; ++outputX; ++outputY; ++outputZ; } - #else double *r0 = rowPtrs[0]; double *r1 = rowPtrs[1]; @@ -215,15 +211,14 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool ++outputX; ++outputY; ++outputZ; } #endif - } -} + } -/*! - Construct an homogeneous matrix from a translation vector and quaternion - rotation vector. - */ + /*! + Construct an homogeneous matrix from a translation vector and quaternion + rotation vector. + */ vpHomogeneousMatrix::vpHomogeneousMatrix(const vpTranslationVector &t, const vpQuaternionVector &q) : vpArray2D(constr_value_4, constr_value_4) { @@ -1578,8 +1573,8 @@ void vpHomogeneousMatrix::parse_json(const nlohmann::json &j) const bool converted = convertFromTypeAndBuildFrom(j, *this); if (!converted) { from_json(j, *asArray); - } } +} else { // Generic 2D array conversion from_json(j, *asArray); } diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp index 8808039ed0..a2cbce8c26 100644 --- a/modules/core/src/math/transformation/vpRotationMatrix.cpp +++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp @@ -1,6 +1,6 @@ /* * ViSP, open source Visual Servoing Platform software. - * Copyright (C) 2005 - 2024 by Inria. All rights reserved. + * Copyright (C) 2005 - 2025 by Inria. All rights reserved. * * This software is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -53,7 +53,6 @@ BEGIN_VISP_NAMESPACE - void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bool isTransposed) const { output.resize(input.getRows(), input.getCols(), false, false); @@ -92,7 +91,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo __m128d sum1 = _mm_hadd_pd(sum2, sum2); // [total, total] double total = _mm_cvtsd_f64(sum1); - outputData[0] = result[0] + result[2]; outputData[1] = result[1] + result[3]; outputData[2] = total; @@ -102,8 +100,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo } #elif VISP_HAVE_SSE2 - - __m128d row01_lo = _mm_loadu_pd(rowPtrs[0]); __m128d row01_hi = _mm_loadu_pd(rowPtrs[1]); __m128d row2_xy = _mm_loadu_pd(rowPtrs[2]); @@ -113,7 +109,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo double c1 = rowPtrs[1][2]; double c2 = rowPtrs[2][2]; - for (unsigned i = 0; i < input.getRows(); ++i) { const __m128d xy = _mm_loadu_pd(inputData); @@ -138,7 +133,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo inputData += 3; outputData += 3; } - #else double *r0 = rowPtrs[0]; double *r1 = rowPtrs[1]; @@ -153,9 +147,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo } #endif } - - - else { // 3xN matrix using namespace vpSIMD; if (input.getRows() != 3) { @@ -168,7 +159,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo double *outputY = output[1]; double *outputZ = output[2]; - #if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) || defined(VISP_HAVE_AVX2) Register elems[9]; @@ -190,16 +180,12 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo dp3 = vpSIMD::fma(y4, elems[7], dp3); dp3 = vpSIMD::fma(z4, elems[8], dp3); - - storeu(outputX, dp1); storeu(outputY, dp2); storeu(outputZ, dp3); - inputX += vpSIMD::numLanes; inputY += vpSIMD::numLanes; inputZ += vpSIMD::numLanes; outputX += vpSIMD::numLanes; outputY += vpSIMD::numLanes; outputZ += vpSIMD::numLanes; - } double *r0 = rowPtrs[0]; double *r1 = rowPtrs[1]; @@ -214,7 +200,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo ++inputX; ++inputY; ++inputZ; ++outputX; ++outputY; ++outputZ; } - #else double *r0 = rowPtrs[0]; double *r1 = rowPtrs[1]; @@ -230,12 +215,8 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo } #endif } - - - } - const unsigned int vpRotationMatrix::constr_val_3 = 3; /*! Initialize the rotation matrix as identity. diff --git a/modules/core/test/math/catchHomogeneousMatrix.cpp b/modules/core/test/math/catchHomogeneousMatrix.cpp index 47aef0bb9f..098d859845 100644 --- a/modules/core/test/math/catchHomogeneousMatrix.cpp +++ b/modules/core/test/math/catchHomogeneousMatrix.cpp @@ -1,6 +1,6 @@ /* * ViSP, open source Visual Servoing Platform software. - * Copyright (C) 2005 - 2024 by Inria. All rights reserved. + * Copyright (C) 2005 - 2025 by Inria. All rights reserved. * * This software is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -247,7 +247,6 @@ TEST_CASE("Point projection", "project") vpMatrix outputT(N, 3); vpMatrix output(3, N); - double t1 = vpTime::measureTimeMs(); M.project(inputT, outputT, true); double t2 = vpTime::measureTimeMs(); @@ -277,7 +276,6 @@ TEST_CASE("Point projection", "project") double t2r = vpTime::measureTimeMs(); timeNaive.push_back(t2r - t1r); - vpMatrix input4(4, N); for (unsigned int i = 0; i < inputT.getRows(); ++i) { input4[0][i] = inputT[i][0]; @@ -317,7 +315,6 @@ TEST_CASE("Point projection", "project") std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::minimum(vpMath::getMean(timeProjectTransposed), vpMath::getMean(timeProject)) << std::endl; } - } int main(int argc, char *argv[]) diff --git a/modules/core/test/math/catchRotation.cpp b/modules/core/test/math/catchRotation.cpp index 00e396ee48..9deaec4dc9 100644 --- a/modules/core/test/math/catchRotation.cpp +++ b/modules/core/test/math/catchRotation.cpp @@ -559,8 +559,6 @@ TEST_CASE("Default constructor", "[rotation matrix]") #endif } - - TEST_CASE("Vector rotation", "project") { std::vector NS = { 1, 10, 100, 1000, 10000 }; @@ -627,16 +625,12 @@ TEST_CASE("Vector rotation", "project") std::cerr << "Optimized 3xN version failed" << std::endl; FAIL(); } - - - } std::cout << "Optimized (transposed) version took: " << vpMath::getMean(timeProjectTransposed) << " +- " << vpMath::getStdev(timeProjectTransposed) << "ms" << std::endl; std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +- " << vpMath::getStdev(timeProject) << "ms" << std::endl; std::cout << "Mult version took: " << vpMath::getMean(timeMult) << " +- " << vpMath::getStdev(timeMult) << "ms" << std::endl; std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +- " << vpMath::getStdev(timeNaive) << "ms" << std::endl; std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::minimum(vpMath::getMean(timeProjectTransposed), vpMath::getMean(timeProject)) << std::endl; - } } diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp index 81af464e3b..ae9c0aa6e6 100644 --- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp +++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp @@ -153,7 +153,7 @@ void vpRBDenseDepthTracker::extractFeatures(const vpRBFeatureTrackerInput &frame } pointsPerThread[threadIdx] = std::move(localPoints); - } + } for (const std::vector &points: pointsPerThread) { m_depthPoints.insert(m_depthPoints.end(), std::make_move_iterator(points.begin()), std::make_move_iterator(points.end())); } @@ -171,7 +171,7 @@ void vpRBDenseDepthTracker::extractFeatures(const vpRBFeatureTrackerInput &frame else { m_numFeatures = 0; } - } +} void vpRBDenseDepthTracker::initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) { @@ -244,11 +244,9 @@ void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, con } } - #if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const vpMatrix &cNt, const vpMatrix &obsT, vpColVector &e, vpMatrix &Lt) { - using namespace vpSIMD; #if defined(VISP_HAVE_OPENMP) @@ -281,8 +279,6 @@ void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const vpMatrix &cNt, co } #endif - - void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix( const vpCameraParameters &cam, const vpHomogeneousMatrix &cMo, const vpImage &depth, vpColVector &e, vpMatrix &Lt) From 3f3bac2c6478ec50e3e714ed5fa5c310db79705c Mon Sep 17 00:00:00 2001 From: Fabien Spindler Date: Sun, 7 Dec 2025 10:25:01 +0100 Subject: [PATCH 16/24] Fix warning variable set but not used modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp:200:10: warning: variable 't1' set but not used [-Wunused-but-set-variable] 200 | double t1 = vpTime::measureTimeMs(); | ^ --- modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp index ae9c0aa6e6..2772c9adc4 100644 --- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp +++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp @@ -197,11 +197,9 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, return; } - double t1 = vpTime::measureTimeMs(); m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_Lt); m_Lt.transpose(m_L); - t1 = vpTime::measureTimeMs(); m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01); m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights); From adc26ab8f97480192076f253ffbe29993a58e533 Mon Sep 17 00:00:00 2001 From: Fabien Spindler Date: Sun, 7 Dec 2025 10:39:44 +0100 Subject: [PATCH 17/24] Remove vpSIMD namespace from doxygen doc --- modules/core/include/visp3/core/vpSIMDUtils.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/core/include/visp3/core/vpSIMDUtils.h b/modules/core/include/visp3/core/vpSIMDUtils.h index a54f3c5550..7d81d01a77 100644 --- a/modules/core/include/visp3/core/vpSIMDUtils.h +++ b/modules/core/include/visp3/core/vpSIMDUtils.h @@ -40,6 +40,8 @@ #define VP_SIMD_UTILS_H #include +#ifndef DOXYGEN_SHOULD_SKIP_THIS + #if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) #include #include @@ -225,4 +227,5 @@ inline void storeu(double *data, const Register a) } +#endif // DOXYGEN_SHOULD_SKIP_THIS #endif // VP_SIMD_UTILS_H From c6ff19a448a741b03bef1f998fda4b86e252add0 Mon Sep 17 00:00:00 2001 From: Fabien Spindler Date: Sun, 7 Dec 2025 11:57:17 +0100 Subject: [PATCH 18/24] Fix bug when input vector is not transposed and AVX or SSE2 not available Document vpRotationMatrix::rotateVectors() method --- .../core/src/math/transformation/vpRotationMatrix.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp index a2cbce8c26..5f665d6ca5 100644 --- a/modules/core/src/math/transformation/vpRotationMatrix.cpp +++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp @@ -53,6 +53,13 @@ BEGIN_VISP_NAMESPACE +/*! + * Rotate a set of vectors by this rotation matrix. + * \param input : Input matrix containing vectors to rotate. By default, each column is a 3D vector with 3 rows. + * When `isTransposed` is true, each row is a 3D vector with 3 columns. + * \param output : Output matrix containing rotated vectors. Has the same layout and size as the input matrix. + * \param isTransposed : If true, input matrix is transposed (each row is a 3D vector). +**/ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bool isTransposed) const { output.resize(input.getRows(), input.getCols(), false, false); @@ -205,7 +212,7 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo double *r1 = rowPtrs[1]; double *r2 = rowPtrs[2]; - for (unsigned int i = 0; i < input.getRows(); ++i) { + for (unsigned int i = 0; i < input.getCols(); ++i) { double X = *inputX, Y = *inputY, Z = *inputZ; *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z; *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z; From 51e89a34781652bdf7c45b95df3f4c5a0cd457eb Mon Sep 17 00:00:00 2001 From: Fabien Spindler Date: Sun, 7 Dec 2025 12:07:16 +0100 Subject: [PATCH 19/24] Fix bug when input vector is not transposed and AVX or SSE3 not available Document vpHomogeneousMatrix::project() method --- .../transformation/vpHomogeneousMatrix.cpp | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp index 0154c26e35..9e295b25b1 100644 --- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp +++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp @@ -48,6 +48,14 @@ BEGIN_VISP_NAMESPACE const unsigned int vpHomogeneousMatrix::constr_value_4 = 4; +/** + * Project a set of 3D points using the homogeneous matrix. + * The input points can be stored either by row or by column. + * \param input : Input matrix containing 3D points to project. By default, each column is a 3D point with 3 rows. + * When `transposed` is true, each row is a 3D point with 3 columns. + * \param output : Output matrix containing projected points. Has the same layout and size as the input matrix. + * \param transposed : If true, input matrix is transposed (each row is a 3D point). + */ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool transposed) const { output.resize(input.getRows(), input.getCols(), false, false); @@ -135,7 +143,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool inputData += 3; } #endif - } + } else { if (input.getRows() != 3) { throw vpException(vpException::dimensionError, "Expected input to have 3 rows"); @@ -202,7 +210,7 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool double *r1 = rowPtrs[1]; double *r2 = rowPtrs[2]; - for (unsigned int i = 0; i < input.getRows(); ++i) { + for (unsigned int i = 0; i < input.getCols(); ++i) { double X = *inputX, Y = *inputY, Z = *inputZ; *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z + r0[3]; *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z + r1[3]; @@ -212,13 +220,13 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool } #endif } - } +} - /*! - Construct an homogeneous matrix from a translation vector and quaternion - rotation vector. - */ +/*! + Construct an homogeneous matrix from a translation vector and quaternion + rotation vector. + */ vpHomogeneousMatrix::vpHomogeneousMatrix(const vpTranslationVector &t, const vpQuaternionVector &q) : vpArray2D(constr_value_4, constr_value_4) { @@ -1573,8 +1581,8 @@ void vpHomogeneousMatrix::parse_json(const nlohmann::json &j) const bool converted = convertFromTypeAndBuildFrom(j, *this); if (!converted) { from_json(j, *asArray); + } } -} else { // Generic 2D array conversion from_json(j, *asArray); } From 09fc8cd7fdafff70359b5e3b248d64602846e7cb Mon Sep 17 00:00:00 2001 From: Fabien Spindler Date: Sun, 7 Dec 2025 12:08:07 +0100 Subject: [PATCH 20/24] Cleanup tests to help debugging --- .../core/test/math/catchHomogeneousMatrix.cpp | 25 +++++--- modules/core/test/math/catchRotation.cpp | 58 ++++++++++++------- 2 files changed, 53 insertions(+), 30 deletions(-) diff --git a/modules/core/test/math/catchHomogeneousMatrix.cpp b/modules/core/test/math/catchHomogeneousMatrix.cpp index 098d859845..de2f1d8182 100644 --- a/modules/core/test/math/catchHomogeneousMatrix.cpp +++ b/modules/core/test/math/catchHomogeneousMatrix.cpp @@ -63,6 +63,7 @@ bool test_matrix_equal(const vpHomogeneousMatrix &M1, const vpHomogeneousMatrix TEST_CASE("vpHomogeneousMatrix re-orthogonalize rotation matrix", "[vpHomogeneousMatrix]") { + std::cout << "\n== Test vpHomogeneousMatrix re-orthogonalize rotation matrix ==" << std::endl; CHECK_NOTHROW([]() { vpHomogeneousMatrix M { 0.9835, -0.0581, 0.1716, 0.0072, -0.0489, -0.9972, -0.0571, 0.0352, 0.1744, 0.0478, -0.9835, 0.9470 }; @@ -122,6 +123,7 @@ TEST_CASE("vpHomogeneousMatrix re-orthogonalize rotation matrix", "[vpHomogeneou TEST_CASE("vpRotationMatrix re-orthogonalize rotation matrix", "[vpRotationMatrix]") { + std::cout << "\n== Test vpRotationMatrix re-orthogonalize rotation matrix ==" << std::endl; CHECK_NOTHROW( []() { vpRotationMatrix R { 0.9835, -0.0581, 0.1716, -0.0489, -0.9972, -0.0571, 0.1744, 0.0478, -0.9835 }; }()); @@ -165,6 +167,7 @@ TEST_CASE("vpRotationMatrix re-orthogonalize rotation matrix", "[vpRotationMatri TEST_CASE("ENU to NED conversion", "[enu2ned]") { + std::cout << "\n== Test ENU to NED conversion ==" << std::endl; vpHomogeneousMatrix enu_M_flu { 0, -1, 0, 0.2, 1, 0, 0, 1., 0, 0, 1, 0.3 }; std::cout << "enu_M_flu:\n" << enu_M_flu << std::endl; @@ -206,6 +209,7 @@ TEST_CASE("ENU to NED conversion", "[enu2ned]") TEST_CASE("vpHomogenousMatrix * vpRotationMatrix", "[operator*]") { + std::cout << "\n== Test vpHomogenousMatrix * vpRotationMatrix ==" << std::endl; // Test rotation_matrix * homogeneous_matrix vpHomogeneousMatrix _1_M_2_ { 0.9835, -0.0581, 0.1716, 0.0072, @@ -228,7 +232,12 @@ TEST_CASE("vpHomogenousMatrix * vpRotationMatrix", "[operator*]") TEST_CASE("Point projection", "project") { + std::cout << "\n== Test point projection ==" << std::endl; +#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE3) // Should be adapted to future evolution of vpHomogeneousMatrix::project() std::vector NS = { 1, 10, 100, 1000, 10000 }; +#else + std::vector NS = { 1, 10 }; +#endif for (unsigned int N : NS) { std::cout << "Running for N = " << N << std::endl; std::vector timeProjectTransposed, timeProject, timeMult, timeNaive; @@ -286,7 +295,7 @@ TEST_CASE("Point projection", "project") vpMatrix output4(4, N); double t14 = vpTime::measureTimeMs(); - vpMatrix::mult2Matrices((vpMatrix)M, input4, output4); + vpMatrix::mult2Matrices(static_cast(M), input4, output4); double t24 = vpTime::measureTimeMs(); timeMult.push_back(t24 - t14); @@ -295,14 +304,12 @@ TEST_CASE("Point projection", "project") if (errorT > 1e-10 || error > 1e-10) { std::cout << "M = " << M << std::endl; - std::cerr << "Naive outputT = " << std::endl << outputR << std::endl; - std::cerr << "Mult outputT = " << std::endl << output4.t() << std::endl; - std::cerr << "Project output transposed version = " << std::endl << outputT << std::endl; - std::cerr << "Project output = " << std::endl << outputT<< std::endl; - - - std::cerr << "diff transposed = " << std::endl << outputR - outputT; - std::cerr << "diff = " << std::endl << outputR - output.t(); + std::cerr << "Naive outputT = " << outputR << std::endl; + std::cerr << "Mult outputT = " << output4.t() << std::endl; + std::cerr << "Project output transposed version = " << outputT << std::endl; + std::cerr << "Project output = " << outputT << std::endl; + std::cerr << "Diff transposed = " << outputR - outputT << std::endl; + std::cerr << "Diff = " << outputR - output.t() << std::endl; FAIL(); } diff --git a/modules/core/test/math/catchRotation.cpp b/modules/core/test/math/catchRotation.cpp index 9deaec4dc9..874e78af57 100644 --- a/modules/core/test/math/catchRotation.cpp +++ b/modules/core/test/math/catchRotation.cpp @@ -134,6 +134,7 @@ TEST_CASE("Common rotation operations", "[rotation]") { SECTION("Theta u initialization") { + std::cout << "\n== Test theta u initialization ==" << std::endl; vpThetaUVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10)); std::vector bench1(3, vpMath::rad(10)); vpColVector bench3(3, vpMath::rad(10)); @@ -169,6 +170,7 @@ TEST_CASE("Common rotation operations", "[rotation]") } SECTION("Rxyz initialization") { + std::cout << "\n== Test Rxyz initialization ==" << std::endl; vpRxyzVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10)); std::vector bench1(3, vpMath::rad(10)); vpColVector bench3(3, vpMath::rad(10)); @@ -203,6 +205,7 @@ TEST_CASE("Common rotation operations", "[rotation]") } SECTION("rzyx initialization") { + std::cout << "\n== Test rzyx initialization ==" << std::endl; vpRzyxVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10)); std::vector bench1(3, vpMath::rad(10)); vpColVector bench3(3, vpMath::rad(10)); @@ -237,6 +240,7 @@ TEST_CASE("Common rotation operations", "[rotation]") } SECTION("rzyz initialiation") { + std::cout << "\n== Test rzyz initialization ==" << std::endl; vpRzyzVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10)); std::vector bench1(3, vpMath::rad(10)); vpColVector bench3(3, vpMath::rad(10)); @@ -271,6 +275,7 @@ TEST_CASE("Common rotation operations", "[rotation]") } SECTION("Test quaternion initialization", "[quaternion]") { + std::cout << "\n== Test quaternion initialization ==" << std::endl; vpQuaternionVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10), vpMath::rad(10)); std::vector bench1(4, vpMath::rad(10)); vpColVector bench3(4, vpMath::rad(10)); @@ -305,41 +310,41 @@ TEST_CASE("Common rotation operations", "[rotation]") } SECTION("Conversions") { + std::cout << "\n== Test conversions ==" << std::endl; vpRotationMatrix R; for (int i = -10; i < 10; i++) { for (int j = -10; j < 10; j++) { vpThetaUVector tu(vpMath::rad(90 + i), vpMath::rad(170 + j), vpMath::rad(45)); tu.buildFrom(vpRotationMatrix(tu)); // put some coherence into rotation convention - std::cout << "Initialization " << std::endl; + std::cout << "\n** Initialization with theta_u: " << tu.t() << std::endl; double theta; vpColVector u; tu.extract(theta, u); - std::cout << "theta=" << vpMath::deg(theta) << std::endl; - std::cout << "u=" << u << std::endl; + std::cout << "Extracted theta: " << vpMath::deg(theta) << " deg" << std::endl; + std::cout << "Extracted u: " << u.t() << std::endl; std::cout << "From vpThetaUVector to vpRotationMatrix " << std::endl; R.buildFrom(tu); - std::cout << "Matrix R"; + std::cout << "Matrix R: \n" << R << std::endl; CHECK(R.isARotationMatrix()); - std::cout << R << std::endl; - std::cout << "From vpRotationMatrix to vpQuaternionVector " << std::endl; vpQuaternionVector q(R); + std::cout << "Quaternion: " << q.t() << std::endl; CHECK(q.magnitude() == Catch::Approx(1.0).margin(1e-4)); - std::cout << q << std::endl; + std::cout << "From vpQuaternionVector to vpRotationMatrix " << std::endl; R.buildFrom(q); + std::cout << "Matrix R: \n" << R << std::endl; CHECK(R.isARotationMatrix()); - std::cout << "From vpQuaternionVector to vpRotationMatrix " << std::endl; std::cout << "From vpRotationMatrix to vpRxyzVector " << std::endl; vpRxyzVector RxyzbuildR(R); - std::cout << RxyzbuildR << std::endl; + std::cout << "Rxyz: " << RxyzbuildR.t() << std::endl; std::cout << "From vpRxyzVector to vpThetaUVector " << std::endl; std::cout << " use From vpRxyzVector to vpRotationMatrix " << std::endl; @@ -348,15 +353,14 @@ TEST_CASE("Common rotation operations", "[rotation]") vpThetaUVector tubuildEu; tubuildEu.buildFrom(R); - std::cout << std::endl; - std::cout << "result : should equivalent to the first one " << std::endl; + std::cout << "Result : should equivalent to the first one " << std::endl; double theta2; vpColVector u2; tubuildEu.extract(theta2, u2); - std::cout << "theta=" << vpMath::deg(theta2) << std::endl; - std::cout << "u=" << u2 << std::endl; + std::cout << "Extracted theta: " << vpMath::deg(theta2) << " deg" << std::endl; + std::cout << "Extracted u: " << u2.t() << std::endl; CHECK(vpMath::abs(theta2 - theta) < std::numeric_limits::epsilon() * 1e10); CHECK(vpMath::abs(u[0] - u2[0]) < std::numeric_limits::epsilon() * 1e10); @@ -366,35 +370,39 @@ TEST_CASE("Common rotation operations", "[rotation]") } SECTION("Conversion from and to rzyz vector") { + std::cout << "\n== Test conversion from and to rzyz vector ==" << std::endl; vpRzyzVector rzyz(vpMath::rad(180), vpMath::rad(120), vpMath::rad(45)); - std::cout << "Initialization vpRzyzVector " << std::endl; - std::cout << rzyz << std::endl; + std::cout << "\n** Initialization with rzyz: " << rzyz.t() << std::endl; std::cout << "From vpRzyzVector to vpRotationMatrix " << std::endl; + R.buildFrom(rzyz); + std::cout << "Matrix R: \n" << R << std::endl; CHECK(R.isARotationMatrix()); std::cout << "From vpRotationMatrix to vpRzyzVector " << std::endl; vpRzyzVector rzyz_final; rzyz_final.buildFrom(R); + std::cout << "rzyz: " << rzyz_final.t() << std::endl; CHECK(test("rzyz", rzyz_final, vpColVector(rzyz))); - std::cout << rzyz_final << std::endl; } SECTION("Conversion from and to rzyx vector") { + std::cout << "\n== Test conversion from and to rzyx vector ==" << std::endl; vpRzyxVector rzyx(vpMath::rad(180), vpMath::rad(120), vpMath::rad(45)); - std::cout << "Initialization vpRzyxVector " << std::endl; - std::cout << rzyx << std::endl; + std::cout << "\n** Initialization with rzyx: " << rzyx.t() << std::endl; std::cout << "From vpRzyxVector to vpRotationMatrix " << std::endl; R.buildFrom(rzyx); + std::cout << "Matrix R: \n" << R << std::endl; CHECK(R.isARotationMatrix()); - std::cout << R << std::endl; std::cout << "From vpRotationMatrix to vpRzyxVector " << std::endl; vpRzyxVector rzyx_final; rzyx_final.buildFrom(R); + std::cout << "rzyx: " << rzyx_final.t() << std::endl; bool ret = test("rzyx", rzyx_final, vpColVector(rzyx)); if (ret == false) { // Euler angle representation is not unique std::cout << "Rzyx vector differ. Test rotation matrix..." << std::endl; vpRotationMatrix RR(rzyx_final); + std::cout << "Matrix R: \n" << RR << std::endl; if (R == RR) { std::cout << "Rzyx vector differ but rotation matrix is valid" << std::endl; ret = true; @@ -406,6 +414,7 @@ TEST_CASE("Common rotation operations", "[rotation]") } SECTION("Rotation matrix extraction from homogeneous matrix and multiplication") { + std::cout << "\n== Test rotation matrix extraction from homogeneous matrix and multiplication ==" << std::endl; // Test rotation_matrix * homogeneous_matrix vpHomogeneousMatrix _1_M_2_truth; _1_M_2_truth[0][0] = 0.9835; @@ -443,6 +452,7 @@ TEST_CASE("Common rotation operations", "[rotation]") TEST_CASE("Theta u multiplication", "[theta.u]") { + std::cout << "\n== Test theta u multiplication ==" << std::endl; const int nTrials = 100; const uint64_t seed = 0x123456789; vpUniRand rng(seed); @@ -469,6 +479,7 @@ TEST_CASE("Theta u multiplication", "[theta.u]") TEST_CASE("Quaternion multiplication", "[quaternion]") { + std::cout << "\n== Test quaternion multiplication ==" << std::endl; const int nTrials = 100; const uint64_t seed = 0x123456789; vpUniRand rng(seed); @@ -496,6 +507,7 @@ TEST_CASE("Quaternion multiplication", "[quaternion]") TEST_CASE("Default constructor", "[rotation matrix]") { + std::cout << "\n== Test default constructor ==" << std::endl; vpRotationMatrix I; SECTION("rotation matrix") { @@ -561,9 +573,14 @@ TEST_CASE("Default constructor", "[rotation matrix]") TEST_CASE("Vector rotation", "project") { + std::cout << "\n== Test rotation of vectors ==" << std::endl; +#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) // Should be adapted to future evolution of vpRotationMatrix::rotateVectors() std::vector NS = { 1, 10, 100, 1000, 10000 }; +#else + std::vector NS = { 1, 10 }; +#endif for (unsigned int N : NS) { - std::cout << "Running for N = " << N << std::endl; + std::cout << "** Running for N = " << N << std::endl; std::vector timeProjectTransposed, timeProject, timeMult, timeNaive; vpUniRand r(42); for (unsigned int trial = 0; trial < 100000; ++trial) { @@ -587,7 +604,6 @@ TEST_CASE("Vector rotation", "project") t2 = vpTime::measureTimeMs(); timeProject.push_back(t2 - t1); - vpColVector x(3, 1); vpColVector res(3); vpMatrix outputR(N, 3); From f9f59e2ff48c2de2c58e05fce0b663337be24320 Mon Sep 17 00:00:00 2001 From: Fabien Spindler Date: Sun, 7 Dec 2025 15:33:38 +0100 Subject: [PATCH 21/24] Fix _mm_hadd_pd() usage that requires SSE3 on pixi windows CI modules\core\src\math\transformation\vpRotationMatrix.cpp(128,21): error: always_inline function '_mm_hadd_pd' requires target feature 'sse3', but would be inlined into function 'rotateVectors' that is compiled without support for 'sse3' 128 | __m128d r01 = _mm_hadd_pd(mul0, mul1); | ^ --- modules/core/src/math/transformation/vpRotationMatrix.cpp | 2 +- modules/core/test/math/catchRotation.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp index 5f665d6ca5..a497f3c5c1 100644 --- a/modules/core/src/math/transformation/vpRotationMatrix.cpp +++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp @@ -106,7 +106,7 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo outputData += 3; } -#elif VISP_HAVE_SSE2 +#elif VISP_HAVE_SSE3 __m128d row01_lo = _mm_loadu_pd(rowPtrs[0]); __m128d row01_hi = _mm_loadu_pd(rowPtrs[1]); __m128d row2_xy = _mm_loadu_pd(rowPtrs[2]); diff --git a/modules/core/test/math/catchRotation.cpp b/modules/core/test/math/catchRotation.cpp index 874e78af57..820bca14fb 100644 --- a/modules/core/test/math/catchRotation.cpp +++ b/modules/core/test/math/catchRotation.cpp @@ -574,7 +574,7 @@ TEST_CASE("Default constructor", "[rotation matrix]") TEST_CASE("Vector rotation", "project") { std::cout << "\n== Test rotation of vectors ==" << std::endl; -#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) // Should be adapted to future evolution of vpRotationMatrix::rotateVectors() +#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE3) // Should be adapted to future evolution of vpRotationMatrix::rotateVectors() std::vector NS = { 1, 10, 100, 1000, 10000 }; #else std::vector NS = { 1, 10 }; From 7502eace1c1f3afd869ef610677f88173486b32e Mon Sep 17 00:00:00 2001 From: Fabien Spindler Date: Fri, 12 Dec 2025 09:23:41 +0100 Subject: [PATCH 22/24] Make vpSIMDUtils.h private to not expose SIMD code to the user For the moment this file is duplicated in core and rbt modules. We should find a solution to avoid code duplication. --- .../math/matrix/private}/vpSIMDUtils.h | 0 .../transformation/vpHomogeneousMatrix.cpp | 3 +- .../math/transformation/vpRotationMatrix.cpp | 2 +- .../rbt/src/core/private/vpSIMDUtils.h | 231 ++++++++++++++++++ .../src/features/vpRBDenseDepthTracker.cpp | 3 +- 5 files changed, 236 insertions(+), 3 deletions(-) rename modules/core/{include/visp3/core => src/math/matrix/private}/vpSIMDUtils.h (100%) create mode 100644 modules/tracker/rbt/src/core/private/vpSIMDUtils.h diff --git a/modules/core/include/visp3/core/vpSIMDUtils.h b/modules/core/src/math/matrix/private/vpSIMDUtils.h similarity index 100% rename from modules/core/include/visp3/core/vpSIMDUtils.h rename to modules/core/src/math/matrix/private/vpSIMDUtils.h diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp index 9e295b25b1..9b864926af 100644 --- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp +++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp @@ -42,7 +42,8 @@ #include #include #include -#include + +#include "../matrix/private/vpSIMDUtils.h" BEGIN_VISP_NAMESPACE diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp index a497f3c5c1..d16dd86205 100644 --- a/modules/core/src/math/transformation/vpRotationMatrix.cpp +++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp @@ -40,7 +40,7 @@ #include #include -#include +#include "../matrix/private/vpSIMDUtils.h" // Rotation classes #include diff --git a/modules/tracker/rbt/src/core/private/vpSIMDUtils.h b/modules/tracker/rbt/src/core/private/vpSIMDUtils.h new file mode 100644 index 0000000000..7d81d01a77 --- /dev/null +++ b/modules/tracker/rbt/src/core/private/vpSIMDUtils.h @@ -0,0 +1,231 @@ +/* + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2025 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See https://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * SIMD utilities. + */ + +/*! + \file vpSIMDUtils.h + \brief Header that defines and includes useful SIMD routines and macros +*/ + +#ifndef VP_SIMD_UTILS_H +#define VP_SIMD_UTILS_H +#include + +#ifndef DOXYGEN_SHOULD_SKIP_THIS + +#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) +#include +#include +#include + +#define VISP_HAVE_SSE2 1 +#endif + +#if defined __AVX2__ +#define VISP_HAVE_AVX2 1 +#endif + +#if defined __AVX__ +#define VISP_HAVE_AVX 1 +#endif + +// https://stackoverflow.com/a/40765925 +#if !defined(__FMA__) && defined(__AVX2__) +#define __FMA__ 1 +#endif + + +#if defined(__FMA__) +#define VISP_HAVE_FMA +#endif + +#if defined _WIN32 && defined(_M_ARM64) +#define _ARM64_DISTINCT_NEON_TYPES +#include +#include +#define VISP_HAVE_NEON 1 +#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__) +#include +#define VISP_HAVE_NEON 1 +#else +#define VISP_HAVE_NEON 0 +#endif + +#if VISP_HAVE_SSE2 && USE_SIMD_CODE +#define USE_SSE 1 +#else +#define USE_SSE 0 +#endif + +#if VISP_HAVE_NEON && USE_SIMD_CODE +#define USE_NEON 1 +#else +#define USE_NEON 0 +#endif + +namespace vpSIMD +{ +#if defined(VISP_HAVE_AVX2) +using Register = __m512d; + +inline constexpr int numLanes = 8; +inline const Register add(const Register a, const Register b) +{ + return _mm512_add_pd(a, b); +} + +inline Register sub(const Register a, const Register b) +{ + return _mm512_sub_pd(a, b); +} + +inline Register mul(const Register a, const Register b) +{ + return _mm512_mul_pd(a, b); +} + +inline Register fma(const Register a, const Register b, const Register c) +{ +#if defined(VISP_HAVE_FMA) + return _mm512_fmadd_pd(a, b, c); +#else + return add(mul(a, b), c); +#endif +} + +inline Register loadu(const double *const data) +{ + return _mm512_loadu_pd(data); +} + +inline Register set1(double v) +{ + return _mm512_set1_pd(v); +} + +inline void storeu(double *data, const Register a) +{ + _mm512_storeu_pd(data, a); +} + +#elif defined(VISP_HAVE_AVX) +using Register = __m256d; +inline const int numLanes = 4; + +inline Register add(const Register a, const Register b) +{ + return _mm256_add_pd(a, b); +} + +inline Register sub(const Register a, const Register b) +{ + return _mm256_sub_pd(a, b); +} + +inline Register mul(const Register a, const Register b) +{ + return _mm256_mul_pd(a, b); +} + +inline Register fma(const Register a, const Register b, const Register c) +{ +#if defined(VISP_HAVE_FMA) + return _mm256_fmadd_pd(a, b, c); +#else + return add(mul(a, b), c); +#endif +} + +inline Register loadu(const double *const data) +{ + return _mm256_loadu_pd(data); +} + +inline Register set1(double v) +{ + return _mm256_set1_pd(v); +} + +inline void storeu(double *data, const Register a) +{ + _mm256_storeu_pd(data, a); +} + +#elif VISP_HAVE_SSE2 +using Register = __m128d; +inline const int numLanes = 2; + +inline Register add(const Register a, const Register b) +{ + return _mm_add_pd(a, b); +} + +inline Register sub(const Register a, const Register b) +{ + return _mm_sub_pd(a, b); +} + +inline Register mul(const Register a, const Register b) +{ + return _mm_mul_pd(a, b); +} + +inline Register fma(const Register a, const Register b, const Register c) +{ +#if defined(VISP_HAVE_FMA) + return _mm_fmadd_pd(a, b, c); +#else + return add(mul(a, b), c); +#endif +} + +inline Register loadu(const double *const data) +{ + return _mm_loadu_pd(data); +} + +inline Register set1(double v) +{ + return _mm_set1_pd(v); +} + +inline void storeu(double *data, const Register a) +{ + _mm_storeu_pd(data, a); +} + +#endif + +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // VP_SIMD_UTILS_H diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp index 2772c9adc4..31abf019dd 100644 --- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp +++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp @@ -30,9 +30,10 @@ #include #include -#include #include +#include "../core/private/vpSIMDUtils.h" + #ifdef VISP_HAVE_OPENMP #include #endif From 1f6079784835618ab05ac279b2a40778af3c3db5 Mon Sep 17 00:00:00 2001 From: Fabien Spindler Date: Fri, 12 Dec 2025 10:48:04 +0100 Subject: [PATCH 23/24] Remove to make code more explicit --- .../transformation/vpHomogeneousMatrix.cpp | 39 +++++++++---------- .../math/transformation/vpRotationMatrix.cpp | 23 ++++++----- .../src/features/vpRBDenseDepthTracker.cpp | 30 +++++++------- 3 files changed, 43 insertions(+), 49 deletions(-) diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp index 9b864926af..175bf5ceaa 100644 --- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp +++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp @@ -157,47 +157,44 @@ void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool double *outputZ = output[2]; #if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) - - using namespace vpSIMD; - - Register elems[12]; + vpSIMD::Register elems[12]; for (unsigned int i = 0; i < 12; ++i) { - elems[i] = set1(data[i]); + elems[i] = vpSIMD::set1(data[i]); } - for (int i = 0; i <= static_cast(input.getCols()) - numLanes; i += numLanes) { - const Register x4 = loadu(inputX); - const Register y4 = loadu(inputY); - const Register z4 = loadu(inputZ); + for (int i = 0; i <= static_cast(input.getCols()) - vpSIMD::numLanes; i += vpSIMD::numLanes) { + const vpSIMD::Register x4 = vpSIMD::loadu(inputX); + const vpSIMD::Register y4 = vpSIMD::loadu(inputY); + const vpSIMD::Register z4 = vpSIMD::loadu(inputZ); - Register dp1 = mul(x4, elems[0]); + vpSIMD::Register dp1 = vpSIMD::mul(x4, elems[0]); dp1 = vpSIMD::fma(y4, elems[1], dp1); dp1 = vpSIMD::fma(z4, elems[2], dp1); - dp1 = add(elems[3], dp1); + dp1 = vpSIMD::add(elems[3], dp1); - Register dp2 = mul(x4, elems[4]); + vpSIMD::Register dp2 = vpSIMD::mul(x4, elems[4]); dp2 = vpSIMD::fma(y4, elems[5], dp2); dp2 = vpSIMD::fma(z4, elems[6], dp2); - dp2 = add(elems[7], dp2); + dp2 = vpSIMD::add(elems[7], dp2); - Register dp3 = mul(x4, elems[8]); + vpSIMD::Register dp3 = vpSIMD::mul(x4, elems[8]); dp3 = vpSIMD::fma(y4, elems[9], dp3); dp3 = vpSIMD::fma(z4, elems[10], dp3); - dp3 = add(elems[11], dp3); + dp3 = vpSIMD::add(elems[11], dp3); - storeu(outputX, dp1); - storeu(outputY, dp2); - storeu(outputZ, dp3); + vpSIMD::storeu(outputX, dp1); + vpSIMD::storeu(outputY, dp2); + vpSIMD::storeu(outputZ, dp3); - inputX += numLanes; inputY += numLanes; inputZ += numLanes; - outputX += numLanes; outputY += numLanes; outputZ += numLanes; + inputX += vpSIMD::numLanes; inputY += vpSIMD::numLanes; inputZ += vpSIMD::numLanes; + outputX += vpSIMD::numLanes; outputY += vpSIMD::numLanes; outputZ += vpSIMD::numLanes; } double *r0 = rowPtrs[0]; double *r1 = rowPtrs[1]; double *r2 = rowPtrs[2]; - for (unsigned int i = (input.getCols() / numLanes) * numLanes; i < input.getCols(); ++i) { + for (unsigned int i = (input.getCols() / vpSIMD::numLanes) * vpSIMD::numLanes; i < input.getCols(); ++i) { // std::cout << "i = " << i << std::endl; double X = *inputX, Y = *inputY, Z = *inputZ; *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z + r0[3]; diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp index d16dd86205..bd6337a70f 100644 --- a/modules/core/src/math/transformation/vpRotationMatrix.cpp +++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp @@ -155,7 +155,6 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo #endif } else { // 3xN matrix - using namespace vpSIMD; if (input.getRows() != 3) { throw vpException(vpException::dimensionError, "Expected input to have 3 rows"); } @@ -168,28 +167,28 @@ void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bo #if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) || defined(VISP_HAVE_AVX2) - Register elems[9]; + vpSIMD::Register elems[9]; for (unsigned int i = 0; i < 9; ++i) { - elems[i] = set1(data[i]); + elems[i] = vpSIMD::set1(data[i]); } for (int i = 0; i <= static_cast(input.getCols()) - vpSIMD::numLanes; i += vpSIMD::numLanes) { - const Register x4 = loadu(inputX); - const Register y4 = loadu(inputY); - const Register z4 = loadu(inputZ); + const vpSIMD::Register x4 = vpSIMD::loadu(inputX); + const vpSIMD::Register y4 = vpSIMD::loadu(inputY); + const vpSIMD::Register z4 = vpSIMD::loadu(inputZ); - Register dp1 = mul(x4, elems[0]); + vpSIMD::Register dp1 = vpSIMD::mul(x4, elems[0]); dp1 = vpSIMD::fma(y4, elems[1], dp1); dp1 = vpSIMD::fma(z4, elems[2], dp1); - Register dp2 = mul(x4, elems[3]); + vpSIMD::Register dp2 = vpSIMD::mul(x4, elems[3]); dp2 = vpSIMD::fma(y4, elems[4], dp2); dp2 = vpSIMD::fma(z4, elems[5], dp2); - Register dp3 = mul(x4, elems[6]); + vpSIMD::Register dp3 = vpSIMD::mul(x4, elems[6]); dp3 = vpSIMD::fma(y4, elems[7], dp3); dp3 = vpSIMD::fma(z4, elems[8], dp3); - storeu(outputX, dp1); - storeu(outputY, dp2); - storeu(outputZ, dp3); + vpSIMD::storeu(outputX, dp1); + vpSIMD::storeu(outputY, dp2); + vpSIMD::storeu(outputZ, dp3); inputX += vpSIMD::numLanes; inputY += vpSIMD::numLanes; inputZ += vpSIMD::numLanes; outputX += vpSIMD::numLanes; outputY += vpSIMD::numLanes; outputZ += vpSIMD::numLanes; diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp index 31abf019dd..384a3d6119 100644 --- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp +++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp @@ -246,35 +246,33 @@ void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, con #if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const vpMatrix &cNt, const vpMatrix &obsT, vpColVector &e, vpMatrix &Lt) { - using namespace vpSIMD; - #if defined(VISP_HAVE_OPENMP) #pragma omp parallel for schedule(static, 1024) #endif - for (int i = 0; i <= static_cast(cXt.getCols()) - numLanes; i += numLanes) { - const Register X = loadu(cXt[0] + i), Y = loadu(cXt[1] + i), Z = loadu(cXt[2] + i); - const Register nX = loadu(cNt[0] + i), nY = loadu(cNt[1] + i), nZ = loadu(cNt[2] + i); - const Register obsX = loadu(obsT[0] + i), obsY = loadu(obsT[1] + i), obsZ = loadu(obsT[2] + i); + for (int i = 0; i <= static_cast(cXt.getCols()) - vpSIMD::numLanes; i += vpSIMD::numLanes) { + const vpSIMD::Register X = vpSIMD::loadu(cXt[0] + i), Y = vpSIMD::loadu(cXt[1] + i), Z = vpSIMD::loadu(cXt[2] + i); + const vpSIMD::Register nX = vpSIMD::loadu(cNt[0] + i), nY = vpSIMD::loadu(cNt[1] + i), nZ = vpSIMD::loadu(cNt[2] + i); + const vpSIMD::Register obsX = vpSIMD::loadu(obsT[0] + i), obsY = vpSIMD::loadu(obsT[1] + i), obsZ = vpSIMD::loadu(obsT[2] + i); - Register D = mul(nX, X); - Register projNormal = mul(obsX, X); + vpSIMD::Register D = vpSIMD::mul(nX, X); + vpSIMD::Register projNormal = vpSIMD::mul(obsX, X); D = vpSIMD::fma(nY, Y, D); D = vpSIMD::fma(nZ, Z, D); projNormal = vpSIMD::fma(nY, obsY, projNormal); projNormal = vpSIMD::fma(nZ, obsZ, projNormal); - storeu(e.data + i, sub(projNormal, D)); + vpSIMD::storeu(e.data + i, vpSIMD::sub(projNormal, D)); - storeu(Lt[0] + i, nX); - storeu(Lt[1] + i, nY); - storeu(Lt[2] + i, nZ); + vpSIMD::storeu(Lt[0] + i, nX); + vpSIMD::storeu(Lt[1] + i, nY); + vpSIMD::storeu(Lt[2] + i, nZ); - storeu(Lt[3] + i, sub(mul(nZ, Y), mul(nY, Z))); - storeu(Lt[4] + i, sub(mul(nX, Z), mul(nZ, X))); - storeu(Lt[5] + i, sub(mul(nY, X), mul(nX, Y))); + vpSIMD::storeu(Lt[3] + i, vpSIMD::sub(vpSIMD::mul(nZ, Y), vpSIMD::mul(nY, Z))); + vpSIMD::storeu(Lt[4] + i, vpSIMD::sub(vpSIMD::mul(nX, Z), vpSIMD::mul(nZ, X))); + vpSIMD::storeu(Lt[5] + i, vpSIMD::sub(vpSIMD::mul(nY, X), vpSIMD::mul(nX, Y))); } - errorAndInteractionMatrixBase(cXt, cNt, obsT, e, Lt, (cXt.getCols() / numLanes) * numLanes, cXt.getCols()); + errorAndInteractionMatrixBase(cXt, cNt, obsT, e, Lt, (cXt.getCols() / vpSIMD::numLanes) * vpSIMD::numLanes, cXt.getCols()); } #endif From 671b8839afec140d25077d2a0aaccddb8e70c69a Mon Sep 17 00:00:00 2001 From: Fabien Spindler Date: Fri, 12 Dec 2025 11:29:03 +0100 Subject: [PATCH 24/24] Make test independent from SIMD instruction set --- .../core/test/math/catchHomogeneousMatrix.cpp | 31 +++++++++---------- modules/core/test/math/catchRotation.cpp | 29 ++++++++--------- 2 files changed, 27 insertions(+), 33 deletions(-) diff --git a/modules/core/test/math/catchHomogeneousMatrix.cpp b/modules/core/test/math/catchHomogeneousMatrix.cpp index de2f1d8182..a6e11fcb0c 100644 --- a/modules/core/test/math/catchHomogeneousMatrix.cpp +++ b/modules/core/test/math/catchHomogeneousMatrix.cpp @@ -233,28 +233,25 @@ TEST_CASE("vpHomogenousMatrix * vpRotationMatrix", "[operator*]") TEST_CASE("Point projection", "project") { std::cout << "\n== Test point projection ==" << std::endl; -#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE3) // Should be adapted to future evolution of vpHomogeneousMatrix::project() - std::vector NS = { 1, 10, 100, 1000, 10000 }; -#else - std::vector NS = { 1, 10 }; -#endif - for (unsigned int N : NS) { - std::cout << "Running for N = " << N << std::endl; + std::map map_vecsize_trials = { {1, 100000}, {10, 10000}, {100, 1000}, {1000, 1000}, { 10000, 100} }; + + for (const auto &[vecsize, trials] : map_vecsize_trials) { + std::cout << "** Running for vector size = " << vecsize << std::endl; std::vector timeProjectTransposed, timeProject, timeMult, timeNaive; vpUniRand r(42); - for (unsigned int trial = 0; trial < 100000; ++trial) { + for (unsigned int trial = 0; trial < trials; ++trial) { vpHomogeneousMatrix M( r.uniform(0.0, 1.0), r.uniform(0.0, 1.0), r.uniform(0.0, 1.0), r.uniform(0.0, M_PI), r.uniform(0.0, M_PI), r.uniform(0.0, M_PI)); - vpMatrix inputT(N, 3); - for (unsigned int i = 0; i< N; ++i) { + vpMatrix inputT(vecsize, 3); + for (unsigned int i = 0; i< vecsize; ++i) { inputT[i][0] = r.uniform(0.0, 1.0); inputT[i][1] = r.uniform(0.0, 1.0); inputT[i][2] = r.uniform(0.0, 1.0); } vpMatrix input = inputT.t(); - vpMatrix outputT(N, 3); - vpMatrix output(3, N); + vpMatrix outputT(vecsize, 3); + vpMatrix output(3, vecsize); double t1 = vpTime::measureTimeMs(); M.project(inputT, outputT, true); @@ -269,7 +266,7 @@ TEST_CASE("Point projection", "project") vpColVector x(4, 1); vpColVector res(4); - vpMatrix outputR(N, 3); + vpMatrix outputR(vecsize, 3); double t1r = vpTime::measureTimeMs(); for (unsigned int i = 0; i < inputT.getRows(); ++i) { @@ -285,22 +282,22 @@ TEST_CASE("Point projection", "project") double t2r = vpTime::measureTimeMs(); timeNaive.push_back(t2r - t1r); - vpMatrix input4(4, N); + vpMatrix input4(4, vecsize); for (unsigned int i = 0; i < inputT.getRows(); ++i) { input4[0][i] = inputT[i][0]; input4[1][i] = inputT[i][1]; input4[2][i] = inputT[i][2]; input4[3][i] = 1; } - vpMatrix output4(4, N); + vpMatrix output4(4, vecsize); double t14 = vpTime::measureTimeMs(); vpMatrix::mult2Matrices(static_cast(M), input4, output4); double t24 = vpTime::measureTimeMs(); timeMult.push_back(t24 - t14); - double errorT = (outputR - outputT).frobeniusNorm() / N; - double error = (outputR - output.t()).frobeniusNorm() / N; + double errorT = (outputR - outputT).frobeniusNorm() / vecsize; + double error = (outputR - output.t()).frobeniusNorm() / vecsize; if (errorT > 1e-10 || error > 1e-10) { std::cout << "M = " << M << std::endl; diff --git a/modules/core/test/math/catchRotation.cpp b/modules/core/test/math/catchRotation.cpp index 820bca14fb..ba84966fa7 100644 --- a/modules/core/test/math/catchRotation.cpp +++ b/modules/core/test/math/catchRotation.cpp @@ -574,27 +574,24 @@ TEST_CASE("Default constructor", "[rotation matrix]") TEST_CASE("Vector rotation", "project") { std::cout << "\n== Test rotation of vectors ==" << std::endl; -#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE3) // Should be adapted to future evolution of vpRotationMatrix::rotateVectors() - std::vector NS = { 1, 10, 100, 1000, 10000 }; -#else - std::vector NS = { 1, 10 }; -#endif - for (unsigned int N : NS) { - std::cout << "** Running for N = " << N << std::endl; + std::map map_vecsize_trials = { {1, 100000}, {10, 10000}, {100, 1000}, {1000, 1000}, { 10000, 100} }; + + for (const auto &[vecsize, trials] : map_vecsize_trials) { + std::cout << "** Running for vector size = " << vecsize << std::endl; std::vector timeProjectTransposed, timeProject, timeMult, timeNaive; vpUniRand r(42); - for (unsigned int trial = 0; trial < 100000; ++trial) { + for (unsigned int trial = 0; trial < trials; ++trial) { vpRotationMatrix M(r.uniform(0.0, M_PI), r.uniform(0.0, M_PI), r.uniform(0.0, M_PI)); - vpMatrix inputT(N, 3); + vpMatrix inputT(vecsize, 3); - for (unsigned int i = 0; i< N; ++i) { + for (unsigned int i = 0; i< vecsize; ++i) { inputT[i][0] = r.uniform(0.0, 1.0); inputT[i][1] = r.uniform(0.0, 1.0); inputT[i][2] = r.uniform(0.0, 1.0); } vpMatrix input = inputT.t(); - vpMatrix outputT(N, 3); - vpMatrix output(3, N); + vpMatrix outputT(vecsize, 3); + vpMatrix output(3, vecsize); double t1 = vpTime::measureTimeMs(); M.rotateVectors(inputT, outputT, true); double t2 = vpTime::measureTimeMs(); @@ -606,7 +603,7 @@ TEST_CASE("Vector rotation", "project") vpColVector x(3, 1); vpColVector res(3); - vpMatrix outputR(N, 3); + vpMatrix outputR(vecsize, 3); double t1r = vpTime::measureTimeMs(); for (unsigned int i = 0; i < inputT.getRows(); ++i) { @@ -622,15 +619,15 @@ TEST_CASE("Vector rotation", "project") double t2r = vpTime::measureTimeMs(); timeNaive.push_back(t2r - t1r); - vpMatrix outputMM(3, N); + vpMatrix outputMM(3, vecsize); double t14 = vpTime::measureTimeMs(); vpMatrix::mult2Matrices((vpMatrix)M, input, outputMM); double t24 = vpTime::measureTimeMs(); timeMult.push_back(t24 - t14); - double errorT = (outputR - outputT).frobeniusNorm() / N; - double error = (outputR - output.t()).frobeniusNorm() / N; + double errorT = (outputR - outputT).frobeniusNorm() / vecsize; + double error = (outputR - output.t()).frobeniusNorm() / vecsize; if (errorT > 1e-10) { std::cerr << "Transposed version failed" << std::endl;