diff --git a/CMakeLists.txt b/CMakeLists.txt index 4f3ad49afe..edb0af45da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ ############################################################################# # # ViSP, open source Visual Servoing Platform software. -# Copyright (C) 2005 - 2024 by Inria. All rights reserved. +# Copyright (C) 2005 - 2025 by Inria. All rights reserved. # # This software is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -381,6 +381,9 @@ VP_OPTION(ENABLE_SSSE3 "" "" "Enable SSSE3 instructions" "" ON IF ((MSVC OR CMAK if(X86_64) VP_OPTION(ENABLE_AVX "" "" "Enable AVX instructions" "" OFF) # should be explicitly enabled, used in matrix transpose code endif() +if(CMAKE_COMPILER_IS_GNUCXX) + VP_OPTION(ENABLE_NATIVE_ARCH "" "" "Enable all available CPU instruction sets" "" OFF) +endif() #---------------------------------------------------------------------- # BLAS / LAPACK diff --git a/cmake/AddExtraCompilationFlags.cmake b/cmake/AddExtraCompilationFlags.cmake index 35f0a269dc..4eba86f73d 100644 --- a/cmake/AddExtraCompilationFlags.cmake +++ b/cmake/AddExtraCompilationFlags.cmake @@ -171,25 +171,30 @@ if(CMAKE_COMPILER_IS_GNUCXX) add_extra_compiler_options("-fvisibility-inlines-hidden") endif() - if(ENABLE_AVX AND X86_64) - add_extra_compiler_options("-mavx") + if(ENABLE_NATIVE_ARCH) + add_extra_compiler_options("-march=native") else() - if(ENABLE_SSE2) - add_extra_compiler_options("-msse2") - elseif(X86 OR X86_64) - add_extra_compiler_options("-mno-sse2") - endif() - if(ENABLE_SSE3) - add_extra_compiler_options("-msse3") - elseif(X86 OR X86_64) - #add_extra_compiler_options("-mno-sse3") - endif() + if(ENABLE_AVX AND X86_64) + add_extra_compiler_options("-mavx") + else() + if(ENABLE_SSE2) + add_extra_compiler_options("-msse2") + elseif(X86 OR X86_64) + add_extra_compiler_options("-mno-sse2") + endif() + + if(ENABLE_SSE3) + add_extra_compiler_options("-msse3") + elseif(X86 OR X86_64) + #add_extra_compiler_options("-mno-sse3") + endif() - if(ENABLE_SSSE3) - add_extra_compiler_options("-mssse3") - elseif(X86 OR X86_64) - add_extra_compiler_options("-mno-ssse3") + if(ENABLE_SSSE3) + add_extra_compiler_options("-mssse3") + elseif(X86 OR X86_64) + add_extra_compiler_options("-mno-ssse3") + endif() endif() endif() diff --git a/modules/core/include/visp3/core/vpHomogeneousMatrix.h b/modules/core/include/visp3/core/vpHomogeneousMatrix.h index 76950d9a82..68f33ed3b4 100644 --- a/modules/core/include/visp3/core/vpHomogeneousMatrix.h +++ b/modules/core/include/visp3/core/vpHomogeneousMatrix.h @@ -381,6 +381,8 @@ class VISP_EXPORT vpHomogeneousMatrix : public vpArray2D vpHomogeneousMatrix &operator<<(double val); vpHomogeneousMatrix &operator,(double val); + void project(const vpMatrix &inputPoints, vpMatrix &outputPoints, bool isTransposed) const; + void orthogonalizeRotation(); void print() const; diff --git a/modules/core/include/visp3/core/vpRotationMatrix.h b/modules/core/include/visp3/core/vpRotationMatrix.h index fbbf0b6f0e..38ec6f2a05 100644 --- a/modules/core/include/visp3/core/vpRotationMatrix.h +++ b/modules/core/include/visp3/core/vpRotationMatrix.h @@ -190,6 +190,8 @@ class VISP_EXPORT vpRotationMatrix : public vpArray2D void printVector(); + void rotateVectors(const vpMatrix &inputs, vpMatrix &outputs, bool isTransposed) const; + /*! This function is not applicable to a rotation matrix that is always a 3-by-3 matrix. diff --git a/modules/core/src/math/matrix/private/vpSIMDUtils.h b/modules/core/src/math/matrix/private/vpSIMDUtils.h new file mode 100644 index 0000000000..7d81d01a77 --- /dev/null +++ b/modules/core/src/math/matrix/private/vpSIMDUtils.h @@ -0,0 +1,231 @@ +/* + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2025 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See https://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * SIMD utilities. + */ + +/*! + \file vpSIMDUtils.h + \brief Header that defines and includes useful SIMD routines and macros +*/ + +#ifndef VP_SIMD_UTILS_H +#define VP_SIMD_UTILS_H +#include + +#ifndef DOXYGEN_SHOULD_SKIP_THIS + +#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) +#include +#include +#include + +#define VISP_HAVE_SSE2 1 +#endif + +#if defined __AVX2__ +#define VISP_HAVE_AVX2 1 +#endif + +#if defined __AVX__ +#define VISP_HAVE_AVX 1 +#endif + +// https://stackoverflow.com/a/40765925 +#if !defined(__FMA__) && defined(__AVX2__) +#define __FMA__ 1 +#endif + + +#if defined(__FMA__) +#define VISP_HAVE_FMA +#endif + +#if defined _WIN32 && defined(_M_ARM64) +#define _ARM64_DISTINCT_NEON_TYPES +#include +#include +#define VISP_HAVE_NEON 1 +#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__) +#include +#define VISP_HAVE_NEON 1 +#else +#define VISP_HAVE_NEON 0 +#endif + +#if VISP_HAVE_SSE2 && USE_SIMD_CODE +#define USE_SSE 1 +#else +#define USE_SSE 0 +#endif + +#if VISP_HAVE_NEON && USE_SIMD_CODE +#define USE_NEON 1 +#else +#define USE_NEON 0 +#endif + +namespace vpSIMD +{ +#if defined(VISP_HAVE_AVX2) +using Register = __m512d; + +inline constexpr int numLanes = 8; +inline const Register add(const Register a, const Register b) +{ + return _mm512_add_pd(a, b); +} + +inline Register sub(const Register a, const Register b) +{ + return _mm512_sub_pd(a, b); +} + +inline Register mul(const Register a, const Register b) +{ + return _mm512_mul_pd(a, b); +} + +inline Register fma(const Register a, const Register b, const Register c) +{ +#if defined(VISP_HAVE_FMA) + return _mm512_fmadd_pd(a, b, c); +#else + return add(mul(a, b), c); +#endif +} + +inline Register loadu(const double *const data) +{ + return _mm512_loadu_pd(data); +} + +inline Register set1(double v) +{ + return _mm512_set1_pd(v); +} + +inline void storeu(double *data, const Register a) +{ + _mm512_storeu_pd(data, a); +} + +#elif defined(VISP_HAVE_AVX) +using Register = __m256d; +inline const int numLanes = 4; + +inline Register add(const Register a, const Register b) +{ + return _mm256_add_pd(a, b); +} + +inline Register sub(const Register a, const Register b) +{ + return _mm256_sub_pd(a, b); +} + +inline Register mul(const Register a, const Register b) +{ + return _mm256_mul_pd(a, b); +} + +inline Register fma(const Register a, const Register b, const Register c) +{ +#if defined(VISP_HAVE_FMA) + return _mm256_fmadd_pd(a, b, c); +#else + return add(mul(a, b), c); +#endif +} + +inline Register loadu(const double *const data) +{ + return _mm256_loadu_pd(data); +} + +inline Register set1(double v) +{ + return _mm256_set1_pd(v); +} + +inline void storeu(double *data, const Register a) +{ + _mm256_storeu_pd(data, a); +} + +#elif VISP_HAVE_SSE2 +using Register = __m128d; +inline const int numLanes = 2; + +inline Register add(const Register a, const Register b) +{ + return _mm_add_pd(a, b); +} + +inline Register sub(const Register a, const Register b) +{ + return _mm_sub_pd(a, b); +} + +inline Register mul(const Register a, const Register b) +{ + return _mm_mul_pd(a, b); +} + +inline Register fma(const Register a, const Register b, const Register c) +{ +#if defined(VISP_HAVE_FMA) + return _mm_fmadd_pd(a, b, c); +#else + return add(mul(a, b), c); +#endif +} + +inline Register loadu(const double *const data) +{ + return _mm_loadu_pd(data); +} + +inline Register set1(double v) +{ + return _mm_set1_pd(v); +} + +inline void storeu(double *data, const Register a) +{ + _mm_storeu_pd(data, a); +} + +#endif + +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // VP_SIMD_UTILS_H diff --git a/modules/core/src/math/robust/vpRobust.cpp b/modules/core/src/math/robust/vpRobust.cpp index 27fe9159c6..08b0c8da09 100644 --- a/modules/core/src/math/robust/vpRobust.cpp +++ b/modules/core/src/math/robust/vpRobust.cpp @@ -107,9 +107,9 @@ vpRobust &vpRobust::operator=(const vpRobust &&other) void vpRobust::resize(unsigned int n_data) { if (n_data != m_size) { - m_normres.resize(n_data); - m_sorted_normres.resize(n_data); - m_sorted_residues.resize(n_data); + m_normres.resize(n_data, false); + m_sorted_normres.resize(n_data, false); + m_sorted_residues.resize(n_data, false); m_size = n_data; } } @@ -143,6 +143,8 @@ void vpRobust::MEstimator(const vpRobustEstimatorType method, const vpColVector // Calculate median med = select(m_sorted_residues, 0, n_data - 1, ind_med); + + // med = select(m_sorted_residues, 0, n_data - 1, ind_med); // --comment: residualMedian = med // Normalize residues @@ -153,6 +155,7 @@ void vpRobust::MEstimator(const vpRobustEstimatorType method, const vpColVector // Calculate MAD normmedian = select(m_sorted_normres, 0, n_data - 1, ind_med); + // normalizedResidualMedian = normmedian ; // 1.48 keeps scale estimate consistent for a normal probability dist. m_mad = 1.4826 * normmedian; // median Absolute Deviation diff --git a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp index 8e17bea6a5..175bf5ceaa 100644 --- a/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp +++ b/modules/core/src/math/transformation/vpHomogeneousMatrix.cpp @@ -43,8 +43,184 @@ #include #include +#include "../matrix/private/vpSIMDUtils.h" + + BEGIN_VISP_NAMESPACE const unsigned int vpHomogeneousMatrix::constr_value_4 = 4; + +/** + * Project a set of 3D points using the homogeneous matrix. + * The input points can be stored either by row or by column. + * \param input : Input matrix containing 3D points to project. By default, each column is a 3D point with 3 rows. + * When `transposed` is true, each row is a 3D point with 3 columns. + * \param output : Output matrix containing projected points. Has the same layout and size as the input matrix. + * \param transposed : If true, input matrix is transposed (each row is a 3D point). + */ +void vpHomogeneousMatrix::project(const vpMatrix &input, vpMatrix &output, bool transposed) const +{ + output.resize(input.getRows(), input.getCols(), false, false); + if (transposed) { + + if (input.getCols() != 3) { + throw vpException(vpException::dimensionError, "Input matrix should have 3 columns"); + } + double *inputData = input.data; +#if defined(VISP_HAVE_AVX) + double *outputData = output.data; + __m256d rows[] = { + _mm256_loadu_pd(rowPtrs[0]), + _mm256_loadu_pd(rowPtrs[1]), + _mm256_loadu_pd(rowPtrs[2]), + _mm256_loadu_pd(rowPtrs[3]), + + }; + + double result[4]; + + for (unsigned int i = 0; i < input.getRows(); ++i) { + const __m256d xyzw = _mm256_setr_pd(inputData[0], inputData[1], inputData[2], 1.0); + + const __m256d rs[] = { + _mm256_mul_pd(rows[0], xyzw), + _mm256_mul_pd(rows[1], xyzw), + _mm256_mul_pd(rows[2], xyzw), + }; + + __m256d rs_half_sum1 = _mm256_hadd_pd(rs[0], rs[1]); + + + _mm256_storeu_pd(result, rs_half_sum1); + __m128d hi = _mm256_extractf128_pd(rs[2], 1); // [A3, A2] + __m128d lo = _mm256_castpd256_pd128(rs[2]); // [A1, A0] + + __m128d sum2 = _mm_add_pd(lo, hi); // [A3+A1, A2+A0] + __m128d sum1 = _mm_hadd_pd(sum2, sum2); // [total, total] + double total = _mm_cvtsd_f64(sum1); + + outputData[0] = result[0] + result[2]; + outputData[1] = result[1] + result[3]; + outputData[2] = total; + + inputData += 3; + outputData += 3; + } + +#elif (VISP_HAVE_SSE3) + double *outputData = output.data; + __m128d rows[] = { + _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2), + _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2), + _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2) + }; + for (unsigned int i = 0; i < input.getRows(); ++i) { + + __m128d r1 = _mm_set_pd(inputData[1], inputData[0]); + __m128d r2 = _mm_set_pd(1.0, inputData[2]); + + for (unsigned int j = 0; j < 3; ++j) { + __m128d m1 = _mm_mul_pd(r1, rows[j * 2]); +#if !defined(VISP_HAVE_FMA) + __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]); + __m128d add = _mm_add_pd(m1, m2); +#else + __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1); +#endif + __m128d sum = _mm_hadd_pd(add, add); + outputData[j] = _mm_cvtsd_f64(sum); + } + inputData += 3; + outputData += 3; + } +#else + double *r0 = rowPtrs[0]; + double *r1 = rowPtrs[1]; + double *r2 = rowPtrs[2]; + + for (unsigned int i = 0; i < input.getRows(); ++i) { + output[i][0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2] + r0[3]; + output[i][1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2] + r1[3]; + output[i][2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2] + r2[3]; + inputData += 3; + } +#endif + } + else { + if (input.getRows() != 3) { + throw vpException(vpException::dimensionError, "Expected input to have 3 rows"); + } + double *inputX = input[0]; + double *inputY = input[1]; + double *inputZ = input[2]; + double *outputX = output[0]; + double *outputY = output[1]; + double *outputZ = output[2]; + +#if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) + vpSIMD::Register elems[12]; + for (unsigned int i = 0; i < 12; ++i) { + elems[i] = vpSIMD::set1(data[i]); + } + + for (int i = 0; i <= static_cast(input.getCols()) - vpSIMD::numLanes; i += vpSIMD::numLanes) { + const vpSIMD::Register x4 = vpSIMD::loadu(inputX); + const vpSIMD::Register y4 = vpSIMD::loadu(inputY); + const vpSIMD::Register z4 = vpSIMD::loadu(inputZ); + + vpSIMD::Register dp1 = vpSIMD::mul(x4, elems[0]); + dp1 = vpSIMD::fma(y4, elems[1], dp1); + dp1 = vpSIMD::fma(z4, elems[2], dp1); + dp1 = vpSIMD::add(elems[3], dp1); + + vpSIMD::Register dp2 = vpSIMD::mul(x4, elems[4]); + dp2 = vpSIMD::fma(y4, elems[5], dp2); + dp2 = vpSIMD::fma(z4, elems[6], dp2); + dp2 = vpSIMD::add(elems[7], dp2); + + vpSIMD::Register dp3 = vpSIMD::mul(x4, elems[8]); + dp3 = vpSIMD::fma(y4, elems[9], dp3); + dp3 = vpSIMD::fma(z4, elems[10], dp3); + dp3 = vpSIMD::add(elems[11], dp3); + + vpSIMD::storeu(outputX, dp1); + vpSIMD::storeu(outputY, dp2); + vpSIMD::storeu(outputZ, dp3); + + inputX += vpSIMD::numLanes; inputY += vpSIMD::numLanes; inputZ += vpSIMD::numLanes; + outputX += vpSIMD::numLanes; outputY += vpSIMD::numLanes; outputZ += vpSIMD::numLanes; + + } + double *r0 = rowPtrs[0]; + double *r1 = rowPtrs[1]; + double *r2 = rowPtrs[2]; + + for (unsigned int i = (input.getCols() / vpSIMD::numLanes) * vpSIMD::numLanes; i < input.getCols(); ++i) { + // std::cout << "i = " << i << std::endl; + double X = *inputX, Y = *inputY, Z = *inputZ; + *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z + r0[3]; + *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z + r1[3]; + *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z + r2[3]; + ++inputX; ++inputY; ++inputZ; + ++outputX; ++outputY; ++outputZ; + } +#else + double *r0 = rowPtrs[0]; + double *r1 = rowPtrs[1]; + double *r2 = rowPtrs[2]; + + for (unsigned int i = 0; i < input.getCols(); ++i) { + double X = *inputX, Y = *inputY, Z = *inputZ; + *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z + r0[3]; + *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z + r1[3]; + *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z + r2[3]; + ++inputX; ++inputY; ++inputZ; + ++outputX; ++outputY; ++outputZ; + } +#endif + } +} + + /*! Construct an homogeneous matrix from a translation vector and quaternion rotation vector. @@ -651,16 +827,41 @@ vpColVector vpHomogeneousMatrix::operator*(const vpColVector &v) const "(%dx1) column vector", v.getRows())); } - vpColVector p(rowNum); + vpColVector p(rowNum, 0.0); - p = 0.0; +// #if USE_SSE +// __m128d rows[] = { +// _mm_loadu_pd(rowPtrs[0]), _mm_loadu_pd(rowPtrs[0] + 2), +// _mm_loadu_pd(rowPtrs[1]), _mm_loadu_pd(rowPtrs[1] + 2), +// _mm_loadu_pd(rowPtrs[2]), _mm_loadu_pd(rowPtrs[2] + 2), +// _mm_loadu_pd(rowPtrs[3]), _mm_loadu_pd(rowPtrs[3] + 2) + +// }; + +// __m128d r1 = _mm_loadu_pd(v.data); +// __m128d r2 = _mm_loadu_pd(v.data + 2); + +// for (unsigned int j = 0; j < 4; ++j) { +// __m128d m1 = _mm_mul_pd(r1, rows[j * 2]); +// #if !defined(__FMA__) +// __m128d m2 = _mm_mul_pd(r2, rows[j * 2 + 1]); +// __m128d add = _mm_add_pd(m1, m2); +// #else +// __m128d add = _mm_fmadd_pd(r2, rows[j * 2 + 1], m1); +// #endif +// __m128d sum = _mm_hadd_pd(add, add); +// p[j] = _mm_cvtsd_f64(sum); +// } + + +// #else for (unsigned int j = 0; j < val_4; ++j) { for (unsigned int i = 0; i < val_4; ++i) { p[i] += rowPtrs[i][j] * v[j]; } } - +// #endif return p; } diff --git a/modules/core/src/math/transformation/vpRotationMatrix.cpp b/modules/core/src/math/transformation/vpRotationMatrix.cpp index 067db852cf..bd6337a70f 100644 --- a/modules/core/src/math/transformation/vpRotationMatrix.cpp +++ b/modules/core/src/math/transformation/vpRotationMatrix.cpp @@ -1,6 +1,6 @@ /* * ViSP, open source Visual Servoing Platform software. - * Copyright (C) 2005 - 2024 by Inria. All rights reserved. + * Copyright (C) 2005 - 2025 by Inria. All rights reserved. * * This software is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -40,6 +40,8 @@ #include #include +#include "../matrix/private/vpSIMDUtils.h" + // Rotation classes #include @@ -50,6 +52,177 @@ #include BEGIN_VISP_NAMESPACE + +/*! + * Rotate a set of vectors by this rotation matrix. + * \param input : Input matrix containing vectors to rotate. By default, each column is a 3D vector with 3 rows. + * When `isTransposed` is true, each row is a 3D vector with 3 columns. + * \param output : Output matrix containing rotated vectors. Has the same layout and size as the input matrix. + * \param isTransposed : If true, input matrix is transposed (each row is a 3D vector). +**/ +void vpRotationMatrix::rotateVectors(const vpMatrix &input, vpMatrix &output, bool isTransposed) const +{ + output.resize(input.getRows(), input.getCols(), false, false); + if (isTransposed) { + if (input.getCols() != 3) { + throw vpException(vpException::dimensionError, "Input matrix should have 3 columns"); + } + double *inputData = input.data; + double *outputData = output.data; +#if defined(VISP_HAVE_AVX) + __m256d rows[] = { + _mm256_setr_pd(rowPtrs[0][0], rowPtrs[0][1], rowPtrs[0][2], 0.0), + _mm256_setr_pd(rowPtrs[1][0], rowPtrs[1][1], rowPtrs[1][2], 0.0), + _mm256_setr_pd(rowPtrs[2][0], rowPtrs[2][1], rowPtrs[2][2], 0.0), + }; + + double result[4]; + + for (unsigned int i = 0; i < input.getRows(); ++i) { + const __m256d xyzw = _mm256_setr_pd(inputData[0], inputData[1], inputData[2], 0.0); + + const __m256d rs[] = { + _mm256_mul_pd(rows[0], xyzw), + _mm256_mul_pd(rows[1], xyzw), + _mm256_mul_pd(rows[2], xyzw), + }; + + __m256d rs_half_sum1 = _mm256_hadd_pd(rs[0], rs[1]); + + + _mm256_storeu_pd(result, rs_half_sum1); + __m128d hi = _mm256_extractf128_pd(rs[2], 1); // [A3, A2] + __m128d lo = _mm256_castpd256_pd128(rs[2]); // [A1, A0] + + __m128d sum2 = _mm_add_pd(lo, hi); // [A3+A1, A2+A0] + __m128d sum1 = _mm_hadd_pd(sum2, sum2); // [total, total] + double total = _mm_cvtsd_f64(sum1); + + outputData[0] = result[0] + result[2]; + outputData[1] = result[1] + result[3]; + outputData[2] = total; + + inputData += 3; + outputData += 3; + } + +#elif VISP_HAVE_SSE3 + __m128d row01_lo = _mm_loadu_pd(rowPtrs[0]); + __m128d row01_hi = _mm_loadu_pd(rowPtrs[1]); + __m128d row2_xy = _mm_loadu_pd(rowPtrs[2]); + + // Preload third column as scalars + double c0 = rowPtrs[0][2]; + double c1 = rowPtrs[1][2]; + double c2 = rowPtrs[2][2]; + + for (unsigned i = 0; i < input.getRows(); ++i) { + + const __m128d xy = _mm_loadu_pd(inputData); + + const double z = inputData[2]; + // const __m128d zz = _mm_set_pd(z, z); + + __m128d mul0 = _mm_mul_pd(row01_lo, xy); + __m128d mul1 = _mm_mul_pd(row01_hi, xy); + __m128d r01 = _mm_hadd_pd(mul0, mul1); + __m128d mul2 = _mm_mul_pd(row2_xy, xy); + + // Adding scalars at the end without using simd is faster + // __m128d mulzz = _mm_mul_pd(row01_z, zz); + // r01 = _mm_add_pd(r01, mulzz); + + // store result + outputData[0] = _mm_cvtsd_f64(r01) + c0 * z; + outputData[1] = _mm_cvtsd_f64(_mm_unpackhi_pd(r01, r01)) + c1 * z; + outputData[2] = _mm_cvtsd_f64(_mm_hadd_pd(mul2, mul2)) + c2 * z; + + inputData += 3; + outputData += 3; + } +#else + double *r0 = rowPtrs[0]; + double *r1 = rowPtrs[1]; + double *r2 = rowPtrs[2]; + + for (unsigned int i = 0; i < input.getRows(); ++i) { + outputData[0] = r0[0] * inputData[0] + r0[1] * inputData[1] + r0[2] * inputData[2]; + outputData[1] = r1[0] * inputData[0] + r1[1] * inputData[1] + r1[2] * inputData[2]; + outputData[2] = r2[0] * inputData[0] + r2[1] * inputData[1] + r2[2] * inputData[2]; + inputData += 3; + outputData += 3; + } +#endif + } + else { // 3xN matrix + if (input.getRows() != 3) { + throw vpException(vpException::dimensionError, "Expected input to have 3 rows"); + } + double *inputX = input[0]; + double *inputY = input[1]; + double *inputZ = input[2]; + double *outputX = output[0]; + double *outputY = output[1]; + double *outputZ = output[2]; + +#if defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) || defined(VISP_HAVE_AVX2) + + vpSIMD::Register elems[9]; + for (unsigned int i = 0; i < 9; ++i) { + elems[i] = vpSIMD::set1(data[i]); + } + for (int i = 0; i <= static_cast(input.getCols()) - vpSIMD::numLanes; i += vpSIMD::numLanes) { + const vpSIMD::Register x4 = vpSIMD::loadu(inputX); + const vpSIMD::Register y4 = vpSIMD::loadu(inputY); + const vpSIMD::Register z4 = vpSIMD::loadu(inputZ); + + vpSIMD::Register dp1 = vpSIMD::mul(x4, elems[0]); + dp1 = vpSIMD::fma(y4, elems[1], dp1); + dp1 = vpSIMD::fma(z4, elems[2], dp1); + vpSIMD::Register dp2 = vpSIMD::mul(x4, elems[3]); + dp2 = vpSIMD::fma(y4, elems[4], dp2); + dp2 = vpSIMD::fma(z4, elems[5], dp2); + vpSIMD::Register dp3 = vpSIMD::mul(x4, elems[6]); + dp3 = vpSIMD::fma(y4, elems[7], dp3); + dp3 = vpSIMD::fma(z4, elems[8], dp3); + + vpSIMD::storeu(outputX, dp1); + vpSIMD::storeu(outputY, dp2); + vpSIMD::storeu(outputZ, dp3); + + inputX += vpSIMD::numLanes; inputY += vpSIMD::numLanes; inputZ += vpSIMD::numLanes; + outputX += vpSIMD::numLanes; outputY += vpSIMD::numLanes; outputZ += vpSIMD::numLanes; + } + double *r0 = rowPtrs[0]; + double *r1 = rowPtrs[1]; + double *r2 = rowPtrs[2]; + + for (unsigned int i = (input.getCols() / vpSIMD::numLanes) * vpSIMD::numLanes; i < input.getCols(); ++i) { + // std::cout << "i = " << i << std::endl; + double X = *inputX, Y = *inputY, Z = *inputZ; + *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z; + *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z; + *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z; + ++inputX; ++inputY; ++inputZ; + ++outputX; ++outputY; ++outputZ; + } +#else + double *r0 = rowPtrs[0]; + double *r1 = rowPtrs[1]; + double *r2 = rowPtrs[2]; + + for (unsigned int i = 0; i < input.getCols(); ++i) { + double X = *inputX, Y = *inputY, Z = *inputZ; + *outputX = r0[0] * X + r0[1] * Y + r0[2] * Z; + *outputY = r1[0] * X + r1[1] * Y + r1[2] * Z; + *outputZ = r2[0] * X + r2[1] * Y + r2[2] * Z; + ++inputX; ++inputY; ++inputZ; + ++outputX; ++outputY; ++outputZ; + } +#endif + } +} + const unsigned int vpRotationMatrix::constr_val_3 = 3; /*! Initialize the rotation matrix as identity. diff --git a/modules/core/test/math/catchHomogeneousMatrix.cpp b/modules/core/test/math/catchHomogeneousMatrix.cpp index af819ab093..a6e11fcb0c 100644 --- a/modules/core/test/math/catchHomogeneousMatrix.cpp +++ b/modules/core/test/math/catchHomogeneousMatrix.cpp @@ -1,6 +1,6 @@ /* * ViSP, open source Visual Servoing Platform software. - * Copyright (C) 2005 - 2024 by Inria. All rights reserved. + * Copyright (C) 2005 - 2025 by Inria. All rights reserved. * * This software is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -38,6 +38,8 @@ */ #include +#include + #if defined(VISP_HAVE_CATCH2) #include @@ -61,6 +63,7 @@ bool test_matrix_equal(const vpHomogeneousMatrix &M1, const vpHomogeneousMatrix TEST_CASE("vpHomogeneousMatrix re-orthogonalize rotation matrix", "[vpHomogeneousMatrix]") { + std::cout << "\n== Test vpHomogeneousMatrix re-orthogonalize rotation matrix ==" << std::endl; CHECK_NOTHROW([]() { vpHomogeneousMatrix M { 0.9835, -0.0581, 0.1716, 0.0072, -0.0489, -0.9972, -0.0571, 0.0352, 0.1744, 0.0478, -0.9835, 0.9470 }; @@ -120,6 +123,7 @@ TEST_CASE("vpHomogeneousMatrix re-orthogonalize rotation matrix", "[vpHomogeneou TEST_CASE("vpRotationMatrix re-orthogonalize rotation matrix", "[vpRotationMatrix]") { + std::cout << "\n== Test vpRotationMatrix re-orthogonalize rotation matrix ==" << std::endl; CHECK_NOTHROW( []() { vpRotationMatrix R { 0.9835, -0.0581, 0.1716, -0.0489, -0.9972, -0.0571, 0.1744, 0.0478, -0.9835 }; }()); @@ -163,6 +167,7 @@ TEST_CASE("vpRotationMatrix re-orthogonalize rotation matrix", "[vpRotationMatri TEST_CASE("ENU to NED conversion", "[enu2ned]") { + std::cout << "\n== Test ENU to NED conversion ==" << std::endl; vpHomogeneousMatrix enu_M_flu { 0, -1, 0, 0.2, 1, 0, 0, 1., 0, 0, 1, 0.3 }; std::cout << "enu_M_flu:\n" << enu_M_flu << std::endl; @@ -204,6 +209,7 @@ TEST_CASE("ENU to NED conversion", "[enu2ned]") TEST_CASE("vpHomogenousMatrix * vpRotationMatrix", "[operator*]") { + std::cout << "\n== Test vpHomogenousMatrix * vpRotationMatrix ==" << std::endl; // Test rotation_matrix * homogeneous_matrix vpHomogeneousMatrix _1_M_2_ { 0.9835, -0.0581, 0.1716, 0.0072, @@ -224,6 +230,97 @@ TEST_CASE("vpHomogenousMatrix * vpRotationMatrix", "[operator*]") CHECK(success); } +TEST_CASE("Point projection", "project") +{ + std::cout << "\n== Test point projection ==" << std::endl; + std::map map_vecsize_trials = { {1, 100000}, {10, 10000}, {100, 1000}, {1000, 1000}, { 10000, 100} }; + + for (const auto &[vecsize, trials] : map_vecsize_trials) { + std::cout << "** Running for vector size = " << vecsize << std::endl; + std::vector timeProjectTransposed, timeProject, timeMult, timeNaive; + vpUniRand r(42); + for (unsigned int trial = 0; trial < trials; ++trial) { + vpHomogeneousMatrix M( + r.uniform(0.0, 1.0), r.uniform(0.0, 1.0), r.uniform(0.0, 1.0), + r.uniform(0.0, M_PI), r.uniform(0.0, M_PI), r.uniform(0.0, M_PI)); + vpMatrix inputT(vecsize, 3); + for (unsigned int i = 0; i< vecsize; ++i) { + inputT[i][0] = r.uniform(0.0, 1.0); + inputT[i][1] = r.uniform(0.0, 1.0); + inputT[i][2] = r.uniform(0.0, 1.0); + } + vpMatrix input = inputT.t(); + vpMatrix outputT(vecsize, 3); + vpMatrix output(3, vecsize); + + double t1 = vpTime::measureTimeMs(); + M.project(inputT, outputT, true); + double t2 = vpTime::measureTimeMs(); + timeProjectTransposed.push_back(t2 - t1); + + t1 = vpTime::measureTimeMs(); + M.project(input, output, false); + t2 = vpTime::measureTimeMs(); + timeProject.push_back(t2 - t1); + + vpColVector x(4, 1); + vpColVector res(4); + + vpMatrix outputR(vecsize, 3); + double t1r = vpTime::measureTimeMs(); + + for (unsigned int i = 0; i < inputT.getRows(); ++i) { + x[0] = inputT[i][0]; + x[1] = inputT[i][1]; + x[2] = inputT[i][2]; + + vpColVector res = M * x; + outputR[i][0] = res[0]; + outputR[i][1] = res[1]; + outputR[i][2] = res[2]; + } + double t2r = vpTime::measureTimeMs(); + timeNaive.push_back(t2r - t1r); + + vpMatrix input4(4, vecsize); + for (unsigned int i = 0; i < inputT.getRows(); ++i) { + input4[0][i] = inputT[i][0]; + input4[1][i] = inputT[i][1]; + input4[2][i] = inputT[i][2]; + input4[3][i] = 1; + } + vpMatrix output4(4, vecsize); + double t14 = vpTime::measureTimeMs(); + + vpMatrix::mult2Matrices(static_cast(M), input4, output4); + double t24 = vpTime::measureTimeMs(); + timeMult.push_back(t24 - t14); + + double errorT = (outputR - outputT).frobeniusNorm() / vecsize; + double error = (outputR - output.t()).frobeniusNorm() / vecsize; + + if (errorT > 1e-10 || error > 1e-10) { + std::cout << "M = " << M << std::endl; + std::cerr << "Naive outputT = " << outputR << std::endl; + std::cerr << "Mult outputT = " << output4.t() << std::endl; + std::cerr << "Project output transposed version = " << outputT << std::endl; + std::cerr << "Project output = " << outputT << std::endl; + std::cerr << "Diff transposed = " << outputR - outputT << std::endl; + std::cerr << "Diff = " << outputR - output.t() << std::endl; + + FAIL(); + } + } + std::cout << "Optimized version (transposed) took: " << vpMath::getMean(timeProjectTransposed) << " +-" << vpMath::getStdev(timeProjectTransposed) << "ms" << std::endl; + std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +-" << vpMath::getStdev(timeProject) << "ms" << std::endl; + + std::cout << "Mult version took: " << vpMath::getMean(timeMult) << " +-" << vpMath::getStdev(timeMult) << "ms" << std::endl; + std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +-" << vpMath::getStdev(timeNaive) << "ms" << std::endl; + std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::minimum(vpMath::getMean(timeProjectTransposed), vpMath::getMean(timeProject)) << std::endl; + + } +} + int main(int argc, char *argv[]) { Catch::Session session; diff --git a/modules/core/test/math/catchRotation.cpp b/modules/core/test/math/catchRotation.cpp index 5031a2efcc..ba84966fa7 100644 --- a/modules/core/test/math/catchRotation.cpp +++ b/modules/core/test/math/catchRotation.cpp @@ -134,6 +134,7 @@ TEST_CASE("Common rotation operations", "[rotation]") { SECTION("Theta u initialization") { + std::cout << "\n== Test theta u initialization ==" << std::endl; vpThetaUVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10)); std::vector bench1(3, vpMath::rad(10)); vpColVector bench3(3, vpMath::rad(10)); @@ -169,6 +170,7 @@ TEST_CASE("Common rotation operations", "[rotation]") } SECTION("Rxyz initialization") { + std::cout << "\n== Test Rxyz initialization ==" << std::endl; vpRxyzVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10)); std::vector bench1(3, vpMath::rad(10)); vpColVector bench3(3, vpMath::rad(10)); @@ -203,6 +205,7 @@ TEST_CASE("Common rotation operations", "[rotation]") } SECTION("rzyx initialization") { + std::cout << "\n== Test rzyx initialization ==" << std::endl; vpRzyxVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10)); std::vector bench1(3, vpMath::rad(10)); vpColVector bench3(3, vpMath::rad(10)); @@ -237,6 +240,7 @@ TEST_CASE("Common rotation operations", "[rotation]") } SECTION("rzyz initialiation") { + std::cout << "\n== Test rzyz initialization ==" << std::endl; vpRzyzVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10)); std::vector bench1(3, vpMath::rad(10)); vpColVector bench3(3, vpMath::rad(10)); @@ -271,6 +275,7 @@ TEST_CASE("Common rotation operations", "[rotation]") } SECTION("Test quaternion initialization", "[quaternion]") { + std::cout << "\n== Test quaternion initialization ==" << std::endl; vpQuaternionVector r1(vpMath::rad(10), vpMath::rad(10), vpMath::rad(10), vpMath::rad(10)); std::vector bench1(4, vpMath::rad(10)); vpColVector bench3(4, vpMath::rad(10)); @@ -305,41 +310,41 @@ TEST_CASE("Common rotation operations", "[rotation]") } SECTION("Conversions") { + std::cout << "\n== Test conversions ==" << std::endl; vpRotationMatrix R; for (int i = -10; i < 10; i++) { for (int j = -10; j < 10; j++) { vpThetaUVector tu(vpMath::rad(90 + i), vpMath::rad(170 + j), vpMath::rad(45)); tu.buildFrom(vpRotationMatrix(tu)); // put some coherence into rotation convention - std::cout << "Initialization " << std::endl; + std::cout << "\n** Initialization with theta_u: " << tu.t() << std::endl; double theta; vpColVector u; tu.extract(theta, u); - std::cout << "theta=" << vpMath::deg(theta) << std::endl; - std::cout << "u=" << u << std::endl; + std::cout << "Extracted theta: " << vpMath::deg(theta) << " deg" << std::endl; + std::cout << "Extracted u: " << u.t() << std::endl; std::cout << "From vpThetaUVector to vpRotationMatrix " << std::endl; R.buildFrom(tu); - std::cout << "Matrix R"; + std::cout << "Matrix R: \n" << R << std::endl; CHECK(R.isARotationMatrix()); - std::cout << R << std::endl; - std::cout << "From vpRotationMatrix to vpQuaternionVector " << std::endl; vpQuaternionVector q(R); + std::cout << "Quaternion: " << q.t() << std::endl; CHECK(q.magnitude() == Catch::Approx(1.0).margin(1e-4)); - std::cout << q << std::endl; + std::cout << "From vpQuaternionVector to vpRotationMatrix " << std::endl; R.buildFrom(q); + std::cout << "Matrix R: \n" << R << std::endl; CHECK(R.isARotationMatrix()); - std::cout << "From vpQuaternionVector to vpRotationMatrix " << std::endl; std::cout << "From vpRotationMatrix to vpRxyzVector " << std::endl; vpRxyzVector RxyzbuildR(R); - std::cout << RxyzbuildR << std::endl; + std::cout << "Rxyz: " << RxyzbuildR.t() << std::endl; std::cout << "From vpRxyzVector to vpThetaUVector " << std::endl; std::cout << " use From vpRxyzVector to vpRotationMatrix " << std::endl; @@ -348,15 +353,14 @@ TEST_CASE("Common rotation operations", "[rotation]") vpThetaUVector tubuildEu; tubuildEu.buildFrom(R); - std::cout << std::endl; - std::cout << "result : should equivalent to the first one " << std::endl; + std::cout << "Result : should equivalent to the first one " << std::endl; double theta2; vpColVector u2; tubuildEu.extract(theta2, u2); - std::cout << "theta=" << vpMath::deg(theta2) << std::endl; - std::cout << "u=" << u2 << std::endl; + std::cout << "Extracted theta: " << vpMath::deg(theta2) << " deg" << std::endl; + std::cout << "Extracted u: " << u2.t() << std::endl; CHECK(vpMath::abs(theta2 - theta) < std::numeric_limits::epsilon() * 1e10); CHECK(vpMath::abs(u[0] - u2[0]) < std::numeric_limits::epsilon() * 1e10); @@ -366,35 +370,39 @@ TEST_CASE("Common rotation operations", "[rotation]") } SECTION("Conversion from and to rzyz vector") { + std::cout << "\n== Test conversion from and to rzyz vector ==" << std::endl; vpRzyzVector rzyz(vpMath::rad(180), vpMath::rad(120), vpMath::rad(45)); - std::cout << "Initialization vpRzyzVector " << std::endl; - std::cout << rzyz << std::endl; + std::cout << "\n** Initialization with rzyz: " << rzyz.t() << std::endl; std::cout << "From vpRzyzVector to vpRotationMatrix " << std::endl; + R.buildFrom(rzyz); + std::cout << "Matrix R: \n" << R << std::endl; CHECK(R.isARotationMatrix()); std::cout << "From vpRotationMatrix to vpRzyzVector " << std::endl; vpRzyzVector rzyz_final; rzyz_final.buildFrom(R); + std::cout << "rzyz: " << rzyz_final.t() << std::endl; CHECK(test("rzyz", rzyz_final, vpColVector(rzyz))); - std::cout << rzyz_final << std::endl; } SECTION("Conversion from and to rzyx vector") { + std::cout << "\n== Test conversion from and to rzyx vector ==" << std::endl; vpRzyxVector rzyx(vpMath::rad(180), vpMath::rad(120), vpMath::rad(45)); - std::cout << "Initialization vpRzyxVector " << std::endl; - std::cout << rzyx << std::endl; + std::cout << "\n** Initialization with rzyx: " << rzyx.t() << std::endl; std::cout << "From vpRzyxVector to vpRotationMatrix " << std::endl; R.buildFrom(rzyx); + std::cout << "Matrix R: \n" << R << std::endl; CHECK(R.isARotationMatrix()); - std::cout << R << std::endl; std::cout << "From vpRotationMatrix to vpRzyxVector " << std::endl; vpRzyxVector rzyx_final; rzyx_final.buildFrom(R); + std::cout << "rzyx: " << rzyx_final.t() << std::endl; bool ret = test("rzyx", rzyx_final, vpColVector(rzyx)); if (ret == false) { // Euler angle representation is not unique std::cout << "Rzyx vector differ. Test rotation matrix..." << std::endl; vpRotationMatrix RR(rzyx_final); + std::cout << "Matrix R: \n" << RR << std::endl; if (R == RR) { std::cout << "Rzyx vector differ but rotation matrix is valid" << std::endl; ret = true; @@ -406,6 +414,7 @@ TEST_CASE("Common rotation operations", "[rotation]") } SECTION("Rotation matrix extraction from homogeneous matrix and multiplication") { + std::cout << "\n== Test rotation matrix extraction from homogeneous matrix and multiplication ==" << std::endl; // Test rotation_matrix * homogeneous_matrix vpHomogeneousMatrix _1_M_2_truth; _1_M_2_truth[0][0] = 0.9835; @@ -443,6 +452,7 @@ TEST_CASE("Common rotation operations", "[rotation]") TEST_CASE("Theta u multiplication", "[theta.u]") { + std::cout << "\n== Test theta u multiplication ==" << std::endl; const int nTrials = 100; const uint64_t seed = 0x123456789; vpUniRand rng(seed); @@ -469,6 +479,7 @@ TEST_CASE("Theta u multiplication", "[theta.u]") TEST_CASE("Quaternion multiplication", "[quaternion]") { + std::cout << "\n== Test quaternion multiplication ==" << std::endl; const int nTrials = 100; const uint64_t seed = 0x123456789; vpUniRand rng(seed); @@ -496,6 +507,7 @@ TEST_CASE("Quaternion multiplication", "[quaternion]") TEST_CASE("Default constructor", "[rotation matrix]") { + std::cout << "\n== Test default constructor ==" << std::endl; vpRotationMatrix I; SECTION("rotation matrix") { @@ -559,6 +571,82 @@ TEST_CASE("Default constructor", "[rotation matrix]") #endif } +TEST_CASE("Vector rotation", "project") +{ + std::cout << "\n== Test rotation of vectors ==" << std::endl; + std::map map_vecsize_trials = { {1, 100000}, {10, 10000}, {100, 1000}, {1000, 1000}, { 10000, 100} }; + + for (const auto &[vecsize, trials] : map_vecsize_trials) { + std::cout << "** Running for vector size = " << vecsize << std::endl; + std::vector timeProjectTransposed, timeProject, timeMult, timeNaive; + vpUniRand r(42); + for (unsigned int trial = 0; trial < trials; ++trial) { + vpRotationMatrix M(r.uniform(0.0, M_PI), r.uniform(0.0, M_PI), r.uniform(0.0, M_PI)); + vpMatrix inputT(vecsize, 3); + + for (unsigned int i = 0; i< vecsize; ++i) { + inputT[i][0] = r.uniform(0.0, 1.0); + inputT[i][1] = r.uniform(0.0, 1.0); + inputT[i][2] = r.uniform(0.0, 1.0); + } + vpMatrix input = inputT.t(); + vpMatrix outputT(vecsize, 3); + vpMatrix output(3, vecsize); + double t1 = vpTime::measureTimeMs(); + M.rotateVectors(inputT, outputT, true); + double t2 = vpTime::measureTimeMs(); + timeProjectTransposed.push_back(t2 - t1); + t1 = vpTime::measureTimeMs(); + M.rotateVectors(input, output, false); + t2 = vpTime::measureTimeMs(); + timeProject.push_back(t2 - t1); + + vpColVector x(3, 1); + vpColVector res(3); + vpMatrix outputR(vecsize, 3); + double t1r = vpTime::measureTimeMs(); + + for (unsigned int i = 0; i < inputT.getRows(); ++i) { + x[0] = inputT[i][0]; + x[1] = inputT[i][1]; + x[2] = inputT[i][2]; + + res = M * x; + outputR[i][0] = res[0]; + outputR[i][1] = res[1]; + outputR[i][2] = res[2]; + } + double t2r = vpTime::measureTimeMs(); + timeNaive.push_back(t2r - t1r); + + vpMatrix outputMM(3, vecsize); + double t14 = vpTime::measureTimeMs(); + + vpMatrix::mult2Matrices((vpMatrix)M, input, outputMM); + double t24 = vpTime::measureTimeMs(); + timeMult.push_back(t24 - t14); + + double errorT = (outputR - outputT).frobeniusNorm() / vecsize; + double error = (outputR - output.t()).frobeniusNorm() / vecsize; + + if (errorT > 1e-10) { + std::cerr << "Transposed version failed" << std::endl; + FAIL(); + } + + if (error > 1e-10) { + std::cerr << "Optimized 3xN version failed" << std::endl; + FAIL(); + } + } + std::cout << "Optimized (transposed) version took: " << vpMath::getMean(timeProjectTransposed) << " +- " << vpMath::getStdev(timeProjectTransposed) << "ms" << std::endl; + std::cout << "Optimized version took: " << vpMath::getMean(timeProject) << " +- " << vpMath::getStdev(timeProject) << "ms" << std::endl; + std::cout << "Mult version took: " << vpMath::getMean(timeMult) << " +- " << vpMath::getStdev(timeMult) << "ms" << std::endl; + std::cout << "Naive version took: " << vpMath::getMean(timeNaive) << " +- " << vpMath::getStdev(timeNaive) << "ms" << std::endl; + std::cout << "Speedup: " << vpMath::minimum(vpMath::getMean(timeNaive), vpMath::getMean(timeMult)) / vpMath::minimum(vpMath::getMean(timeProjectTransposed), vpMath::getMean(timeProject)) << std::endl; + } +} + int main(int argc, char *argv[]) { Catch::Session session; diff --git a/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h b/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h index cd8136b900..07282c2377 100644 --- a/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h +++ b/modules/tracker/rbt/include/visp3/rbt/vpRBDenseDepthTracker.h @@ -151,7 +151,7 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker void extractFeatures(const vpRBFeatureTrackerInput &frame, const vpRBFeatureTrackerInput &previousFrame, const vpHomogeneousMatrix &cMo) VP_OVERRIDE; void trackFeatures(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) VP_OVERRIDE { } - void initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) VP_OVERRIDE { } + void initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) VP_OVERRIDE; void computeVVSIter(const vpRBFeatureTrackerInput &frame, const vpHomogeneousMatrix &cMo, unsigned int iteration) VP_OVERRIDE; void display(const vpCameraParameters &cam, const vpImage &I, const vpImage &IRGB, const vpImage &depth) const VP_OVERRIDE; @@ -179,10 +179,8 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker for (vpMatrix *m: matrices) { m->resize(3, numPoints, false, false); } - m_valid.resize(numPoints); for (unsigned int i = 0; i < numPoints; ++i) { - m_valid[i] = true; for (unsigned int j = 0; j < 3; ++j) { m_oXt[j][i] = points[i].oX[j]; m_oNt[j][i] = points[i].objectNormal[j]; @@ -190,89 +188,9 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker } } } - inline void updateAndErrorAndInteractionMatrix(const vpCameraParameters &cam, const vpHomogeneousMatrix &cMo, const vpImage &depth, vpColVector &e, vpMatrix &L) - { - const vpRotationMatrix cRo = cMo.getRotationMatrix(); - const vpTranslationVector t = cMo.getTranslationVector(); - vpMatrix::mult2Matrices(cRo, m_oXt, m_cXt); - vpMatrix::mult2Matrices(cRo, m_oNt, m_cNt); - - const unsigned int numPoints = m_oXt.getCols(); - e.resize(numPoints, false); - L.resize(numPoints, 6, false, false); - -#ifdef VISP_HAVE_OPENMP -#pragma omp parallel for -#endif - for (int i = 0; i < static_cast(numPoints); ++i) { - - //Step 1: update and filter out points that are no longer valid - { - - for (unsigned int j = 0; j < 3; ++j) { - m_cXt[j][i] += t[j]; - } - // Plane points away from the camera: this surface is no longer visible due to rotation - if (m_cNt[2][i] >= 0.0) { - m_valid[i] = false; - continue; - } - double x, y, u, v; - x = m_cXt[0][i] / m_cXt[2][i]; - y = m_cXt[1][i] / m_cXt[2][i]; - - vpMeterPixelConversion::convertPointWithoutDistortion(cam, x, y, u, v); - // Point is no longer in image: depth value cannot be sampled - if (u < 0 || v < 0 || u >= depth.getWidth() || v >= depth.getHeight()) { - m_valid[i] = false; - continue; - } - const double Z = depth[static_cast(v)][static_cast(u)]; - // Z value in the depth image from the camera is invalid - if (Z <= 0.0) { - m_valid[i] = false; - continue; - } - - m_valid[i] = true; - m_observations[0][i] = x * Z; - m_observations[1][i] = y * Z; - m_observations[2][i] = Z; - } - // Step 2: update Jacobian and error for valid points - { - const double X = m_cXt[0][i], Y = m_cXt[1][i], Z = m_cXt[2][i]; - const double nX = m_cNt[0][i], nY = m_cNt[1][i], nZ = m_cNt[2][i]; + void updateAndErrorAndInteractionMatrix(const vpCameraParameters &cam, const vpHomogeneousMatrix &cMo, + const vpImage &depth, vpColVector &e, vpMatrix &L); - const double D = -((nX * X) + (nY * Y) + (nZ * Z)); - double projNormal = nX * m_observations[0][i] + nY * m_observations[1][i] + nZ * m_observations[2][i]; - - e[i] = D + projNormal; - - L[i][0] = nX; - L[i][1] = nY; - L[i][2] = nZ; - L[i][3] = nZ * Y - nY * Z; - L[i][4] = nX * Z - nZ * X; - L[i][5] = nY * X - nX * Y; - - } - } - - // Disable invalid points - for (unsigned int i = 0; i < numPoints; ++i) { - if (!m_valid[i]) { - e[i] = 0.0; - - L[i][0] = 0; - L[i][1] = 0; - L[i][2] = 0; - L[i][3] = 0; - L[i][4] = 0; - L[i][5] = 0; - } - } - } const vpMatrix &getPointsObject() const { return m_oXt; } const vpMatrix &getNormalsObject() const { return m_oNt; } @@ -285,7 +203,7 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker vpMatrix m_oNt; // a 3xN matrix containing the 3D normals on the object vpMatrix m_cXt; vpMatrix m_cNt; - std::vector m_valid; + std::vector m_invalid; }; @@ -304,7 +222,7 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker {vpRBDenseDepthTracker::vpDisplayType::DT_WEIGHT, "weight"}, {vpRBDenseDepthTracker::vpDisplayType::DT_ERROR, "error"}, {vpRBDenseDepthTracker::vpDisplayType::DT_WEIGHT_AND_ERROR, "weightAndError"} - }); +}); virtual void loadJsonConfiguration(const nlohmann::json &j) VP_OVERRIDE { @@ -330,6 +248,9 @@ class VISP_EXPORT vpRBDenseDepthTracker : public vpRBFeatureTracker bool m_useMask; float m_minMaskConfidence; vpDisplayType m_displayType; + + vpMatrix m_Lt; + }; END_VISP_NAMESPACE diff --git a/modules/tracker/rbt/src/core/private/vpSIMDUtils.h b/modules/tracker/rbt/src/core/private/vpSIMDUtils.h new file mode 100644 index 0000000000..7d81d01a77 --- /dev/null +++ b/modules/tracker/rbt/src/core/private/vpSIMDUtils.h @@ -0,0 +1,231 @@ +/* + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2025 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See https://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * SIMD utilities. + */ + +/*! + \file vpSIMDUtils.h + \brief Header that defines and includes useful SIMD routines and macros +*/ + +#ifndef VP_SIMD_UTILS_H +#define VP_SIMD_UTILS_H +#include + +#ifndef DOXYGEN_SHOULD_SKIP_THIS + +#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) +#include +#include +#include + +#define VISP_HAVE_SSE2 1 +#endif + +#if defined __AVX2__ +#define VISP_HAVE_AVX2 1 +#endif + +#if defined __AVX__ +#define VISP_HAVE_AVX 1 +#endif + +// https://stackoverflow.com/a/40765925 +#if !defined(__FMA__) && defined(__AVX2__) +#define __FMA__ 1 +#endif + + +#if defined(__FMA__) +#define VISP_HAVE_FMA +#endif + +#if defined _WIN32 && defined(_M_ARM64) +#define _ARM64_DISTINCT_NEON_TYPES +#include +#include +#define VISP_HAVE_NEON 1 +#elif (defined(__ARM_NEON__) || defined (__ARM_NEON)) && defined(__aarch64__) +#include +#define VISP_HAVE_NEON 1 +#else +#define VISP_HAVE_NEON 0 +#endif + +#if VISP_HAVE_SSE2 && USE_SIMD_CODE +#define USE_SSE 1 +#else +#define USE_SSE 0 +#endif + +#if VISP_HAVE_NEON && USE_SIMD_CODE +#define USE_NEON 1 +#else +#define USE_NEON 0 +#endif + +namespace vpSIMD +{ +#if defined(VISP_HAVE_AVX2) +using Register = __m512d; + +inline constexpr int numLanes = 8; +inline const Register add(const Register a, const Register b) +{ + return _mm512_add_pd(a, b); +} + +inline Register sub(const Register a, const Register b) +{ + return _mm512_sub_pd(a, b); +} + +inline Register mul(const Register a, const Register b) +{ + return _mm512_mul_pd(a, b); +} + +inline Register fma(const Register a, const Register b, const Register c) +{ +#if defined(VISP_HAVE_FMA) + return _mm512_fmadd_pd(a, b, c); +#else + return add(mul(a, b), c); +#endif +} + +inline Register loadu(const double *const data) +{ + return _mm512_loadu_pd(data); +} + +inline Register set1(double v) +{ + return _mm512_set1_pd(v); +} + +inline void storeu(double *data, const Register a) +{ + _mm512_storeu_pd(data, a); +} + +#elif defined(VISP_HAVE_AVX) +using Register = __m256d; +inline const int numLanes = 4; + +inline Register add(const Register a, const Register b) +{ + return _mm256_add_pd(a, b); +} + +inline Register sub(const Register a, const Register b) +{ + return _mm256_sub_pd(a, b); +} + +inline Register mul(const Register a, const Register b) +{ + return _mm256_mul_pd(a, b); +} + +inline Register fma(const Register a, const Register b, const Register c) +{ +#if defined(VISP_HAVE_FMA) + return _mm256_fmadd_pd(a, b, c); +#else + return add(mul(a, b), c); +#endif +} + +inline Register loadu(const double *const data) +{ + return _mm256_loadu_pd(data); +} + +inline Register set1(double v) +{ + return _mm256_set1_pd(v); +} + +inline void storeu(double *data, const Register a) +{ + _mm256_storeu_pd(data, a); +} + +#elif VISP_HAVE_SSE2 +using Register = __m128d; +inline const int numLanes = 2; + +inline Register add(const Register a, const Register b) +{ + return _mm_add_pd(a, b); +} + +inline Register sub(const Register a, const Register b) +{ + return _mm_sub_pd(a, b); +} + +inline Register mul(const Register a, const Register b) +{ + return _mm_mul_pd(a, b); +} + +inline Register fma(const Register a, const Register b, const Register c) +{ +#if defined(VISP_HAVE_FMA) + return _mm_fmadd_pd(a, b, c); +#else + return add(mul(a, b), c); +#endif +} + +inline Register loadu(const double *const data) +{ + return _mm_loadu_pd(data); +} + +inline Register set1(double v) +{ + return _mm_set1_pd(v); +} + +inline void storeu(double *data, const Register a) +{ + _mm_storeu_pd(data, a); +} + +#endif + +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // VP_SIMD_UTILS_H diff --git a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp index 2a69b7bfa7..384a3d6119 100644 --- a/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp +++ b/modules/tracker/rbt/src/features/vpRBDenseDepthTracker.cpp @@ -32,6 +32,8 @@ #include #include +#include "../core/private/vpSIMDUtils.h" + #ifdef VISP_HAVE_OPENMP #include #endif @@ -172,6 +174,17 @@ void vpRBDenseDepthTracker::extractFeatures(const vpRBFeatureTrackerInput &frame } } +void vpRBDenseDepthTracker::initVVS(const vpRBFeatureTrackerInput &/*frame*/, const vpRBFeatureTrackerInput &/*previousFrame*/, const vpHomogeneousMatrix &/*cMo*/) +{ + m_LTL.resize(6, 6, false, false); + m_LTR.resize(6, false); + m_L.resize(m_numFeatures, 6, false, false); + m_Lt.resize(6, m_numFeatures, false, false); + m_error.resize(m_numFeatures, false); + m_weighted_error.resize(m_numFeatures, false); + m_weights.resize(m_numFeatures, false); +} + void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, const vpHomogeneousMatrix &cMo, unsigned int /*iteration*/) { if (m_numFeatures == 0) { @@ -185,13 +198,15 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, return; } - m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_L); + m_depthPointSet.updateAndErrorAndInteractionMatrix(frame.cam, cMo, frame.depth, m_error, m_Lt); + m_Lt.transpose(m_L); m_robust.setMinMedianAbsoluteDeviation((frame.renders.zFar - frame.renders.zNear) * 0.01); m_robust.MEstimator(vpRobust::TUKEY, m_error, m_weights); - - +// #if defined(VISP_HAVE_OPENMP) +// #pragma omp parallel for +// #endif for (unsigned int i = 0; i < m_depthPoints.size(); ++i) { m_weighted_error[i] = m_error[i] * m_weights[i]; m_covWeightDiag[i] = m_weights[i] * m_weights[i]; @@ -199,12 +214,145 @@ void vpRBDenseDepthTracker::computeVVSIter(const vpRBFeatureTrackerInput &frame, m_L[i][dof] *= m_weights[i]; } } - - m_LTL = m_L.AtA(); + m_LTL.resize(6, 6); + m_L.AtA(m_LTL); computeJTR(m_L, m_weighted_error, m_LTR); m_vvsConverged = false; } +void errorAndInteractionMatrixBase(const vpMatrix &cXt, const vpMatrix &cNt, const vpMatrix &obsT, vpColVector &e, vpMatrix &Lt, unsigned int start, unsigned int end) +{ +#if defined(VISP_HAVE_OPENMP) +#pragma omp parallel for +#endif + for (unsigned int i = start; i < end; ++i) { + const double X = cXt[0][i], Y = cXt[1][i], Z = cXt[2][i]; + const double nX = cNt[0][i], nY = cNt[1][i], nZ = cNt[2][i]; + + const double D = -((nX * X) + (nY * Y) + (nZ * Z)); + const double projNormal = nX * obsT[0][i] + nY * obsT[1][i] + nZ * obsT[2][i]; + + e[i] = D + projNormal; + + Lt[0][i] = nX; + Lt[1][i] = nY; + Lt[2][i] = nZ; + Lt[3][i] = nZ * Y - nY * Z; + Lt[4][i] = nX * Z - nZ * X; + Lt[5][i] = nY * X - nX * Y; + } +} + +#if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) +void errorAndInteractionMatrixSIMD(const vpMatrix &cXt, const vpMatrix &cNt, const vpMatrix &obsT, vpColVector &e, vpMatrix &Lt) +{ +#if defined(VISP_HAVE_OPENMP) +#pragma omp parallel for schedule(static, 1024) +#endif + for (int i = 0; i <= static_cast(cXt.getCols()) - vpSIMD::numLanes; i += vpSIMD::numLanes) { + const vpSIMD::Register X = vpSIMD::loadu(cXt[0] + i), Y = vpSIMD::loadu(cXt[1] + i), Z = vpSIMD::loadu(cXt[2] + i); + const vpSIMD::Register nX = vpSIMD::loadu(cNt[0] + i), nY = vpSIMD::loadu(cNt[1] + i), nZ = vpSIMD::loadu(cNt[2] + i); + const vpSIMD::Register obsX = vpSIMD::loadu(obsT[0] + i), obsY = vpSIMD::loadu(obsT[1] + i), obsZ = vpSIMD::loadu(obsT[2] + i); + + vpSIMD::Register D = vpSIMD::mul(nX, X); + vpSIMD::Register projNormal = vpSIMD::mul(obsX, X); + D = vpSIMD::fma(nY, Y, D); + D = vpSIMD::fma(nZ, Z, D); + + projNormal = vpSIMD::fma(nY, obsY, projNormal); + projNormal = vpSIMD::fma(nZ, obsZ, projNormal); + + vpSIMD::storeu(e.data + i, vpSIMD::sub(projNormal, D)); + + vpSIMD::storeu(Lt[0] + i, nX); + vpSIMD::storeu(Lt[1] + i, nY); + vpSIMD::storeu(Lt[2] + i, nZ); + + vpSIMD::storeu(Lt[3] + i, vpSIMD::sub(vpSIMD::mul(nZ, Y), vpSIMD::mul(nY, Z))); + vpSIMD::storeu(Lt[4] + i, vpSIMD::sub(vpSIMD::mul(nX, Z), vpSIMD::mul(nZ, X))); + vpSIMD::storeu(Lt[5] + i, vpSIMD::sub(vpSIMD::mul(nY, X), vpSIMD::mul(nX, Y))); + } + errorAndInteractionMatrixBase(cXt, cNt, obsT, e, Lt, (cXt.getCols() / vpSIMD::numLanes) * vpSIMD::numLanes, cXt.getCols()); +} +#endif + +void vpRBDenseDepthTracker::vpDepthPointSet::updateAndErrorAndInteractionMatrix( + const vpCameraParameters &cam, const vpHomogeneousMatrix &cMo, + const vpImage &depth, vpColVector &e, vpMatrix &Lt) +{ + const vpRotationMatrix cRo = cMo.getRotationMatrix(); + cMo.project(m_oXt, m_cXt, false); + cRo.rotateVectors(m_oNt, m_cNt, false); + m_invalid.clear(); + + const unsigned int numPoints = m_oXt.getCols(); + e.resize(numPoints, false); + Lt.resize(6, numPoints, false, false); +#if defined(VISP_HAVE_OPENMP) +#pragma omp parallel +#endif + { + std::vector localInvalid; +#if defined(VISP_HAVE_OPENMP) +#pragma omp for +#endif + + for (int i = 0; i < static_cast(numPoints); ++i) { + //Step 1: update and filter out points that are no longer valid + { + // Plane points away from the camera: this surface is no longer visible due to rotation + if (m_cNt[2][i] >= 0.0) { + localInvalid.push_back(i); + continue; + } + double x, y, u, v; + x = m_cXt[0][i] / m_cXt[2][i]; + y = m_cXt[1][i] / m_cXt[2][i]; + vpMeterPixelConversion::convertPointWithoutDistortion(cam, x, y, u, v); + // Point is no longer in image: depth value cannot be sampled + if (u < 0 || v < 0 || u >= depth.getWidth() || v >= depth.getHeight()) { + localInvalid.push_back(i); + continue; + } + const double Z = depth[static_cast(v)][static_cast(u)]; + // Z value in the depth image from the camera is invalid + if (Z <= 0.0) { + localInvalid.push_back(i); + continue; + } + m_observations[0][i] = x * Z; + m_observations[1][i] = y * Z; + m_observations[2][i] = Z; + } + } +#if defined(VISP_HAVE_OPENMP) +#pragma omp critical + { + m_invalid.insert(m_invalid.end(), localInvalid.begin(), localInvalid.end()); + } +#else + m_invalid = std::move(localInvalid); +#endif + } + +#if defined(VISP_HAVE_AVX2) || defined(VISP_HAVE_AVX) || defined(VISP_HAVE_SSE2) + errorAndInteractionMatrixSIMD(m_cXt, m_cNt, m_observations, e, Lt); +#else + errorAndInteractionMatrixBase(m_cXt, m_cNt, m_observations, e, Lt, 0, numPoints); +#endif + // Disable invalid points + for (unsigned int i : m_invalid) { + e[i] = 0.0; + + Lt[0][i] = 0; + Lt[1][i] = 0; + Lt[2][i] = 0; + Lt[3][i] = 0; + Lt[4][i] = 0; + Lt[5][i] = 0; + } +} + void vpRBDenseDepthTracker::display(const vpCameraParameters &/*cam*/, const vpImage &/*I*/, const vpImage &/*IRGB*/, const vpImage &depth) const {