diff --git a/CHANGELOG.md b/CHANGELOG.md index a112d7a3..4c9e131f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ - Added cmake-format +- The MatrixHandler::scaleAddI function was added to calculate A := cA + I + +- The MatrixHandler::scaleAddB function was added to calculate A := cA + B + ## Changes to Re::Solve in release 0.99.2 ### Major Features diff --git a/resolve/LinSolverDirectRocSolverRf.cpp b/resolve/LinSolverDirectRocSolverRf.cpp index 4f0ebc50..4f445db6 100644 --- a/resolve/LinSolverDirectRocSolverRf.cpp +++ b/resolve/LinSolverDirectRocSolverRf.cpp @@ -1,5 +1,7 @@ #include "LinSolverDirectRocSolverRf.hpp" +#include + #include #include diff --git a/resolve/MemoryUtils.hpp b/resolve/MemoryUtils.hpp index 702d80ea..ff0e9055 100644 --- a/resolve/MemoryUtils.hpp +++ b/resolve/MemoryUtils.hpp @@ -38,7 +38,7 @@ namespace ReSolve * * @brief Provides basic memory allocation, free and copy functions. * - * This class provedes abstractions for memory management functiosn for + * This class provedes abstractions for memory management functions for * different GPU programming models. * * @tparam Policy - Memory management policy (vendor specific) diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp index 1d570af8..86871509 100644 --- a/resolve/matrix/MatrixHandler.cpp +++ b/resolve/matrix/MatrixHandler.cpp @@ -351,6 +351,57 @@ namespace ReSolve } } + /** + * @brief Multiply csr matrix by a constant and add I. + * @param[in,out] A - Sparse CSR matrix + * @param[in] alpha - scalar parameter + * @param[in] memspace - Device where the operation is computed + * @return 0 if successful, 1 otherwise + */ + int MatrixHandler::scaleAddI(matrix::Csr* A, real_type alpha, memory::MemorySpace memspace) + { + assert(A->getSparseFormat() == matrix::Sparse::COMPRESSED_SPARSE_ROW && "Matrix is assumed to be in CSR format.\n"); + assert((A->getValues(memspace) != nullptr || A->getNnz() == 0) && "Matrix values are null!\n"); + assert(A->getNumRows() == A->getNumColumns() && "Matrix must be square.\n"); + using namespace ReSolve::memory; + switch (memspace) + { + case HOST: + return cpuImpl_->scaleAddI(A, alpha); + break; + case DEVICE: + return devImpl_->scaleAddI(A, alpha); + break; + } + return 1; + } + + /** + * @brief Multiply csr matrix by a constant and add B. + * @param[in,out] A - Sparse CSR matrix + * @param[in] alpha - scalar parameter + * @param[in] B - Sparse CSR matrix + * @param[in] memspace - Device where the operation is computed + * @return 0 if successful, 1 otherwise + */ + int MatrixHandler::scaleAddB(matrix::Csr* A, real_type alpha, matrix::Csr* B, memory::MemorySpace memspace) + { + assert(A->getSparseFormat() == matrix::Sparse::COMPRESSED_SPARSE_ROW && "Matrix is assumed to be in CSR format.\n"); + assert(A->getValues(memspace) != nullptr && "Matrix values are null!\n"); + assert(A->getNumRows() == A->getNumColumns() && "Matrix must be square.\n"); + using namespace ReSolve::memory; + switch (memspace) + { + case HOST: + return cpuImpl_->scaleAddB(A, alpha, B); + break; + case DEVICE: + return devImpl_->scaleAddB(A, alpha, B); + break; + } + return 1; + } + /** * @brief If CUDA support is enabled in the handler. * diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp index 6b6785ab..cd7fdcc8 100644 --- a/resolve/matrix/MatrixHandler.hpp +++ b/resolve/matrix/MatrixHandler.hpp @@ -62,6 +62,10 @@ namespace ReSolve void addConst(matrix::Sparse* A, real_type alpha, memory::MemorySpace memspace); + int scaleAddI(matrix::Csr* A, real_type alpha, memory::MemorySpace memspace); + + int scaleAddB(matrix::Csr* A, real_type alpha, matrix::Csr* B, memory::MemorySpace memspace); + /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped int matvec(matrix::Sparse* A, vector_type* vec_x, diff --git a/resolve/matrix/MatrixHandlerCpu.cpp b/resolve/matrix/MatrixHandlerCpu.cpp index eecd48fb..f8b5331b 100644 --- a/resolve/matrix/MatrixHandlerCpu.cpp +++ b/resolve/matrix/MatrixHandlerCpu.cpp @@ -8,6 +8,8 @@ #include #include #include +#include +#include namespace ReSolve { @@ -389,4 +391,368 @@ namespace ReSolve } return 0; } + + /** + * @brief Multiply all nonzero values of a sparse matrix by a constant + * + * @param[in,out] A - Sparse matrix + * @param[in] alpha - constant to the added + * @return 0 if successful, 1 otherwise + */ + static int scaleConst(matrix::Sparse* A, real_type alpha) + { + real_type* values = A->getValues(memory::HOST); + const index_type nnz = A->getNnz(); + for (index_type i = 0; i < nnz; ++i) + { + values[i] *= alpha; + } + return 0; + } + + /** + * @brief Update matrix with a new number of nonzero elements + * + * @param[in,out] A - Sparse matrix + * @param[in] row_data - pointer to row data (array of integers) + * @param[in] col_data - pointer to column data (array of integers) + * @param[in] val_data - pointer to value data (array of real numbers) + * @param[in] nnz - number of non-zer + * @return 0 if successful, 1 otherwise + */ + static int updateMatrix(matrix::Sparse* A, index_type* rowData, index_type* columnData, real_type* valData, index_type nnz) + { + if (A->destroyMatrixData(memory::HOST) != 0) + { + return 1; + } + return A->copyDataFrom(rowData, columnData, valData, nnz, memory::HOST, memory::HOST); + } + + /** + * @brief Add the identity to a CSR matrix. + * + * @param[in,out] A - Sparse CSR matrix + * @param[in] pattern - precalculated sparsity pattern + * @return 0 if successful, 1 otherwise + */ + static int addIWithPattern(matrix::Csr* A, ScaleAddBufferCpu* pattern) + { + std::vector newValues(pattern->getNnz()); + + index_type const* const originalRowPointers = A->getRowData(memory::HOST); + index_type const* const originalColumnIndices = A->getColData(memory::HOST); + real_type const* const originalValues = A->getValues(memory::HOST); + + index_type newNnzCount = 0; + for (index_type i = 0; i < A->getNumRows(); ++i) + { + const index_type originalRowStart = originalRowPointers[i]; + const index_type originalRowEnd = originalRowPointers[i + 1]; + + bool diagonalAdded = false; + for (index_type j = originalRowStart; j < originalRowEnd; ++j) + { + if (originalColumnIndices[j] == i) + { + // Diagonal element found in original matrix + newValues[newNnzCount] = originalValues[j] + 1.0; + newNnzCount++; + diagonalAdded = true; + } + else if (originalColumnIndices[j] > i && !diagonalAdded) + { + // Insert diagonal if not found yet + newValues[newNnzCount] = 1.; + newNnzCount++; + diagonalAdded = true; // Mark as added to prevent re-insertion + // Then add the current original element + newValues[newNnzCount] = originalValues[j]; + newNnzCount++; + } + else + { + // Elements before diagonal, elements after diagonal and the + // diagonal is already handled + newValues[newNnzCount] = originalValues[j]; + newNnzCount++; + } + } + + // If diagonal element was not present in original row + if (!diagonalAdded) + { + newValues[newNnzCount] = 1.; + newNnzCount++; + } + } + + assert(newNnzCount == pattern->getNnz()); + index_type info = updateMatrix(A, pattern->getRowData(), pattern->getColumnData(), newValues.data(), pattern->getNnz()); + return info; + } + + /** + * @brief Add a constant to the nonzero values of a csr matrix, + * then add the identity matrix. + * + * @param[in,out] A - Sparse CSR matrix + * @param[in] alpha - constant to the added + * @return 0 if successful, 1 otherwise + */ + int MatrixHandlerCpu::scaleAddI(matrix::Csr* A, real_type alpha) + { + index_type info = scaleConst(A, alpha); + if (info == 1) + { + return info; + } + + // Reuse sparsity pattern if it is available + if (workspace_->scaleAddISetup()) + { + ScaleAddBufferCpu* pattern = workspace_->getScaleAddIBuffer(); + return addIWithPattern(A, pattern); + } + + std::vector newRowPointers(A->getNumRows() + 1); + // At most we add one element per row/column + index_type maxNnzCount = A->getNnz() + A->getNumRows(); + std::vector newColumnIndices(maxNnzCount); + std::vector newValues(maxNnzCount); + + index_type const* const originalRowPointers = A->getRowData(memory::HOST); + index_type const* const originalColIndices = A->getColData(memory::HOST); + real_type const* const originalValues = A->getValues(memory::HOST); + + index_type newNnzCount = 0; + for (index_type i = 0; i < A->getNumRows(); ++i) + { + newRowPointers[i] = newNnzCount; + const index_type originalRowStart = originalRowPointers[i]; + const index_type originalRowEnd = originalRowPointers[i + 1]; + + bool diagonalAdded = false; + for (index_type j = originalRowStart; j < originalRowEnd; ++j) + { + if (originalColIndices[j] == i) + { + // Diagonal element found in original matrix + newValues[newNnzCount] = originalValues[j] + 1.0; + newColumnIndices[newNnzCount] = i; + newNnzCount++; + diagonalAdded = true; + } + else if (originalColIndices[j] > i && !diagonalAdded) + { + // Insert diagonal if not found yet + newValues[newNnzCount] = 1.; + newColumnIndices[newNnzCount] = i; + newNnzCount++; + diagonalAdded = true; // Mark as added to prevent re-insertion + // Then add the current original element + newValues[newNnzCount] = originalValues[j]; + newColumnIndices[newNnzCount] = originalColIndices[j]; + newNnzCount++; + } + else + { + // Elements before diagonal, elements after diagonal and the + // diagonal is already handled + newValues[newNnzCount] = originalValues[j]; + newColumnIndices[newNnzCount] = originalColIndices[j]; + newNnzCount++; + } + } + + // If diagonal element was not present in original row + if (!diagonalAdded) + { + newValues[newNnzCount] = 1.; + newColumnIndices[newNnzCount] = i; + newNnzCount++; + } + } + newRowPointers[A->getNumRows()] = newNnzCount; + assert(newNnzCount <= maxNnzCount); + newColumnIndices.resize(newNnzCount); + auto pattern = new ScaleAddBufferCpu(std::move(newRowPointers), std::move(newColumnIndices)); + // workspace_ owns pattern + workspace_->setScaleAddIBuffer(pattern); + updateMatrix(A, pattern->getRowData(), pattern->getColumnData(), newValues.data(), pattern->getNnz()); + return 0; + } + + /** + * @brief Add the identity to a CSR matrix. + * + * @param[in,out] A - Sparse CSR matrix + * @param[in,out] B - Sparse CSR matrix + * @param[in] pattern - precalculated sparsity pattern + * @return 0 if successful, 1 otherwise + */ + static int addBWithPattern(matrix::Csr* A, matrix::Csr* B, ScaleAddBufferCpu* pattern) + { + index_type const* const aRowPointers = A->getRowData(memory::HOST); + index_type const* const aColIndices = A->getColData(memory::HOST); + real_type const* const aValues = A->getValues(memory::HOST); + + index_type const* const bRowPointers = B->getRowData(memory::HOST); + index_type const* const bColIndices = B->getColData(memory::HOST); + real_type const* const bValues = B->getValues(memory::HOST); + + std::vector newValues; + newValues.reserve(pattern->getNnz()); + + for (index_type i = 0; i < A->getNumRows(); ++i) + { + const index_type startA = aRowPointers[i]; + const index_type endA = aRowPointers[i + 1]; + const index_type startB = bRowPointers[i]; + const index_type endB = bRowPointers[i + 1]; + + index_type ptrA = startA; + index_type ptrB = startB; + + // Merge the non-zero elements for the current row + while (ptrA < endA || ptrB < endB) + { + // Case 1: All remaining elements are in A + if (ptrB >= endB) + { + newValues.push_back(aValues[ptrA]); + ptrA++; + } + // Case 2: All remaining elements are in B + else if (ptrA >= endA) + { + newValues.push_back(bValues[ptrB]); + ptrB++; + } + // Case 3: Elements in both A and B. Compare column indices. + else + { + if (aColIndices[ptrA] < bColIndices[ptrB]) + { + newValues.push_back(aValues[ptrA]); + ptrA++; + } + else if (bColIndices[ptrB] < aColIndices[ptrA]) + { + newValues.push_back(bValues[ptrB]); + ptrB++; + } + // Case 4: Same column index, add values + else + { + real_type sum = aValues[ptrA] + bValues[ptrB]; + newValues.push_back(sum); + ptrA++; + ptrB++; + } + } + } + } + assert(static_cast(newValues.size()) == pattern->getNnz()); + return updateMatrix(A, pattern->getRowData(), pattern->getColumnData(), newValues.data(), pattern->getNnz()); + } + + /** + * @brief Multiply csr matrix by a constant and add B. + * + * @param[in,out] A - Sparse CSR matrix + * @param[in] alpha - constant to the added + * @param[in] B - Sparse CSR matrix + * @return 0 if successful, 1 otherwise + */ + int MatrixHandlerCpu::scaleAddB(matrix::Csr* A, real_type alpha, matrix::Csr* B) + { + index_type info = scaleConst(A, alpha); + if (info == 1) + { + return info; + } + + // Reuse sparsity pattern if it is available + if (workspace_->scaleAddBSetup()) + { + ScaleAddBufferCpu* pattern = workspace_->getScaleAddBBuffer(); + return addBWithPattern(A, B, pattern); + } + + index_type const* const aRowPointers = A->getRowData(memory::HOST); + index_type const* const aColIndices = A->getColData(memory::HOST); + real_type const* const aValues = A->getValues(memory::HOST); + + index_type const* const bRowPointers = B->getRowData(memory::HOST); + index_type const* const bColIndices = B->getColData(memory::HOST); + real_type const* const bValues = B->getValues(memory::HOST); + + std::vector newRowPointers; + newRowPointers.reserve(A->getNumRows() + 1); + std::vector newColIndices; + std::vector newValues; + + newRowPointers.push_back(0); // First element is always 0 + for (index_type i = 0; i < A->getNumRows(); ++i) + { + const index_type startA = aRowPointers[i]; + const index_type endA = aRowPointers[i + 1]; + const index_type startB = bRowPointers[i]; + const index_type endB = bRowPointers[i + 1]; + + index_type ptrA = startA; + index_type ptrB = startB; + + // Merge the non-zero elements for the current row + while (ptrA < endA || ptrB < endB) + { + // Case 1: All remaining elements are in A + if (ptrB >= endB) + { + newColIndices.push_back(aColIndices[ptrA]); + newValues.push_back(aValues[ptrA]); + ptrA++; + } + // Case 2: All remaining elements are in B + else if (ptrA >= endA) + { + newColIndices.push_back(bColIndices[ptrB]); + newValues.push_back(bValues[ptrB]); + ptrB++; + } + // Case 3: Elements in both A and B. Compare column indices. + else + { + if (aColIndices[ptrA] < bColIndices[ptrB]) + { + newColIndices.push_back(aColIndices[ptrA]); + newValues.push_back(aValues[ptrA]); + ptrA++; + } + else if (bColIndices[ptrB] < aColIndices[ptrA]) + { + newColIndices.push_back(bColIndices[ptrB]); + newValues.push_back(bValues[ptrB]); + ptrB++; + } + // Case 4: Same column index, add values + else + { + real_type sum = aValues[ptrA] + bValues[ptrB]; + newColIndices.push_back(aColIndices[ptrA]); + newValues.push_back(sum); + ptrA++; + ptrB++; + } + } + } + newRowPointers.push_back(static_cast(newValues.size())); + } + + auto pattern = new ScaleAddBufferCpu(std::move(newRowPointers), std::move(newColIndices)); + // workspace_ owns pattern + workspace_->setScaleAddBBuffer(pattern); + return updateMatrix(A, pattern->getRowData(), pattern->getColumnData(), newValues.data(), pattern->getNnz()); + } } // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerCpu.hpp b/resolve/matrix/MatrixHandlerCpu.hpp index 66b81078..6fcb8fc3 100644 --- a/resolve/matrix/MatrixHandlerCpu.hpp +++ b/resolve/matrix/MatrixHandlerCpu.hpp @@ -46,6 +46,10 @@ namespace ReSolve int addConst(matrix::Sparse* A, real_type alpha) override; + int scaleAddI(matrix::Csr* A, real_type alpha) override; + + int scaleAddB(matrix::Csr* A, real_type alpha, matrix::Csr* B) override; + virtual int matvec(matrix::Sparse* A, vector_type* vec_x, vector_type* vec_result, diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp index 7b310843..25f013c6 100644 --- a/resolve/matrix/MatrixHandlerCuda.cpp +++ b/resolve/matrix/MatrixHandlerCuda.cpp @@ -1,6 +1,7 @@ #include "MatrixHandlerCuda.hpp" #include +#include #include #include @@ -12,6 +13,7 @@ #include #include #include +#include namespace ReSolve { @@ -400,4 +402,215 @@ namespace ReSolve cuda::addConst(nnz, alpha, values); return 0; } + + /** + * @brief Calculate buffer size and sparsity pattern of alpha*A + beta*B + * + * @param[in] A - CSR matrix + * @param[in] alpha - constant to be added to A + * @param[in] B - CSR matrix + * @param[in] beta - constant to be added to B + * @param[in, out] pattern - sparsity pattern and buffer + * + * @return int error code, 0 if successful + */ + int MatrixHandlerCuda::allocateForSum(matrix::Csr* A, real_type alpha, matrix::Csr* B, real_type beta, ScaleAddBufferCUDA** pattern) + { + auto handle = workspace_->getCusparseHandle(); + cusparseStatus_t cu_info = cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST); + int info = (cu_info != CUSPARSE_STATUS_SUCCESS); + workspace_->setCusparseHandle(handle); + + auto a_v = A->getValues(memory::DEVICE); + auto a_i = A->getRowData(memory::DEVICE); + auto a_j = A->getColData(memory::DEVICE); + + auto b_v = B->getValues(memory::DEVICE); + auto b_i = B->getRowData(memory::DEVICE); + auto b_j = B->getColData(memory::DEVICE); + + index_type m = A->getNumRows(); + assert(m == B->getNumRows()); + index_type n = A->getNumColumns(); + assert(n == B->getNumColumns()); + + assert(m == n); + + index_type nnz_a = A->getNnz(); + index_type nnz_b = B->getNnz(); + + real_type* c_v = nullptr; + index_type* c_i = nullptr; + index_type* c_j = nullptr; + + *pattern = new ScaleAddBufferCUDA(n + 1); + cusparseMatDescr_t descr_a = (*pattern)->getMatrixDescriptor(); + size_t buffer_byte_size_add; + // calculates sum buffer + cu_info = cusparseDcsrgeam2_bufferSizeExt(handle, m, n, &alpha, descr_a, nnz_a, a_v, a_i, a_j, &beta, descr_a, nnz_b, b_v, b_i, b_j, descr_a, c_v, c_i, c_j, &buffer_byte_size_add); + info = info || (cu_info != CUSPARSE_STATUS_SUCCESS); + + (*pattern)->allocateBuffer(buffer_byte_size_add); + + index_type nnz_total; + // determines sum row offsets and total number of nonzeros + cu_info = cusparseXcsrgeam2Nnz(handle, m, n, descr_a, nnz_a, a_i, a_j, descr_a, nnz_b, b_i, b_j, descr_a, (*pattern)->getRowData(), &nnz_total, (*pattern)->getBuffer()); + info = info || (cu_info != CUSPARSE_STATUS_SUCCESS); + (*pattern)->setNnz(nnz_total); + return info; + } + + /** + * @brief Given sparsity pattern, calculate alpha*A + beta*B + * + * @param[in] A - CSR matrix + * @param[in] alpha - constant to be added to A + * @param[in] B - CSR matrix + * @param[in] beta - constant to be added to B + * @param[in, out] pattern - sparsity pattern and buffer + * + * @return int error code, 0 if successful + */ + int MatrixHandlerCuda::computeSum(matrix::Csr* A, real_type alpha, matrix::Csr* B, real_type beta, matrix::Csr* C, ScaleAddBufferCUDA* pattern) + { + auto a_v = A->getValues(memory::DEVICE); + auto a_i = A->getRowData(memory::DEVICE); + auto a_j = A->getColData(memory::DEVICE); + + auto b_v = B->getValues(memory::DEVICE); + auto b_i = B->getRowData(memory::DEVICE); + auto b_j = B->getColData(memory::DEVICE); + + auto c_v = C->getValues(memory::DEVICE); + auto c_i = C->getRowData(memory::DEVICE); + auto c_j = C->getColData(memory::DEVICE); + + auto handle = workspace_->getCusparseHandle(); + + index_type m = A->getNumRows(); + assert(m == B->getNumRows()); + assert(m == C->getNumRows()); + + index_type n = A->getNumColumns(); + assert(n == B->getNumColumns()); + assert(n == C->getNumColumns()); + + assert(m == n); + + index_type nnz_a = A->getNnz(); + index_type nnz_b = B->getNnz(); + + int info = mem_.copyArrayDeviceToDevice(c_i, pattern->getRowData(), n + 1); + cusparseMatDescr_t descr_a = pattern->getMatrixDescriptor(); + cusparseStatus_t cu_info = cusparseDcsrgeam2(handle, m, n, &alpha, descr_a, nnz_a, a_v, a_i, a_j, &beta, descr_a, nnz_b, b_v, b_i, b_j, descr_a, c_v, c_i, c_j, pattern->getBuffer()); + info = info || (cu_info != CUSPARSE_STATUS_SUCCESS); + info = info || C->setUpdated(memory::DEVICE); + return info; + } + + /** + * @brief Update matrix with a new number of nonzero elements + * + * @param[in,out] A - Sparse matrix + * @param[in] row_data - pointer to row data (array of integers) + * @param[in] col_data - pointer to column data (array of integers) + * @param[in] val_data - pointer to value data (array of real numbers) + * @param[in] nnz - number of non-zer + * @return 0 if successful, 1 otherwise + */ + static int updateMatrix(matrix::Sparse* A, index_type* rowData, index_type* columnData, real_type* valData, index_type nnz) + { + if (A->destroyMatrixData(memory::DEVICE) != 0) + { + return 1; + } + if (A->destroyMatrixData(memory::HOST) != 0) + { + return 1; + } + return A->copyDataFrom(rowData, columnData, valData, nnz, memory::DEVICE, memory::DEVICE); + } + + /** + * @brief Add a constant to the nonzero values of a csr matrix, + * then add the identity matrix. + * + * @param[in,out] A - Sparse CSR matrix + * @param[in] alpha - constant to the added + * @return 0 if successful, 1 otherwise + */ + int MatrixHandlerCuda::scaleAddI(matrix::Csr* A, real_type alpha) + { + index_type n = A->getNumRows(); + + std::vector I_i(n + 1); + std::iota(I_i.begin(), I_i.end(), 0); + + std::vector I_j(n); + std::iota(I_j.begin(), I_j.end(), 0); + + std::vector I_v(n, 1.0); + + matrix::Csr I(A->getNumRows(), A->getNumColumns(), n); + int info = I.copyDataFrom(I_i.data(), I_j.data(), I_v.data(), n, memory::HOST, memory::DEVICE); + + // Reuse sparsity pattern if it is available + if (workspace_->scaleAddISetup()) + { + ScaleAddBufferCUDA* pattern = workspace_->getScaleAddIBuffer(); + matrix::Csr C(A->getNumRows(), A->getNumColumns(), pattern->getNnz()); + info = info || C.allocateMatrixData(memory::DEVICE); + info = info || computeSum(A, alpha, &I, 1., &C, pattern); + info = info || updateMatrix(A, C.getRowData(memory::DEVICE), C.getColData(memory::DEVICE), C.getValues(memory::DEVICE), C.getNnz()); + } + else + { + ScaleAddBufferCUDA* pattern = nullptr; + matrix::Csr C(A->getNumRows(), A->getNumColumns(), A->getNnz()); + info = info || allocateForSum(A, alpha, &I, 1., &pattern); + workspace_->setScaleAddIBuffer(pattern); + workspace_->scaleAddISetupDone(); + C.setNnz(pattern->getNnz()); + C.allocateMatrixData(memory::DEVICE); + info = info || computeSum(A, alpha, &I, 1., &C, pattern); + info = info || updateMatrix(A, C.getRowData(memory::DEVICE), C.getColData(memory::DEVICE), C.getValues(memory::DEVICE), C.getNnz()); + } + return info; + } + + /** + * @brief Multiply csr matrix by a constant and add B. + * + * @param[in,out] A - Sparse CSR matrix + * @param[in] alpha - constant to the added + * @param[in] B - Sparse CSR matrix + * @return 0 if successful, 1 otherwise + */ + int MatrixHandlerCuda::scaleAddB(matrix::Csr* A, real_type alpha, matrix::Csr* B) + { + int info = 0; + // Reuse sparsity pattern if it is available + if (workspace_->scaleAddBSetup()) + { + ScaleAddBufferCUDA* pattern = workspace_->getScaleAddBBuffer(); + matrix::Csr C(A->getNumRows(), A->getNumColumns(), pattern->getNnz()); + info = info || C.allocateMatrixData(memory::DEVICE); + info = info || computeSum(A, alpha, B, 1., &C, pattern); + info = info || updateMatrix(A, C.getRowData(memory::DEVICE), C.getColData(memory::DEVICE), C.getValues(memory::DEVICE), C.getNnz()); + } + else + { + ScaleAddBufferCUDA* pattern = nullptr; + matrix::Csr C(A->getNumRows(), A->getNumColumns(), A->getNnz()); + info = info || allocateForSum(A, alpha, B, 1., &pattern); + workspace_->setScaleAddBBuffer(pattern); + workspace_->scaleAddBSetupDone(); + C.setNnz(pattern->getNnz()); + C.allocateMatrixData(memory::DEVICE); + info = info || computeSum(A, alpha, B, 1., &C, pattern); + info = info || updateMatrix(A, C.getRowData(memory::DEVICE), C.getColData(memory::DEVICE), C.getValues(memory::DEVICE), C.getNnz()); + } + return info; + } + } // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerCuda.hpp b/resolve/matrix/MatrixHandlerCuda.hpp index f0efbcee..7441ba1e 100644 --- a/resolve/matrix/MatrixHandlerCuda.hpp +++ b/resolve/matrix/MatrixHandlerCuda.hpp @@ -1,10 +1,16 @@ #pragma once + +#include + #include #include #include namespace ReSolve { + + class ScaleAddBufferCUDA; + namespace vector { class Vector; @@ -45,6 +51,10 @@ namespace ReSolve int rightScale(matrix::Csr* A, vector_type* diag) override; + int scaleAddI(matrix::Csr* A, real_type alpha) override; + + int scaleAddB(matrix::Csr* A, real_type alpha, matrix::Csr* B) override; + virtual int matvec(matrix::Sparse* A, vector_type* vec_x, vector_type* vec_result, @@ -55,6 +65,8 @@ namespace ReSolve void setValuesChanged(bool isValuesChanged) override; private: + int allocateForSum(matrix::Csr* A, real_type alpha, matrix::Csr* B, real_type beta, ScaleAddBufferCUDA** pattern); + int computeSum(matrix::Csr* A, real_type alpha, matrix::Csr* B, real_type beta, matrix::Csr* C, ScaleAddBufferCUDA* pattern); LinAlgWorkspaceCUDA* workspace_{nullptr}; bool values_changed_{true}; ///< needed for matvec diff --git a/resolve/matrix/MatrixHandlerHip.cpp b/resolve/matrix/MatrixHandlerHip.cpp index 17a31bc3..20a28e3a 100644 --- a/resolve/matrix/MatrixHandlerHip.cpp +++ b/resolve/matrix/MatrixHandlerHip.cpp @@ -1,6 +1,8 @@ #include "MatrixHandlerHip.hpp" #include +#include +#include #include #include @@ -11,6 +13,7 @@ #include #include #include +#include namespace ReSolve { @@ -369,4 +372,200 @@ namespace ReSolve return 0; } + /** + * @brief Calculate buffer size and sparsity pattern of alpha*A + beta*B + * + * @param[in] A - CSR matrix + * @param[in] B - CSR matrix + * @param[in, out] pattern - sparsity pattern and buffer + * + * @return int error code, 0 if successful + */ + int MatrixHandlerHip::allocateForSum(matrix::Csr* A, matrix::Csr* B, ScaleAddBufferHIP** pattern) + { + auto handle = workspace_->getRocsparseHandle(); + rocsparse_status roc_info = rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host); + int info = (roc_info != rocsparse_status_success); + workspace_->setRocsparseHandle(handle); + + auto a_i = A->getRowData(memory::DEVICE); + auto a_j = A->getColData(memory::DEVICE); + + auto b_i = B->getRowData(memory::DEVICE); + auto b_j = B->getColData(memory::DEVICE); + + index_type m = A->getNumRows(); + assert(m == B->getNumRows()); + index_type n = A->getNumColumns(); + assert(n == B->getNumColumns()); + + assert(m == n); + + index_type nnz_a = A->getNnz(); + index_type nnz_b = B->getNnz(); + + *pattern = new ScaleAddBufferHIP(n + 1); + rocsparse_mat_descr descr_a = (*pattern)->getMatrixDescriptor(); + index_type nnz_total; + // determines sum row offsets and total number of nonzeros + roc_info = rocsparse_bsrgeam_nnzb(handle, rocsparse_direction_row, m, n, 1, descr_a, nnz_a, a_i, a_j, descr_a, nnz_b, b_i, b_j, descr_a, (*pattern)->getRowData(), &nnz_total); + info = info || (roc_info != rocsparse_status_success); + (*pattern)->setNnz(nnz_total); + return info; + } + + /** + * @brief Given sparsity pattern, calculate alpha*A + beta*B + * + * @param[in] A - CSR matrix + * @param[in] alpha - constant to be added to A + * @param[in] B - CSR matrix + * @param[in] beta - constant to be added to B + * @param[in, out] pattern - sparsity pattern and buffer + * + * @return int error code, 0 if successful + */ + int MatrixHandlerHip::computeSum(matrix::Csr* A, real_type alpha, matrix::Csr* B, real_type beta, matrix::Csr* C, ScaleAddBufferHIP* pattern) + { + auto a_v = A->getValues(memory::DEVICE); + auto a_i = A->getRowData(memory::DEVICE); + auto a_j = A->getColData(memory::DEVICE); + + auto b_v = B->getValues(memory::DEVICE); + auto b_i = B->getRowData(memory::DEVICE); + auto b_j = B->getColData(memory::DEVICE); + + auto c_v = C->getValues(memory::DEVICE); + auto c_i = C->getRowData(memory::DEVICE); + auto c_j = C->getColData(memory::DEVICE); + + auto handle = workspace_->getRocsparseHandle(); + + index_type m = A->getNumRows(); + assert(m == B->getNumRows()); + assert(m == C->getNumRows()); + + index_type n = A->getNumColumns(); + assert(n == B->getNumColumns()); + assert(n == C->getNumColumns()); + + assert(m == n); + + index_type nnz_a = A->getNnz(); + index_type nnz_b = B->getNnz(); + + int info = mem_.copyArrayDeviceToDevice(c_i, pattern->getRowData(), n + 1); + rocsparse_mat_descr descr_a = pattern->getMatrixDescriptor(); + rocsparse_status roc_info = rocsparse_dbsrgeam(handle, rocsparse_direction_row, m, n, 1, &alpha, descr_a, nnz_a, a_v, a_i, a_j, &beta, descr_a, nnz_b, b_v, b_i, b_j, descr_a, c_v, c_i, c_j); + info = info || (roc_info != rocsparse_status_success); + info = info || C->setUpdated(memory::DEVICE); + return info; + } + + /** + * @brief Update matrix with a new number of nonzero elements + * + * @param[in,out] A - Sparse matrix + * @param[in] row_data - pointer to row data (array of integers) + * @param[in] col_data - pointer to column data (array of integers) + * @param[in] val_data - pointer to value data (array of real numbers) + * @param[in] nnz - number of non-zer + * @return 0 if successful, 1 otherwise + */ + static int updateMatrix(matrix::Sparse* A, index_type* rowData, index_type* columnData, real_type* valData, index_type nnz) + { + if (A->destroyMatrixData(memory::DEVICE) != 0) + { + return 1; + } + if (A->destroyMatrixData(memory::HOST) != 0) + { + return 1; + } + return A->copyDataFrom(rowData, columnData, valData, nnz, memory::DEVICE, memory::DEVICE); + } + + /** + * @brief Add a constant to the nonzero values of a csr matrix, + * then add the identity matrix. + * + * @param[in,out] A - Sparse CSR matrix + * @param[in] alpha - constant to the added + * @return 0 if successful, 1 otherwise + */ + int MatrixHandlerHip::scaleAddI(matrix::Csr* A, real_type alpha) + { + index_type n = A->getNumRows(); + + std::vector I_i(n + 1); + std::iota(I_i.begin(), I_i.end(), 0); + + std::vector I_j(n); + std::iota(I_j.begin(), I_j.end(), 0); + + std::vector I_v(n, 1.0); + + matrix::Csr I(A->getNumRows(), A->getNumColumns(), n); + int info = I.copyDataFrom(I_i.data(), I_j.data(), I_v.data(), n, memory::HOST, memory::DEVICE); + + // Reuse sparsity pattern if it is available + if (workspace_->scaleAddISetup()) + { + ScaleAddBufferHIP* pattern = workspace_->getScaleAddIBuffer(); + matrix::Csr C(A->getNumRows(), A->getNumColumns(), pattern->getNnz()); + info = info || C.allocateMatrixData(memory::DEVICE); + info = info || computeSum(A, alpha, &I, 1., &C, pattern); + info = info || updateMatrix(A, C.getRowData(memory::DEVICE), C.getColData(memory::DEVICE), C.getValues(memory::DEVICE), C.getNnz()); + } + else + { + ScaleAddBufferHIP* pattern = nullptr; + matrix::Csr C(A->getNumRows(), A->getNumColumns(), A->getNnz()); + info = info || allocateForSum(A, &I, &pattern); + + workspace_->setScaleAddIBuffer(pattern); + workspace_->scaleAddISetupDone(); + C.setNnz(pattern->getNnz()); + C.allocateMatrixData(memory::DEVICE); + info = info || computeSum(A, alpha, &I, 1., &C, pattern); + info = info || updateMatrix(A, C.getRowData(memory::DEVICE), C.getColData(memory::DEVICE), C.getValues(memory::DEVICE), C.getNnz()); + } + return info; + } + + /** + * @brief Multiply csr matrix by a constant and add B. + * + * @param[in,out] A - Sparse CSR matrix + * @param[in] alpha - constant to the added + * @param[in] B - Sparse CSR matrix + * @return 0 if successful, 1 otherwise + */ + int MatrixHandlerHip::scaleAddB(matrix::Csr* A, real_type alpha, matrix::Csr* B) + { + int info = 0; + // Reuse sparsity pattern if it is available + if (workspace_->scaleAddBSetup()) + { + ScaleAddBufferHIP* pattern = workspace_->getScaleAddBBuffer(); + matrix::Csr C(A->getNumRows(), A->getNumColumns(), pattern->getNnz()); + info = info || C.allocateMatrixData(memory::DEVICE); + info = info || computeSum(A, alpha, B, 1., &C, pattern); + info = info || updateMatrix(A, C.getRowData(memory::DEVICE), C.getColData(memory::DEVICE), C.getValues(memory::DEVICE), C.getNnz()); + } + else + { + ScaleAddBufferHIP* pattern = nullptr; + matrix::Csr C(A->getNumRows(), A->getNumColumns(), A->getNnz()); + info = info || allocateForSum(A, B, &pattern); + workspace_->setScaleAddBBuffer(pattern); + workspace_->scaleAddBSetupDone(); + C.setNnz(pattern->getNnz()); + C.allocateMatrixData(memory::DEVICE); + info = info || computeSum(A, alpha, B, 1., &C, pattern); + info = info || updateMatrix(A, C.getRowData(memory::DEVICE), C.getColData(memory::DEVICE), C.getValues(memory::DEVICE), C.getNnz()); + } + return info; + } + } // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerHip.hpp b/resolve/matrix/MatrixHandlerHip.hpp index 7e69c3f1..4f7c092b 100644 --- a/resolve/matrix/MatrixHandlerHip.hpp +++ b/resolve/matrix/MatrixHandlerHip.hpp @@ -17,6 +17,7 @@ namespace ReSolve class Csc; class Csr; } // namespace matrix + class ScaleAddBufferHIP; class LinAlgWorkspaceHIP; } // namespace ReSolve @@ -43,7 +44,12 @@ namespace ReSolve int rightScale(matrix::Csr* A, vector_type* diag) override; - int addConst(matrix::Sparse* A, real_type alpha) override; + int addConst(matrix::Sparse* A, real_type alpha) override; + + int scaleAddI(matrix::Csr* A, real_type alpha) override; + + int scaleAddB(matrix::Csr* A, real_type alpha, matrix::Csr* B) override; + virtual int matvec(matrix::Sparse* A, vector_type* vec_x, vector_type* vec_result, @@ -55,6 +61,9 @@ namespace ReSolve void setValuesChanged(bool isValuesChanged) override; private: + int allocateForSum(matrix::Csr* A, matrix::Csr* B, ScaleAddBufferHIP** pattern); + int computeSum(matrix::Csr* A, real_type alpha, matrix::Csr* B, real_type beta, matrix::Csr* C, ScaleAddBufferHIP* pattern); + LinAlgWorkspaceHIP* workspace_{nullptr}; bool values_changed_{true}; ///< needed for matvec diff --git a/resolve/matrix/MatrixHandlerImpl.hpp b/resolve/matrix/MatrixHandlerImpl.hpp index d874dff1..80fbb631 100644 --- a/resolve/matrix/MatrixHandlerImpl.hpp +++ b/resolve/matrix/MatrixHandlerImpl.hpp @@ -46,6 +46,10 @@ namespace ReSolve virtual int addConst(matrix::Sparse* A, real_type alpha) = 0; + virtual int scaleAddI(matrix::Csr* A, real_type alpha) = 0; + + virtual int scaleAddB(matrix::Csr* A, real_type alpha, matrix::Csr* B) = 0; + virtual int matvec(matrix::Sparse* A, vector_type* vec_x, vector_type* vec_result, diff --git a/resolve/workspace/CMakeLists.txt b/resolve/workspace/CMakeLists.txt index d9849e6c..a0760396 100644 --- a/resolve/workspace/CMakeLists.txt +++ b/resolve/workspace/CMakeLists.txt @@ -7,15 +7,22 @@ ]] # C++ code -set(ReSolve_Workspace_SRC LinAlgWorkspaceCpu.cpp) +set(ReSolve_Workspace_SRC ScaleAddBufferCpu.cpp LinAlgWorkspaceCpu.cpp) # C++ code that depends on CUDA SDK libraries -set(ReSolve_Workspace_CUDASDK_SRC LinAlgWorkspaceCUDA.cpp) +set(ReSolve_Workspace_CUDASDK_SRC ScaleAddBufferCUDA.cpp + LinAlgWorkspaceCUDA.cpp +) -set(ReSolve_Workspace_ROCM_SRC LinAlgWorkspaceHIP.cpp) +set(ReSolve_Workspace_ROCM_SRC ScaleAddBufferHIP.cpp LinAlgWorkspaceHIP.cpp) set(ReSolve_Workspace_HEADER_INSTALL - LinAlgWorkspace.hpp LinAlgWorkspaceCpu.hpp LinAlgWorkspaceCUDA.hpp + ScaleAddBufferCpu.hpp + ScaleAddBufferCUDA.upp, + ScaleAddBufferHIP.hpp + LinAlgWorkspace.hpp + LinAlgWorkspaceCpu.hpp + LinAlgWorkspaceCUDA.hpp LinAlgWorkspaceHIP.hpp ) diff --git a/resolve/workspace/LinAlgWorkspaceCUDA.cpp b/resolve/workspace/LinAlgWorkspaceCUDA.cpp index fdb19c30..57987666 100644 --- a/resolve/workspace/LinAlgWorkspaceCUDA.cpp +++ b/resolve/workspace/LinAlgWorkspaceCUDA.cpp @@ -1,4 +1,5 @@ #include +#include namespace ReSolve { @@ -8,6 +9,8 @@ namespace ReSolve handle_cusparse_ = nullptr; handle_cublas_ = nullptr; buffer_spmv_ = nullptr; + buffer_scale_add_i = nullptr; + buffer_scale_add_b = nullptr; buffer_1norm_ = nullptr; transpose_workspace_ = nullptr; transpose_workspace_ready_ = false; @@ -21,6 +24,10 @@ namespace ReSolve { if (buffer_spmv_ != nullptr) mem_.deleteOnDevice(buffer_spmv_); + if (buffer_scale_add_i != nullptr) + delete buffer_scale_add_i; + if (buffer_scale_add_b != nullptr) + delete buffer_scale_add_b; if (d_r_size_ != 0) mem_.deleteOnDevice(d_r_); if (norm_buffer_ready_) @@ -70,6 +77,18 @@ namespace ReSolve transpose_workspace_ = nullptr; transpose_workspace_ready_ = false; } + if (scale_add_b_setup_done_) + { + delete buffer_scale_add_b; + buffer_scale_add_b = nullptr; + scale_add_b_setup_done_ = false; + } + if (scale_add_i_setup_done_) + { + delete buffer_scale_add_i; + buffer_scale_add_i = nullptr; + scale_add_i_setup_done_ = false; + } return; } @@ -83,6 +102,16 @@ namespace ReSolve return buffer_1norm_; } + ScaleAddBufferCUDA* LinAlgWorkspaceCUDA::getScaleAddIBuffer() + { + return buffer_scale_add_i; + } + + ScaleAddBufferCUDA* LinAlgWorkspaceCUDA::getScaleAddBBuffer() + { + return buffer_scale_add_b; + } + void* LinAlgWorkspaceCUDA::getTransposeBufferWorkspace() { return transpose_workspace_; @@ -114,6 +143,26 @@ namespace ReSolve buffer_spmv_ = buffer; } + void LinAlgWorkspaceCUDA::setScaleAddBBuffer(ScaleAddBufferCUDA* buffer) + { + buffer_scale_add_b = buffer; + } + + void LinAlgWorkspaceCUDA::setScaleAddIBuffer(ScaleAddBufferCUDA* buffer) + { + buffer_scale_add_i = buffer; + } + + void LinAlgWorkspaceCUDA::scaleAddBSetupDone() + { + scale_add_b_setup_done_ = true; + } + + void LinAlgWorkspaceCUDA::scaleAddISetupDone() + { + scale_add_i_setup_done_ = true; + } + void LinAlgWorkspaceCUDA::setNormBuffer(void* buffer) { buffer_1norm_ = buffer; @@ -204,6 +253,16 @@ namespace ReSolve matvec_setup_done_ = true; } + bool LinAlgWorkspaceCUDA::scaleAddISetup() + { + return scale_add_i_setup_done_; + } + + bool LinAlgWorkspaceCUDA::scaleAddBSetup() + { + return scale_add_b_setup_done_; + } + void LinAlgWorkspaceCUDA::initializeHandles() { cusparseCreate(&handle_cusparse_); diff --git a/resolve/workspace/LinAlgWorkspaceCUDA.hpp b/resolve/workspace/LinAlgWorkspaceCUDA.hpp index ac996b9b..01b30f6e 100644 --- a/resolve/workspace/LinAlgWorkspaceCUDA.hpp +++ b/resolve/workspace/LinAlgWorkspaceCUDA.hpp @@ -8,6 +8,9 @@ namespace ReSolve { + + class ScaleAddBufferCUDA; + class LinAlgWorkspaceCUDA { public: @@ -17,13 +20,19 @@ namespace ReSolve void resetLinAlgWorkspace(); // accessors - void* getSpmvBuffer(); - void* getNormBuffer(); - void* getTransposeBufferWorkspace(); - void setTransposeBufferWorkspace(size_t bufferSize); - bool isTransposeBufferAllocated(); - void setSpmvBuffer(void* buffer); - void setNormBuffer(void* buffer); + void* getSpmvBuffer(); + void* getNormBuffer(); + ScaleAddBufferCUDA* getScaleAddIBuffer(); + ScaleAddBufferCUDA* getScaleAddBBuffer(); + void* getTransposeBufferWorkspace(); + void setTransposeBufferWorkspace(size_t bufferSize); + bool isTransposeBufferAllocated(); + void setSpmvBuffer(void* buffer); + void setNormBuffer(void* buffer); + void setScaleAddIBuffer(ScaleAddBufferCUDA* buffer); + void setScaleAddBBuffer(ScaleAddBufferCUDA* buffer); + void scaleAddISetupDone(); + void scaleAddBSetupDone(); cublasHandle_t getCublasHandle(); cusolverSpHandle_t getCusolverSpHandle(); // needed for 1-norms etc @@ -48,6 +57,9 @@ namespace ReSolve bool matvecSetup(); void matvecSetupDone(); + bool scaleAddISetup(); + bool scaleAddBSetup(); + private: // handles cublasHandle_t handle_cublas_; @@ -62,10 +74,14 @@ namespace ReSolve cusparseDnVecDescr_t vec_y_; // buffers - void* buffer_spmv_{nullptr}; - void* buffer_1norm_{nullptr}; + void* buffer_spmv_{nullptr}; + void* buffer_1norm_{nullptr}; + ScaleAddBufferCUDA* buffer_scale_add_i{nullptr}; + ScaleAddBufferCUDA* buffer_scale_add_b{nullptr}; bool matvec_setup_done_{false}; // check if setup is done for matvec i.e. if buffer is allocated, csr structure is set etc. + bool scale_add_i_setup_done_{false}; + bool scale_add_b_setup_done_{false}; void* transpose_workspace_{nullptr}; // needed for transpose bool transpose_workspace_ready_{false}; // to track if allocated diff --git a/resolve/workspace/LinAlgWorkspaceCpu.cpp b/resolve/workspace/LinAlgWorkspaceCpu.cpp index 36f9d6cf..e632afa9 100644 --- a/resolve/workspace/LinAlgWorkspaceCpu.cpp +++ b/resolve/workspace/LinAlgWorkspaceCpu.cpp @@ -1,7 +1,11 @@ #include "LinAlgWorkspaceCpu.hpp" +#include +#include #include +#include + namespace ReSolve { LinAlgWorkspaceCpu::LinAlgWorkspaceCpu() @@ -10,6 +14,8 @@ namespace ReSolve LinAlgWorkspaceCpu::~LinAlgWorkspaceCpu() { + delete scaleAddIBuffer_; + delete scaleAddBBuffer_; } void LinAlgWorkspaceCpu::initializeHandles() @@ -18,7 +24,59 @@ namespace ReSolve void LinAlgWorkspaceCpu::resetLinAlgWorkspace() { - // No resources to reset in CPU workspace - return; + delete scaleAddIBuffer_; + scaleAddIBuffer_ = nullptr; + scaleAddISetupDone_ = false; + delete scaleAddBBuffer_; + scaleAddBBuffer_ = nullptr; + scaleAddBSetupDone_ = false; + } + + bool LinAlgWorkspaceCpu::scaleAddISetup() + { + return scaleAddISetupDone_; + } + + void LinAlgWorkspaceCpu::scaleAddISetupDone() + { + scaleAddISetupDone_ = true; + } + + ScaleAddBufferCpu* LinAlgWorkspaceCpu::getScaleAddIBuffer() + { + assert(scaleAddIBuffer_ != nullptr); + return scaleAddIBuffer_; + } + + void LinAlgWorkspaceCpu::setScaleAddIBuffer(ScaleAddBufferCpu* buffer) + { + assert(buffer != nullptr); + assert(scaleAddIBuffer_ == nullptr); + scaleAddIBuffer_ = buffer; + scaleAddISetupDone(); } + + bool LinAlgWorkspaceCpu::scaleAddBSetup() + { + return scaleAddBSetupDone_; + } + + void LinAlgWorkspaceCpu::scaleAddBSetupDone() + { + scaleAddBSetupDone_ = true; + } + + ScaleAddBufferCpu* LinAlgWorkspaceCpu::getScaleAddBBuffer() + { + assert(scaleAddBBuffer_ != nullptr); + return scaleAddBBuffer_; + } + + void LinAlgWorkspaceCpu::setScaleAddBBuffer(ScaleAddBufferCpu* buffer) + { + assert(scaleAddBBuffer_ == nullptr); + scaleAddBBuffer_ = buffer; + scaleAddBSetupDone(); + } + } // namespace ReSolve diff --git a/resolve/workspace/LinAlgWorkspaceCpu.hpp b/resolve/workspace/LinAlgWorkspaceCpu.hpp index 308c2497..7c0b758c 100644 --- a/resolve/workspace/LinAlgWorkspaceCpu.hpp +++ b/resolve/workspace/LinAlgWorkspaceCpu.hpp @@ -1,14 +1,34 @@ #pragma once +#include + namespace ReSolve { + class ScaleAddBufferCpu; + class LinAlgWorkspaceCpu { public: LinAlgWorkspaceCpu(); ~LinAlgWorkspaceCpu(); - void initializeHandles(); - void resetLinAlgWorkspace(); + void initializeHandles(); + void resetLinAlgWorkspace(); + bool scaleAddISetup(); + void scaleAddISetupDone(); + ScaleAddBufferCpu* getScaleAddIBuffer(); + void setScaleAddIBuffer(ScaleAddBufferCpu* buffer); + bool scaleAddBSetup(); + void scaleAddBSetupDone(); + ScaleAddBufferCpu* getScaleAddBBuffer(); + void setScaleAddBBuffer(ScaleAddBufferCpu* buffer); + + private: + // check if setup is done for scaleAddI i.e. if buffer is allocated, csr structure is set etc. + bool scaleAddISetupDone_{false}; + ScaleAddBufferCpu* scaleAddIBuffer_{nullptr}; + // check if setup is done for scaleAddB i.e. if buffer is allocated, csr structure is set etc. + bool scaleAddBSetupDone_{false}; + ScaleAddBufferCpu* scaleAddBBuffer_{nullptr}; }; } // namespace ReSolve diff --git a/resolve/workspace/LinAlgWorkspaceHIP.cpp b/resolve/workspace/LinAlgWorkspaceHIP.cpp index fc13e689..1504a42c 100644 --- a/resolve/workspace/LinAlgWorkspaceHIP.cpp +++ b/resolve/workspace/LinAlgWorkspaceHIP.cpp @@ -1,12 +1,16 @@ +#include + #include +#include namespace ReSolve { LinAlgWorkspaceHIP::LinAlgWorkspaceHIP() { - handle_rocsparse_ = nullptr; - handle_rocblas_ = nullptr; - + handle_rocsparse_ = nullptr; + handle_rocblas_ = nullptr; + buffer_scale_add_i_ = nullptr; + buffer_scale_add_b_ = nullptr; matvec_setup_done_ = false; d_r_ = nullptr; d_r_size_ = 0; @@ -24,6 +28,16 @@ namespace ReSolve { rocsparse_destroy_mat_descr(mat_A_); } + if (scale_add_i_setup_done_) + { + assert(buffer_scale_add_i_ != nullptr); + delete buffer_scale_add_i_; + } + if (buffer_scale_add_b_ != nullptr) + { + assert(buffer_scale_add_b_ != nullptr); + delete buffer_scale_add_b_; + } if (d_r_size_ != 0) { mem_.deleteOnDevice(d_r_); @@ -52,6 +66,20 @@ namespace ReSolve rocsparse_destroy_mat_descr(mat_A_); matvec_setup_done_ = false; } + if (scale_add_b_setup_done_) + { + assert(buffer_scale_add_b_ != nullptr); + delete buffer_scale_add_b_; + buffer_scale_add_b_ = nullptr; + scale_add_b_setup_done_ = false; + } + if (scale_add_i_setup_done_) + { + assert(buffer_scale_add_i_ != nullptr); + delete buffer_scale_add_i_; + buffer_scale_add_i_ = nullptr; + scale_add_i_setup_done_ = false; + } if (d_r_size_ != 0) { mem_.deleteOnDevice(d_r_); @@ -128,6 +156,36 @@ namespace ReSolve norm_buffer_ = nb; } + ScaleAddBufferHIP* LinAlgWorkspaceHIP::getScaleAddIBuffer() + { + return buffer_scale_add_i_; + } + + ScaleAddBufferHIP* LinAlgWorkspaceHIP::getScaleAddBBuffer() + { + return buffer_scale_add_b_; + } + + void LinAlgWorkspaceHIP::setScaleAddBBuffer(ScaleAddBufferHIP* buffer) + { + buffer_scale_add_b_ = buffer; + } + + void LinAlgWorkspaceHIP::setScaleAddIBuffer(ScaleAddBufferHIP* buffer) + { + buffer_scale_add_i_ = buffer; + } + + void LinAlgWorkspaceHIP::scaleAddBSetupDone() + { + scale_add_b_setup_done_ = true; + } + + void LinAlgWorkspaceHIP::scaleAddISetupDone() + { + scale_add_i_setup_done_ = true; + } + void LinAlgWorkspaceHIP::setNormBufferState(bool r) { norm_buffer_ready_ = r; @@ -143,6 +201,16 @@ namespace ReSolve matvec_setup_done_ = true; } + bool LinAlgWorkspaceHIP::scaleAddISetup() + { + return scale_add_i_setup_done_; + } + + bool LinAlgWorkspaceHIP::scaleAddBSetup() + { + return scale_add_b_setup_done_; + } + void LinAlgWorkspaceHIP::initializeHandles() { rocsparse_create_handle(&handle_rocsparse_); diff --git a/resolve/workspace/LinAlgWorkspaceHIP.hpp b/resolve/workspace/LinAlgWorkspaceHIP.hpp index 4db4b89f..fe08d961 100644 --- a/resolve/workspace/LinAlgWorkspaceHIP.hpp +++ b/resolve/workspace/LinAlgWorkspaceHIP.hpp @@ -9,6 +9,8 @@ namespace ReSolve { + class ScaleAddBufferHIP; + class LinAlgWorkspaceHIP { public: @@ -24,6 +26,10 @@ namespace ReSolve index_type getDrSize(); real_type* getDr(); real_type* getNormBuffer(); + void setScaleAddIBuffer(ScaleAddBufferHIP* buffer); + void setScaleAddBBuffer(ScaleAddBufferHIP* buffer); + ScaleAddBufferHIP* getScaleAddIBuffer(); + ScaleAddBufferHIP* getScaleAddBBuffer(); void* getTransposeBufferWorkspace(); void setTransposeBufferWorkspace(size_t bufferSize); bool getNormBufferState(); @@ -32,12 +38,17 @@ namespace ReSolve void setRocblasHandle(rocblas_handle handle); void setRocsparseHandle(rocsparse_handle handle); void setSpmvMatrixDescriptor(rocsparse_mat_descr mat); + void setSpmvMatrixInfo(rocsparse_mat_info info); void initializeHandles(); bool matvecSetup(); void matvecSetupDone(); + bool scaleAddISetup(); + bool scaleAddBSetup(); + void scaleAddISetupDone(); + void scaleAddBSetupDone(); void setDrSize(index_type new_sz); void setDr(real_type* new_dr); @@ -51,23 +62,26 @@ namespace ReSolve // matrix descriptors rocsparse_mat_descr mat_A_; - // vector descriptors not needed, rocsparse uses RAW pointers. // buffers // there is no buffer needed in matvec bool matvec_setup_done_{false}; // check if setup is done for matvec (note: no buffer but there is analysis) + bool scale_add_i_setup_done_{false}; + bool scale_add_b_setup_done_{false}; // info - but we need info rocsparse_mat_info info_A_; - real_type* d_r_{nullptr}; // needed for inf-norm - real_type* norm_buffer_{nullptr}; // needed for inf-norm - void* transpose_workspace_{nullptr}; // needed for transpose - bool transpose_workspace_ready_{false}; // to track if allocated - index_type d_r_size_{0}; - bool norm_buffer_ready_{false}; // to track if allocated - MemoryHandler mem_; ///< Memory handler not needed for now + real_type* d_r_{nullptr}; // needed for inf-norm + real_type* norm_buffer_{nullptr}; // needed for inf-norm + void* transpose_workspace_{nullptr}; // needed for transpose + bool transpose_workspace_ready_{false}; // to track if allocated + index_type d_r_size_{0}; + bool norm_buffer_ready_{false}; // to track if allocated + ScaleAddBufferHIP* buffer_scale_add_i_{nullptr}; + ScaleAddBufferHIP* buffer_scale_add_b_{nullptr}; + MemoryHandler mem_; ///< Memory handler not needed for now }; } // namespace ReSolve diff --git a/resolve/workspace/ScaleAddBufferCUDA.cpp b/resolve/workspace/ScaleAddBufferCUDA.cpp new file mode 100644 index 00000000..7bbd33f4 --- /dev/null +++ b/resolve/workspace/ScaleAddBufferCUDA.cpp @@ -0,0 +1,102 @@ +#include + +namespace ReSolve +{ + + /** + * @brief Store sparsity pattern + * + * @param[in] nrows - number of rows + */ + ScaleAddBufferCUDA::ScaleAddBufferCUDA(index_type numRows) + : numRows_(numRows), bufferSize_(0) + { + mem_.allocateArrayOnDevice(&rowData_, numRows_ + 1); + cusparseCreateMatDescr(&mat_A_); + cusparseSetMatType(mat_A_, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(mat_A_, CUSPARSE_INDEX_BASE_ZERO); + } + + /** + * @brief Destructor + * + */ + ScaleAddBufferCUDA::~ScaleAddBufferCUDA() + { + mem_.deleteOnDevice(rowData_); + mem_.deleteOnDevice(buffer_); + cusparseDestroyMatDescr(mat_A_); + } + + /** + * @brief Retrieve row sparsity pattern + * + * @return precalculated row pointers + */ + index_type* ScaleAddBufferCUDA::getRowData() + { + return rowData_; + } + + /** + * @brief Retrieve matrix descriptor + * + * @return matrix descriptor set for scaleAddB, scaleAddI + */ + cusparseMatDescr_t ScaleAddBufferCUDA::getMatrixDescriptor() + { + return mat_A_; + } + + /** + * @brief Allocate memory for cusparse buffer + * + * @param[in] bufferSize calculated array size + */ + void ScaleAddBufferCUDA::allocateBuffer(size_t bufferSize) + { + bufferSize_ = bufferSize; + mem_.allocateBufferOnDevice(&buffer_, bufferSize_); + } + + /** + * @brief Retrieve cusparse buffer + * + * @return cusparse buffer + */ + void* ScaleAddBufferCUDA::getBuffer() + { + return buffer_; + } + + /** + * @brief get number of matrix rows + * + * @return number of matrix rows. + */ + index_type ScaleAddBufferCUDA::getNumRows() + { + return numRows_; + } + + /** + * @brief Get number of non-zeros. + * + * @return number of non-zeros + */ + index_type ScaleAddBufferCUDA::getNnz() + { + return nnz_; + } + + /** + * @brief set number of non-zeros. + * + * @param[in] nnz number of non-zeros + */ + void ScaleAddBufferCUDA::setNnz(index_type nnz) + { + nnz_ = nnz; + } + +} // namespace ReSolve diff --git a/resolve/workspace/ScaleAddBufferCUDA.hpp b/resolve/workspace/ScaleAddBufferCUDA.hpp new file mode 100644 index 00000000..683eb389 --- /dev/null +++ b/resolve/workspace/ScaleAddBufferCUDA.hpp @@ -0,0 +1,33 @@ +#pragma once + +#include "cusparse.h" +#include +#include + +namespace ReSolve +{ + + class ScaleAddBufferCUDA + { + public: + ScaleAddBufferCUDA(index_type numRows); + ~ScaleAddBufferCUDA(); + index_type* getRowData(); + void allocateBuffer(size_t bufferSize); + void* getBuffer(); + cusparseMatDescr_t getMatrixDescriptor(); + index_type getNumRows(); + void setNnz(index_type nnz); + index_type getNnz(); + + private: + index_type* rowData_; + void* buffer_; + cusparseMatDescr_t mat_A_; + index_type numRows_; + index_type nnz_; + size_t bufferSize_; + MemoryHandler mem_; + }; + +} // namespace ReSolve diff --git a/resolve/workspace/ScaleAddBufferCpu.cpp b/resolve/workspace/ScaleAddBufferCpu.cpp new file mode 100644 index 00000000..4f9066e8 --- /dev/null +++ b/resolve/workspace/ScaleAddBufferCpu.cpp @@ -0,0 +1,69 @@ +#include +#include +#include + +#include + +namespace ReSolve +{ + /** + * @brief Store sparsity pattern + * + * @param[in] row_data - row data (array of integers, length:nrows+1) + * @param[in] col_data - column data (array of integers, length: nnz) + */ + ScaleAddBufferCpu::ScaleAddBufferCpu(std::vector rowData, std::vector colData) + : rowData_(std::move(rowData)), colData_(std::move(colData)) + { + } + + /** + * @brief Retrieve row sparsity pattern + * + * @return precalculated row pointers + */ + index_type* ScaleAddBufferCpu::getRowData() + { + return rowData_.data(); + } + + /** + * @brief Retrieve column sparsity pattern + * + * @return precalculated column indices + */ + index_type* ScaleAddBufferCpu::getColumnData() + { + return colData_.data(); + } + + /** + * @brief get number of matrix rows + * + * @return number of matrix rows. + */ + index_type ScaleAddBufferCpu::getNumRows() + { + return static_cast(rowData_.size()) - 1; + } + + /** + * @brief get number of matrix columns + * + * @return number of matrix columns. + */ + index_type ScaleAddBufferCpu::getNumColumns() + { + return getNumRows(); + } + + /** + * @brief Get number of non-zeros. + * + * @return number of non-zeros + */ + index_type ScaleAddBufferCpu::getNnz() + { + return static_cast(colData_.size()); + } +} // namespace ReSolve diff --git a/resolve/workspace/ScaleAddBufferCpu.hpp b/resolve/workspace/ScaleAddBufferCpu.hpp new file mode 100644 index 00000000..61fd014a --- /dev/null +++ b/resolve/workspace/ScaleAddBufferCpu.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include + +#include + +namespace ReSolve +{ + class ScaleAddBufferCpu + { + public: + ScaleAddBufferCpu(std::vector rowData, std::vector colData); + index_type* getRowData(); + index_type* getColumnData(); + index_type getNumRows(); + index_type getNumColumns(); + index_type getNnz(); + + private: + std::vector rowData_; + std::vector colData_; + }; +} // namespace ReSolve diff --git a/resolve/workspace/ScaleAddBufferHIP.cpp b/resolve/workspace/ScaleAddBufferHIP.cpp new file mode 100644 index 00000000..38e19a22 --- /dev/null +++ b/resolve/workspace/ScaleAddBufferHIP.cpp @@ -0,0 +1,81 @@ +#include + +#include + +namespace ReSolve +{ + /** + * @brief Store sparsity pattern + * + * @param[in] row_data - pointer to row data (array of integers, length:nrows+1) + * @param[in] nrows - number of rows + * @param[in] col_data - pointer to column data (array of integers, length: nnz) + * @param[in] nnz - number of non-zeros + */ + ScaleAddBufferHIP::ScaleAddBufferHIP(index_type numRows) + : numRows_(numRows) + { + rocsparse_create_mat_descr(&mat_A_); + mem_.allocateArrayOnDevice(&rowData_, numRows_ + 1); + } + + /** + * @brief Destructor + * + */ + ScaleAddBufferHIP::~ScaleAddBufferHIP() + { + mem_.deleteOnDevice(rowData_); + rocsparse_destroy_mat_descr(mat_A_); + } + + /** + * @brief Retrieve row sparsity pattern + * + * @return precalculated row pointers + */ + index_type* ScaleAddBufferHIP::getRowData() + { + return rowData_; + } + + /** + * @brief Retrieve matrix descriptor + * + * @return matrix descriptor set for scaleAddB, scaleAddI + */ + rocsparse_mat_descr ScaleAddBufferHIP::getMatrixDescriptor() + { + return mat_A_; + } + + /** + * @brief get number of matrix rows + * + * @return number of matrix rows. + */ + index_type ScaleAddBufferHIP::getNumRows() + { + return numRows_; + } + + /** + * @brief Get number of non-zeros. + * + * @return number of non-zeros + */ + index_type ScaleAddBufferHIP::getNnz() + { + return nnz_; + } + + /** + * @brief Get number of non-zeros. + * + * @param[in] nnz number of non-zeros + */ + void ScaleAddBufferHIP::setNnz(index_type nnz) + { + nnz_ = nnz; + } +} // namespace ReSolve diff --git a/resolve/workspace/ScaleAddBufferHIP.hpp b/resolve/workspace/ScaleAddBufferHIP.hpp new file mode 100644 index 00000000..29299c00 --- /dev/null +++ b/resolve/workspace/ScaleAddBufferHIP.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include +#include +#include + +#include +#include + +namespace ReSolve +{ + + class ScaleAddBufferHIP + { + public: + ScaleAddBufferHIP(index_type numRows); + ~ScaleAddBufferHIP(); + index_type* getRowData(); + rocsparse_mat_descr getMatrixDescriptor(); + index_type getNumRows(); + void setNnz(index_type nnz); + index_type getNnz(); + + private: + index_type* rowData_; + rocsparse_mat_descr mat_A_; + index_type numRows_; + index_type nnz_; + MemoryHandler mem_; + }; +} // namespace ReSolve diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp index e6fb0eac..047856f4 100644 --- a/tests/unit/matrix/MatrixHandlerTests.hpp +++ b/tests/unit/matrix/MatrixHandlerTests.hpp @@ -202,6 +202,92 @@ namespace ReSolve return status.report(testname.c_str()); } + TestOutcome scaleAddI(index_type n) + { + TestStatus status; + std::string testname(__func__); + matrix::Csr* A = createCsrMatrix(n); + real_type val = 2.; + + handler_.scaleAddI(A, val, memspace_); + if (memspace_ == memory::DEVICE) + { + A->syncData(memory::HOST); + } + // expected sum is 2*30+1 + status *= verifyScaleAddICsrMatrix(A, 61); + + // run again to reuse sparsity pattern. + handler_.scaleAddI(A, val, memspace_); + if (memspace_ == memory::DEVICE) + { + A->syncData(memory::HOST); + } + // expected sum is 2*61+1 + status *= verifyScaleAddICsrMatrix(A, 123); + + delete A; + return status.report(testname.c_str()); + } + + TestOutcome scaleAddIZero(index_type n) + { + TestStatus status; + std::string testname(__func__); + matrix::Csr* A = createCsrEmptyMatrix(n); + real_type val = 2.; + + handler_.scaleAddI(A, val, memspace_); + if (memspace_ == memory::DEVICE) + { + A->syncData(memory::HOST); + } + // expected sum is 1 + status *= verifyScaleAddICsrMatrix(A, 1); + + // run again to reuse sparsity pattern. + handler_.scaleAddI(A, val, memspace_); + if (memspace_ == memory::DEVICE) + { + A->syncData(memory::HOST); + } + // expected sum is 2*1+1 + status *= verifyScaleAddICsrMatrix(A, 3); + + delete A; + return status.report(testname.c_str()); + } + + TestOutcome scaleAddB(index_type n) + { + TestStatus status; + std::string testname(__func__); + matrix::Csr* A = createCsrMatrix(n); + matrix::Csr* B = createCsrMatrix(n); + real_type val = 2.; + + handler_.scaleAddB(A, val, B, memspace_); + if (memspace_ == memory::DEVICE) + { + A->syncData(memory::HOST); + } + // expected sum is 2*30+30 + status *= verifyScaleAddICsrMatrix(A, 90); + + // run again to reuse sparsity pattern. + handler_.scaleAddB(A, val, B, memspace_); + if (memspace_ == memory::DEVICE) + { + A->syncData(memory::HOST); + } + // expected sum is 2*90+30 + status *= verifyScaleAddICsrMatrix(A, 210); + + delete A; + delete B; + return status.report(testname.c_str()); + } + private: ReSolve::MatrixHandler& handler_; memory::MemorySpace memspace_{memory::HOST}; @@ -402,8 +488,8 @@ namespace ReSolve * @brief Verify structure of a CSR matrix with preset pattern. * * The sparsity structure corresponds to the CSR representation of a rectangular matrix - * created by createRectangularCscMatrix. - * The sparisty structure is upper bidiagonal if n==m, + * created by createRectangularCsrMatrix. + * The sparsity structure is upper bidiagonal if n==m, * with an extra entry in the first column. * If n>m A_{ij} is nonzero iff i==j, or i+m==j+n * if nallocateMatrixData(memory::HOST); + + index_type* rowptr = A->getRowData(memory::HOST); + + for (index_type i = 0; i < n + 1; ++i) + { + rowptr[i] = 0; + } + A->setUpdated(memory::HOST); + + if (memspace_ == memory::DEVICE) + { + A->syncData(memspace_); + } + return A; + } + /** * @brief Create a CSR matrix with preset sparsity structure * @@ -678,6 +795,56 @@ namespace ReSolve return true; } + /** + * @brief Verify structure of a CSR matrix with preset pattern after scaleAddI + * + * The sparsity structure corresponds to the CSR representation of a square matrix + * created by createCsrMatrix, scaled by a constant and 1. added along the diagonal. + * + * @pre A is a valid, allocated CSR matrix + * @invariant A + * @param[in] A matrix::Csr* pointer to the matrix to be verified + * @param[in] expected sum of row elements + * @return bool true if the matrix is valid, false otherwise + */ + bool verifyScaleAddICsrMatrix(matrix::Csr* A, real_type expected) + { + // Check if the matrix is valid + if (A == nullptr) + { + return false; + } + + // Get matrix dimensions + const index_type n = A->getNumRows(); + const index_type m = A->getNumColumns(); + + if (n != m) + { + return false; + } + + index_type* scaled_row_ptr = A->getRowData(memory::HOST); + real_type* scaled_value = A->getValues(memory::HOST); + + // Verify values - each element scaled by scale. Diagonal elements should be 1. + for (index_type i = 0; i < n; ++i) + { + real_type sum = 0.; + for (index_type j = scaled_row_ptr[i]; j < scaled_row_ptr[i + 1]; ++j) + { + sum += scaled_value[j]; + } + if (sum != expected) + { + std::cout << "Mismatch at row " << i << ": scaled sum = " << sum << ", expected sum = " << expected << "\n"; + return false; + } + } + + return true; + } + /** * @brief Verify structure of a CSR matrix with preset pattern that is right-scaled * diff --git a/tests/unit/matrix/runMatrixHandlerTests.cpp b/tests/unit/matrix/runMatrixHandlerTests.cpp index 84e9a601..5f6e7dcd 100644 --- a/tests/unit/matrix/runMatrixHandlerTests.cpp +++ b/tests/unit/matrix/runMatrixHandlerTests.cpp @@ -76,6 +76,13 @@ void runTests(const std::string& backend, ReSolve::tests::TestingResults& result result += test.rightScale(1024, 2048); workspace.resetLinAlgWorkspace(); result += test.rightScale(2048, 1024); + workspace.resetLinAlgWorkspace(); + result += test.scaleAddI(100); + workspace.resetLinAlgWorkspace(); + result += test.scaleAddIZero(100); + workspace.resetLinAlgWorkspace(); + result += test.scaleAddB(100); + workspace.resetLinAlgWorkspace(); std::cout << "\n"; }