From 199ad701e366f2c24fa8b03bafe223a537aa52ee Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 27 Nov 2024 16:46:30 +0100 Subject: [PATCH] add slides for session 2024-11-20 --- 2024-11-20.md | 281 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 235 insertions(+), 46 deletions(-) diff --git a/2024-11-20.md b/2024-11-20.md index 01b8490..09af69e 100644 --- a/2024-11-20.md +++ b/2024-11-20.md @@ -1,70 +1,259 @@ --- -title: User Meeting -subtitle: What's going on? -author: Marcel Koch -date: 20.11.2024 -theme: solarized -document-css: true -maxwidth: 100% -linestretch: 5.0 +marp: true +theme: ginkgo +headingDivider: 2 --- -## Intro +# Basic Concepts in Ginkgo -some textaf some textaf + -### sub page +Early-Adopter Session 27.11.2024 -### other sub page +## Topics -```js -let a = 1; -let b = 2; -let c = x => 1 + 2 + x; -c(3); -``` +- Concept: Executors +- Concept: LinOps +- Hello World! +- Creating Matrices +- Concept: Factories + +## What is Ginkgo? +Numerical Sparse Linear Algebra Library +https://github.com/ginkgo-project/ginkgo +Supports GPUs and Manycore architecture -## What is Ginkgo? +Focus on Portability and Composability -- Numerical Sparse Linear Algebra Library https://github.com/ginkgo-project/ginkgo -- Supports GPUs and Manycore architecture -- Focus on Portability and Composability -- Written in modern C++ -- Comprehensive test and benchmark framework -- Integrations into MFEM, deal.II, OpenFOAM, ... -- Available on Spack +Written in modern C++ +Comprehensive test and benchmark framework +Integrations into MFEM, deal.II, OpenFOAM, ... + +Available on Spack ## Basic Concepts: Executor Represents a single device to run HPC code on: -Singlecore CPU (Reference) +- Singlecore CPU (Reference) + `auto exec = gko::ReferenceExecutor::create();` +- Multicore CPU (OpenMP) + `auto exec = gko::OmpExecutor::create();` -`auto exec = gko::ReferenceExecutor::create();`{.cpp} -- Multicore CPU (OpenMP) - ```cpp - auto exec = gko::OmpExecutor::create(); - ``` +## Basic Concepts: Executor + +Represents a single device to run HPC code on: + - NVIDIA GPU (CUDA) - ```cpp - auto host_exec = gko::OmpExecutor(); - auto exec = gko::CudaExecutor::create(device_id, - host_exec); + ```c++ + auto host_exec = gko::OmpExecutor::create(); + auto exec = gko::CudaExecutor::create(device_id, host_exec); ``` - AMD GPU (HIP/ROCm) - ```cpp - auto host_exec = gko::OmpExecutor(); - auto exec = gko::HipExecutor::create(device_id, - host_exec); + ```c++ + auto host_exec = gko::OmpExecutor::create(); + auto exec = gko::HipExecutor::create(device_id, host_exec); + ``` +- Intel GPU (SYCL) + ```c++ + auto host_exec = gko::OmpExecutor::create(); + auto exec = gko::DpcppExecutor::create(device_id, host_exec); + ``` + +## Basic Concepts: Executor + +Responsibilities: + +- Memory allocation +- Copying data +- Executing kernels + +Passed to other functions, not used directly! +```c++ +void gko::func(std::shared_ptr exec, ...); +``` + +## Basic Concepts: LinOp + + + + +Linear Operator representing: + +- Matrices (Dense, Sparse CSR, COO, ELL, ...) + $$y = A\cdot x$$ +- Solvers (Krylov CG, GMRES,..., Direct, Triangular) + $$S = A^{-1}, y = S\cdot x$$ +- Preconditioners (Block-Jacobi, Multigrid, ILU(0), ...) + $$M \approx A^{-1}, y = M\cdot x$$ +- Compositions and linear combination + $$ C = A\cdot B, y = C\cdot x $$ + $$ C = \alpha A + \beta B, y = C\cdot x$$ +- Factorizations (ILU, LU, Cholesky, ...) + $$ L\cdot U = A, y = LU \cdot x$$ + +## Basic Concepts: LinOp + +- Functionality: + - Matrix (multi-)vector products + ```cpp + A->apply(x, y); // y = A * x + A->apply(alpha, x, beta, y); // y = alpha * A * x + beta * y + ``` +- Properties: + - Executor + - Where is the data stored? + - Where are operations executed? + - Dimensions $n \times m$ +- Compatibility: + - Ensures all inputs are on the operator's executor + +## Creating Matrices + +Equivalent across all matrix formats: + +- From C++ initializer lists (scalars, tiny matrices) + `auto mtx = gko::initialize({{0.0, 1.0}, {1.0, 0.0}}, exec);` +- From MatrixMarket files + `auto mtx = gko::read(std::ifstream{"A.mtx"}, exec);` +- From an existing matrix + ```c++ + auto mtx = other_mtx->clone(); + auto mtx = gko::clone(exec, mtx); + ``` +- From a matrix in a different format + ```c++ + auto mtx = Mtx::create(exec); + other_mtx->convert_to(mtx); + ``` + +## Matrix Assembly + + + +Three equivalent ways of specifying matrix values: + +- `gko::matrix_data` for sequential assembly (no duplicates) + ```c++ + gko::matrix_data<> data{gko::dim<2>{num_rows, num_cols}}; + data.nonzeros.emplace_back(row, column, value); + mtx->read(data); + ``` +- `gko::matrix_assembly_data` for sequential assembly + ```c++ + gko::matrix_assembly_data<> data{gko::dim<2>{num_rows, num_cols}}; + data.set_value(row, column, value); + data.add_value(row, column, value); + mtx->read(data); + ``` +- `gko::device_matrix_data` for high-performance assembly + ```c++ + device_matrix_data<> data{exec, dim<2>{nrows, ncols}, nnz}; + run_assembly_kernel(data.get_row_idxs(), + data.get_col_idxs(), + data.get_values()); // runs on device + data.sum_duplicates(); // optional, e.g. for FEM edge DoFs + mtx->read(data); ``` -- Intel GPU/CPU (SYCL) - ```cpp - auto host_exec = gko::OmpExecutor(); - auto exec = gko::DpcppExecutor::create(device_id, - host_exec); - ``` \ No newline at end of file + +## Hello World + +```c++ +#include + +int main() { + using Mtx = gko::matrix::Csr; + using Vec = gko::matrix::Dense; + auto exec = gko::ReferenceExecutor::create(); + auto mtx = gko::initialize({{0.0, 1.0}, {1.0, 0.0}}, exec); + auto vec = gko::initialize({2.0, 3.0}, exec); + auto result = vec->clone(); + mtx->apply(vec, result); + gko::write(std::cout, result); +} +``` + +## Changing Executors + +```c++ +#include + +int main() { + using Mtx = gko::matrix::Csr; + using Vec = gko::matrix::Dense; + auto exec = gko::CudaExecutor(0, gko::OmpExecutor::create()); + auto mtx = gko::initialize({{0.0, 1.0}, {1.0, 0.0}}, exec); + auto vec = gko::initialize({2.0, 3.0}, exec); + auto result = vec->clone(); + mtx->apply(vec, result); + gko::write(std::cout, result); +} +``` + +## Explicit Types + +```c++ +#include + +int main() { + using Mtx = gko::matrix::Csr; + using Vec = gko::matrix::Dense; + std::shared_ptr exec = + gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + std::unique_ptr mtx = gko::initialize({{0.0, 1.0}, {1.0, 0.0}}, + exec); + std::unique_ptr vec = gko::initialize({2.0, 3.0}, exec); + std::unique_ptr result = vec->clone(); + mtx->apply(vec, result); + gko::write(std::cout, result); +} +``` + +## Matrix Assembly + +```c++ +#include + +int main() { + using Mtx = gko::matrix::Csr; + using Vec = gko::matrix::Dense; + using dim = gko::dim<2>; + using matrix_data = gko::matrix_data; + auto exec = gko::ReferenceExecutor::create(); + matrix_data data{dim{10, 10}}; + auto mtx = Mtx::create(exec); + auto x = Vec::create(exec, dim{10, 1}); + auto y = Vec::create(exec, dim{10, 1}); + for (int row = 0; row < 10; row++) { + data.nonzeros.emplace_back(row, row, 2.0); + data.nonzeros.emplace_back(row, (row + 1) % 10, -1.0); + data.nonzeros.emplace_back(row, (row + 9) % 10, -1.0); + x->at(row, 0) = 1.0; // only works on CPU executors + } + mtx->read(data); + mtx->apply(x, y); + gko::write(std::cout, y); +} +```