diff --git a/CMakeLists.txt b/CMakeLists.txt index e130191b40d..f7f12b48606 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -373,6 +373,7 @@ endif () include(CompilerConfiguration) include(CheckCompilerSupport) +# subdirectories add_subdirectory(src) include(CTest) @@ -384,4 +385,6 @@ if (WITH_EXAMPLES) add_subdirectory(examples) endif () +add_subdirectory(docs) + include(CustomTargets) diff --git a/DBCSR.md b/DBCSR.md index 5609bcbf21c..d3481b35bd2 100644 --- a/DBCSR.md +++ b/DBCSR.md @@ -15,6 +15,9 @@ predocmark: > media_dir: @CMAKE_SOURCE_DIR@/docs/media md_base_dir: @CMAKE_SOURCE_DIR@ page_dir: @CMAKE_SOURCE_DIR@/docs/guide +src_dir: ./src + ./tests + ./examples output_dir: @CMAKE_BINARY_DIR@/doc docmark_alt: # predocmark_alt: < diff --git a/README.md b/README.md index 78703a96d18..1f6c3d03f10 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ It is MPI and OpenMP parallel and can exploit Nvidia and AMD GPUs via CUDA and H ## How to Install -Follow the [installation guide](docs/guide/2-user-guide/1-installation/1-install.md). +Follow the [installation guide](https://cp2k.github.io/dbcsr/develop/page/2-user-guide/1-installation/index.html). ## Documentation diff --git a/VERSION b/VERSION index 6321062b636..3eef448c4b5 100644 --- a/VERSION +++ b/VERSION @@ -1,6 +1,7 @@ MAJOR = 2 MINOR = 1 -PATCH = 0-rc19 +PATCH = 0 # A specific DATE (YYYY-MM-DD) fixes an official release, otherwise # it is considered Development version. -DATE = +DATE = 2020-12-09 + diff --git a/cmake/CustomTargets.cmake b/cmake/CustomTargets.cmake index 93b1db4fcbe..5753b9b7a0a 100644 --- a/cmake/CustomTargets.cmake +++ b/cmake/CustomTargets.cmake @@ -1,3 +1,5 @@ +# ================================================================================================= +# BUILD DISTRIBUTION set(ARCHIVE_NAME "${CMAKE_PROJECT_NAME}-${dbcsr_VERSION}") add_custom_target( dist @@ -9,22 +11,8 @@ add_custom_target( "${CMAKE_BINARY_DIR}/dist/${ARCHIVE_NAME}.tar.gz" WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) -find_program( - FORD_EXE ford - DOC "path to the ford executable (required to generate the documentation)") - -# Copy the FORD project-file into the build directory -set(FORD_PROJECT_FILE "${CMAKE_BINARY_DIR}/DBCSR.md") -configure_file(DBCSR.md "${FORD_PROJECT_FILE}") - -add_custom_target( - doc - COMMENT "Generating API documentation" - COMMAND "${FORD_EXE}" "${FORD_PROJECT_FILE}" - VERBATIM) -add_dependencies(doc fypp) # only depend on the fypp step to avoid building - # everything just for the docs - +# ================================================================================================= +# LCOV - COVERAGE REPORTS GENERATION find_program( LCOV_EXE lcov DOC "path to the lcov executable (required to generate coverage reports)") diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt new file mode 100644 index 00000000000..032b9cf073c --- /dev/null +++ b/docs/CMakeLists.txt @@ -0,0 +1,22 @@ +# ================================================================================================= +# FORD - DOCUMENTATION GENERATION +find_program( + FORD_EXE ford + DOC "path to the ford executable (required to generate the documentation)") + +# Copy the FORD project-file into the build directory +set(FORD_PROJECT_FILE "${CMAKE_BINARY_DIR}/DBCSR.md") +configure_file(${CMAKE_SOURCE_DIR}/DBCSR.md "${FORD_PROJECT_FILE}") + +# Copy the FORD project-file into the build directory +add_custom_target( + doc + COMMENT "Generating API documentation and doc pages" + COMMAND "${FORD_EXE}" "${FORD_PROJECT_FILE}" + VERBATIM) +add_dependencies(doc doc_copy_tests) +if (WITH_C_API) + add_dependencies(doc doc_copy_examples) +endif () +add_dependencies(doc fypp) # only depend on the fypp step to avoid building + # everything just for the docs diff --git a/docs/guide/2-user-guide/1-installation/2-cmake-build-recipes.md b/docs/guide/2-user-guide/1-installation/1-cmake-build-recipes.md similarity index 100% rename from docs/guide/2-user-guide/1-installation/2-cmake-build-recipes.md rename to docs/guide/2-user-guide/1-installation/1-cmake-build-recipes.md diff --git a/docs/guide/2-user-guide/1-installation/1-install.md b/docs/guide/2-user-guide/1-installation/1-install.md deleted file mode 100644 index 3cba6942b45..00000000000 --- a/docs/guide/2-user-guide/1-installation/1-install.md +++ /dev/null @@ -1,114 +0,0 @@ -title: Install - -# Install - -## Prerequisites - -You absolutely need: - -* [CMake](https://cmake.org/) (3.12+) -* GNU make or Ninja -* a Fortran compiler which supports at least Fortran 2008 (including the TS 29113 when using the C-bindings) -* a BLAS+LAPACK implementation (reference, OpenBLAS and MKL have been tested. Note: DBCSR linked to OpenBLAS 0.3.6 gives wrong results on Power9 architectures.) -* a Python version installed (2.7 or 3.6+ have been tested) - -Optionally: - -* [libxsmm](https://github.com/hfp/libxsmm) (1.10+, and `pkg-config`) for Small Matrix Multiplication acceleration -* a LAPACK implementation (reference, OpenBLAS-bundled and MKL have been tested), required when building the tests - -To build `libsmm_acc`, DBCSR's GPU backend, you further need: - -* A GPU-capable compiler, either - * CUDA Toolkit (targets NVIDIA GPUs, minimal version required: 5.5) with cuBLAS - * or HIP compiler (targets NVIDIA or AMD GPUs) and hipBLAS -* a C++ compiler which supports at least C++11 standard - -We test against GNU and Intel compilers on Linux systems, GNU compiler on MacOS systems. See a list of supported compilers [here](./3-supported-compilers.html). - -## Get DBCSR - -Download either a [release tarball](https://github.com/cp2k/dbcsr/releases) or clone the latest version from Git using: - -```bash -git clone --recursive https://github.com/cp2k/dbcsr.git -``` - -## Build - -DBCSR can be compiled in 4 main variants: -* Serial, i.e. no OpenMP and MPI -* OpenMP -* MPI -* OpenMP+MPI -The 4 variants can be combined with the accelerator support. - -Run inside the `dbcsr` directory: - -```bash -mkdir build -cd build -cmake .. -make -``` - - The configuration flags for the CMake command are (default first): - -``` --DUSE_MPI= --DUSE_OPENMP= --DUSE_SMM= --DUSE_CUDA= --DUSE_HIP= --DWITH_C_API= --DWITH_EXAMPLES= --DWITH_GPU= --DWITH_CUDA_PROFILING= --DCMAKE_BUILD_TYPE= --DBUILD_TESTING= --DTEST_MPI_RANKS= --DTEST_OMP_THREADS=<2,N> -``` - -When providing a custom build of `libxsmm`, make sure that its library directory is added to the `PKG_CONFIG_PATH` variable prior -to running `cmake`. An example if `libxsmm` was checked out using Git to your home folder: - -```bash -export PKG_CONFIG_PATH="${PKG_CONFIG_PATH}:${HOME}/libxsmm/lib" -``` - -### CMake Build Recipes - -For build recipes on different platforms, make sure to also read the [CMake Build Recipes](./2-cmake-build-recipes.html). - -### Using Python in a virtual environment - -If you want to use Python from a virtual environment and your CMake version is < 3.15, specify the desired python interpreter manually using: - -``` - -DPython_EXECUTABLE=/path/to/python -``` - -### Running Tests - -To run the tests, use: - -```bash - make test -``` - -Please, note that if you are using OpenMP builds, then you have to set the environment variable `OMP_NESTED=false`. - -### C/C++ Interface - -If MPI support is enabled (the default), the C API is automatically built. - -### Workaround issue in HIP - -HIP is a relatively new language, and some issues still need to be ironed out. As a workaround to an [issue](https://github.com/ROCm-Developer-Tools/HIP/pull/1543) in HIP's JIT infrastructure, please set the following if you've built HIP from source: - -```bash - export HIP_PATH=/opt/rocm/hip -``` - -before running on an AMD GPU. diff --git a/docs/guide/2-user-guide/1-installation/3-supported-compilers.md b/docs/guide/2-user-guide/1-installation/2-supported-compilers.md similarity index 100% rename from docs/guide/2-user-guide/1-installation/3-supported-compilers.md rename to docs/guide/2-user-guide/1-installation/2-supported-compilers.md diff --git a/docs/guide/2-user-guide/1-installation/4-using-dbcsr-in-a-cmake-project.md b/docs/guide/2-user-guide/1-installation/3-using-dbcsr-in-a-cmake-project.md similarity index 100% rename from docs/guide/2-user-guide/1-installation/4-using-dbcsr-in-a-cmake-project.md rename to docs/guide/2-user-guide/1-installation/3-using-dbcsr-in-a-cmake-project.md diff --git a/docs/guide/2-user-guide/1-installation/4-docker.md b/docs/guide/2-user-guide/1-installation/4-docker.md new file mode 100644 index 00000000000..25d8ccc2b6a --- /dev/null +++ b/docs/guide/2-user-guide/1-installation/4-docker.md @@ -0,0 +1,4 @@ +title: Docker Images + +{!./tools/docker/README.md!} + diff --git a/docs/guide/2-user-guide/1-installation/index.md b/docs/guide/2-user-guide/1-installation/index.md index 1bc9462715f..c0efd0d406b 100644 --- a/docs/guide/2-user-guide/1-installation/index.md +++ b/docs/guide/2-user-guide/1-installation/index.md @@ -1 +1,104 @@ -title: Installation \ No newline at end of file +title: Install + +# Install + +## Prerequisites + +You absolutely need: + +* [CMake](https://cmake.org/) (3.12+) +* GNU make or Ninja +* a Fortran compiler which supports at least Fortran 2008 (including the TS 29113 when using the C-bindings) +* a BLAS+LAPACK implementation (reference, OpenBLAS and MKL have been tested. Note: DBCSR linked to OpenBLAS 0.3.6 gives wrong results on Power9 architectures.) +* a Python version installed (2.7 or 3.6+ have been tested) + +Optionally: + +* [libxsmm](https://github.com/hfp/libxsmm) (1.10+, and `pkg-config`) for Small Matrix Multiplication acceleration +* a LAPACK implementation (reference, OpenBLAS-bundled and MKL have been tested), required when building the tests + +To build `libsmm_acc`, DBCSR's GPU backend, you further need: + +* A GPU-capable compiler, either + * CUDA Toolkit (targets NVIDIA GPUs, minimal version required: 5.5) with cuBLAS + * or HIP compiler (targets NVIDIA or AMD GPUs) and hipBLAS (the tested version is ROCm 3.8) +* a C++ compiler which supports at least C++11 standard + +We test against GNU and Intel compilers on Linux systems, GNU compiler on MacOS systems. See a list of supported compilers [here](./3-supported-compilers.html). + +## Get DBCSR + +Download either a [release tarball](https://github.com/cp2k/dbcsr/releases) or clone the latest version from Git using: + +```bash +git clone --recursive https://github.com/cp2k/dbcsr.git +``` + +## Build + +DBCSR can be compiled in 4 main variants: +* Serial, i.e. no OpenMP and MPI +* OpenMP +* MPI +* OpenMP+MPI +The 4 variants can be combined with the accelerator support. + +Run inside the `dbcsr` directory: + +```bash +mkdir build +cd build +cmake .. +make +``` + + The configuration flags for the CMake command are (default first): + +``` +-DUSE_MPI= +-DUSE_OPENMP= +-DUSE_SMM= +-DUSE_CUDA= +-DWITH_CUDA_PROFILING= +-DUSE_HIP= +-DWITH_C_API= +-DWITH_EXAMPLES= +-DWITH_GPU= +-DCMAKE_BUILD_TYPE= +-DBUILD_TESTING= +-DTEST_MPI_RANKS= +-DTEST_OMP_THREADS=<2,N> +``` + +When providing a custom build of `libxsmm`, make sure that its library directory is added to the `PKG_CONFIG_PATH` variable prior +to running `cmake`. An example if `libxsmm` was checked out using Git to your home folder: + +```bash +export PKG_CONFIG_PATH="${PKG_CONFIG_PATH}:${HOME}/libxsmm/lib" +``` + +### CMake Build Recipes + +For build recipes on different platforms, make sure to also read the [CMake Build Recipes](./1-cmake-build-recipes.html). + +### Using Python in a virtual environment + +If you want to use Python from a virtual environment and your CMake version is < 3.15, specify the desired python interpreter manually using: + +``` + -DPython_EXECUTABLE=/path/to/python +``` + +### C/C++ Interface + +If MPI support is enabled (the default), the C API is automatically built. + +### Workaround issue in HIP + +HIP is a relatively new language, and some issues still need to be ironed out. As a workaround to an [issue](https://github.com/ROCm-Developer-Tools/HIP/pull/1543) in HIP's JIT infrastructure, please set the following if you've built HIP from source: + +```bash + export HIP_PATH=/opt/rocm/hip +``` + +before running on an AMD GPU. diff --git a/docs/guide/2-user-guide/2-examples/index.md b/docs/guide/2-user-guide/2-examples/index.md deleted file mode 100644 index 96b610794f3..00000000000 --- a/docs/guide/2-user-guide/2-examples/index.md +++ /dev/null @@ -1,3 +0,0 @@ -title: Examples - -{!examples/README.md!} diff --git a/docs/guide/2-user-guide/2-tests/index.md b/docs/guide/2-user-guide/2-tests/index.md new file mode 100644 index 00000000000..3980073254b --- /dev/null +++ b/docs/guide/2-user-guide/2-tests/index.md @@ -0,0 +1,71 @@ +title: Tests + +# Tests + +## Correctness tests + +- [[dbcsr_unittest_1(program)]] (fortran) : test matrix operations: add, multiply and multiply-ghost +- [[dbcsr_unittest_2(program)]] (fortran) : test matrix-multiply with large blocks (block size=100) and rectangular matrices (block size=5) +- [[dbcsr_test_csr_conversions(program)]] (fortran) : test DBCSR to CSR conversion with random matrices +- [[dbcsr_tas_unittest(program)]] (fortran) : unit test for tall-and-skinny matrices +- [[dbcsr_tensor_unittest(program)]] (fortran) : unit test for tensor functionalities +- [dbcsr_tensor_test](../../../../tests/dbcsr_tensor_test.cpp) (c++) : test the tensor contraction (13|2)x(54|21)=(3|45) 31 and other functions + +### GPU-backend correctness tests: + +- [[dbcsr_unittest_3(program)]] (fortran) : test matrix-multiply with various block sizes that are run by the libsmm_acc GPU backend if DBCSR is compiled with GPU support +- [libsmm_acc_unittest_multiply](../../../../tests/libsmm_acc_unittest_multiply.cpp) (c++) : tests all libsmm_acc transpose kernels +- [libsmm_acc_unittest_transpose](../../../../tests/libsmm_acc_unittest_transpose.cpp) (c++) : tests all libsmm_acc batch-multiplication kernels + +## Performance tests + +DBCSR performance tests: + +- [[dbcsr_performance_driver(program)]] (fortran) : performance tester for matrix operations. The input matrices can be described in an input file in order to test different configurations. See below. + +### GPU backend performance tests: + +- [libsmm_acc_timer_multiply](../../../../tests/libsmm_acc_timer_multiply.cpp) (c++) : time all libsmm_acc batch-multiplication kernels + +## Running Tests + +To run all the tests, use: + +```bash +make test +``` + +Or run individual tests from the `build` directory, as follows: + +```bash +srun -N 1 --ntasks-per-core 2 --ntasks-per-node 12 --cpus-per-task 2 ./tests/dbcsr_unittest_1 +``` + +Note that the tests of libsmm_acc (the GPU-backend) do not use MPI since libsmm_acc only operates on-node. + +Note that if you are using OpenMP builds, then you have to set the environment variable `OMP_NESTED=false`. + +### Input Files for Performance Driver + +The test suite comes with a performance driver ([[dbcsr_performance_driver(program)]]), which evaluates the performance of matrix-matrix multiplication in DBCSR. + +Input matrices can be specified in an input file, passed to the executable as standard input, for example: + +a) To test pure MPI performance test using [n] nodes: + +```bash +mpirun -np [n] ./build/tests/dbcsr_perf tests/input.perf 2>&1 | tee perf.log +``` + +b) To test hybrid MPI/OpenMP performance test using [n] nodes, each spanning [t] threads: + +```bash +export OMP_NUM_THREADS=[t]; mpirun -np [n] ./build/tests/dbcsr_perf tests/input.perf 2>&1 | tee perf.log +``` + +### How to Write Input Files + +Examples of input files can be found in `tests/inputs` for different sizes of matrices and different block sizes. + +You can also write custom input files: for more information, follow the template in `tests/input.perf`. + diff --git a/docs/guide/2-user-guide/3-examples/index.md b/docs/guide/2-user-guide/3-examples/index.md new file mode 100644 index 00000000000..a24a4fa269e --- /dev/null +++ b/docs/guide/2-user-guide/3-examples/index.md @@ -0,0 +1,35 @@ +title: Examples + +# Examples + +- [[dbcsr_example_1(program)]] : how to create a dbcsr matrix (fortran) +- [[dbcsr_example_2(program)]] : how to set a dbcsr matrix (fortran) +- dbcsr_example_3: how to multiply two dbcsr matrices (in fortran: [[dbcsr_example_3(program)]]) and in c++: [dbcsr_example_3](https://github.com/cp2k/dbcsr/blob/develop/examples/dbcsr_example_3.cpp)) +- [[dbcsr_tensor_example_1(program)]] : how to create a dbcsr matrix (fortran) + - the example can be run with different parameters, controlling block size, sparsity, verbosity and more +- [dbcsr_tensor_example_2](https://github.com/cp2k/dbcsr/blob/develop/examples/dbcsr_tensor_example_2.cpp): tensor contraction example (cpp) + - tensor1 x tensor2 = tensor3, (13|2)x(54|21)=(3|45) + +## Build + +Compile the DBCSR library, using `-DUSE_MPI=ON -DWITH_EXAMPLES=ON`. + +The examples require MPI. Furthermore, if you are using threading, MPI_THREAD_FUNNELED mode is required. + +## Run + +You can run the examples, for instance from the `build` directory, as follows: + +```bash +srun -N 1 --ntasks-per-core 2 --ntasks-per-node 12 --cpus-per-task 2 ./examples/dbcsr_example_1 +``` + +### Run tensor examples + +How to run (this example and DBCSR for tensors in general): + +- best performance is obtained by running with mpi and one openmp thread per rank. +- ideally number of mpi ranks should be composed of small prime factors (e.g. powers of 2). +- for sparse data & heterogeneous block sizes, DBCSR should be run on CPUs with libxsmm backend. +- for dense data best performance is obtained by choosing homogeneous block sizes of 64 and by compiling with GPU support. + diff --git a/docs/guide/3-developer-guide/1-tooling/index.md b/docs/guide/3-developer-guide/1-tooling/index.md index d2727bc3752..8470abe8b65 100644 --- a/docs/guide/3-developer-guide/1-tooling/index.md +++ b/docs/guide/3-developer-guide/1-tooling/index.md @@ -2,8 +2,8 @@ title: Tooling # Build System -We support CMake for compilation. See [here](https://cp2k.github.io/dbcsr/page/2-user-guide/1-installation/1-install.html) on how to compile and -[here](https://cp2k.github.io/dbcsr/page/2-user-guide/1-installation/2-cmake-build-recipes.html) for more CMake details. +We support CMake for compilation. See [here](../../2-user-guide/1-installation/index.html) on how to compile and +[here](../../2-user-guide/1-installation/1-cmake-build-recipes.html) for more CMake details. Compilations is based on [Fypp](https://github.com/aradi/fypp) meta-progamming package, which is available as submodule. diff --git a/docs/guide/3-developer-guide/2-documentation/index.md b/docs/guide/3-developer-guide/2-documentation/index.md index 0f708994bce..db7245a89b0 100644 --- a/docs/guide/3-developer-guide/2-documentation/index.md +++ b/docs/guide/3-developer-guide/2-documentation/index.md @@ -2,21 +2,28 @@ title: Documentation # Documentation -## Build the Documentation +## Build -To build the Documentation you need [FORD](https://github.com/Fortran-FOSS-Programmers/ford). +To build the documentation you need [FORD](https://github.com/Fortran-FOSS-Programmers/ford). Afterwards use the `doc` target for the CMake generated Makefile: ```bash mkdir build cd build - cmake .. # will look for the `ford` binary + cmake .. # will look for the `ford` binary make doc ``` +Note that in order to generate the documentation with examples (recommended), the following options should be activated in cmake (these are the options' default values) + +```bash + cmake -DUSE_MPI=ON -DWITH_EXAMPLES=ON .. # these options are default and recommended. + # If set off, the examples' documentation is not generated. +``` + The documentation (HTML format) will be located in `doc/`. To view it, open `doc/index.html` in a browser. -## Add Pages to the Documentation +## Add Pages To add pages to the documentation, write Markdown files and add them to the desired location in `dbcsr/docs/guide`. Note that subfolders of `guide` will only be added to the documentation pages if they contain a file `index.md`. For more information on writing pages, see [Ford's documentation](https://github.com/Fortran-FOSS-Programmers/ford/wiki/Writing-Pages). diff --git a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md new file mode 100644 index 00000000000..1301c085f0d --- /dev/null +++ b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md @@ -0,0 +1,23 @@ +title: Kernel Parameters + +# Kernel Parameters + +## Batched Matrix-Matrix Multiplication Kernel Parameters + +The batched matrix-matrix multiplication kernels are templated on: + +* the characteristic dimensions of the multiplication: `m, n, k` +* between 3-7 kernel parameters from (`M`, `N`, `w`, `v`, `threads`, `grouping`, `minblocks`), depending on the algorithm. + +## Batched Matrix Transpose Kernel Parameters + +The batched transpose kernels are templated on: + +* the characteristic dimensions of the transpose: `m, n` + +## Predictive parameters + +The input features for the predictive models can be 'raw' parameters (left-most-column in the figure below), or hand-engineered features 'derived' from the raw features (matrix sizes, launch parameters and resource usage estimations). + +![libsmm_acc_predictive_modeling_features](|media|/images/libsmm_acc_predictive_modeling_features.png) + diff --git a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/index.md b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/index.md index 4faf7651028..23ac88907b5 100644 --- a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/index.md +++ b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/index.md @@ -1 +1,3 @@ -title: Accelerator Backend \ No newline at end of file +title: Accelerator Backend + +{!./src/acc/README.md!} diff --git a/docs/guide/3-developer-guide/4-performance/1-insights.md b/docs/guide/3-developer-guide/4-performance/1-insights.md new file mode 100644 index 00000000000..fc30fd1615c --- /dev/null +++ b/docs/guide/3-developer-guide/4-performance/1-insights.md @@ -0,0 +1,119 @@ +title: Insights + +# Insights into Performance + +## Read Timing & Statistics Reports + +At the end of an output file, a report of DBCSR's statistics and timings can be found. + +### Statistics + +The STATISTICS section of the output file provides some information on matrix-matrix multiplications that were run and their performance characteristics. + +Example: + +``` +------------------------------------------------------------------------------- +- - +- DBCSR STATISTICS - +- - +------------------------------------------------------------------------------- +COUNTER TOTAL BLAS SMM ACC +flops 23 x 23 x 23 687272462200 0.0% 0.0% 100.0% +flops inhomo. stacks 0 0.0% 0.0% 0.0% +flops total 687.272462E+09 0.0% 0.0% 100.0% +flops max/rank 687.272462E+09 0.0% 0.0% 100.0% +matmuls inhomo. stacks 0 0.0% 0.0% 0.0% +matmuls total 28243300 0.0% 0.0% 100.0% +number of processed stacks 1600 0.0% 0.0% 100.0% +average stack size 0.0 0.0 17652.1 +marketing flops 1.076458E+12 +------------------------------------------------------------------------------- +# multiplications 50 +max memory usage/rank 16.650822E+09 +# max total images/rank 1 +# max 3D layers 1 +# MPI messages exchanged 0 +MPI messages size (bytes): + total size 0.000000E+00 + min size 0.000000E+00 + max size 0.000000E+00 + average size 0.000000E+00 +MPI breakdown and total messages size (bytes): + size <= 128 0 0 + 128 < size <= 8192 0 0 + 8192 < size <= 32768 0 0 + 32768 < size <= 131072 0 0 + 131072 < size <= 4194304 0 0 + 4194304 < size <= 16777216 0 0 + 16777216 < size 0 0 +------------------------------------------------------------------------------- +``` + +#### How to Read the Columns + +- `TOTAL`: total flops +- `BLAS`: percentage of flops run on BLAS (this could be CUBLAS or HIPBLAS) +- `SMM`: percentage of flops run on SMM (libsmm or libxsmm, CPU) +- `ACC`: percentage of flops run on ACC (libsmm_acc, DBCSR's GPU-accelerated backend) + +#### How to Read the Rows (Counters) + +Every time "matrix-matrix multiplication" is mentionned in this paragraph, it refers *not* to the sparse multiplication of large matrices, but the multiplication of small dense blocks that the large sparse matrix was decomposed into. + +- `flops 23 x 23 x 23`: indicates that batched matrix-matrix multiplication kernels with matrix dimensions (m, n, k) = (23, 23, 23) was run, and provides info on its flops. If several batched matrix-matrix multiplications of different matrix dimensions (m, n, k) were run, they would appear as subsequent separate rows. +- `flops inhomo. stacks`: flops of so-called "inhomogeneous stacks". These are stacks of batched-matrix-matrix multiplications where not all multiplications contained have the same matrix dimensions (m, n, k). +- `flops total`: total flops for all stacks of matrix-matrix multiplication. +- `flops max/rank`: flops of the MPI rank with the most flops. +- `matmuls inhomo. stacks`: number of matrix-matrix multiplications run in inhomogeneous stacks. +- `matmuls total`: number of matrix-matrix multiplications run in total. +- `number of processed stacks`: number of stacks of batched matrix-matrix multiplication. +- `average stack size`: average over all stacks of the stack size (i.e. the number of matrix-matrix multiplications that a stack contains). + +### Timings + +Example of the statistics section of the output file: + +``` +------------------------------------------------------------------------------- +- - +- T I M I N G - +- - +------------------------------------------------------------------------------- +SUBROUTINE CALLS ASD SELF TIME TOTAL TIME MAXRANK + MAXIMUM AVERAGE MAXIMUM AVERAGE MAXIMUM +dbcsr_performance_driver 1 1.0 0.000 0.000 102.563 102.563 0 +dbcsr_perf_multiply_low 1 2.0 0.002 0.002 102.563 102.563 0 +perf_multiply 1 3.0 0.003 0.003 102.077 102.077 0 +[...] +------------------------------------------------------------------------------- +``` + +The columns describe: + +- `SUBROUTINE`: the name of the fortran subroutine (or c++ function) timed. +- `CALLS`: number of times the subroutine was called. +- `ASD`: average stack depth: the average number of entries on the call stack when this subroutine is called. +- `SELF TIME`: how much time is spent in the subroutine, or in non-timed subroutines called by this subroutine. + - `AVERAGE`: the self time averaged over all MPI ranks, + - `MAXIMUM`: the self time maximum over all MPI ranks, + - `AVERAGE` and `MAXIMUM` can be used to locate load-imbalance or synchronization points. +- `TOTAL TIME`: how much time is spent in the subroutine, including the time spent in timed subroutines. + - `AVERAGE`: averaged over all MPI ranks + - `MAXIMUM`: maximum over all MPI ranks + - `AVERAGE` and `MAXIMUM` can be used to locate load-imbalance or synchronization points. +- `MAXRANKS`: + +#### Time spent in Just-In-Time (JIT) Compilation + +For performance debugging and in order to check how much time a program spends doing JIT, look for the functions `jit_kernel_multiply` and `jit_kernel_transpose`. + +#### How to Time a Function + +By default, the most important subroutines are timed in DBCSR. + +If you want to time a subroutine or function that is not timed already, call `timeset` with a routine name and a handle at the beginning of the function, and `timestop` with the same handle at the end of the function. + +For examples, just `grep` for `timeset` and `timestop` in the codebase. + +This can be done both in fortran code and in the c++ code. diff --git a/docs/guide/3-developer-guide/4-performance/2-just-in-time-compilation.md b/docs/guide/3-developer-guide/4-performance/2-just-in-time-compilation.md new file mode 100644 index 00000000000..dec9dbb9d14 --- /dev/null +++ b/docs/guide/3-developer-guide/4-performance/2-just-in-time-compilation.md @@ -0,0 +1,14 @@ +title: Just-In-Time Compilation + +# Just-In-Time (JIT) Compilation in libsmm_acc + +DBCSR's GPU backend, libsmm_acc, uses heavily templated cuda/hip kernels for its batched multiplication and transpose. + +If DBCSR were to compile kernels for all possible `m, n, k`s (or, in the case of the transpose, for all possible `m, n`s) ahead-of-time (AOT), this would bloat the library and the compilation time would be much longer. +Instead, kernels are JIT-ed on the fly, at runtime, as they are requested by the user. `libsmm_acc`'s JIT infrastructure is based on the CUDA library [NVRTC](https://docs.nvidia.com/cuda/nvrtc/), a runtime compilation library for CUDA C++. + +On NVIDIA's P100, the overhead of JIT has been found to be around 400ms for one kernel - a negligible overhead for typical DBCSR (and CP2K) runs. +On AMD GPUs however, the overhead has been found to be of several seconds, a real hinderance to performance. + +For performance debugging and in order to check how much time a program spends doing JIT, look for the functions `jit_kernel_multiply` and `jit_kernel_transpose` in the [timings report](./1-insights.html) at the end of the output file. + diff --git a/docs/guide/3-developer-guide/4-performance/index.md b/docs/guide/3-developer-guide/4-performance/index.md new file mode 100644 index 00000000000..2de023cdac7 --- /dev/null +++ b/docs/guide/3-developer-guide/4-performance/index.md @@ -0,0 +1 @@ +title: Performance diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 39de98431a1..167516e9516 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,8 +1,10 @@ -set(DBCSR_PROGRAM_SRCS dbcsr_example_1.F dbcsr_example_2.F dbcsr_example_3.F - dbcsr_tensor_example.F) +set(DBCSR_PROGRAM_SRCS_FTN dbcsr_example_1.F dbcsr_example_2.F + dbcsr_example_3.F dbcsr_tensor_example_1.F) -# register each program source file as executable -foreach (dbcsr_program_src ${DBCSR_PROGRAM_SRCS}) +set(DBCSR_PROGRAM_SRCS_CPP dbcsr_example_3.cpp dbcsr_tensor_example_2.cpp) + +# Compile Fortran examples +foreach (dbcsr_program_src ${DBCSR_PROGRAM_SRCS_FTN}) get_filename_component(dbcsr_program_name ${dbcsr_program_src} NAME_WE) if (USE_HIP) hip_add_executable(${dbcsr_program_name} ${dbcsr_program_src}) @@ -17,23 +19,49 @@ foreach (dbcsr_program_src ${DBCSR_PROGRAM_SRCS}) Fortran) endforeach () +# Compile C++ examples if (WITH_C_API) - if (USE_HIP) - hip_add_executable(dbcsr_example_3_cpp dbcsr_example_3.cpp) - hip_add_executable(dbcsr_tensor_example_1_cpp dbcsr_tensor_example_1.cpp) - else () - add_executable(dbcsr_example_3_cpp dbcsr_example_3.cpp) - add_executable(dbcsr_tensor_example_1_cpp dbcsr_tensor_example_1.cpp) - endif () - target_link_libraries(dbcsr_example_3_cpp dbcsr_c MPI::MPI_CXX) - target_link_libraries(dbcsr_tensor_example_1_cpp dbcsr_c MPI::MPI_CXX) + foreach (dbcsr_program_src ${DBCSR_PROGRAM_SRCS_CPP}) + get_filename_component(dbcsr_program_name ${dbcsr_program_src} NAME_WE) + set(dbcsr_program_name ${dbcsr_program_name}_cpp) + if (USE_HIP) + hip_add_executable(${dbcsr_program_name} ${dbcsr_program_src}) + else () + add_executable(${dbcsr_program_name} ${dbcsr_program_src}) + endif () + target_link_libraries(${dbcsr_program_name} dbcsr_c MPI::MPI_CXX) - if (CMAKE_CXX_COMPILER_ID STREQUAL "Cray") - # for recent Cray compiler versions CMake doesn't know - target_compile_options(dbcsr_example_3_cpp PRIVATE "-hstd=c++14") - target_compile_options(dbcsr_tensor_example_1_cpp PRIVATE "-hstd=c++14") - else () - target_compile_features(dbcsr_example_3_cpp PRIVATE cxx_std_14) - target_compile_features(dbcsr_tensor_example_1_cpp PRIVATE cxx_std_14) - endif () + if (CMAKE_CXX_COMPILER_ID STREQUAL "Cray") + # for recent Cray compiler versions CMake doesn't know + target_compile_options(${dbcsr_program_name} PRIVATE "-hstd=c++14") + else () + target_compile_features(${dbcsr_program_name} PRIVATE cxx_std_14) + endif () + endforeach () endif () + +# =================================== DOCUMENTATION GENERATION Copy example +# source files into the build directory so that their documentation can be +# generated by FORD + +set(DBCSR_PROGRAM_SRCS ${DBCSR_PROGRAM_SRCS_FTN} ${DBCSR_PROGRAM_SRCS_CPP}) + +# Make a list of the copy commands +set(example_copy_commands) +foreach (example ${DBCSR_PROGRAM_SRCS}) + list( + APPEND + example_copy_commands + COMMAND + ${CMAKE_COMMAND} + -E + copy + ${CMAKE_SOURCE_DIR}/examples/${example} + ${CMAKE_BINARY_DIR}/examples) +endforeach () + +add_custom_target( + doc_copy_examples + COMMENT "Copy examples for documentation generation" + COMMAND mkdir -p ${CMAKE_BINARY_DIR}/examples ${example_copy_commands} + VERBATIM) diff --git a/examples/README.md b/examples/README.md index f7c8ba746a1..b8634b6fb37 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,9 +1,34 @@ -# DBCSR Examples +# Examples -These examples require to be executed with MPI. -Furthermore, MPI_THREAD_FUNNELED mode is required -if you are using threading. -Make sure you compile DBCSR accordingly. +- [`dbcsr_example_1`](dbcsr_example_1.F): how to create a dbcsr matrix (fortran) +- [`dbcsr_example_2`](dbcsr_example_2.F): how to set a dbcsr matrix (fortran) +- `dbcsr_example_3`: how to multiply two dbcsr matrices ([fortran](dbcsr_example_3.F) and [cpp](dbcsr_example_3.cpp)) +- [`dbcsr_tensor_example_1`](dbcsr_tensor_example_1.F): how to create a dbcsr matrix (fortran) + - the example can be run with different parameters, controlling block size, sparsity, verbosity and more +- [`dbcsr_tensor_example_2`](dbcsr_tensor_example_2.cpp): tensor contraction example (cpp) + - tensor1 x tensor2 = tensor3, (13|2)x(54|21)=(3|45) + +See the [examples' documentation](../docs/guide/2-user-guide/2-examples/index.md). + +## Build + +Compile the DBCSR library, using `-DUSE_MPI=ON -DWITH_EXAMPLES=ON`. + +The examples require MPI. Furthermore, if you are using threading, MPI_THREAD_FUNNELED mode is required. + +## Run + +You can run the examples, for instance from the `build` directory, as follows: + +```bash +srun -N 1 --ntasks-per-core 2 --ntasks-per-node 12 --cpus-per-task 2 ./examples/dbcsr_example_1 +``` + +### Run tensor examples + +How to run (this example and DBCSR for tensors in general): +* best performance is obtained by running with mpi and one openmp thread per rank. +* ideally number of mpi ranks should be composed of small prime factors (e.g. powers of 2). +* for sparse data & heterogeneous block sizes, DBCSR should be run on CPUs with libxsmm backend. +* for dense data best performance is obtained by choosing homogeneous block sizes of 64 and by compiling with GPU support. -1) Compile and install DBCSR, following the procedure described in the DBCSR - installation guide, and using `-DWITH_EXAMPLES=ON`. diff --git a/examples/dbcsr_example_1.F b/examples/dbcsr_example_1.F index fe3bd3f7a44..837850001cc 100644 --- a/examples/dbcsr_example_1.F +++ b/examples/dbcsr_example_1.F @@ -8,8 +8,8 @@ !--------------------------------------------------------------------------------------------------! PROGRAM dbcsr_example_1 - !! DBCSR example 1 - !! This example shows how to create a dbcsr matrix + !! DBCSR example 1: + !! This example shows how to create a DBCSR matrix USE mpi USE dbcsr_api, ONLY: & @@ -46,7 +46,7 @@ PROGRAM dbcsr_example_1 !$ ENDIF ! - ! setup the mp environment + ! setup the mpi environment CALL mpi_comm_size(MPI_COMM_WORLD, numnodes, ierr) IF (ierr /= 0) STOP "Error in MPI_Comm_size" npdims(:) = 0 @@ -61,7 +61,7 @@ PROGRAM dbcsr_example_1 !*************************************************************************************** ! - ! initialize libdbcsr + ! initialize the DBCSR library CALL dbcsr_init_lib(MPI_COMM_WORLD) ! @@ -81,11 +81,11 @@ PROGRAM dbcsr_example_1 CALL random_dist(col_dist, nblkcols_total, npdims(2)) ! - ! set the dbcsr distribution object + ! set the DBCSR distribution object CALL dbcsr_distribution_new(dist, group=group, row_dist=row_dist, col_dist=col_dist, reuse_arrays=.TRUE.) ! - ! create the dbcsr matrix, i.e. a double precision non symmetric matrix + ! create the DBCSR matrix, i.e. a double precision non symmetric matrix ! with nblkrows_total x nblkcols_total blocks and ! sizes "sum(row_blk_sizes)" x "sum(col_blk_sizes)", distributed as ! specified by the dist object @@ -99,7 +99,7 @@ PROGRAM dbcsr_example_1 reuse_arrays=.TRUE.) ! - ! finalize the dbcsr matrix + ! finalize the DBCSR matrix CALL dbcsr_finalize(matrix_a) ! @@ -121,8 +121,9 @@ PROGRAM dbcsr_example_1 CALL mpi_comm_free(group, ierr) IF (ierr /= 0) STOP "Error in MPI_Comm_free" - ! finalize libdbcsr + ! finalize the DBCSR library CALL dbcsr_finalize_lib() + ! ! finalize mpi CALL mpi_finalize(ierr) diff --git a/examples/dbcsr_example_2.F b/examples/dbcsr_example_2.F index 1945f1e3c4b..4fa1c2a61a2 100644 --- a/examples/dbcsr_example_2.F +++ b/examples/dbcsr_example_2.F @@ -8,8 +8,8 @@ !--------------------------------------------------------------------------------------------------! PROGRAM dbcsr_example_2 - !! DBCSR example 2 - !! This example shows how to set a dbcsr matrix + !! DBCSR example 2: + !! This example shows how to set a DBCSR matrix USE mpi USE dbcsr_api, ONLY: & @@ -49,7 +49,7 @@ PROGRAM dbcsr_example_2 !$ ENDIF ! - ! setup the mp environment + ! setup the mpi environment CALL mpi_comm_size(MPI_COMM_WORLD, numnodes, ierr) IF (ierr /= 0) STOP "Error in MPI_Comm_size" npdims(:) = 0 @@ -64,7 +64,7 @@ PROGRAM dbcsr_example_2 !*************************************************************************************** ! - ! initialize libdbcsr + ! initialize the DBCSR library CALL dbcsr_init_lib(MPI_COMM_WORLD) ! @@ -84,11 +84,11 @@ PROGRAM dbcsr_example_2 CALL random_dist(col_dist, nblkcols_total, npdims(2)) ! - ! set the dbcsr distribution object + ! set the DBCSR distribution object CALL dbcsr_distribution_new(dist, group=group, row_dist=row_dist, col_dist=col_dist, reuse_arrays=.TRUE.) ! - ! create the dbcsr matrix, i.e. a double precision non symmetric matrix + ! create the DBCSR matrix, i.e. a double precision non symmetric matrix ! with nblkrows_total x nblkcols_total blocks and ! sizes "sum(row_blk_sizes)" x "sum(col_blk_sizes)", distributed as ! specified by the dist object @@ -139,7 +139,7 @@ PROGRAM dbcsr_example_2 DEALLOCATE (values) ! - ! finalize the dbcsr matrix + ! finalize the DBCSR matrix CALL dbcsr_finalize(matrix_a) ! @@ -160,8 +160,9 @@ PROGRAM dbcsr_example_2 CALL mpi_comm_free(group, ierr) IF (ierr /= 0) STOP "Error in MPI_Comm_free" - ! finalize libdbcsr + ! finalize the DBCSR library CALL dbcsr_finalize_lib() + ! ! finalize mpi CALL mpi_finalize(ierr) diff --git a/examples/dbcsr_example_3.F b/examples/dbcsr_example_3.F index 62f886397a9..f7acda3aaf7 100644 --- a/examples/dbcsr_example_3.F +++ b/examples/dbcsr_example_3.F @@ -8,7 +8,7 @@ !--------------------------------------------------------------------------------------------------! PROGRAM dbcsr_example_3 - !! DBCSR example 3 + !! DBCSR example 3: !! This example shows how to multiply two dbcsr matrices USE mpi @@ -50,7 +50,7 @@ PROGRAM dbcsr_example_3 !$ ENDIF ! - ! setup the mp environment + ! setup the mpi environment CALL mpi_comm_size(MPI_COMM_WORLD, numnodes, ierr) IF (ierr /= 0) STOP "Error in MPI_Comm_size" npdims(:) = 0 @@ -65,7 +65,7 @@ PROGRAM dbcsr_example_3 !*************************************************************************************** ! - ! initialize libdbcsr + ! initialize the DBCSR library CALL dbcsr_init_lib(MPI_COMM_WORLD) ! @@ -185,8 +185,9 @@ PROGRAM dbcsr_example_3 CALL mpi_comm_free(group, ierr) IF (ierr /= 0) STOP "Error in MPI_Comm_free" - ! finalize libdbcsr + ! finalize the DBCSR library CALL dbcsr_finalize_lib() + ! ! finalize mpi CALL mpi_finalize(ierr) diff --git a/examples/dbcsr_example_3.cpp b/examples/dbcsr_example_3.cpp index 3d94374477a..58a9cc046cf 100644 --- a/examples/dbcsr_example_3.cpp +++ b/examples/dbcsr_example_3.cpp @@ -33,15 +33,19 @@ std::vector random_dist(int dist_size, int nbins) } +// DBCSR example 3 +// This example shows how to multiply two DBCSR matrices int main(int argc, char* argv[]) { + // initialize MPI MPI_Init(&argc, &argv); + // setup the mpi environment int mpi_size, mpi_rank; MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - // Make 2D grid + // make 2D grid int dims[2] = {0}; MPI_Dims_create(mpi_size, 2, dims); int periods[2] = {1}; @@ -58,20 +62,22 @@ int main(int argc, char* argv[]) << ", (" << coord[0] << ", " << coord[1] << ") in the 2D grid" << std::endl; + // initialize the DBCSR library c_dbcsr_init_lib(MPI_COMM_WORLD, nullptr); - // Total number of blocks + // the matrix will contain nblkrows_total row blocks and nblkcols_total column blocks int nblkrows_total = 4; int nblkcols_total = 4; - // Block sizes + // set the block size for each row and column std::vector row_blk_sizes(nblkrows_total, 2), col_blk_sizes(nblkcols_total, 2); + // set the row and column distributions (here the distribution is set randomly) auto row_dist = random_dist(nblkrows_total, dims[0]); auto col_dist = random_dist(nblkcols_total, dims[1]); + // set the DBCSR distribution object void* dist = nullptr; - c_dbcsr_distribution_new(&dist, group, row_dist.data(), row_dist.size(), col_dist.data(), col_dist.size()); @@ -103,7 +109,12 @@ int main(int argc, char* argv[]) } }; - // create and fill matrix a + // create the DBCSR matrices, i.e. a double precision non symmetric matrix + // with nblkrows_total x nblkcols_total blocks and + // sizes "sum(row_blk_sizes)" x "sum(col_blk_sizes)", distributed as + // specified by the dist object + + // create, fill and finalize matrix a void* matrix_a = nullptr; c_dbcsr_create_new_d(&matrix_a, "this is my matrix a", dist, 'N', row_blk_sizes.data(), row_blk_sizes.size(), @@ -111,7 +122,7 @@ int main(int argc, char* argv[]) fill_matrix(matrix_a); c_dbcsr_finalize(matrix_a); - // create and fill matrix b + // create, fill and finalize matrix b void* matrix_b = nullptr; c_dbcsr_create_new_d(&matrix_b, "this is my matrix b", dist, 'N', row_blk_sizes.data(), row_blk_sizes.size(), @@ -119,7 +130,7 @@ int main(int argc, char* argv[]) fill_matrix(matrix_b); c_dbcsr_finalize(matrix_b); - // create matrix c, empty + // create and finalize matrix c (empty) void* matrix_c = nullptr; c_dbcsr_create_new_d(&matrix_c, "matrix c", dist, 'N', row_blk_sizes.data(), row_blk_sizes.size(), @@ -129,21 +140,25 @@ int main(int argc, char* argv[]) // multiply the matrices c_dbcsr_multiply_d('N', 'N', 1.0, &matrix_a, &matrix_b, 0.0, &matrix_c, nullptr); + // print the matrices c_dbcsr_print(matrix_a); c_dbcsr_print(matrix_b); c_dbcsr_print(matrix_c); + // release the matrices c_dbcsr_release(&matrix_a); c_dbcsr_release(&matrix_b); c_dbcsr_release(&matrix_c); c_dbcsr_distribution_release(&dist); - + // free comm MPI_Comm_free(&group); + // finalize the DBCSR library c_dbcsr_finalize_lib(); + // finalize MPI MPI_Finalize(); return 0; diff --git a/examples/dbcsr_tensor_example.F b/examples/dbcsr_tensor_example_1.F similarity index 99% rename from examples/dbcsr_tensor_example.F rename to examples/dbcsr_tensor_example_1.F index d6b200bed8c..32b41cae1e4 100644 --- a/examples/dbcsr_tensor_example.F +++ b/examples/dbcsr_tensor_example_1.F @@ -7,8 +7,8 @@ ! SPDX-License-Identifier: GPL-2.0+ ! !--------------------------------------------------------------------------------------------------! -program dbcsr_tensor_example - +program dbcsr_tensor_example_1 + !! Sparse tensor contraction example use mpi use dbcsr_api, only: & dbcsr_type, dbcsr_distribution_type, dbcsr_init_lib, dbcsr_distribution_new, & @@ -180,7 +180,7 @@ program dbcsr_tensor_example beta = beta*scale_exp gamma = gamma*scale_exp - ! iniialize mpi + ! initialize mpi call mpi_init(ierr) if (ierr /= 0) stop "error in mpi_init" diff --git a/examples/dbcsr_tensor_example_1.cpp b/examples/dbcsr_tensor_example_2.cpp similarity index 100% rename from examples/dbcsr_tensor_example_1.cpp rename to examples/dbcsr_tensor_example_2.cpp diff --git a/src/acc/README.md b/src/acc/README.md index 750d8dd5749..3355b527109 100644 --- a/src/acc/README.md +++ b/src/acc/README.md @@ -2,7 +2,14 @@ ## Overview -This folder contains the ISO_C_BINDING based Fortran code of DBCSR's [ACC-backend interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc.h) and [LIBSMM/ACC-interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc_libsmm.h). Further, two stand-alone sample codes are given exercising both interfaces (benchmarks). +This folder contains the ISO_C_BINDING based Fortran code of DBCSR's [ACC-backend interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc.h) and [LIBSMM/ACC-interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc_libsmm.h). It also contains the CUDA (for Nvidia GPUs) and HIP (for AMD GPUs) accelerator backends. + +Further, two stand-alone sample codes are given exercising both interfaces (benchmarks). + +## CUDA and HIP backends + +The code for both the CUDA and HIP backends is unique, and can be found in the `cuda` directory. +We switch from one backend to the other via macros (`__CUDA` and `__HIP`). ## Benchmarks diff --git a/src/acc/libsmm_acc/kernels/README.md b/src/acc/libsmm_acc/kernels/README.md index e49a3348d01..acc9d8f49ae 100644 --- a/src/acc/libsmm_acc/kernels/README.md +++ b/src/acc/libsmm_acc/kernels/README.md @@ -1,19 +1,40 @@ # libsmm_acc/kernels +`libsmm_acc`'s GPU kernels. + ## Directory Organization * [`autotuning_properties.json`](autotuning_properties.json) Properties of the autotuning procedure, read from [DBCSR source code](../libsmm_acc_benchmark.cpp) +* [`gpu_properties.json`](gpu_properties.json) GPU card properties + * [`smm_acc_common.h`](smm_acc_common.h) Functionalities common to kernel CUDA/HIP codes * [`smm_acc_dnt_base.py`](smm_acc_dnt_base.py) Kernel base class - * `smm_acc_dnt_ALGORITHM.py` Kernel class + * `smm_acc_dnt_ALGORITHM.py` Kernel class in python - * `smm_acc_dnt_ALGORITHM.h` Kernel CUDA/HIP code + * `smm_acc_dnt_ALGORITHM.h` Batched Multiply Kernel CUDA/HIP code * [`smm_acc_predict.py`](smm_acc_predict.py) Class and helper functions for parameter prediction procedure -* [`smm_acc_transpose.h`](smm_acc_transpose.h) Transposition CUDA/HIP code +* [`smm_acc_transpose.h`](smm_acc_transpose.h) Transpose CUDA/HIP code + +## Batched Multiplication Kernels + +All kernels have following signature: + +``` +template + +__global__ void +__launch_bounds__(threads, minblocks) +smm_acc_dnt_ALGORITHM +(const int *__restrict__ param_stack, const int stack_size, +const double* __restrict__ a_data, const double* __restrict__ b_data, double* c_data); +``` + +At kernel launch time, the A, B, and C matrices, as well as the product descriptors (the so-called stacks) are all located in global memory on the GPU. Each entry in the stack describes one matrix-matrix product: it contains three pointers to the blocks in the A, B, and C matrices. After the kernel has read a stack entry, it fetches the blocks in matrices A and B from global to shared memory, and updates the C matrix with the product of A and B. + +`libsmm_acc` provides 5 different kernels for this operation (tiny, small, medium, largeDB1, largeDB2), which are optimized for different block sizes. Please refer to the documentation inside the respective `.h` files for more details. -* [`gpu_properties.json`](gpu_properties.json) GPU card properties diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a4ca3bc7a4a..b33ba5800ab 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -57,7 +57,7 @@ endforeach () # =================================== DBCSR CORRECTNESS TESTS Define all the # tests here, will be used as the executable name -set(DBCSR_TESTS +set(DBCSR_TESTS_FTN dbcsr_unittest1 dbcsr_unittest2 dbcsr_unittest3 @@ -67,12 +67,9 @@ set(DBCSR_TESTS dbcsr_test_csr_conversions) if (NOT (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")) - set(DBCSR_TESTS_CPP dbcsr_tensor_test_cpp) + set(DBCSR_TESTS_SRCS_CPP dbcsr_tensor_test.cpp) endif () -# Common object files linked to all tests -set(dbcsr_unittest_common_SRCS dbcsr_test_add.F dbcsr_test_multiply.F) - # For each test, set a variable testname_SRCS defining the sources of that test set(dbcsr_unittest1_SRCS dbcsr_unittest1.F) set(dbcsr_unittest2_SRCS dbcsr_unittest2.F) @@ -83,6 +80,15 @@ set(dbcsr_tas_unittest_SRCS dbcsr_tas_unittest.F) set(dbcsr_test_csr_conversions_SRCS dbcsr_test_csr_conversions.F) set(dbcsr_tensor_test_cpp_SRCS dbcsr_tensor_test.cpp) +# Make a list of the source files of fortran tests +set(DBCSR_TESTS_SRCS_FTN) +foreach (dbcsr_test ${DBCSR_TESTS_FTN}) + set(DBCSR_TESTS_SRCS_FTN ${DBCSR_TESTS_SRCS_FTN} ${${dbcsr_test}_SRCS}) +endforeach () + +# Common object files linked to all tests +set(dbcsr_unittest_common_SRCS dbcsr_test_add.F dbcsr_test_multiply.F) + # instead of building a full-blown lib, it would be better to simply build an # OBJECT lib, but we would need cmake 3.12 to be able to specify # target_link_libraries on those to get the proper compile flags @@ -98,7 +104,8 @@ if (APPLE AND BLAS_LIBRARIES MATCHES "Accelerate") endif () target_link_libraries(dbcsr_unittest_common PUBLIC dbcsr) -foreach (dbcsr_test ${DBCSR_TESTS}) +# Compile Fortran tests +foreach (dbcsr_test ${DBCSR_TESTS_FTN}) if (USE_HIP) hip_add_executable(${dbcsr_test} ${${dbcsr_test}_SRCS}) else () @@ -125,17 +132,8 @@ foreach (dbcsr_test ${DBCSR_TESTS}) endforeach () # set the __SHORT_FILE__ per file for dbcsr sources -foreach ( - tests_src - ${DBCSR_PERF_SRCS} - ${dbcsr_unittest_common_SRCS} - ${dbcsr_unittest1_SRCS} - ${dbcsr_unittest2_SRCS} - ${dbcsr_unittest3_SRCS} - ${dbcsr_unittest4_SRCS} - ${dbcsr_tensor_unittest_SRCS} - ${dbcsr_tas_unittest_SRCS} - ${dbcsr_test_csr_conversions_SRCS}) +foreach (tests_src ${DBCSR_PERF_SRCS} ${dbcsr_unittest_common_SRCS} + ${DBCSR_TESTS_SRCS_FTN}) # add_fypp_sources returns a path in the current binary dir get_filename_component(short_file "${tests_src}" NAME) set_source_files_properties( @@ -143,128 +141,135 @@ foreach ( endforeach () if (WITH_C_API) - foreach (dbcsr_test_cpp ${DBCSR_TESTS_CPP}) - add_executable(${dbcsr_test_cpp} ${${dbcsr_test_cpp}_SRCS}) - target_link_libraries(${dbcsr_test_cpp} dbcsr_c MPI::MPI_CXX) + foreach (dbcsr_test_cpp_src ${DBCSR_TESTS_SRCS_CPP}) + get_filename_component(dbcsr_test_cpp_name ${dbcsr_test_cpp_src} NAME_WE) + add_executable(${dbcsr_test_cpp_name} ${dbcsr_test_cpp_src}) + target_link_libraries(${dbcsr_test_cpp_name} dbcsr_c MPI::MPI_CXX) # register unittest executable with CMake if (USE_MPI) separate_arguments(MPIEXEC_PREFLAGS) add_test( - NAME ${dbcsr_test_cpp} - COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${num_ranks} - ${MPIEXEC_PREFLAGS} ./${dbcsr_test_cpp} ${MPIEXEC_POSTFLAGS}) + NAME ${dbcsr_test_cpp_name} + COMMAND + ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${num_ranks} + ${MPIEXEC_PREFLAGS} ./${dbcsr_test_cpp_name} ${MPIEXEC_POSTFLAGS}) else () - add_test(NAME ${dbcsr_test_cpp} COMMAND ./${dbcsr_test_cpp}) + add_test(NAME ${dbcsr_test_cpp_name} COMMAND ./${dbcsr_test_cpp_name}) endif () if (OpenMP_FOUND) set_tests_properties( - ${dbcsr_test_cpp} PROPERTIES ENVIRONMENT - OMP_NUM_THREADS=${TEST_OMP_THREADS}) + ${dbcsr_test_cpp_name} PROPERTIES ENVIRONMENT + OMP_NUM_THREADS=${TEST_OMP_THREADS}) endif () endforeach () endif () # =================================== GPU BACKEND TESTS (CUDA / HIP) + +# Add custom commands for the test files that need to be generated from a +# template +file(RELATIVE_PATH CURRENT_BINARY_DIR_RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/.. + ${CMAKE_CURRENT_BINARY_DIR}) + +# libsmm_acc_unittest_multiply +add_custom_command( + OUTPUT libsmm_acc_unittest_multiply.cpp + COMMAND + ${Python_EXECUTABLE} + ${CMAKE_CURRENT_SOURCE_DIR}/generate_libsmm_acc_unittest_multiply.py + --base_dir ${CMAKE_CURRENT_SOURCE_DIR}/.. --out_dir + ${CURRENT_BINARY_DIR_RELATIVE} --gpu_version=${WITH_GPU} + DEPENDS libsmm_acc_unittest_multiply.cpp.template + generate_libsmm_acc_unittest_multiply.py + COMMENT "Generate tests/libsmm_acc_unittest_multiply.cpp") +add_custom_target(generate_libsmm_acc_unittest_multiply_test_cpp + DEPENDS libsmm_acc_unittest_multiply.cpp) + +# libsmm_acc_timer_multiply +add_custom_command( + OUTPUT libsmm_acc_timer_multiply.cpp + COMMAND + ${Python_EXECUTABLE} + ${CMAKE_CURRENT_SOURCE_DIR}/generate_libsmm_acc_timer_multiply.py --base_dir + ${CMAKE_CURRENT_SOURCE_DIR}/.. --out_dir ${CURRENT_BINARY_DIR_RELATIVE} + --gpu_version=${WITH_GPU} + DEPENDS libsmm_acc_timer_multiply.cpp.template + generate_libsmm_acc_timer_multiply.py + COMMENT "Generate tests/libsmm_acc_unittest_transpose.cpp") +add_custom_target(generate_libsmm_acc_timer_multiply_test_cpp + DEPENDS libsmm_acc_timer_multiply.cpp) + if (USE_CUDA OR USE_HIP) # All libsmm_acc tests - set(LIBSMM_ACC_TESTS_BUILD - libsmm_acc_unittest_multiply libsmm_acc_unittest_transpose - libsmm_acc_timer_multiply) + set(LIBSMM_ACC_TESTS_SRCS + ${CMAKE_CURRENT_BINARY_DIR}/libsmm_acc_unittest_multiply.cpp + ${CMAKE_CURRENT_BINARY_DIR}/libsmm_acc_timer_multiply.cpp + libsmm_acc_unittest_transpose.cpp) # Tests that need no additional arguments to be run - set(LIBSMM_ACC_SIMPLE_TESTS libsmm_acc_unittest_multiply - libsmm_acc_unittest_transpose) - - # Add custom commands for the test files that need to be generated from a - # template libsmm_acc_unittest_multiply, libsmm_acc_timer_multiply - file(RELATIVE_PATH CURRENT_BINARY_DIR_RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/.. - ${CMAKE_CURRENT_BINARY_DIR}) - - add_custom_command( - OUTPUT libsmm_acc_unittest_multiply.cpp - COMMAND - ${Python_EXECUTABLE} - ${CMAKE_CURRENT_SOURCE_DIR}/generate_libsmm_acc_unittest_multiply.py - --base_dir ${CMAKE_CURRENT_SOURCE_DIR}/.. --out_dir - ${CURRENT_BINARY_DIR_RELATIVE} --gpu_version=${WITH_GPU} - DEPENDS libsmm_acc_unittest_multiply.cpp.template - generate_libsmm_acc_unittest_multiply.py - COMMENT "Generate tests/libsmm_acc_unittest_multiply.cpp") - - add_custom_command( - OUTPUT libsmm_acc_timer_multiply.cpp - COMMAND - ${Python_EXECUTABLE} - ${CMAKE_CURRENT_SOURCE_DIR}/generate_libsmm_acc_timer_multiply.py - --base_dir ${CMAKE_CURRENT_SOURCE_DIR}/.. --out_dir - ${CURRENT_BINARY_DIR_RELATIVE} --gpu_version=${WITH_GPU} - DEPENDS libsmm_acc_timer_multiply.cpp.template - generate_libsmm_acc_timer_multiply.py - COMMENT "Generate tests/libsmm_acc_unittest_transpose.cpp") + set(LIBSMM_ACC_NOARG_TESTS libsmm_acc_unittest_multiply + libsmm_acc_unittest_transpose) # Add executables for all libsmm_acc tests if (USE_CUDA) - foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_BUILD}) + foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_SRCS}) + + get_filename_component(libsmm_acc_test_name ${libsmm_acc_test} NAME_WE) - add_executable(${libsmm_acc_test} ${libsmm_acc_test}.cpp) - target_compile_definitions(${libsmm_acc_test} PRIVATE __CUDA) + add_executable(${libsmm_acc_test_name} ${libsmm_acc_test}) + target_compile_definitions(${libsmm_acc_test_name} PRIVATE __CUDA) target_include_directories( - ${libsmm_acc_test} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + ${libsmm_acc_test_name} + PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - target_link_libraries(${libsmm_acc_test} dbcsr) + target_link_libraries(${libsmm_acc_test_name} dbcsr) if (OpenMP_FOUND) - target_link_libraries(${libsmm_acc_test} OpenMP::OpenMP_CXX) + target_link_libraries(${libsmm_acc_test_name} OpenMP::OpenMP_CXX) endif () endforeach () else () # i.e. USE_HIP - foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_BUILD}) - set_source_files_properties(${libsmm_acc_test}.cpp + foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_SRCS}) + set_source_files_properties(${libsmm_acc_test} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - endforeach () - hip_add_executable( - libsmm_acc_unittest_multiply - ${CMAKE_CURRENT_BINARY_DIR}/libsmm_acc_unittest_multiply.cpp) - target_link_options(libsmm_acc_unittest_multiply PRIVATE ${HIP_ARCH_FLAGS}) - hip_add_executable( - libsmm_acc_timer_multiply - ${CMAKE_CURRENT_BINARY_DIR}/libsmm_acc_timer_multiply.cpp) - target_link_options(libsmm_acc_timer_multiply PRIVATE ${HIP_ARCH_FLAGS}) - hip_add_executable(libsmm_acc_unittest_transpose - libsmm_acc_unittest_transpose.cpp) - target_link_options(libsmm_acc_unittest_transpose PRIVATE ${HIP_ARCH_FLAGS}) + get_filename_component(libsmm_acc_test_name ${libsmm_acc_test} NAME_WE) + + hip_add_executable(${libsmm_acc_test_name} ${libsmm_acc_test}) + target_link_options(${libsmm_acc_test_name} PRIVATE ${HIP_ARCH_FLAGS}) + + target_link_libraries(${libsmm_acc_test_name} dbcsr) + target_compile_definitions(${libsmm_acc_test_name} PRIVATE __HIP) + + endforeach () # Workaround issue in hip_add_library: explicitely write dependency between - # the unit test & the script that generates it - add_custom_target(generate_libsmm_acc_unittest_multiply_test_cpp - DEPENDS libsmm_acc_unittest_multiply.cpp) + # the test executable and the generated test c++ source file add_dependencies(libsmm_acc_unittest_multiply generate_libsmm_acc_unittest_multiply_test_cpp) - add_custom_target(generate_libsmm_acc_timer_multiply_test_cpp - DEPENDS libsmm_acc_timer_multiply.cpp) add_dependencies(libsmm_acc_timer_multiply generate_libsmm_acc_timer_multiply_test_cpp) - foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_BUILD}) + foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_SRCS}) - target_compile_definitions(${libsmm_acc_test} PRIVATE __HIP) - target_include_directories(${libsmm_acc_test} + get_filename_component(libsmm_acc_test_name ${libsmm_acc_test} NAME_WE) + target_compile_definitions(${libsmm_acc_test_name} PRIVATE __HIP) + target_include_directories(${libsmm_acc_test_name} PRIVATE ${HIP_PATH}/../include) - target_link_libraries(${libsmm_acc_test} dbcsr) + target_link_libraries(${libsmm_acc_test_name} dbcsr) endforeach () endif () # Add tests that do not need additional arguments - foreach (libsmm_acc_test ${LIBSMM_ACC_SIMPLE_TESTS}) + foreach (libsmm_acc_test ${LIBSMM_ACC_NOARG_TESTS}) add_test(NAME ${libsmm_acc_test} COMMAND ${libsmm_acc_test}) endforeach () @@ -275,3 +280,36 @@ if (USE_CUDA OR USE_HIP) COMMAND libsmm_acc_timer_multiply predicted) endif () + +# =================================== DOCUMENTATION GENERATION Copy test source +# files into the build directory so that their documentation can be generated by +# FORD +set(DBCSR_TESTS dbcsr_performance_driver.F ${DBCSR_TESTS_SRCS_FTN} + ${DBCSR_TESTS_SRCS_CPP} libsmm_acc_unittest_transpose.cpp) + +# Make a list of the copy commands +set(test_copy_commands) +foreach (test ${DBCSR_TESTS}) + list( + APPEND + test_copy_commands + COMMAND + ${CMAKE_COMMAND} + -E + copy + ${CMAKE_SOURCE_DIR}/tests/${test} + ${CMAKE_BINARY_DIR}/tests) +endforeach () + +add_custom_target( + doc_copy_tests + COMMENT "Copy tests for documentation generation" + COMMAND mkdir -p ${CMAKE_BINARY_DIR}/tests ${test_copy_commands} + VERBATIM) + +# libsmm_acc_unittest_multiply.cpp and libsmm_acc_timer_multiply.cpp do not need +# to be copied to the build directory since they are generated at build-time and +# written to the build directory directly. We just need to make sure that the +# documentation generation depends on the generation of these tests. +add_dependencies(doc_copy_tests generate_libsmm_acc_unittest_multiply_test_cpp) +add_dependencies(doc_copy_tests generate_libsmm_acc_timer_multiply_test_cpp) diff --git a/tests/README b/tests/README deleted file mode 100644 index d3abfa49765..00000000000 --- a/tests/README +++ /dev/null @@ -1,32 +0,0 @@ -!--------------------------------------------------------------------------------------------------! -! Copyright (C) by the DBCSR developers group - All rights reserved ! -! This file is part of the DBCSR library. ! -! ! -! For information on the license, see the LICENSE file. ! -! For further information please visit https://dbcsr.cp2k.org ! -! SPDX-License-Identifier: GPL-2.0+ ! -!--------------------------------------------------------------------------------------------------! - -DBCSR Testing and Performance - -Performance: -============ - -* Building: compile libdbcsr as usual but use the 'dbcsr_performance_driver' target, i.e.: - - > cd $LIBDBCSR_HOME - > make -j dbcsr_performance_driver - -* Running, examples: - - Modify the input.perf input file and pass it as standard input - to the executable generated above, e.g.: - - a) To test pure MPI performance test using [n] nodes: - - > mpirun -np [n] ./bin/dbcsr_performance_driver.x tests/input.perf 2>&1 | tee perf.log - - b) To test hybrid MPI/OpenMP performance test using [n] nodes, each spanning [t] threads: - - > export OMP_NUM_THREADS=[t]; mpirun -np [n] ./bin/dbcsr_performance_driver.x tests/input.perf 2>&1 | tee perf.log - diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 00000000000..a263ba0d7dc --- /dev/null +++ b/tests/README.md @@ -0,0 +1,39 @@ +!--------------------------------------------------------------------------------------------------! +! Copyright (C) by the DBCSR developers group - All rights reserved ! +! This file is part of the DBCSR library. ! +! ! +! For information on the license, see the LICENSE file. ! +! For further information please visit https://dbcsr.cp2k.org ! +! SPDX-License-Identifier: GPL-2.0+ ! +!--------------------------------------------------------------------------------------------------! + +# DBCSR Testing and Performance + +## Correctness tests + +- [dbcsr_tas_unittest](dbcsr_tas_unittest.F) : unit test for tall-and-skinny matrices +- [dbcsr_tensor_test](dbcsr_tensor_test.cpp) : test the tensor contraction (13|2)x(54|21)=(3|45) 31 and other functions +- [dbcsr_tensor_unittest](dbcsr_tensor_unittest.F) : unit test for tensor functionalities +- [dbcsr_test_csr_conversions](dbcsr_test_csr_conversions.F) : test DBCSR to CSR conversion with random matrices +- [dbcsr_unittest_1](dbcsr_unittest1.F) : test matrix operations: add, multiply and multiply-ghost +- [dbcsr_unittest_2](dbcsr_unittest2.F) : test matrix-multiply with large blocks (block size=100) and rectangular matrices (block size=5) + +### GPU-backend correctness tests: + +- [dbcsr_unittest_3](dbcsr_unittest3.F) : test matrix-multiply with various block sizes that are run by the libsmm_acc GPU backend if DBCSR is compiled with GPU support +- [libsmm_acc_unittest_multiply](libsmm_acc_unittest_multiply.cpp.template) : tests all libsmm_acc transpose kernels +- [libsmm_acc_unittest_transpose](libsmm_acc_unittest_transpose.cpp) : tests all libsmm_acc batch-multiplication kernels + +## Performance tests + +DBCSR performance tests: + +- [dbcsr_performance_driver](dbcsr_performance_driver.F) : performance tester for matrix operations. The input matrices can be described in an input file in order to test different configurations. See below. + +### GPU backend performance tests: + +- [libsmm_acc_timer_multiply](libsmm_acc_timer_multiply.cpp.template) : time all libsmm_acc batch-multiplication kernels + +--- + +See the [tests' documentation](../docs/guide/2-user-guide/2-tests/index.md). diff --git a/tests/dbcsr_tas_unittest.F b/tests/dbcsr_tas_unittest.F index 7c3971fce02..0281967f520 100644 --- a/tests/dbcsr_tas_unittest.F +++ b/tests/dbcsr_tas_unittest.F @@ -9,7 +9,7 @@ PROGRAM dbcsr_tas_unittest - !! unit testing for tall-and-skinny matrices + !! Unit testing for tall-and-skinny matrices USE dbcsr_api, ONLY: dbcsr_finalize_lib, & dbcsr_init_lib, & dbcsr_print_statistics diff --git a/tests/dbcsr_tensor_unittest.F b/tests/dbcsr_tensor_unittest.F index 91bf8e2e9a3..e36af4c207e 100644 --- a/tests/dbcsr_tensor_unittest.F +++ b/tests/dbcsr_tensor_unittest.F @@ -8,7 +8,7 @@ !--------------------------------------------------------------------------------------------------! PROGRAM dbcsr_tensor_unittest - !! DBCSR tensor unit test. + !! DBCSR tensor unit test USE dbcsr_api, ONLY: dbcsr_finalize_lib, & dbcsr_init_lib, & dbcsr_type_real_8, & diff --git a/tests/dbcsr_unittest1.F b/tests/dbcsr_unittest1.F index ce1c1b5f4a2..da6db071cf3 100644 --- a/tests/dbcsr_unittest1.F +++ b/tests/dbcsr_unittest1.F @@ -7,8 +7,9 @@ ! SPDX-License-Identifier: GPL-2.0+ ! !--------------------------------------------------------------------------------------------------! -PROGRAM dbcsr_unittest - !! Tests for DBCSR operations +PROGRAM dbcsr_unittest_1 + !! Tests for DBCSR operations: + !! add, multiply and multiply-ghost USE dbcsr_kinds, ONLY: dp USE dbcsr_lib, ONLY: dbcsr_finalize_lib, & @@ -318,4 +319,4 @@ PROGRAM dbcsr_unittest ! ! finalize libdbcsr errors -END PROGRAM dbcsr_unittest +END PROGRAM dbcsr_unittest_1 diff --git a/tests/dbcsr_unittest2.F b/tests/dbcsr_unittest2.F index 7a9e7a35d06..719648d60a0 100644 --- a/tests/dbcsr_unittest2.F +++ b/tests/dbcsr_unittest2.F @@ -7,8 +7,10 @@ ! SPDX-License-Identifier: GPL-2.0+ ! !--------------------------------------------------------------------------------------------------! -PROGRAM dbcsr_unittest - !! Tests for DBCSR operations +PROGRAM dbcsr_unittest_2 + !! Tests for DBCSR multiply: + !! large blocks (block size=100) + !! and rectangular matrices (block size=5) USE dbcsr_kinds, ONLY: dp USE dbcsr_lib, ONLY: dbcsr_finalize_lib, & @@ -126,4 +128,4 @@ PROGRAM dbcsr_unittest CALL mp_world_finalize() -END PROGRAM dbcsr_unittest +END PROGRAM dbcsr_unittest_2 diff --git a/tests/dbcsr_unittest3.F b/tests/dbcsr_unittest3.F index c82273a14ff..369d639a56e 100644 --- a/tests/dbcsr_unittest3.F +++ b/tests/dbcsr_unittest3.F @@ -7,8 +7,10 @@ ! SPDX-License-Identifier: GPL-2.0+ ! !--------------------------------------------------------------------------------------------------! -PROGRAM dbcsr_unittest - !! Tests for DBCSR's libsmm_acc backend +PROGRAM dbcsr_unittest_3 + !! Tests for DBCSR multiply: + !! various block sizes that are run by the libsmm_acc GPU backend if + !! DBCSR is compiled with GPU support. USE dbcsr_kinds, ONLY: dp USE dbcsr_lib, ONLY: dbcsr_finalize_lib, & @@ -139,4 +141,4 @@ PROGRAM dbcsr_unittest CALL mp_world_finalize() -END PROGRAM dbcsr_unittest +END PROGRAM dbcsr_unittest_3 diff --git a/tests/input.perf b/tests/input.perf index 90837cb6509..aafa4e04764 100644 --- a/tests/input.perf +++ b/tests/input.perf @@ -1,7 +1,10 @@ -# npcols MPI grid, 0 leaves MPI to find the best grid. -# Note that the total number of processors must be divisible per npcols +# Template Input File for DBCSR Performance Driver +# ----------------------------------------------------------------------- +# npcols MPI grid +# - 0 leaves MPI to find the best grid. +# - Note that the total number of processors must be divisible per npcols 0 -# Use MPI-RMA +# use MPI-RMA F # operation dbcsr_multiply @@ -9,7 +12,7 @@ dbcsr_multiply 1000 1000 1000 -# sparsity (A, B, C) +# sparsity (matrix A, matrix B, matrix C) 0.0d0 0.0d0 0.0d0 @@ -21,22 +24,31 @@ N N N # data type +# - 3: double +# - other types, see "Type definitions" in dbcsr/src/data/dbcsr_data_types.F 3 # alpha (real, imag) 1.0d0 0.0d0 # beta (real, imag) 1.0d0 -0.0d0 -# limits (0 means full size) -# row +0.0d0 +# limits +# - 0 means full size +# - row +# -- limRowL (First full row of limiting submatrix) 0 +# -- limRowU 0 -# col +# - col +# -- limColL (First full col of limiting submatrix) 0 +# -- limColU 0 -# k +# - k +# -- limKL first full col of imiting inner product) 0 +# -- limKU 0 # retain sparsity (T/F) F @@ -47,8 +59,17 @@ F 1 1 # the m blocks (multiplicity, block size, ...) +# - this configuration, eg, will generate blocks of +# - size 5 in the m-dimension 1 5 +# - this configuration would generate a block of +# - size 5 followed by 2 blocks of size 3 in the m-dimension, +# - followed by a block of size 5, etc. until size M is reached +# - 1 +# - 5 +# - 2 +# - 3 # the n blocks (multiplicity, block size, ...) 1 5 diff --git a/tests/inputs/test_H2O.perf b/tests/inputs/test_H2O.perf index df09e2bafbc..00e9200e7c2 100644 --- a/tests/inputs/test_H2O.perf +++ b/tests/inputs/test_H2O.perf @@ -27,7 +27,7 @@ N 0.0d0 # beta (real, imag) 1.0d0 -0.0d0 +0.0d0 # limits (0 means full size) # row 0 diff --git a/tests/inputs/test_rect1_dense.perf b/tests/inputs/test_rect1_dense.perf index 3ee8a8b7d63..4f1f73918f3 100644 --- a/tests/inputs/test_rect1_dense.perf +++ b/tests/inputs/test_rect1_dense.perf @@ -27,7 +27,7 @@ N 0.0d0 # beta (real, imag) 1.0d0 -0.0d0 +0.0d0 # limits (0 means full size) # row 0 diff --git a/tests/inputs/test_rect1_sparse.perf b/tests/inputs/test_rect1_sparse.perf index 5c876e63133..d54ba15cd19 100644 --- a/tests/inputs/test_rect1_sparse.perf +++ b/tests/inputs/test_rect1_sparse.perf @@ -27,7 +27,7 @@ N 0.0d0 # beta (real, imag) 1.0d0 -0.0d0 +0.0d0 # limits (0 means full size) # row 0 diff --git a/tests/inputs/test_rect2_dense.perf b/tests/inputs/test_rect2_dense.perf index aacf4b1d336..316a02368e9 100644 --- a/tests/inputs/test_rect2_dense.perf +++ b/tests/inputs/test_rect2_dense.perf @@ -27,7 +27,7 @@ N 0.0d0 # beta (real, imag) 1.0d0 -0.0d0 +0.0d0 # limits (0 means full size) # row 0 diff --git a/tests/inputs/test_rect2_sparse.perf b/tests/inputs/test_rect2_sparse.perf index ad609b4277a..d3b283bf7a4 100644 --- a/tests/inputs/test_rect2_sparse.perf +++ b/tests/inputs/test_rect2_sparse.perf @@ -27,7 +27,7 @@ N 0.0d0 # beta (real, imag) 1.0d0 -0.0d0 +0.0d0 # limits (0 means full size) # row 0 diff --git a/tests/inputs/test_singleblock.perf b/tests/inputs/test_singleblock.perf index 7295fb7eac6..fb6158d11e8 100644 --- a/tests/inputs/test_singleblock.perf +++ b/tests/inputs/test_singleblock.perf @@ -27,7 +27,7 @@ N 0.0d0 # beta (real, imag) 1.0d0 -0.0d0 +0.0d0 # limits (0 means full size) # row 0 @@ -59,4 +59,4 @@ F T 0.1E-10 0.418186760034529E+06 -0.190157258297048E+06 +0.190157258297048E+06 diff --git a/tests/inputs/test_square_dense.perf b/tests/inputs/test_square_dense.perf index 008d712a485..9be924b629e 100644 --- a/tests/inputs/test_square_dense.perf +++ b/tests/inputs/test_square_dense.perf @@ -27,7 +27,7 @@ N 0.0d0 # beta (real, imag) 1.0d0 -0.0d0 +0.0d0 # limits (0 means full size) # row 0 diff --git a/tests/inputs/test_square_sparse.perf b/tests/inputs/test_square_sparse.perf index 1aebcfcc17b..76e82ed2f60 100644 --- a/tests/inputs/test_square_sparse.perf +++ b/tests/inputs/test_square_sparse.perf @@ -27,7 +27,7 @@ N 0.0d0 # beta (real, imag) 1.0d0 -0.0d0 +0.0d0 # limits (0 means full size) # row 0 diff --git a/tests/inputs/test_square_sparse_bigblocks.perf b/tests/inputs/test_square_sparse_bigblocks.perf index 4d3317ef277..b81565c788c 100644 --- a/tests/inputs/test_square_sparse_bigblocks.perf +++ b/tests/inputs/test_square_sparse_bigblocks.perf @@ -27,7 +27,7 @@ N 0.0d0 # beta (real, imag) 1.0d0 -0.0d0 +0.0d0 # limits (0 means full size) # row 0 diff --git a/tests/inputs/test_square_sparse_rma.perf b/tests/inputs/test_square_sparse_rma.perf index fb2115ca3bc..4938966ce7c 100644 --- a/tests/inputs/test_square_sparse_rma.perf +++ b/tests/inputs/test_square_sparse_rma.perf @@ -27,7 +27,7 @@ N 0.0d0 # beta (real, imag) 1.0d0 -0.0d0 +0.0d0 # limits (0 means full size) # row 0 diff --git a/tools/docker/Dockerfile.build-env-rocm b/tools/docker/Dockerfile.build-env-rocm new file mode 100644 index 00000000000..a947e70467f --- /dev/null +++ b/tools/docker/Dockerfile.build-env-rocm @@ -0,0 +1,43 @@ +FROM rocm/dev-ubuntu-18.04:latest + +# install compilers, libraries & co +RUN apt-get update +RUN apt-get install -y \ + locales \ + gfortran \ + gcc-7 \ + g++-7 \ + llvm-7-dev \ + llvm-7-tools \ + mpich \ + libomp-7-dev \ + libmpich-dev \ + libopenblas-dev \ + wget + +# install rocm libraries +RUN wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add - +RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list +RUN apt-get update +RUN apt-get install -y \ + rocm-dev \ + rocblas \ + rocsolver \ + hipblas + +# install git 2.18+ +RUN apt-get install -y software-properties-common +RUN add-apt-repository ppa:git-core/ppa +RUN apt-get install -y git + +# install ninja +RUN apt-get install -y wget +RUN wget https://github.com/Kitware/ninja/releases/download/v1.10.0.gfb670.kitware.jobserver-1/ninja-1.10.0.gfb670.kitware.jobserver-1_x86_64-linux-gnu.tar.gz +RUN tar -xzvf ninja-1.10.0.gfb670.kitware.jobserver-1_x86_64-linux-gnu.tar.gz +ENV PATH="/ninja-1.10.0.gfb670.kitware.jobserver-1_x86_64-linux-gnu:${PATH}" + +# install cmake +RUN wget https://github.com/Kitware/CMake/releases/download/v3.17.0/cmake-3.17.0-Linux-x86_64.tar.gz +RUN tar -xzvf cmake-3.17.0-Linux-x86_64.tar.gz +ENV PATH="/cmake-3.17.0-Linux-x86_64/bin:${PATH}" + diff --git a/tools/docker/README.md b/tools/docker/README.md index 5b6e485d658..1f4093b56fa 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -31,3 +31,17 @@ If you need to rebuild the image, use: $ cd dbcsr/tools/docker $ docker build -t dbcsr/build-env-ubuntu-20.04 -f Dockerfile.build-env-ubuntu . ``` + +## ROCm Build Environment + +The image is based on Ubuntu 18.04 and contains: + +* GNU Fortran Compiler +* OpenBLAS +* MPICH +* CMake (recent version) +* Ninja (recent version) +* Git 2.18+ +* ROCm +* ROCm libraries (rocblas, rocsolver, hipblas) +