diff --git a/CMakeLists.txt b/CMakeLists.txt
index e130191b40d..f7f12b48606 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -373,6 +373,7 @@ endif ()
 include(CompilerConfiguration)
 include(CheckCompilerSupport)
 
+# subdirectories
 add_subdirectory(src)
 
 include(CTest)
@@ -384,4 +385,6 @@ if (WITH_EXAMPLES)
   add_subdirectory(examples)
 endif ()
 
+add_subdirectory(docs)
+
 include(CustomTargets)
diff --git a/DBCSR.md b/DBCSR.md
index 5609bcbf21c..d3481b35bd2 100644
--- a/DBCSR.md
+++ b/DBCSR.md
@@ -15,6 +15,9 @@ predocmark: >
 media_dir: @CMAKE_SOURCE_DIR@/docs/media
 md_base_dir: @CMAKE_SOURCE_DIR@
 page_dir: @CMAKE_SOURCE_DIR@/docs/guide
+src_dir: ./src
+         ./tests
+         ./examples
 output_dir: @CMAKE_BINARY_DIR@/doc
 docmark_alt: #
 predocmark_alt: <
diff --git a/README.md b/README.md
index 78703a96d18..1f6c3d03f10 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ It is MPI and OpenMP parallel and can exploit Nvidia and AMD GPUs via CUDA and H
 
 ## How to Install
 
-Follow the [installation guide](docs/guide/2-user-guide/1-installation/1-install.md).
+Follow the [installation guide](https://cp2k.github.io/dbcsr/develop/page/2-user-guide/1-installation/index.html).
 
 ## Documentation
 
diff --git a/VERSION b/VERSION
index 6321062b636..3eef448c4b5 100644
--- a/VERSION
+++ b/VERSION
@@ -1,6 +1,7 @@
 MAJOR = 2
 MINOR = 1
-PATCH = 0-rc19
+PATCH = 0
 # A specific DATE (YYYY-MM-DD) fixes an official release, otherwise
 # it is considered Development version.
-DATE  =
+DATE  = 2020-12-09
+
diff --git a/cmake/CustomTargets.cmake b/cmake/CustomTargets.cmake
index 93b1db4fcbe..5753b9b7a0a 100644
--- a/cmake/CustomTargets.cmake
+++ b/cmake/CustomTargets.cmake
@@ -1,3 +1,5 @@
+# =================================================================================================
+# BUILD DISTRIBUTION
 set(ARCHIVE_NAME "${CMAKE_PROJECT_NAME}-${dbcsr_VERSION}")
 add_custom_target(
   dist
@@ -9,22 +11,8 @@ add_custom_target(
           "${CMAKE_BINARY_DIR}/dist/${ARCHIVE_NAME}.tar.gz"
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
 
-find_program(
-  FORD_EXE ford
-  DOC "path to the ford executable (required to generate the documentation)")
-
-# Copy the FORD project-file into the build directory
-set(FORD_PROJECT_FILE "${CMAKE_BINARY_DIR}/DBCSR.md")
-configure_file(DBCSR.md "${FORD_PROJECT_FILE}")
-
-add_custom_target(
-  doc
-  COMMENT "Generating API documentation"
-  COMMAND "${FORD_EXE}" "${FORD_PROJECT_FILE}"
-  VERBATIM)
-add_dependencies(doc fypp) # only depend on the fypp step to avoid building
-                           # everything just for the docs
-
+# =================================================================================================
+# LCOV - COVERAGE REPORTS GENERATION
 find_program(
   LCOV_EXE lcov
   DOC "path to the lcov executable (required to generate coverage reports)")
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
new file mode 100644
index 00000000000..032b9cf073c
--- /dev/null
+++ b/docs/CMakeLists.txt
@@ -0,0 +1,22 @@
+# =================================================================================================
+# FORD - DOCUMENTATION GENERATION
+find_program(
+  FORD_EXE ford
+  DOC "path to the ford executable (required to generate the documentation)")
+
+# Copy the FORD project-file into the build directory
+set(FORD_PROJECT_FILE "${CMAKE_BINARY_DIR}/DBCSR.md")
+configure_file(${CMAKE_SOURCE_DIR}/DBCSR.md "${FORD_PROJECT_FILE}")
+
+# Copy the FORD project-file into the build directory
+add_custom_target(
+  doc
+  COMMENT "Generating API documentation and doc pages"
+  COMMAND "${FORD_EXE}" "${FORD_PROJECT_FILE}"
+  VERBATIM)
+add_dependencies(doc doc_copy_tests)
+if (WITH_C_API)
+  add_dependencies(doc doc_copy_examples)
+endif ()
+add_dependencies(doc fypp) # only depend on the fypp step to avoid building
+                           # everything just for the docs
diff --git a/docs/guide/2-user-guide/1-installation/2-cmake-build-recipes.md b/docs/guide/2-user-guide/1-installation/1-cmake-build-recipes.md
similarity index 100%
rename from docs/guide/2-user-guide/1-installation/2-cmake-build-recipes.md
rename to docs/guide/2-user-guide/1-installation/1-cmake-build-recipes.md
diff --git a/docs/guide/2-user-guide/1-installation/1-install.md b/docs/guide/2-user-guide/1-installation/1-install.md
deleted file mode 100644
index 3cba6942b45..00000000000
--- a/docs/guide/2-user-guide/1-installation/1-install.md
+++ /dev/null
@@ -1,114 +0,0 @@
-title: Install
-
-# Install
-
-## Prerequisites
-
-You absolutely need:
-
-* [CMake](https://cmake.org/) (3.12+)
-* GNU make or Ninja
-* a Fortran compiler which supports at least Fortran 2008 (including the TS 29113 when using the C-bindings)
-* a BLAS+LAPACK implementation (reference, OpenBLAS and MKL have been tested. Note: DBCSR linked to OpenBLAS 0.3.6 gives wrong results on Power9 architectures.)
-* a Python version installed (2.7 or 3.6+ have been tested)
-
-Optionally:
-
-* [libxsmm](https://github.com/hfp/libxsmm) (1.10+, and `pkg-config`) for Small Matrix Multiplication acceleration
-* a LAPACK implementation (reference, OpenBLAS-bundled and MKL have been tested), required when building the tests
-
-To build `libsmm_acc`, DBCSR's GPU backend, you further need:
-
-* A GPU-capable compiler, either
-  * CUDA Toolkit (targets NVIDIA GPUs, minimal version required: 5.5) with cuBLAS
-  * or HIP compiler (targets NVIDIA or AMD GPUs) and hipBLAS
-* a C++ compiler which supports at least C++11 standard
-
-We test against GNU and Intel compilers on Linux systems, GNU compiler on MacOS systems. See a list of supported compilers [here](./3-supported-compilers.html).
-
-## Get DBCSR
-
-Download either a [release tarball](https://github.com/cp2k/dbcsr/releases) or clone the latest version from Git using:
-
-```bash
-git clone --recursive https://github.com/cp2k/dbcsr.git
-```
-
-## Build
-
-DBCSR can be compiled in 4 main variants:
-* Serial, i.e. no OpenMP and MPI
-* OpenMP
-* MPI
-* OpenMP+MPI
-The 4 variants can be combined with the accelerator support.
-
-Run inside the `dbcsr` directory:
-
-```bash
-mkdir build
-cd build
-cmake ..
-make
-```
-
- The configuration flags for the CMake command are (default first):
-
-```
--DUSE_MPI=<ON|OFF>
--DUSE_OPENMP=<ON|OFF>
--DUSE_SMM=<blas|libxsmm>
--DUSE_CUDA=<OFF|ON>
--DUSE_HIP=<OFF|ON>
--DWITH_C_API=<ON|OFF>
--DWITH_EXAMPLES=<ON|OFF>
--DWITH_GPU=<P100|K20X|K40|K80|V100|Mi50>
--DWITH_CUDA_PROFILING=<OFF|ON>
--DCMAKE_BUILD_TYPE=<Release|Debug|Coverage>
--DBUILD_TESTING=<ON|OFF>
--DTEST_MPI_RANKS=<auto,N>
--DTEST_OMP_THREADS=<2,N>
-```
-
-When providing a custom build of `libxsmm`, make sure that its library directory is added to the `PKG_CONFIG_PATH` variable prior
-to running `cmake`. An example if `libxsmm` was checked out using Git to your home folder:
-
-```bash
-export PKG_CONFIG_PATH="${PKG_CONFIG_PATH}:${HOME}/libxsmm/lib"
-```
-
-### CMake Build Recipes
-
-For build recipes on different platforms, make sure to also read the [CMake Build Recipes](./2-cmake-build-recipes.html).
-
-### Using Python in a virtual environment
-
-If you want to use Python from a virtual environment and your CMake version is < 3.15, specify the desired python interpreter manually using:
-
-```
-    -DPython_EXECUTABLE=/path/to/python
-```
-
-### Running Tests
-
-To run the tests, use:
-
-```bash
-    make test
-```
-
-Please, note that if you are using OpenMP builds, then you have to set the environment variable `OMP_NESTED=false`.
-
-### C/C++ Interface
-
-If MPI support is enabled (the default), the C API is automatically built.
-
-### Workaround issue in HIP
-
-HIP is a relatively new language, and some issues still need to be ironed out. As a workaround to an [issue](https://github.com/ROCm-Developer-Tools/HIP/pull/1543) in HIP's JIT infrastructure, please set the following if you've built HIP from source:
-
-```bash
-    export HIP_PATH=/opt/rocm/hip
-```
-
-before running on an AMD GPU.
diff --git a/docs/guide/2-user-guide/1-installation/3-supported-compilers.md b/docs/guide/2-user-guide/1-installation/2-supported-compilers.md
similarity index 100%
rename from docs/guide/2-user-guide/1-installation/3-supported-compilers.md
rename to docs/guide/2-user-guide/1-installation/2-supported-compilers.md
diff --git a/docs/guide/2-user-guide/1-installation/4-using-dbcsr-in-a-cmake-project.md b/docs/guide/2-user-guide/1-installation/3-using-dbcsr-in-a-cmake-project.md
similarity index 100%
rename from docs/guide/2-user-guide/1-installation/4-using-dbcsr-in-a-cmake-project.md
rename to docs/guide/2-user-guide/1-installation/3-using-dbcsr-in-a-cmake-project.md
diff --git a/docs/guide/2-user-guide/1-installation/4-docker.md b/docs/guide/2-user-guide/1-installation/4-docker.md
new file mode 100644
index 00000000000..25d8ccc2b6a
--- /dev/null
+++ b/docs/guide/2-user-guide/1-installation/4-docker.md
@@ -0,0 +1,4 @@
+title: Docker Images
+
+{!./tools/docker/README.md!}
+
diff --git a/docs/guide/2-user-guide/1-installation/index.md b/docs/guide/2-user-guide/1-installation/index.md
index 1bc9462715f..c0efd0d406b 100644
--- a/docs/guide/2-user-guide/1-installation/index.md
+++ b/docs/guide/2-user-guide/1-installation/index.md
@@ -1 +1,104 @@
-title: Installation
\ No newline at end of file
+title: Install
+
+# Install
+
+## Prerequisites
+
+You absolutely need:
+
+* [CMake](https://cmake.org/) (3.12+)
+* GNU make or Ninja
+* a Fortran compiler which supports at least Fortran 2008 (including the TS 29113 when using the C-bindings)
+* a BLAS+LAPACK implementation (reference, OpenBLAS and MKL have been tested. Note: DBCSR linked to OpenBLAS 0.3.6 gives wrong results on Power9 architectures.)
+* a Python version installed (2.7 or 3.6+ have been tested)
+
+Optionally:
+
+* [libxsmm](https://github.com/hfp/libxsmm) (1.10+, and `pkg-config`) for Small Matrix Multiplication acceleration
+* a LAPACK implementation (reference, OpenBLAS-bundled and MKL have been tested), required when building the tests
+
+To build `libsmm_acc`, DBCSR's GPU backend, you further need:
+
+* A GPU-capable compiler, either
+  * CUDA Toolkit (targets NVIDIA GPUs, minimal version required: 5.5) with cuBLAS
+  * or HIP compiler (targets NVIDIA or AMD GPUs) and hipBLAS (the tested version is ROCm 3.8)
+* a C++ compiler which supports at least C++11 standard
+
+We test against GNU and Intel compilers on Linux systems, GNU compiler on MacOS systems. See a list of supported compilers [here](./3-supported-compilers.html).
+
+## Get DBCSR
+
+Download either a [release tarball](https://github.com/cp2k/dbcsr/releases) or clone the latest version from Git using:
+
+```bash
+git clone --recursive https://github.com/cp2k/dbcsr.git
+```
+
+## Build
+
+DBCSR can be compiled in 4 main variants:
+* Serial, i.e. no OpenMP and MPI
+* OpenMP
+* MPI
+* OpenMP+MPI
+The 4 variants can be combined with the accelerator support.
+
+Run inside the `dbcsr` directory:
+
+```bash
+mkdir build
+cd build
+cmake ..
+make
+```
+
+ The configuration flags for the CMake command are (default first):
+
+```
+-DUSE_MPI=<ON|OFF>
+-DUSE_OPENMP=<ON|OFF>
+-DUSE_SMM=<blas|libxsmm>
+-DUSE_CUDA=<OFF|ON>
+-DWITH_CUDA_PROFILING=<OFF|ON>
+-DUSE_HIP=<OFF|ON>
+-DWITH_C_API=<ON|OFF>
+-DWITH_EXAMPLES=<ON|OFF>
+-DWITH_GPU=<P100|K20X|K40|K80|V100|Mi50>
+-DCMAKE_BUILD_TYPE=<Release|Debug|Coverage>
+-DBUILD_TESTING=<ON|OFF>
+-DTEST_MPI_RANKS=<auto,N>
+-DTEST_OMP_THREADS=<2,N>
+```
+
+When providing a custom build of `libxsmm`, make sure that its library directory is added to the `PKG_CONFIG_PATH` variable prior
+to running `cmake`. An example if `libxsmm` was checked out using Git to your home folder:
+
+```bash
+export PKG_CONFIG_PATH="${PKG_CONFIG_PATH}:${HOME}/libxsmm/lib"
+```
+
+### CMake Build Recipes
+
+For build recipes on different platforms, make sure to also read the [CMake Build Recipes](./1-cmake-build-recipes.html).
+
+### Using Python in a virtual environment
+
+If you want to use Python from a virtual environment and your CMake version is < 3.15, specify the desired python interpreter manually using:
+
+```
+    -DPython_EXECUTABLE=/path/to/python
+```
+
+### C/C++ Interface
+
+If MPI support is enabled (the default), the C API is automatically built.
+
+### Workaround issue in HIP
+
+HIP is a relatively new language, and some issues still need to be ironed out. As a workaround to an [issue](https://github.com/ROCm-Developer-Tools/HIP/pull/1543) in HIP's JIT infrastructure, please set the following if you've built HIP from source:
+
+```bash
+    export HIP_PATH=/opt/rocm/hip
+```
+
+before running on an AMD GPU.
diff --git a/docs/guide/2-user-guide/2-examples/index.md b/docs/guide/2-user-guide/2-examples/index.md
deleted file mode 100644
index 96b610794f3..00000000000
--- a/docs/guide/2-user-guide/2-examples/index.md
+++ /dev/null
@@ -1,3 +0,0 @@
-title: Examples
-
-{!examples/README.md!}
diff --git a/docs/guide/2-user-guide/2-tests/index.md b/docs/guide/2-user-guide/2-tests/index.md
new file mode 100644
index 00000000000..3980073254b
--- /dev/null
+++ b/docs/guide/2-user-guide/2-tests/index.md
@@ -0,0 +1,71 @@
+title: Tests
+
+# Tests
+
+## Correctness tests
+
+- [[dbcsr_unittest_1(program)]] (fortran) : test matrix operations: add, multiply and multiply-ghost
+- [[dbcsr_unittest_2(program)]] (fortran) : test matrix-multiply with large blocks (block size=100) and rectangular matrices (block size=5)
+- [[dbcsr_test_csr_conversions(program)]] (fortran) : test DBCSR to CSR conversion with random matrices
+- [[dbcsr_tas_unittest(program)]] (fortran) : unit test for tall-and-skinny matrices
+- [[dbcsr_tensor_unittest(program)]] (fortran) : unit test for tensor functionalities
+- [dbcsr_tensor_test](../../../../tests/dbcsr_tensor_test.cpp) (c++) : test the tensor contraction (13|2)x(54|21)=(3|45) 31 and other functions
+
+### GPU-backend correctness tests:
+
+- [[dbcsr_unittest_3(program)]] (fortran) : test matrix-multiply with various block sizes that are run by the libsmm_acc GPU backend if DBCSR is compiled with GPU support
+- [libsmm_acc_unittest_multiply](../../../../tests/libsmm_acc_unittest_multiply.cpp) (c++) : tests all libsmm_acc transpose kernels
+- [libsmm_acc_unittest_transpose](../../../../tests/libsmm_acc_unittest_transpose.cpp) (c++) : tests all libsmm_acc batch-multiplication kernels
+
+## Performance tests
+
+DBCSR performance tests:
+
+- [[dbcsr_performance_driver(program)]] (fortran) : performance tester for matrix operations. The input matrices can be described in an input file in order to test different configurations. See below.
+
+### GPU backend performance tests:
+
+- [libsmm_acc_timer_multiply](../../../../tests/libsmm_acc_timer_multiply.cpp) (c++) : time all libsmm_acc batch-multiplication kernels
+
+## Running Tests
+
+To run all the tests, use:
+
+```bash
+make test
+```
+
+Or run individual tests from the `build` directory, as follows:
+
+```bash
+srun -N 1 --ntasks-per-core 2 --ntasks-per-node 12 --cpus-per-task 2 ./tests/dbcsr_unittest_1
+```
+
+Note that the tests of libsmm_acc (the GPU-backend) do not use MPI since libsmm_acc only operates on-node.
+
+Note that if you are using OpenMP builds, then you have to set the environment variable `OMP_NESTED=false`.
+
+### Input Files for Performance Driver
+
+The test suite comes with a performance driver ([[dbcsr_performance_driver(program)]]), which evaluates the performance of matrix-matrix multiplication in DBCSR.
+
+Input matrices can be specified in an input file, passed to the executable as standard input, for example:
+
+a) To test pure MPI performance test using [n] nodes:
+
+```bash
+mpirun -np [n] ./build/tests/dbcsr_perf tests/input.perf 2>&1 | tee perf.log
+```
+
+b) To test hybrid MPI/OpenMP performance test using [n] nodes, each spanning [t] threads:
+
+```bash
+export OMP_NUM_THREADS=[t]; mpirun -np [n] ./build/tests/dbcsr_perf tests/input.perf 2>&1 | tee perf.log
+```
+
+###  How to Write Input Files
+
+Examples of input files can be found in `tests/inputs` for different sizes of matrices and different block sizes.
+
+You can also write custom input files: for more information, follow the template in `tests/input.perf`.
+
diff --git a/docs/guide/2-user-guide/3-examples/index.md b/docs/guide/2-user-guide/3-examples/index.md
new file mode 100644
index 00000000000..a24a4fa269e
--- /dev/null
+++ b/docs/guide/2-user-guide/3-examples/index.md
@@ -0,0 +1,35 @@
+title: Examples
+
+# Examples
+
+- [[dbcsr_example_1(program)]] : how to create a dbcsr matrix (fortran)
+- [[dbcsr_example_2(program)]] : how to set a dbcsr matrix (fortran)
+- dbcsr_example_3: how to multiply two dbcsr matrices (in fortran: [[dbcsr_example_3(program)]]) and in c++: [dbcsr_example_3](https://github.com/cp2k/dbcsr/blob/develop/examples/dbcsr_example_3.cpp))
+- [[dbcsr_tensor_example_1(program)]] : how to create a dbcsr matrix (fortran)
+    - the example can be run with different parameters, controlling block size, sparsity, verbosity and more
+- [dbcsr_tensor_example_2](https://github.com/cp2k/dbcsr/blob/develop/examples/dbcsr_tensor_example_2.cpp): tensor contraction example (cpp)
+    - tensor1 x tensor2 = tensor3, (13|2)x(54|21)=(3|45)
+
+## Build
+
+Compile the DBCSR library, using `-DUSE_MPI=ON -DWITH_EXAMPLES=ON`.
+
+The examples require MPI. Furthermore, if you are using threading, MPI_THREAD_FUNNELED mode is required.
+
+## Run
+
+You can run the examples, for instance from the `build` directory, as follows:
+
+```bash
+srun -N 1 --ntasks-per-core 2 --ntasks-per-node 12 --cpus-per-task 2 ./examples/dbcsr_example_1
+```
+
+### Run tensor examples
+
+How to run (this example and DBCSR for tensors in general):
+
+- best performance is obtained by running with mpi and one openmp thread per rank.
+- ideally number of mpi ranks should be composed of small prime factors (e.g. powers of 2).
+- for sparse data & heterogeneous block sizes, DBCSR should be run on CPUs with libxsmm backend.
+- for dense data best performance is obtained by choosing homogeneous block sizes of 64 and by compiling with GPU support.
+
diff --git a/docs/guide/3-developer-guide/1-tooling/index.md b/docs/guide/3-developer-guide/1-tooling/index.md
index d2727bc3752..8470abe8b65 100644
--- a/docs/guide/3-developer-guide/1-tooling/index.md
+++ b/docs/guide/3-developer-guide/1-tooling/index.md
@@ -2,8 +2,8 @@ title: Tooling
 
 # Build System
 
-We support CMake for compilation. See [here](https://cp2k.github.io/dbcsr/page/2-user-guide/1-installation/1-install.html) on how to compile and 
-[here](https://cp2k.github.io/dbcsr/page/2-user-guide/1-installation/2-cmake-build-recipes.html) for more CMake details.
+We support CMake for compilation. See [here](../../2-user-guide/1-installation/index.html) on how to compile and 
+[here](../../2-user-guide/1-installation/1-cmake-build-recipes.html) for more CMake details.
 
 Compilations is based on [Fypp](https://github.com/aradi/fypp) meta-progamming package, which is available as submodule.
 
diff --git a/docs/guide/3-developer-guide/2-documentation/index.md b/docs/guide/3-developer-guide/2-documentation/index.md
index 0f708994bce..db7245a89b0 100644
--- a/docs/guide/3-developer-guide/2-documentation/index.md
+++ b/docs/guide/3-developer-guide/2-documentation/index.md
@@ -2,21 +2,28 @@ title: Documentation
 
 # Documentation
 
-## Build the Documentation
+## Build
 
-To build the Documentation you need [FORD](https://github.com/Fortran-FOSS-Programmers/ford).
+To build the documentation you need [FORD](https://github.com/Fortran-FOSS-Programmers/ford).
 
 Afterwards use the `doc` target for the CMake generated Makefile:
 
 ```bash
     mkdir build
     cd build
-    cmake ..  # will look for the `ford` binary
+    cmake .. # will look for the `ford` binary
     make doc
 ```
 
+Note that in order to generate the documentation with examples (recommended), the following options should be activated in cmake (these are the options' default values)
+
+```bash
+    cmake -DUSE_MPI=ON -DWITH_EXAMPLES=ON .. # these options are default and recommended.
+                                             # If set off, the examples' documentation is not generated.
+```
+
 The documentation (HTML format) will be located in `doc/`. To view it, open `doc/index.html` in a browser.
 
-## Add Pages to the Documentation
+## Add Pages
 
 To add pages to the documentation, write Markdown files and add them to the desired location in `dbcsr/docs/guide`. Note that subfolders of `guide` will only be added to the documentation pages if they contain a file `index.md`. For more information on writing pages, see [Ford's documentation](https://github.com/Fortran-FOSS-Programmers/ford/wiki/Writing-Pages).
diff --git a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md
new file mode 100644
index 00000000000..1301c085f0d
--- /dev/null
+++ b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md
@@ -0,0 +1,23 @@
+title: Kernel Parameters
+
+# Kernel Parameters
+
+## Batched Matrix-Matrix Multiplication Kernel Parameters
+
+The batched matrix-matrix multiplication kernels are templated on:
+
+* the characteristic dimensions of the multiplication: `m, n, k`
+* between 3-7 kernel parameters from (`M`, `N`, `w`, `v`, `threads`, `grouping`, `minblocks`), depending on the algorithm.
+
+## Batched Matrix Transpose Kernel Parameters
+
+The batched transpose kernels are templated on:
+
+* the characteristic dimensions of the transpose: `m, n`
+
+## Predictive parameters
+
+The input features for the predictive models can be 'raw' parameters (left-most-column in the figure below), or hand-engineered features 'derived' from the raw features (matrix sizes, launch parameters and resource usage estimations).
+
+![libsmm_acc_predictive_modeling_features](|media|/images/libsmm_acc_predictive_modeling_features.png)
+
diff --git a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/index.md b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/index.md
index 4faf7651028..23ac88907b5 100644
--- a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/index.md
+++ b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/index.md
@@ -1 +1,3 @@
-title: Accelerator Backend
\ No newline at end of file
+title: Accelerator Backend
+
+{!./src/acc/README.md!}
diff --git a/docs/guide/3-developer-guide/4-performance/1-insights.md b/docs/guide/3-developer-guide/4-performance/1-insights.md
new file mode 100644
index 00000000000..fc30fd1615c
--- /dev/null
+++ b/docs/guide/3-developer-guide/4-performance/1-insights.md
@@ -0,0 +1,119 @@
+title: Insights
+
+# Insights into Performance
+
+## Read Timing & Statistics Reports
+
+At the end of an output file, a report of DBCSR's statistics and timings can be found.
+
+### Statistics
+
+The STATISTICS section of the output file provides some information on matrix-matrix multiplications that were run and their performance characteristics.
+
+Example:
+
+```
+-------------------------------------------------------------------------------
+-                                                                             -
+-                                DBCSR STATISTICS                             -
+-                                                                             -
+-------------------------------------------------------------------------------
+COUNTER                                    TOTAL       BLAS       SMM       ACC
+flops    23 x    23 x    23         687272462200       0.0%      0.0%    100.0%
+flops inhomo. stacks                           0       0.0%      0.0%      0.0%
+flops total                       687.272462E+09       0.0%      0.0%    100.0%
+flops max/rank                    687.272462E+09       0.0%      0.0%    100.0%
+matmuls inhomo. stacks                         0       0.0%      0.0%      0.0%
+matmuls total                           28243300       0.0%      0.0%    100.0%
+number of processed stacks                  1600       0.0%      0.0%    100.0%
+average stack size                                     0.0       0.0   17652.1
+marketing flops                     1.076458E+12
+-------------------------------------------------------------------------------
+# multiplications                             50
+max memory usage/rank              16.650822E+09
+# max total images/rank                        1
+# max 3D layers                                1
+# MPI messages exchanged                       0
+MPI messages size (bytes):
+ total size                         0.000000E+00
+ min size                           0.000000E+00
+ max size                           0.000000E+00
+ average size                       0.000000E+00
+MPI breakdown and total messages size (bytes):
+            size <=      128                   0                        0
+      128 < size <=     8192                   0                        0
+     8192 < size <=    32768                   0                        0
+    32768 < size <=   131072                   0                        0
+   131072 < size <=  4194304                   0                        0
+  4194304 < size <= 16777216                   0                        0
+ 16777216 < size                               0                        0
+-------------------------------------------------------------------------------
+```
+
+#### How to Read the Columns
+
+- `TOTAL`: total flops
+- `BLAS`: percentage of flops run on BLAS (this could be CUBLAS or HIPBLAS)
+- `SMM`: percentage of flops run on SMM (libsmm or libxsmm, CPU)
+- `ACC`: percentage of flops run on ACC (libsmm_acc, DBCSR's GPU-accelerated backend)
+
+#### How to Read the Rows (Counters)
+
+Every time "matrix-matrix multiplication" is mentionned in this paragraph, it refers *not* to the sparse multiplication of large matrices, but the multiplication of small dense blocks that the large sparse matrix was decomposed into.
+
+- `flops    23 x    23 x    23`: indicates that batched matrix-matrix multiplication kernels with matrix dimensions (m, n, k) = (23, 23, 23) was run, and provides info on its flops. If several batched matrix-matrix multiplications of different matrix dimensions (m, n, k) were run, they would appear as subsequent separate rows.
+- `flops inhomo. stacks`: flops of so-called "inhomogeneous stacks". These are stacks of batched-matrix-matrix multiplications where not all multiplications contained have the same matrix dimensions (m, n, k).
+- `flops total`: total flops for all stacks of matrix-matrix multiplication.
+- `flops max/rank`: flops of the MPI rank with the most flops.
+- `matmuls inhomo. stacks`: number of matrix-matrix multiplications run in inhomogeneous stacks.
+- `matmuls total`: number of matrix-matrix multiplications run in total.
+- `number of processed stacks`: number of stacks of batched matrix-matrix multiplication.
+- `average stack size`: average over all stacks of the stack size (i.e. the number of matrix-matrix multiplications that a stack contains).
+
+### Timings
+
+Example of the statistics section of the output file:
+
+```
+-------------------------------------------------------------------------------
+-                                                                             -
+-                                T I M I N G                                  -
+-                                                                             -
+-------------------------------------------------------------------------------
+SUBROUTINE                       CALLS  ASD         SELF TIME        TOTAL TIME MAXRANK
+                               MAXIMUM       AVERAGE  MAXIMUM  AVERAGE  MAXIMUM
+dbcsr_performance_driver             1  1.0    0.000    0.000  102.563  102.563       0
+dbcsr_perf_multiply_low              1  2.0    0.002    0.002  102.563  102.563       0
+perf_multiply                        1  3.0    0.003    0.003  102.077  102.077       0
+[...]
+-------------------------------------------------------------------------------
+```
+
+The columns describe:
+
+- `SUBROUTINE`: the name of the fortran subroutine (or c++ function) timed.
+- `CALLS`: number of times the subroutine was called.
+- `ASD`: average stack depth: the average number of entries on the call stack when this subroutine is called.
+- `SELF TIME`: how much time is spent in the subroutine, or in non-timed subroutines called by this subroutine.
+    - `AVERAGE`: the self time averaged over all MPI ranks,
+    - `MAXIMUM`: the self time maximum over all MPI ranks,
+    - `AVERAGE` and `MAXIMUM` can be used to locate load-imbalance or synchronization points.
+- `TOTAL TIME`: how much time is spent in the subroutine, including the time spent in timed subroutines.
+    - `AVERAGE`: averaged over all MPI ranks
+    - `MAXIMUM`: maximum over all MPI ranks
+    - `AVERAGE` and `MAXIMUM` can be used to locate load-imbalance or synchronization points.
+- `MAXRANKS`: 
+
+#### Time spent in Just-In-Time (JIT) Compilation
+
+For performance debugging and in order to check how much time a program spends doing JIT, look for the functions `jit_kernel_multiply` and `jit_kernel_transpose`.
+
+#### How to Time a Function
+
+By default, the most important subroutines are timed in DBCSR.
+
+If you want to time a subroutine or function that is not timed already, call `timeset` with a routine name and a handle at the beginning of the function, and `timestop` with the same handle at the end of the function.
+
+For examples, just `grep` for `timeset` and `timestop` in the codebase.
+
+This can be done both in fortran code and in the c++ code.
diff --git a/docs/guide/3-developer-guide/4-performance/2-just-in-time-compilation.md b/docs/guide/3-developer-guide/4-performance/2-just-in-time-compilation.md
new file mode 100644
index 00000000000..dec9dbb9d14
--- /dev/null
+++ b/docs/guide/3-developer-guide/4-performance/2-just-in-time-compilation.md
@@ -0,0 +1,14 @@
+title: Just-In-Time Compilation
+
+# Just-In-Time (JIT) Compilation in libsmm_acc
+
+DBCSR's GPU backend, libsmm_acc, uses heavily templated cuda/hip kernels for its batched multiplication and transpose.
+
+If DBCSR were to compile kernels for all possible `m, n, k`s (or, in the case of the transpose, for all possible `m, n`s) ahead-of-time (AOT), this would bloat the library and the compilation time would be much longer.
+Instead, kernels are JIT-ed on the fly, at runtime, as they are requested by the user. `libsmm_acc`'s JIT infrastructure is based on the CUDA library [NVRTC](https://docs.nvidia.com/cuda/nvrtc/), a runtime compilation library for CUDA C++.
+
+On NVIDIA's P100, the overhead of JIT has been found to be around 400ms for one kernel - a negligible overhead for typical DBCSR (and CP2K) runs.
+On AMD GPUs however, the overhead has been found to be of several seconds, a real hinderance to performance.
+
+For performance debugging and in order to check how much time a program spends doing JIT, look for the functions `jit_kernel_multiply` and `jit_kernel_transpose` in the [timings report](./1-insights.html) at the end of the output file.
+
diff --git a/docs/guide/3-developer-guide/4-performance/index.md b/docs/guide/3-developer-guide/4-performance/index.md
new file mode 100644
index 00000000000..2de023cdac7
--- /dev/null
+++ b/docs/guide/3-developer-guide/4-performance/index.md
@@ -0,0 +1 @@
+title: Performance
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 39de98431a1..167516e9516 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,8 +1,10 @@
-set(DBCSR_PROGRAM_SRCS dbcsr_example_1.F dbcsr_example_2.F dbcsr_example_3.F
-                       dbcsr_tensor_example.F)
+set(DBCSR_PROGRAM_SRCS_FTN dbcsr_example_1.F dbcsr_example_2.F
+                           dbcsr_example_3.F dbcsr_tensor_example_1.F)
 
-# register each program source file as executable
-foreach (dbcsr_program_src ${DBCSR_PROGRAM_SRCS})
+set(DBCSR_PROGRAM_SRCS_CPP dbcsr_example_3.cpp dbcsr_tensor_example_2.cpp)
+
+# Compile Fortran examples
+foreach (dbcsr_program_src ${DBCSR_PROGRAM_SRCS_FTN})
   get_filename_component(dbcsr_program_name ${dbcsr_program_src} NAME_WE)
   if (USE_HIP)
     hip_add_executable(${dbcsr_program_name} ${dbcsr_program_src})
@@ -17,23 +19,49 @@ foreach (dbcsr_program_src ${DBCSR_PROGRAM_SRCS})
                                                          Fortran)
 endforeach ()
 
+# Compile C++ examples
 if (WITH_C_API)
-  if (USE_HIP)
-    hip_add_executable(dbcsr_example_3_cpp dbcsr_example_3.cpp)
-    hip_add_executable(dbcsr_tensor_example_1_cpp dbcsr_tensor_example_1.cpp)
-  else ()
-    add_executable(dbcsr_example_3_cpp dbcsr_example_3.cpp)
-    add_executable(dbcsr_tensor_example_1_cpp dbcsr_tensor_example_1.cpp)
-  endif ()
-  target_link_libraries(dbcsr_example_3_cpp dbcsr_c MPI::MPI_CXX)
-  target_link_libraries(dbcsr_tensor_example_1_cpp dbcsr_c MPI::MPI_CXX)
+  foreach (dbcsr_program_src ${DBCSR_PROGRAM_SRCS_CPP})
+    get_filename_component(dbcsr_program_name ${dbcsr_program_src} NAME_WE)
+    set(dbcsr_program_name ${dbcsr_program_name}_cpp)
+    if (USE_HIP)
+      hip_add_executable(${dbcsr_program_name} ${dbcsr_program_src})
+    else ()
+      add_executable(${dbcsr_program_name} ${dbcsr_program_src})
+    endif ()
+    target_link_libraries(${dbcsr_program_name} dbcsr_c MPI::MPI_CXX)
 
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "Cray")
-    # for recent Cray compiler versions CMake doesn't know
-    target_compile_options(dbcsr_example_3_cpp PRIVATE "-hstd=c++14")
-    target_compile_options(dbcsr_tensor_example_1_cpp PRIVATE "-hstd=c++14")
-  else ()
-    target_compile_features(dbcsr_example_3_cpp PRIVATE cxx_std_14)
-    target_compile_features(dbcsr_tensor_example_1_cpp PRIVATE cxx_std_14)
-  endif ()
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Cray")
+      # for recent Cray compiler versions CMake doesn't know
+      target_compile_options(${dbcsr_program_name} PRIVATE "-hstd=c++14")
+    else ()
+      target_compile_features(${dbcsr_program_name} PRIVATE cxx_std_14)
+    endif ()
+  endforeach ()
 endif ()
+
+# =================================== DOCUMENTATION GENERATION Copy example
+# source files into the build directory so that their documentation can be
+# generated by FORD
+
+set(DBCSR_PROGRAM_SRCS ${DBCSR_PROGRAM_SRCS_FTN} ${DBCSR_PROGRAM_SRCS_CPP})
+
+# Make a list of the copy commands
+set(example_copy_commands)
+foreach (example ${DBCSR_PROGRAM_SRCS})
+  list(
+    APPEND
+    example_copy_commands
+    COMMAND
+    ${CMAKE_COMMAND}
+    -E
+    copy
+    ${CMAKE_SOURCE_DIR}/examples/${example}
+    ${CMAKE_BINARY_DIR}/examples)
+endforeach ()
+
+add_custom_target(
+  doc_copy_examples
+  COMMENT "Copy examples for documentation generation"
+  COMMAND mkdir -p ${CMAKE_BINARY_DIR}/examples ${example_copy_commands}
+  VERBATIM)
diff --git a/examples/README.md b/examples/README.md
index f7c8ba746a1..b8634b6fb37 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,9 +1,34 @@
-# DBCSR Examples
+# Examples
 
-These examples require to be executed with MPI.
-Furthermore, MPI_THREAD_FUNNELED mode is required
-if you are using threading.
-Make sure you compile DBCSR accordingly.
+- [`dbcsr_example_1`](dbcsr_example_1.F): how to create a dbcsr matrix (fortran)
+- [`dbcsr_example_2`](dbcsr_example_2.F): how to set a dbcsr matrix (fortran)
+- `dbcsr_example_3`: how to multiply two dbcsr matrices ([fortran](dbcsr_example_3.F) and [cpp](dbcsr_example_3.cpp))
+- [`dbcsr_tensor_example_1`](dbcsr_tensor_example_1.F): how to create a dbcsr matrix (fortran)
+    - the example can be run with different parameters, controlling block size, sparsity, verbosity and more
+- [`dbcsr_tensor_example_2`](dbcsr_tensor_example_2.cpp): tensor contraction example (cpp)
+    - tensor1 x tensor2 = tensor3, (13|2)x(54|21)=(3|45)
+
+See the [examples' documentation](../docs/guide/2-user-guide/2-examples/index.md).
+
+## Build
+
+Compile the DBCSR library, using `-DUSE_MPI=ON -DWITH_EXAMPLES=ON`.
+
+The examples require MPI. Furthermore, if you are using threading, MPI_THREAD_FUNNELED mode is required.
+
+## Run
+
+You can run the examples, for instance from the `build` directory, as follows:
+
+```bash
+srun -N 1 --ntasks-per-core 2 --ntasks-per-node 12 --cpus-per-task 2 ./examples/dbcsr_example_1
+```
+
+### Run tensor examples
+
+How to run (this example and DBCSR for tensors in general):
+* best performance is obtained by running with mpi and one openmp thread per rank.
+* ideally number of mpi ranks should be composed of small prime factors (e.g. powers of 2).
+* for sparse data & heterogeneous block sizes, DBCSR should be run on CPUs with libxsmm backend.
+* for dense data best performance is obtained by choosing homogeneous block sizes of 64 and by compiling with GPU support.
 
-1) Compile and install DBCSR, following the procedure described in the DBCSR
-   installation guide, and using `-DWITH_EXAMPLES=ON`.
diff --git a/examples/dbcsr_example_1.F b/examples/dbcsr_example_1.F
index fe3bd3f7a44..837850001cc 100644
--- a/examples/dbcsr_example_1.F
+++ b/examples/dbcsr_example_1.F
@@ -8,8 +8,8 @@
 !--------------------------------------------------------------------------------------------------!
 
 PROGRAM dbcsr_example_1
-   !! DBCSR example 1
-   !! This example shows how to create a dbcsr matrix
+   !! DBCSR example 1:
+   !! This example shows how to create a DBCSR matrix
 
    USE mpi
    USE dbcsr_api, ONLY: &
@@ -46,7 +46,7 @@ PROGRAM dbcsr_example_1
 !$ ENDIF
 
    !
-   ! setup the mp environment
+   ! setup the mpi environment
    CALL mpi_comm_size(MPI_COMM_WORLD, numnodes, ierr)
    IF (ierr /= 0) STOP "Error in MPI_Comm_size"
    npdims(:) = 0
@@ -61,7 +61,7 @@ PROGRAM dbcsr_example_1
 
    !***************************************************************************************
    !
-   ! initialize libdbcsr
+   ! initialize the DBCSR library
    CALL dbcsr_init_lib(MPI_COMM_WORLD)
 
    !
@@ -81,11 +81,11 @@ PROGRAM dbcsr_example_1
    CALL random_dist(col_dist, nblkcols_total, npdims(2))
 
    !
-   ! set the dbcsr distribution object
+   ! set the DBCSR distribution object
    CALL dbcsr_distribution_new(dist, group=group, row_dist=row_dist, col_dist=col_dist, reuse_arrays=.TRUE.)
 
    !
-   ! create the dbcsr matrix, i.e. a double precision non symmetric matrix
+   ! create the DBCSR matrix, i.e. a double precision non symmetric matrix
    ! with nblkrows_total x nblkcols_total blocks and
    ! sizes "sum(row_blk_sizes)" x "sum(col_blk_sizes)", distributed as
    ! specified by the dist object
@@ -99,7 +99,7 @@ PROGRAM dbcsr_example_1
                      reuse_arrays=.TRUE.)
 
    !
-   ! finalize the dbcsr matrix
+   ! finalize the DBCSR matrix
    CALL dbcsr_finalize(matrix_a)
 
    !
@@ -121,8 +121,9 @@ PROGRAM dbcsr_example_1
    CALL mpi_comm_free(group, ierr)
    IF (ierr /= 0) STOP "Error in MPI_Comm_free"
 
-   ! finalize libdbcsr
+   ! finalize the DBCSR library
    CALL dbcsr_finalize_lib()
+
    !
    ! finalize mpi
    CALL mpi_finalize(ierr)
diff --git a/examples/dbcsr_example_2.F b/examples/dbcsr_example_2.F
index 1945f1e3c4b..4fa1c2a61a2 100644
--- a/examples/dbcsr_example_2.F
+++ b/examples/dbcsr_example_2.F
@@ -8,8 +8,8 @@
 !--------------------------------------------------------------------------------------------------!
 
 PROGRAM dbcsr_example_2
-   !! DBCSR example 2
-   !! This example shows how to set a dbcsr matrix
+   !! DBCSR example 2:
+   !! This example shows how to set a DBCSR matrix
 
    USE mpi
    USE dbcsr_api, ONLY: &
@@ -49,7 +49,7 @@ PROGRAM dbcsr_example_2
 !$ ENDIF
 
    !
-   ! setup the mp environment
+   ! setup the mpi environment
    CALL mpi_comm_size(MPI_COMM_WORLD, numnodes, ierr)
    IF (ierr /= 0) STOP "Error in MPI_Comm_size"
    npdims(:) = 0
@@ -64,7 +64,7 @@ PROGRAM dbcsr_example_2
 
    !***************************************************************************************
    !
-   ! initialize libdbcsr
+   ! initialize the DBCSR library
    CALL dbcsr_init_lib(MPI_COMM_WORLD)
 
    !
@@ -84,11 +84,11 @@ PROGRAM dbcsr_example_2
    CALL random_dist(col_dist, nblkcols_total, npdims(2))
 
    !
-   ! set the dbcsr distribution object
+   ! set the DBCSR distribution object
    CALL dbcsr_distribution_new(dist, group=group, row_dist=row_dist, col_dist=col_dist, reuse_arrays=.TRUE.)
 
    !
-   ! create the dbcsr matrix, i.e. a double precision non symmetric matrix
+   ! create the DBCSR matrix, i.e. a double precision non symmetric matrix
    ! with nblkrows_total x nblkcols_total blocks and
    ! sizes "sum(row_blk_sizes)" x "sum(col_blk_sizes)", distributed as
    ! specified by the dist object
@@ -139,7 +139,7 @@ PROGRAM dbcsr_example_2
    DEALLOCATE (values)
 
    !
-   ! finalize the dbcsr matrix
+   ! finalize the DBCSR matrix
    CALL dbcsr_finalize(matrix_a)
 
    !
@@ -160,8 +160,9 @@ PROGRAM dbcsr_example_2
    CALL mpi_comm_free(group, ierr)
    IF (ierr /= 0) STOP "Error in MPI_Comm_free"
 
-   ! finalize libdbcsr
+   ! finalize the DBCSR library
    CALL dbcsr_finalize_lib()
+
    !
    ! finalize mpi
    CALL mpi_finalize(ierr)
diff --git a/examples/dbcsr_example_3.F b/examples/dbcsr_example_3.F
index 62f886397a9..f7acda3aaf7 100644
--- a/examples/dbcsr_example_3.F
+++ b/examples/dbcsr_example_3.F
@@ -8,7 +8,7 @@
 !--------------------------------------------------------------------------------------------------!
 
 PROGRAM dbcsr_example_3
-   !! DBCSR example 3
+   !! DBCSR example 3:
    !! This example shows how to multiply two dbcsr matrices
 
    USE mpi
@@ -50,7 +50,7 @@ PROGRAM dbcsr_example_3
 !$ ENDIF
 
    !
-   ! setup the mp environment
+   ! setup the mpi environment
    CALL mpi_comm_size(MPI_COMM_WORLD, numnodes, ierr)
    IF (ierr /= 0) STOP "Error in MPI_Comm_size"
    npdims(:) = 0
@@ -65,7 +65,7 @@ PROGRAM dbcsr_example_3
 
    !***************************************************************************************
    !
-   ! initialize libdbcsr
+   ! initialize the DBCSR library
    CALL dbcsr_init_lib(MPI_COMM_WORLD)
 
    !
@@ -185,8 +185,9 @@ PROGRAM dbcsr_example_3
    CALL mpi_comm_free(group, ierr)
    IF (ierr /= 0) STOP "Error in MPI_Comm_free"
 
-   ! finalize libdbcsr
+   ! finalize the DBCSR library
    CALL dbcsr_finalize_lib()
+
    !
    ! finalize mpi
    CALL mpi_finalize(ierr)
diff --git a/examples/dbcsr_example_3.cpp b/examples/dbcsr_example_3.cpp
index 3d94374477a..58a9cc046cf 100644
--- a/examples/dbcsr_example_3.cpp
+++ b/examples/dbcsr_example_3.cpp
@@ -33,15 +33,19 @@ std::vector<int> random_dist(int dist_size, int nbins)
 }
 
 
+// DBCSR example 3
+// This example shows how to multiply two DBCSR matrices
 int main(int argc, char* argv[])
 {
+    // initialize MPI
     MPI_Init(&argc, &argv);
 
+    // setup the mpi environment
     int mpi_size, mpi_rank;
     MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
     MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
 
-    // Make 2D grid
+    // make 2D grid
     int dims[2] = {0};
     MPI_Dims_create(mpi_size, 2, dims);
     int periods[2] = {1};
@@ -58,20 +62,22 @@ int main(int argc, char* argv[])
         << ", (" << coord[0] << ", " << coord[1] << ") in the 2D grid"
         << std::endl;
 
+    // initialize the DBCSR library
     c_dbcsr_init_lib(MPI_COMM_WORLD, nullptr);
 
-    // Total number of blocks
+    // the matrix will contain nblkrows_total row blocks and nblkcols_total column blocks
     int nblkrows_total = 4;
     int nblkcols_total = 4;
 
-    // Block sizes
+    // set the block size for each row and column
     std::vector<int> row_blk_sizes(nblkrows_total, 2), col_blk_sizes(nblkcols_total, 2);
 
+    // set the row and column distributions (here the distribution is set randomly)
     auto row_dist = random_dist(nblkrows_total, dims[0]);
     auto col_dist = random_dist(nblkcols_total, dims[1]);
 
+    // set the DBCSR distribution object
     void* dist = nullptr;
-
     c_dbcsr_distribution_new(&dist, group,
         row_dist.data(), row_dist.size(),
         col_dist.data(), col_dist.size());
@@ -103,7 +109,12 @@ int main(int argc, char* argv[])
         }
     };
 
-    // create and fill matrix a
+    // create the DBCSR matrices, i.e. a double precision non symmetric matrix
+    // with nblkrows_total x nblkcols_total blocks and
+    // sizes "sum(row_blk_sizes)" x "sum(col_blk_sizes)", distributed as
+    // specified by the dist object
+
+    // create, fill and finalize matrix a
     void* matrix_a = nullptr;
     c_dbcsr_create_new_d(&matrix_a, "this is my matrix a", dist, 'N',
         row_blk_sizes.data(), row_blk_sizes.size(),
@@ -111,7 +122,7 @@ int main(int argc, char* argv[])
     fill_matrix(matrix_a);
     c_dbcsr_finalize(matrix_a);
 
-    // create and fill matrix b
+    // create, fill and finalize matrix b
     void* matrix_b = nullptr;
     c_dbcsr_create_new_d(&matrix_b, "this is my matrix b", dist, 'N',
         row_blk_sizes.data(), row_blk_sizes.size(),
@@ -119,7 +130,7 @@ int main(int argc, char* argv[])
     fill_matrix(matrix_b);
     c_dbcsr_finalize(matrix_b);
 
-    // create matrix c, empty
+    // create and finalize matrix c (empty)
     void* matrix_c = nullptr;
     c_dbcsr_create_new_d(&matrix_c, "matrix c", dist, 'N',
         row_blk_sizes.data(), row_blk_sizes.size(),
@@ -129,21 +140,25 @@ int main(int argc, char* argv[])
     // multiply the matrices
     c_dbcsr_multiply_d('N', 'N', 1.0, &matrix_a, &matrix_b, 0.0, &matrix_c, nullptr);
 
+    // print the matrices
     c_dbcsr_print(matrix_a);
     c_dbcsr_print(matrix_b);
     c_dbcsr_print(matrix_c);
 
+    // release the matrices
     c_dbcsr_release(&matrix_a);
     c_dbcsr_release(&matrix_b);
     c_dbcsr_release(&matrix_c);
 
     c_dbcsr_distribution_release(&dist);
 
-
+    // free comm
     MPI_Comm_free(&group);
 
+    // finalize the DBCSR library
     c_dbcsr_finalize_lib();
 
+    // finalize MPI
     MPI_Finalize();
 
     return 0;
diff --git a/examples/dbcsr_tensor_example.F b/examples/dbcsr_tensor_example_1.F
similarity index 99%
rename from examples/dbcsr_tensor_example.F
rename to examples/dbcsr_tensor_example_1.F
index d6b200bed8c..32b41cae1e4 100644
--- a/examples/dbcsr_tensor_example.F
+++ b/examples/dbcsr_tensor_example_1.F
@@ -7,8 +7,8 @@
 ! SPDX-License-Identifier: GPL-2.0+                                                                !
 !--------------------------------------------------------------------------------------------------!
 
-program dbcsr_tensor_example
-
+program dbcsr_tensor_example_1
+   !! Sparse tensor contraction example
    use mpi
    use dbcsr_api, only: &
       dbcsr_type, dbcsr_distribution_type, dbcsr_init_lib, dbcsr_distribution_new, &
@@ -180,7 +180,7 @@ program dbcsr_tensor_example
    beta = beta*scale_exp
    gamma = gamma*scale_exp
 
-   ! iniialize mpi
+   ! initialize mpi
    call mpi_init(ierr)
    if (ierr /= 0) stop "error in mpi_init"
 
diff --git a/examples/dbcsr_tensor_example_1.cpp b/examples/dbcsr_tensor_example_2.cpp
similarity index 100%
rename from examples/dbcsr_tensor_example_1.cpp
rename to examples/dbcsr_tensor_example_2.cpp
diff --git a/src/acc/README.md b/src/acc/README.md
index 750d8dd5749..3355b527109 100644
--- a/src/acc/README.md
+++ b/src/acc/README.md
@@ -2,7 +2,14 @@
 
 ## Overview
 
-This folder contains the ISO_C_BINDING based Fortran code of DBCSR's [ACC-backend interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc.h) and [LIBSMM/ACC-interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc_libsmm.h). Further, two stand-alone sample codes are given exercising both interfaces (benchmarks).
+This folder contains the ISO_C_BINDING based Fortran code of DBCSR's [ACC-backend interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc.h) and [LIBSMM/ACC-interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc_libsmm.h). It also contains the CUDA (for Nvidia GPUs) and HIP (for AMD GPUs) accelerator backends.
+
+Further, two stand-alone sample codes are given exercising both interfaces (benchmarks).
+
+## CUDA and HIP backends
+
+The code for both the CUDA and HIP backends is unique, and can be found in the `cuda` directory.
+We switch from one backend to the other via macros (`__CUDA` and `__HIP`).
 
 ## Benchmarks
 
diff --git a/src/acc/libsmm_acc/kernels/README.md b/src/acc/libsmm_acc/kernels/README.md
index e49a3348d01..acc9d8f49ae 100644
--- a/src/acc/libsmm_acc/kernels/README.md
+++ b/src/acc/libsmm_acc/kernels/README.md
@@ -1,19 +1,40 @@
 # libsmm_acc/kernels
 
+`libsmm_acc`'s GPU kernels.
+
 ## Directory Organization
 
 * [`autotuning_properties.json`](autotuning_properties.json) Properties of the autotuning procedure, read from [DBCSR source code](../libsmm_acc_benchmark.cpp)
 
+* [`gpu_properties.json`](gpu_properties.json) GPU card properties
+
 * [`smm_acc_common.h`](smm_acc_common.h) Functionalities common to kernel CUDA/HIP codes
 
 * [`smm_acc_dnt_base.py`](smm_acc_dnt_base.py) Kernel base class
 
-  * `smm_acc_dnt_ALGORITHM.py` Kernel class
+  * `smm_acc_dnt_ALGORITHM.py` Kernel class in python
 
-  * `smm_acc_dnt_ALGORITHM.h` Kernel CUDA/HIP code
+  * `smm_acc_dnt_ALGORITHM.h` Batched Multiply Kernel CUDA/HIP code
 
 * [`smm_acc_predict.py`](smm_acc_predict.py) Class and helper functions for parameter prediction procedure
 
-* [`smm_acc_transpose.h`](smm_acc_transpose.h) Transposition CUDA/HIP code
+* [`smm_acc_transpose.h`](smm_acc_transpose.h) Transpose CUDA/HIP code
+
+## Batched Multiplication Kernels
+
+All kernels have following signature:
+
+```
+template <int m, int n, int k, int M, int N, int w, int v, int threads, int grouping, int minblocks >
+
+__global__ void
+__launch_bounds__(threads, minblocks)
+smm_acc_dnt_ALGORITHM
+(const int *__restrict__ param_stack, const int stack_size,
+const double* __restrict__ a_data, const double* __restrict__ b_data, double* c_data);
+```
+
+At kernel launch time, the A, B, and C matrices, as well as the product descriptors (the so-called stacks) are all located in global memory on the GPU. Each entry in the stack describes one matrix-matrix product: it contains three pointers to the blocks in the A, B, and C matrices. After the kernel has read a stack entry, it fetches the blocks in matrices A and B from global to shared memory, and updates the C matrix with the product of A and B.
+
+`libsmm_acc` provides 5 different kernels for this operation (tiny, small, medium, largeDB1, largeDB2), which are optimized for different block sizes. Please refer to the documentation inside the respective `.h` files for more details.
 
-* [`gpu_properties.json`](gpu_properties.json) GPU card properties
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a4ca3bc7a4a..b33ba5800ab 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -57,7 +57,7 @@ endforeach ()
 
 # =================================== DBCSR CORRECTNESS TESTS Define all the
 # tests here, will be used as the executable name
-set(DBCSR_TESTS
+set(DBCSR_TESTS_FTN
     dbcsr_unittest1
     dbcsr_unittest2
     dbcsr_unittest3
@@ -67,12 +67,9 @@ set(DBCSR_TESTS
     dbcsr_test_csr_conversions)
 
 if (NOT (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray"))
-  set(DBCSR_TESTS_CPP dbcsr_tensor_test_cpp)
+  set(DBCSR_TESTS_SRCS_CPP dbcsr_tensor_test.cpp)
 endif ()
 
-# Common object files linked to all tests
-set(dbcsr_unittest_common_SRCS dbcsr_test_add.F dbcsr_test_multiply.F)
-
 # For each test, set a variable testname_SRCS defining the sources of that test
 set(dbcsr_unittest1_SRCS dbcsr_unittest1.F)
 set(dbcsr_unittest2_SRCS dbcsr_unittest2.F)
@@ -83,6 +80,15 @@ set(dbcsr_tas_unittest_SRCS dbcsr_tas_unittest.F)
 set(dbcsr_test_csr_conversions_SRCS dbcsr_test_csr_conversions.F)
 set(dbcsr_tensor_test_cpp_SRCS dbcsr_tensor_test.cpp)
 
+# Make a list of the source files of fortran tests
+set(DBCSR_TESTS_SRCS_FTN)
+foreach (dbcsr_test ${DBCSR_TESTS_FTN})
+  set(DBCSR_TESTS_SRCS_FTN ${DBCSR_TESTS_SRCS_FTN} ${${dbcsr_test}_SRCS})
+endforeach ()
+
+# Common object files linked to all tests
+set(dbcsr_unittest_common_SRCS dbcsr_test_add.F dbcsr_test_multiply.F)
+
 # instead of building a full-blown lib, it would be better to simply build an
 # OBJECT lib, but we would need cmake 3.12 to be able to specify
 # target_link_libraries on those to get the proper compile flags
@@ -98,7 +104,8 @@ if (APPLE AND BLAS_LIBRARIES MATCHES "Accelerate")
 endif ()
 target_link_libraries(dbcsr_unittest_common PUBLIC dbcsr)
 
-foreach (dbcsr_test ${DBCSR_TESTS})
+# Compile Fortran tests
+foreach (dbcsr_test ${DBCSR_TESTS_FTN})
   if (USE_HIP)
     hip_add_executable(${dbcsr_test} ${${dbcsr_test}_SRCS})
   else ()
@@ -125,17 +132,8 @@ foreach (dbcsr_test ${DBCSR_TESTS})
 endforeach ()
 
 # set the __SHORT_FILE__ per file for dbcsr sources
-foreach (
-  tests_src
-  ${DBCSR_PERF_SRCS}
-  ${dbcsr_unittest_common_SRCS}
-  ${dbcsr_unittest1_SRCS}
-  ${dbcsr_unittest2_SRCS}
-  ${dbcsr_unittest3_SRCS}
-  ${dbcsr_unittest4_SRCS}
-  ${dbcsr_tensor_unittest_SRCS}
-  ${dbcsr_tas_unittest_SRCS}
-  ${dbcsr_test_csr_conversions_SRCS})
+foreach (tests_src ${DBCSR_PERF_SRCS} ${dbcsr_unittest_common_SRCS}
+                   ${DBCSR_TESTS_SRCS_FTN})
   # add_fypp_sources returns a path in the current binary dir
   get_filename_component(short_file "${tests_src}" NAME)
   set_source_files_properties(
@@ -143,128 +141,135 @@ foreach (
 endforeach ()
 
 if (WITH_C_API)
-  foreach (dbcsr_test_cpp ${DBCSR_TESTS_CPP})
-    add_executable(${dbcsr_test_cpp} ${${dbcsr_test_cpp}_SRCS})
-    target_link_libraries(${dbcsr_test_cpp} dbcsr_c MPI::MPI_CXX)
+  foreach (dbcsr_test_cpp_src ${DBCSR_TESTS_SRCS_CPP})
+    get_filename_component(dbcsr_test_cpp_name ${dbcsr_test_cpp_src} NAME_WE)
+    add_executable(${dbcsr_test_cpp_name} ${dbcsr_test_cpp_src})
+    target_link_libraries(${dbcsr_test_cpp_name} dbcsr_c MPI::MPI_CXX)
     # register unittest executable with CMake
     if (USE_MPI)
       separate_arguments(MPIEXEC_PREFLAGS)
       add_test(
-        NAME ${dbcsr_test_cpp}
-        COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${num_ranks}
-                ${MPIEXEC_PREFLAGS} ./${dbcsr_test_cpp} ${MPIEXEC_POSTFLAGS})
+        NAME ${dbcsr_test_cpp_name}
+        COMMAND
+          ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${num_ranks}
+          ${MPIEXEC_PREFLAGS} ./${dbcsr_test_cpp_name} ${MPIEXEC_POSTFLAGS})
     else ()
-      add_test(NAME ${dbcsr_test_cpp} COMMAND ./${dbcsr_test_cpp})
+      add_test(NAME ${dbcsr_test_cpp_name} COMMAND ./${dbcsr_test_cpp_name})
     endif ()
     if (OpenMP_FOUND)
       set_tests_properties(
-        ${dbcsr_test_cpp} PROPERTIES ENVIRONMENT
-                                     OMP_NUM_THREADS=${TEST_OMP_THREADS})
+        ${dbcsr_test_cpp_name} PROPERTIES ENVIRONMENT
+                                          OMP_NUM_THREADS=${TEST_OMP_THREADS})
     endif ()
   endforeach ()
 endif ()
 
 # =================================== GPU BACKEND TESTS (CUDA / HIP)
+
+# Add custom commands for the test files that need to be generated from a
+# template
+file(RELATIVE_PATH CURRENT_BINARY_DIR_RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/..
+     ${CMAKE_CURRENT_BINARY_DIR})
+
+# libsmm_acc_unittest_multiply
+add_custom_command(
+  OUTPUT libsmm_acc_unittest_multiply.cpp
+  COMMAND
+    ${Python_EXECUTABLE}
+    ${CMAKE_CURRENT_SOURCE_DIR}/generate_libsmm_acc_unittest_multiply.py
+    --base_dir ${CMAKE_CURRENT_SOURCE_DIR}/.. --out_dir
+    ${CURRENT_BINARY_DIR_RELATIVE} --gpu_version=${WITH_GPU}
+  DEPENDS libsmm_acc_unittest_multiply.cpp.template
+          generate_libsmm_acc_unittest_multiply.py
+  COMMENT "Generate tests/libsmm_acc_unittest_multiply.cpp")
+add_custom_target(generate_libsmm_acc_unittest_multiply_test_cpp
+                  DEPENDS libsmm_acc_unittest_multiply.cpp)
+
+# libsmm_acc_timer_multiply
+add_custom_command(
+  OUTPUT libsmm_acc_timer_multiply.cpp
+  COMMAND
+    ${Python_EXECUTABLE}
+    ${CMAKE_CURRENT_SOURCE_DIR}/generate_libsmm_acc_timer_multiply.py --base_dir
+    ${CMAKE_CURRENT_SOURCE_DIR}/.. --out_dir ${CURRENT_BINARY_DIR_RELATIVE}
+    --gpu_version=${WITH_GPU}
+  DEPENDS libsmm_acc_timer_multiply.cpp.template
+          generate_libsmm_acc_timer_multiply.py
+  COMMENT "Generate tests/libsmm_acc_unittest_transpose.cpp")
+add_custom_target(generate_libsmm_acc_timer_multiply_test_cpp
+                  DEPENDS libsmm_acc_timer_multiply.cpp)
+
 if (USE_CUDA OR USE_HIP)
 
   # All libsmm_acc tests
-  set(LIBSMM_ACC_TESTS_BUILD
-      libsmm_acc_unittest_multiply libsmm_acc_unittest_transpose
-      libsmm_acc_timer_multiply)
+  set(LIBSMM_ACC_TESTS_SRCS
+      ${CMAKE_CURRENT_BINARY_DIR}/libsmm_acc_unittest_multiply.cpp
+      ${CMAKE_CURRENT_BINARY_DIR}/libsmm_acc_timer_multiply.cpp
+      libsmm_acc_unittest_transpose.cpp)
 
   # Tests that need no additional arguments to be run
-  set(LIBSMM_ACC_SIMPLE_TESTS libsmm_acc_unittest_multiply
-                              libsmm_acc_unittest_transpose)
-
-  # Add custom commands for the test files that need to be generated from a
-  # template libsmm_acc_unittest_multiply, libsmm_acc_timer_multiply
-  file(RELATIVE_PATH CURRENT_BINARY_DIR_RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/..
-       ${CMAKE_CURRENT_BINARY_DIR})
-
-  add_custom_command(
-    OUTPUT libsmm_acc_unittest_multiply.cpp
-    COMMAND
-      ${Python_EXECUTABLE}
-      ${CMAKE_CURRENT_SOURCE_DIR}/generate_libsmm_acc_unittest_multiply.py
-      --base_dir ${CMAKE_CURRENT_SOURCE_DIR}/.. --out_dir
-      ${CURRENT_BINARY_DIR_RELATIVE} --gpu_version=${WITH_GPU}
-    DEPENDS libsmm_acc_unittest_multiply.cpp.template
-            generate_libsmm_acc_unittest_multiply.py
-    COMMENT "Generate tests/libsmm_acc_unittest_multiply.cpp")
-
-  add_custom_command(
-    OUTPUT libsmm_acc_timer_multiply.cpp
-    COMMAND
-      ${Python_EXECUTABLE}
-      ${CMAKE_CURRENT_SOURCE_DIR}/generate_libsmm_acc_timer_multiply.py
-      --base_dir ${CMAKE_CURRENT_SOURCE_DIR}/.. --out_dir
-      ${CURRENT_BINARY_DIR_RELATIVE} --gpu_version=${WITH_GPU}
-    DEPENDS libsmm_acc_timer_multiply.cpp.template
-            generate_libsmm_acc_timer_multiply.py
-    COMMENT "Generate tests/libsmm_acc_unittest_transpose.cpp")
+  set(LIBSMM_ACC_NOARG_TESTS libsmm_acc_unittest_multiply
+                             libsmm_acc_unittest_transpose)
 
   # Add executables for all libsmm_acc tests
   if (USE_CUDA)
 
-    foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_BUILD})
+    foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_SRCS})
+
+      get_filename_component(libsmm_acc_test_name ${libsmm_acc_test} NAME_WE)
 
-      add_executable(${libsmm_acc_test} ${libsmm_acc_test}.cpp)
-      target_compile_definitions(${libsmm_acc_test} PRIVATE __CUDA)
+      add_executable(${libsmm_acc_test_name} ${libsmm_acc_test})
+      target_compile_definitions(${libsmm_acc_test_name} PRIVATE __CUDA)
       target_include_directories(
-        ${libsmm_acc_test} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+        ${libsmm_acc_test_name}
+        PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 
-      target_link_libraries(${libsmm_acc_test} dbcsr)
+      target_link_libraries(${libsmm_acc_test_name} dbcsr)
 
       if (OpenMP_FOUND)
-        target_link_libraries(${libsmm_acc_test} OpenMP::OpenMP_CXX)
+        target_link_libraries(${libsmm_acc_test_name} OpenMP::OpenMP_CXX)
       endif ()
 
     endforeach ()
 
   else () # i.e. USE_HIP
 
-    foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_BUILD})
-      set_source_files_properties(${libsmm_acc_test}.cpp
+    foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_SRCS})
+      set_source_files_properties(${libsmm_acc_test}
                                   PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-    endforeach ()
 
-    hip_add_executable(
-      libsmm_acc_unittest_multiply
-      ${CMAKE_CURRENT_BINARY_DIR}/libsmm_acc_unittest_multiply.cpp)
-    target_link_options(libsmm_acc_unittest_multiply PRIVATE ${HIP_ARCH_FLAGS})
-    hip_add_executable(
-      libsmm_acc_timer_multiply
-      ${CMAKE_CURRENT_BINARY_DIR}/libsmm_acc_timer_multiply.cpp)
-    target_link_options(libsmm_acc_timer_multiply PRIVATE ${HIP_ARCH_FLAGS})
-    hip_add_executable(libsmm_acc_unittest_transpose
-                       libsmm_acc_unittest_transpose.cpp)
-    target_link_options(libsmm_acc_unittest_transpose PRIVATE ${HIP_ARCH_FLAGS})
+      get_filename_component(libsmm_acc_test_name ${libsmm_acc_test} NAME_WE)
+
+      hip_add_executable(${libsmm_acc_test_name} ${libsmm_acc_test})
+      target_link_options(${libsmm_acc_test_name} PRIVATE ${HIP_ARCH_FLAGS})
+
+      target_link_libraries(${libsmm_acc_test_name} dbcsr)
+      target_compile_definitions(${libsmm_acc_test_name} PRIVATE __HIP)
+
+    endforeach ()
 
     # Workaround issue in hip_add_library: explicitely write dependency between
-    # the unit test & the script that generates it
-    add_custom_target(generate_libsmm_acc_unittest_multiply_test_cpp
-                      DEPENDS libsmm_acc_unittest_multiply.cpp)
+    # the test executable and the generated test c++ source file
     add_dependencies(libsmm_acc_unittest_multiply
                      generate_libsmm_acc_unittest_multiply_test_cpp)
-    add_custom_target(generate_libsmm_acc_timer_multiply_test_cpp
-                      DEPENDS libsmm_acc_timer_multiply.cpp)
     add_dependencies(libsmm_acc_timer_multiply
                      generate_libsmm_acc_timer_multiply_test_cpp)
 
-    foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_BUILD})
+    foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_SRCS})
 
-      target_compile_definitions(${libsmm_acc_test} PRIVATE __HIP)
-      target_include_directories(${libsmm_acc_test}
+      get_filename_component(libsmm_acc_test_name ${libsmm_acc_test} NAME_WE)
+      target_compile_definitions(${libsmm_acc_test_name} PRIVATE __HIP)
+      target_include_directories(${libsmm_acc_test_name}
                                  PRIVATE ${HIP_PATH}/../include)
 
-      target_link_libraries(${libsmm_acc_test} dbcsr)
+      target_link_libraries(${libsmm_acc_test_name} dbcsr)
 
     endforeach ()
 
   endif ()
 
   # Add tests that do not need additional arguments
-  foreach (libsmm_acc_test ${LIBSMM_ACC_SIMPLE_TESTS})
+  foreach (libsmm_acc_test ${LIBSMM_ACC_NOARG_TESTS})
     add_test(NAME ${libsmm_acc_test} COMMAND ${libsmm_acc_test})
   endforeach ()
 
@@ -275,3 +280,36 @@ if (USE_CUDA OR USE_HIP)
            COMMAND libsmm_acc_timer_multiply predicted)
 
 endif ()
+
+# =================================== DOCUMENTATION GENERATION Copy test source
+# files into the build directory so that their documentation can be generated by
+# FORD
+set(DBCSR_TESTS dbcsr_performance_driver.F ${DBCSR_TESTS_SRCS_FTN}
+                ${DBCSR_TESTS_SRCS_CPP} libsmm_acc_unittest_transpose.cpp)
+
+# Make a list of the copy commands
+set(test_copy_commands)
+foreach (test ${DBCSR_TESTS})
+  list(
+    APPEND
+    test_copy_commands
+    COMMAND
+    ${CMAKE_COMMAND}
+    -E
+    copy
+    ${CMAKE_SOURCE_DIR}/tests/${test}
+    ${CMAKE_BINARY_DIR}/tests)
+endforeach ()
+
+add_custom_target(
+  doc_copy_tests
+  COMMENT "Copy tests for documentation generation"
+  COMMAND mkdir -p ${CMAKE_BINARY_DIR}/tests ${test_copy_commands}
+  VERBATIM)
+
+# libsmm_acc_unittest_multiply.cpp and libsmm_acc_timer_multiply.cpp do not need
+# to be copied to the build directory since they are generated at build-time and
+# written to the build directory directly. We just need to make sure that the
+# documentation generation depends on the generation of these tests.
+add_dependencies(doc_copy_tests generate_libsmm_acc_unittest_multiply_test_cpp)
+add_dependencies(doc_copy_tests generate_libsmm_acc_timer_multiply_test_cpp)
diff --git a/tests/README b/tests/README
deleted file mode 100644
index d3abfa49765..00000000000
--- a/tests/README
+++ /dev/null
@@ -1,32 +0,0 @@
-!--------------------------------------------------------------------------------------------------!
-! Copyright (C) by the DBCSR developers group - All rights reserved                                !
-! This file is part of the DBCSR library.                                                          !
-!                                                                                                  !
-! For information on the license, see the LICENSE file.                                            !
-! For further information please visit https://dbcsr.cp2k.org                                      !
-! SPDX-License-Identifier: GPL-2.0+                                                                !
-!--------------------------------------------------------------------------------------------------!
-
-DBCSR Testing and Performance
-
-Performance:
-============
-
-* Building: compile libdbcsr as usual but use the 'dbcsr_performance_driver' target, i.e.:
-
-  > cd $LIBDBCSR_HOME
-  > make -j dbcsr_performance_driver
-
-* Running, examples:
-
-  Modify the input.perf input file and pass it as standard input
-  to the executable generated above, e.g.:
-
-  a) To test pure MPI performance test using [n] nodes:
-
-  > mpirun -np [n] ./bin/dbcsr_performance_driver.x tests/input.perf 2>&1 | tee perf.log
-
-  b) To test hybrid MPI/OpenMP performance test using [n] nodes, each spanning [t] threads:
-
-  > export OMP_NUM_THREADS=[t]; mpirun -np [n] ./bin/dbcsr_performance_driver.x tests/input.perf 2>&1 | tee perf.log
-
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 00000000000..a263ba0d7dc
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,39 @@
+!--------------------------------------------------------------------------------------------------!
+! Copyright (C) by the DBCSR developers group - All rights reserved                                !
+! This file is part of the DBCSR library.                                                          !
+!                                                                                                  !
+! For information on the license, see the LICENSE file.                                            !
+! For further information please visit https://dbcsr.cp2k.org                                      !
+! SPDX-License-Identifier: GPL-2.0+                                                                !
+!--------------------------------------------------------------------------------------------------!
+
+# DBCSR Testing and Performance
+
+## Correctness tests
+
+- [dbcsr_tas_unittest](dbcsr_tas_unittest.F) : unit test for tall-and-skinny matrices
+- [dbcsr_tensor_test](dbcsr_tensor_test.cpp) : test the tensor contraction (13|2)x(54|21)=(3|45) 31 and other functions
+- [dbcsr_tensor_unittest](dbcsr_tensor_unittest.F) : unit test for tensor functionalities
+- [dbcsr_test_csr_conversions](dbcsr_test_csr_conversions.F) : test DBCSR to CSR conversion with random matrices
+- [dbcsr_unittest_1](dbcsr_unittest1.F) : test matrix operations: add, multiply and multiply-ghost
+- [dbcsr_unittest_2](dbcsr_unittest2.F) : test matrix-multiply with large blocks (block size=100) and rectangular matrices (block size=5)
+
+### GPU-backend correctness tests:
+
+- [dbcsr_unittest_3](dbcsr_unittest3.F) : test matrix-multiply with various block sizes that are run by the libsmm_acc GPU backend if DBCSR is compiled with GPU support
+- [libsmm_acc_unittest_multiply](libsmm_acc_unittest_multiply.cpp.template) : tests all libsmm_acc transpose kernels
+- [libsmm_acc_unittest_transpose](libsmm_acc_unittest_transpose.cpp) : tests all libsmm_acc batch-multiplication kernels
+
+## Performance tests
+
+DBCSR performance tests:
+
+- [dbcsr_performance_driver](dbcsr_performance_driver.F) : performance tester for matrix operations. The input matrices can be described in an input file in order to test different     configurations. See below.
+
+### GPU backend performance tests:
+
+- [libsmm_acc_timer_multiply](libsmm_acc_timer_multiply.cpp.template) : time all libsmm_acc batch-multiplication kernels
+
+---
+
+See the [tests' documentation](../docs/guide/2-user-guide/2-tests/index.md).
diff --git a/tests/dbcsr_tas_unittest.F b/tests/dbcsr_tas_unittest.F
index 7c3971fce02..0281967f520 100644
--- a/tests/dbcsr_tas_unittest.F
+++ b/tests/dbcsr_tas_unittest.F
@@ -9,7 +9,7 @@
 
 PROGRAM dbcsr_tas_unittest
 
-   !! unit testing for tall-and-skinny matrices
+   !! Unit testing for tall-and-skinny matrices
    USE dbcsr_api, ONLY: dbcsr_finalize_lib, &
                         dbcsr_init_lib, &
                         dbcsr_print_statistics
diff --git a/tests/dbcsr_tensor_unittest.F b/tests/dbcsr_tensor_unittest.F
index 91bf8e2e9a3..e36af4c207e 100644
--- a/tests/dbcsr_tensor_unittest.F
+++ b/tests/dbcsr_tensor_unittest.F
@@ -8,7 +8,7 @@
 !--------------------------------------------------------------------------------------------------!
 
 PROGRAM dbcsr_tensor_unittest
-   !! DBCSR tensor unit test.
+   !! DBCSR tensor unit test
    USE dbcsr_api, ONLY: dbcsr_finalize_lib, &
                         dbcsr_init_lib, &
                         dbcsr_type_real_8, &
diff --git a/tests/dbcsr_unittest1.F b/tests/dbcsr_unittest1.F
index ce1c1b5f4a2..da6db071cf3 100644
--- a/tests/dbcsr_unittest1.F
+++ b/tests/dbcsr_unittest1.F
@@ -7,8 +7,9 @@
 ! SPDX-License-Identifier: GPL-2.0+                                                                !
 !--------------------------------------------------------------------------------------------------!
 
-PROGRAM dbcsr_unittest
-   !! Tests for DBCSR operations
+PROGRAM dbcsr_unittest_1
+   !! Tests for DBCSR operations:
+   !! add, multiply and multiply-ghost
 
    USE dbcsr_kinds, ONLY: dp
    USE dbcsr_lib, ONLY: dbcsr_finalize_lib, &
@@ -318,4 +319,4 @@ PROGRAM dbcsr_unittest
    !
    ! finalize libdbcsr errors
 
-END PROGRAM dbcsr_unittest
+END PROGRAM dbcsr_unittest_1
diff --git a/tests/dbcsr_unittest2.F b/tests/dbcsr_unittest2.F
index 7a9e7a35d06..719648d60a0 100644
--- a/tests/dbcsr_unittest2.F
+++ b/tests/dbcsr_unittest2.F
@@ -7,8 +7,10 @@
 ! SPDX-License-Identifier: GPL-2.0+                                                                !
 !--------------------------------------------------------------------------------------------------!
 
-PROGRAM dbcsr_unittest
-   !! Tests for DBCSR operations
+PROGRAM dbcsr_unittest_2
+   !! Tests for DBCSR multiply:
+   !! large blocks (block size=100)
+   !! and rectangular matrices (block size=5)
 
    USE dbcsr_kinds, ONLY: dp
    USE dbcsr_lib, ONLY: dbcsr_finalize_lib, &
@@ -126,4 +128,4 @@ PROGRAM dbcsr_unittest
 
    CALL mp_world_finalize()
 
-END PROGRAM dbcsr_unittest
+END PROGRAM dbcsr_unittest_2
diff --git a/tests/dbcsr_unittest3.F b/tests/dbcsr_unittest3.F
index c82273a14ff..369d639a56e 100644
--- a/tests/dbcsr_unittest3.F
+++ b/tests/dbcsr_unittest3.F
@@ -7,8 +7,10 @@
 ! SPDX-License-Identifier: GPL-2.0+                                                                !
 !--------------------------------------------------------------------------------------------------!
 
-PROGRAM dbcsr_unittest
-   !! Tests for DBCSR's libsmm_acc backend
+PROGRAM dbcsr_unittest_3
+   !! Tests for DBCSR multiply:
+   !! various block sizes that are run by the libsmm_acc GPU backend if
+   !! DBCSR is compiled with GPU support.
 
    USE dbcsr_kinds, ONLY: dp
    USE dbcsr_lib, ONLY: dbcsr_finalize_lib, &
@@ -139,4 +141,4 @@ PROGRAM dbcsr_unittest
 
    CALL mp_world_finalize()
 
-END PROGRAM dbcsr_unittest
+END PROGRAM dbcsr_unittest_3
diff --git a/tests/input.perf b/tests/input.perf
index 90837cb6509..aafa4e04764 100644
--- a/tests/input.perf
+++ b/tests/input.perf
@@ -1,7 +1,10 @@
-# npcols MPI grid, 0 leaves MPI to find the best grid.
-# Note that the total number of processors must be divisible per npcols
+# Template Input File for DBCSR Performance Driver
+# -----------------------------------------------------------------------
+# npcols MPI grid
+# - 0 leaves MPI to find the best grid.
+# - Note that the total number of processors must be divisible per npcols
 0
-# Use MPI-RMA
+# use MPI-RMA
 F
 # operation
 dbcsr_multiply
@@ -9,7 +12,7 @@ dbcsr_multiply
 1000
 1000
 1000
-# sparsity (A, B, C)
+# sparsity (matrix A, matrix B, matrix C)
 0.0d0
 0.0d0
 0.0d0
@@ -21,22 +24,31 @@ N
 N
 N
 # data type
+# - 3: double
+# - other types, see "Type definitions" in dbcsr/src/data/dbcsr_data_types.F
 3
 # alpha (real, imag)
 1.0d0
 0.0d0
 # beta (real, imag)
 1.0d0
-0.0d0          
-# limits (0 means full size)
-# row
+0.0d0
+# limits
+# - 0 means full size
+# - row
+# -- limRowL (First full row of limiting submatrix)
 0
+# -- limRowU
 0
-# col
+# - col
+# -- limColL (First full col of limiting submatrix)
 0
+# -- limColU
 0
-# k
+# - k
+# -- limKL first full col of imiting inner product)
 0
+# -- limKU
 0
 # retain sparsity (T/F)
 F
@@ -47,8 +59,17 @@ F
 1
 1
 # the m blocks (multiplicity, block size, ...)
+# - this configuration, eg, will generate blocks of
+# - size 5 in the m-dimension
 1
 5
+# - this configuration would generate a block of
+# - size 5 followed by 2 blocks of size 3 in the m-dimension,
+# - followed by a block of size 5, etc. until size M is reached
+# - 1
+# - 5
+# - 2
+# - 3
 # the n blocks (multiplicity, block size, ...)
 1
 5
diff --git a/tests/inputs/test_H2O.perf b/tests/inputs/test_H2O.perf
index df09e2bafbc..00e9200e7c2 100644
--- a/tests/inputs/test_H2O.perf
+++ b/tests/inputs/test_H2O.perf
@@ -27,7 +27,7 @@ N
 0.0d0
 # beta (real, imag)
 1.0d0
-0.0d0          
+0.0d0
 # limits (0 means full size)
 # row
 0
diff --git a/tests/inputs/test_rect1_dense.perf b/tests/inputs/test_rect1_dense.perf
index 3ee8a8b7d63..4f1f73918f3 100644
--- a/tests/inputs/test_rect1_dense.perf
+++ b/tests/inputs/test_rect1_dense.perf
@@ -27,7 +27,7 @@ N
 0.0d0
 # beta (real, imag)
 1.0d0
-0.0d0          
+0.0d0
 # limits (0 means full size)
 # row
 0
diff --git a/tests/inputs/test_rect1_sparse.perf b/tests/inputs/test_rect1_sparse.perf
index 5c876e63133..d54ba15cd19 100644
--- a/tests/inputs/test_rect1_sparse.perf
+++ b/tests/inputs/test_rect1_sparse.perf
@@ -27,7 +27,7 @@ N
 0.0d0
 # beta (real, imag)
 1.0d0
-0.0d0          
+0.0d0
 # limits (0 means full size)
 # row
 0
diff --git a/tests/inputs/test_rect2_dense.perf b/tests/inputs/test_rect2_dense.perf
index aacf4b1d336..316a02368e9 100644
--- a/tests/inputs/test_rect2_dense.perf
+++ b/tests/inputs/test_rect2_dense.perf
@@ -27,7 +27,7 @@ N
 0.0d0
 # beta (real, imag)
 1.0d0
-0.0d0          
+0.0d0
 # limits (0 means full size)
 # row
 0
diff --git a/tests/inputs/test_rect2_sparse.perf b/tests/inputs/test_rect2_sparse.perf
index ad609b4277a..d3b283bf7a4 100644
--- a/tests/inputs/test_rect2_sparse.perf
+++ b/tests/inputs/test_rect2_sparse.perf
@@ -27,7 +27,7 @@ N
 0.0d0
 # beta (real, imag)
 1.0d0
-0.0d0          
+0.0d0
 # limits (0 means full size)
 # row
 0
diff --git a/tests/inputs/test_singleblock.perf b/tests/inputs/test_singleblock.perf
index 7295fb7eac6..fb6158d11e8 100644
--- a/tests/inputs/test_singleblock.perf
+++ b/tests/inputs/test_singleblock.perf
@@ -27,7 +27,7 @@ N
 0.0d0
 # beta (real, imag)
 1.0d0
-0.0d0          
+0.0d0
 # limits (0 means full size)
 # row
 0
@@ -59,4 +59,4 @@ F
 T
 0.1E-10
 0.418186760034529E+06
-0.190157258297048E+06  
+0.190157258297048E+06
diff --git a/tests/inputs/test_square_dense.perf b/tests/inputs/test_square_dense.perf
index 008d712a485..9be924b629e 100644
--- a/tests/inputs/test_square_dense.perf
+++ b/tests/inputs/test_square_dense.perf
@@ -27,7 +27,7 @@ N
 0.0d0
 # beta (real, imag)
 1.0d0
-0.0d0          
+0.0d0
 # limits (0 means full size)
 # row
 0
diff --git a/tests/inputs/test_square_sparse.perf b/tests/inputs/test_square_sparse.perf
index 1aebcfcc17b..76e82ed2f60 100644
--- a/tests/inputs/test_square_sparse.perf
+++ b/tests/inputs/test_square_sparse.perf
@@ -27,7 +27,7 @@ N
 0.0d0
 # beta (real, imag)
 1.0d0
-0.0d0          
+0.0d0
 # limits (0 means full size)
 # row
 0
diff --git a/tests/inputs/test_square_sparse_bigblocks.perf b/tests/inputs/test_square_sparse_bigblocks.perf
index 4d3317ef277..b81565c788c 100644
--- a/tests/inputs/test_square_sparse_bigblocks.perf
+++ b/tests/inputs/test_square_sparse_bigblocks.perf
@@ -27,7 +27,7 @@ N
 0.0d0
 # beta (real, imag)
 1.0d0
-0.0d0          
+0.0d0
 # limits (0 means full size)
 # row
 0
diff --git a/tests/inputs/test_square_sparse_rma.perf b/tests/inputs/test_square_sparse_rma.perf
index fb2115ca3bc..4938966ce7c 100644
--- a/tests/inputs/test_square_sparse_rma.perf
+++ b/tests/inputs/test_square_sparse_rma.perf
@@ -27,7 +27,7 @@ N
 0.0d0
 # beta (real, imag)
 1.0d0
-0.0d0          
+0.0d0
 # limits (0 means full size)
 # row
 0
diff --git a/tools/docker/Dockerfile.build-env-rocm b/tools/docker/Dockerfile.build-env-rocm
new file mode 100644
index 00000000000..a947e70467f
--- /dev/null
+++ b/tools/docker/Dockerfile.build-env-rocm
@@ -0,0 +1,43 @@
+FROM rocm/dev-ubuntu-18.04:latest
+
+# install compilers, libraries & co
+RUN apt-get update
+RUN apt-get install -y \
+        locales \
+        gfortran \
+        gcc-7 \
+        g++-7 \
+        llvm-7-dev \
+        llvm-7-tools \
+        mpich \
+        libomp-7-dev \
+        libmpich-dev \
+        libopenblas-dev \
+        wget
+
+# install rocm libraries
+RUN wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add -
+RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list
+RUN apt-get update
+RUN apt-get install -y \
+        rocm-dev \
+        rocblas \
+        rocsolver \
+        hipblas
+
+# install git 2.18+
+RUN apt-get install -y software-properties-common
+RUN add-apt-repository ppa:git-core/ppa
+RUN apt-get install -y git
+
+# install ninja
+RUN apt-get install -y wget
+RUN wget https://github.com/Kitware/ninja/releases/download/v1.10.0.gfb670.kitware.jobserver-1/ninja-1.10.0.gfb670.kitware.jobserver-1_x86_64-linux-gnu.tar.gz
+RUN tar -xzvf ninja-1.10.0.gfb670.kitware.jobserver-1_x86_64-linux-gnu.tar.gz
+ENV PATH="/ninja-1.10.0.gfb670.kitware.jobserver-1_x86_64-linux-gnu:${PATH}"
+
+# install cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.17.0/cmake-3.17.0-Linux-x86_64.tar.gz
+RUN tar -xzvf cmake-3.17.0-Linux-x86_64.tar.gz
+ENV PATH="/cmake-3.17.0-Linux-x86_64/bin:${PATH}"
+
diff --git a/tools/docker/README.md b/tools/docker/README.md
index 5b6e485d658..1f4093b56fa 100644
--- a/tools/docker/README.md
+++ b/tools/docker/README.md
@@ -31,3 +31,17 @@ If you need to rebuild the image, use:
 $ cd dbcsr/tools/docker
 $ docker build -t dbcsr/build-env-ubuntu-20.04 -f Dockerfile.build-env-ubuntu .
 ```
+
+## ROCm Build Environment
+
+The image is based on Ubuntu 18.04 and contains:
+
+* GNU Fortran Compiler
+* OpenBLAS
+* MPICH
+* CMake (recent version)
+* Ninja (recent version)
+* Git 2.18+
+* ROCm
+* ROCm libraries (rocblas, rocsolver, hipblas)
+