From 1ce24276930473ec487dff94045b0acd1ae5027c Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Thu, 4 Feb 2021 14:25:14 +0100
Subject: [PATCH 01/22] OpenCL-BE/LIBSMM: verbose output and documentation.
 Improved auto-tuning scripts. Minor fixes after #419.

* Introduced (runtime-)verbosity level. Print device name (non-zero verbosity).
* Fixed issue (https://github.com/cp2k/dbcsr/pull/419#issuecomment-772793542).
* Renamed ACC_OPENCL_VERBOSE to ACC_OPENCL_DEBUG.
* ACC benchmark drivers: inform if no device was found.
* Improved documentation and documented ACC_OPENCL_VERBOSE.
* Introduced verbose output (time needed for kernel compilation, etc).
* tune_multiply.py: option to only rely on primary objective.
* tune_multiply.py: catch CTRL-C and save configuration.
* tune_multiply.sh: relay result code of failing script.
* tune_multiply.sh: continuation with wrapper script.
---
 src/acc/acc_bench_smm.c             | 16 +++++---
 src/acc/acc_bench_trans.c           | 10 ++++-
 src/acc/opencl/README.md            |  5 ++-
 src/acc/opencl/acc_opencl.c         | 35 ++++++++++++----
 src/acc/opencl/acc_opencl.h         |  9 +++--
 src/acc/opencl/acc_opencl_event.c   |  4 +-
 src/acc/opencl/acc_opencl_stream.c  |  4 +-
 src/acc/opencl/smm/README.md        | 19 ++++++---
 src/acc/opencl/smm/opencl_libsmm.c  | 31 ++++++++++++---
 src/acc/opencl/smm/tune_multiply.py | 62 ++++++++++++++++++++++-------
 src/acc/opencl/smm/tune_multiply.sh | 13 +++++-
 11 files changed, 159 insertions(+), 49 deletions(-)

diff --git a/src/acc/acc_bench_smm.c b/src/acc/acc_bench_smm.c
index 560f6b59e5e..793bac0c2f9 100644
--- a/src/acc/acc_bench_smm.c
+++ b/src/acc/acc_bench_smm.c
@@ -106,6 +106,8 @@ int main(int argc, char* argv[])
   printf("%s%s%i %i %i %i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "",
     nrepeat, stack_size, m, n, k, nc, na, nb);
   CHECK(c_dbcsr_acc_init(), &result);
+  /* note: libsmm_acc_init() may imply acc_init() */
+  CHECK(libsmm_acc_init(), &result);
   CHECK(c_dbcsr_acc_get_ndevices(&ndevices), &result);
   if (0 < ndevices) {
 #if defined(_DEBUG)
@@ -113,8 +115,9 @@ int main(int argc, char* argv[])
 #endif
   }
   else {
-#if defined(_DEBUG)
-    fprintf(stderr, "Error: no device found!\n");
+    fprintf(stderr, "No ACC-device found!\n");
+#if !defined(__CUDA)
+    CHECK(libsmm_acc_finalize(), NULL);
 #endif
     CHECK(c_dbcsr_acc_finalize(), NULL);
     return result;
@@ -165,14 +168,14 @@ int main(int argc, char* argv[])
     CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev,
       DBCSR_TYPE(ELEM_TYPE), n, k, MAX_KERNEL_DIM, stream), &result);
   }
-#if defined(USE_LIBXSMM)
+# if defined(USE_LIBXSMM)
   CHECK(c_dbcsr_acc_stream_sync(stream), &result);
   start = libxsmm_timer_tick();
-#endif
+# endif
   /* to perform NN-SMMs on the device, all B-matrices are transposed upfront (SMM-kernel is limited to NT) */
   CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev,
     DBCSR_TYPE(ELEM_TYPE), k, n, MAX_KERNEL_DIM, stream), &result);
-#if defined(USE_LIBXSMM)
+# if defined(USE_LIBXSMM)
   CHECK(c_dbcsr_acc_stream_sync(stream), &result);
   transpose = libxsmm_timer_duration(start, libxsmm_timer_tick());
 # endif
@@ -282,6 +285,9 @@ int main(int argc, char* argv[])
   CHECK(c_dbcsr_acc_dev_mem_deallocate(bmat_dev), NULL);
   CHECK(c_dbcsr_acc_dev_mem_deallocate(cmat_dev), NULL);
   CHECK(c_dbcsr_acc_stream_destroy(stream), NULL);
+#if !defined(__CUDA)
+  CHECK(libsmm_acc_finalize(), NULL);
+#endif
   CHECK(c_dbcsr_acc_finalize(), NULL);
   if (EXIT_SUCCESS != result) {
     fprintf(stderr, "FAILED\n");
diff --git a/src/acc/acc_bench_trans.c b/src/acc/acc_bench_trans.c
index b10bb0b3d82..ccac1c8d292 100644
--- a/src/acc/acc_bench_trans.c
+++ b/src/acc/acc_bench_trans.c
@@ -91,6 +91,8 @@ int main(int argc, char* argv[])
   assert(m <= (mn / n) && 0 == (mn % n));
   printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n);
   CHECK(c_dbcsr_acc_init(), &result);
+  /* note: libsmm_acc_init() may imply acc_init() */
+  CHECK(libsmm_acc_init(), &result);
   CHECK(c_dbcsr_acc_get_ndevices(&ndevices), &result);
   if (0 < ndevices) {
 #if defined(_DEBUG)
@@ -98,8 +100,9 @@ int main(int argc, char* argv[])
 #endif
   }
   else {
-#if defined(_DEBUG)
-    fprintf(stderr, "Error: no device found!\n");
+    fprintf(stderr, "No ACC-device found!\n");
+#if !defined(__CUDA)
+    CHECK(libsmm_acc_finalize(), NULL);
 #endif
     CHECK(c_dbcsr_acc_finalize(), NULL);
     return result;
@@ -210,6 +213,9 @@ int main(int argc, char* argv[])
   CHECK(c_dbcsr_acc_dev_mem_deallocate(stack_dev), NULL);
   CHECK(c_dbcsr_acc_dev_mem_deallocate(mat_dev), NULL);
   CHECK(c_dbcsr_acc_stream_destroy(stream), NULL);
+#if !defined(__CUDA)
+  CHECK(libsmm_acc_finalize(), NULL);
+#endif
   CHECK(c_dbcsr_acc_finalize(), NULL);
   if (EXIT_SUCCESS != result) {
     fprintf(stderr, "FAILED\n");
diff --git a/src/acc/opencl/README.md b/src/acc/opencl/README.md
index 47791410edc..9148c0b9a25 100644
--- a/src/acc/opencl/README.md
+++ b/src/acc/opencl/README.md
@@ -8,7 +8,7 @@ The OpenCL backend implements the [ACC interface](https://github.com/cp2k/dbcsr/
 
 ### Compile-time Settings
 
-Compile-time settings are (implicitly) documented and can be adjusted by editing [acc_opencl.h](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/acc_opencl.h) (adjusting the build-line as per `-D` is possible as well but less convenient). For example, `ACC_OPENCL_STREAM_PRIORITIES` is enabled by default (and further confirmed at runtime/build-time) but can be disabled, or `ACC_OPENCL_VERBOSE` (which is disabled by default) can be enabled for debug purpose. More sensitive/private compile-time settings may be available within particular translation units like in `acc_opencl_mem.c`.
+Compile-time settings are (implicitly) documented and can be adjusted by editing [acc_opencl.h](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/acc_opencl.h) (adjusting the build-line as per `-D` is possible as well but less convenient). For example, `ACC_OPENCL_STREAM_PRIORITIES` is enabled by default (and further confirmed at runtime/build-time) but can be disabled, or `ACC_OPENCL_DEBUG` (which is disabled by default) can be enabled for debug purpose. More sensitive/private compile-time settings may be available within particular translation units like in `acc_opencl_mem.c`.
 
 An application of compile-time settings (and perhaps a valuable contribution) might be to call a GPU library in OpenCL-based LIBSMM. In such case, Shared Virtual Memory support (SVM) in OpenCL comes handy and can be enabled per `ACC_OPENCL_SVM`. The latter allows then to simply take the raw pointer out of an `cl_mem` object, and pass it into such library/function (which in turn can work across language borders, etc.).
 
@@ -19,6 +19,9 @@ Runtime settings are made by the means of environment variables (implemented in
 * `ACC_OPENCL_VENDOR`: character string matching the vendor of the OpenCL device in an case-insensitive fashion, e.g., "intel".
 * `ACC_OPENCL_DEVTYPE`: character string matching the device-kind like "cpu", "gpu", or another kind if neither CPU or GPU.
 * `ACC_OPENCL_DEVICE`: non-negative integer number to select a device from the (internally enumerated) list of devices.
+* `ACC_OPENCL_VERBOSE`: verbosity level (integer).
+    * `ACC_OPENCL_VERBOSE=1`: outputs (stderr) the number of devices found and the name of the selected device.
+    * `ACC_OPENCL_VERBOSE=2`: outputs (stderr) the duration needed to generate a requested kernel.
 
 The OpenCL backend enumerates and orders devices primarily by device-kind (GPU, CPU, and others in that order) and by memory capacity (secondary criterion). Device IDs are zero-based as per ACC interface (and less than what is permitted/returned by `acc_get_ndevices`).
 
diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c
index 88553be672f..fa18c94f770 100644
--- a/src/acc/opencl/acc_opencl.c
+++ b/src/acc/opencl/acc_opencl.c
@@ -131,7 +131,11 @@ int c_dbcsr_acc_init(void)
 {
 #if defined(_OPENMP)
   /* initialization/finalization is not meant to be thread-safe */
-  int result = (0 == omp_in_parallel() ? EXIT_SUCCESS : EXIT_FAILURE);
+  int result = ((0 == omp_in_parallel()
+# if /*WORKAROUND*/defined(__DBCSR_ACC)
+    || 0/*master*/ == omp_get_thread_num()
+# endif
+    ) ? EXIT_SUCCESS : EXIT_FAILURE);
 #else
   int result = EXIT_SUCCESS;
 #endif
@@ -177,7 +181,6 @@ int c_dbcsr_acc_init(void)
       if (device_id < acc_opencl_ndevices) {
         if (NULL != env_device_vendor && '\0' != *env_device_vendor) {
           for (i = 0; i < (cl_uint)acc_opencl_ndevices;) {
-            buffer[0] = '\0';
             if (CL_SUCCESS == clGetDeviceInfo(acc_opencl_devices[i],
               CL_DEVICE_VENDOR, ACC_OPENCL_BUFFERSIZE, buffer, NULL))
             {
@@ -216,7 +219,9 @@ int c_dbcsr_acc_init(void)
           }
         }
         if (EXIT_SUCCESS == result) {
+          const char *const env_verbose = getenv("ACC_OPENCL_VERBOSE");
           cl_device_id active_device;
+          acc_opencl_options.verbosity = (NULL == env_verbose ? 0 : atoi(env_verbose));
           result = c_dbcsr_acc_opencl_set_active_device(device_id, &active_device);
 #if defined(_OPENMP) && defined(ACC_OPENCL_THREADLOCAL_CONTEXT)
           if (EXIT_SUCCESS == result) {
@@ -284,7 +289,11 @@ int c_dbcsr_acc_finalize(void)
 {
 #if defined(_OPENMP)
   /* initialization/finalization is not meant to be thread-safe */
-  int result = (0 == omp_in_parallel() ? EXIT_SUCCESS : EXIT_FAILURE);
+  int result = ((0 == omp_in_parallel()
+# if /*WORKAROUND*/defined(__DBCSR_ACC)
+    || 0/*master*/ == omp_get_thread_num()
+# endif
+    ) ? EXIT_SUCCESS : EXIT_FAILURE);
 #else
   int result = EXIT_SUCCESS;
 #endif
@@ -325,7 +334,6 @@ void c_dbcsr_acc_clear_errors(void)
 int c_dbcsr_acc_get_ndevices(int* ndevices)
 {
   int result;
-
 #if defined(__DBCSR_ACC)
   /* DBCSR calls acc_get_ndevices before calling acc_init(). */
   result = c_dbcsr_acc_init();
@@ -375,7 +383,6 @@ int c_dbcsr_acc_opencl_device_vendor(cl_device_id device, const char* vendor)
   char buffer[ACC_OPENCL_BUFFERSIZE];
   int result = EXIT_SUCCESS;
   assert(NULL != device && NULL != vendor);
-  buffer[0] = '\0';
   ACC_OPENCL_CHECK(clGetDeviceInfo(device,
     CL_DEVICE_VENDOR, ACC_OPENCL_BUFFERSIZE, buffer, NULL),
     "retrieve device vendor", result);
@@ -477,8 +484,20 @@ int c_dbcsr_acc_opencl_set_active_device(int device_id, cl_device_id* device)
         ACC_OPENCL_CHECK(result, "create context", result);
       }
     }
-    if (NULL != device) {
-      *device = (EXIT_SUCCESS == result ? active_id : NULL);
+    if (EXIT_SUCCESS == result) {
+      if (NULL != device) *device = active_id;
+      if (0 != acc_opencl_options.verbosity) {
+        char buffer[ACC_OPENCL_BUFFERSIZE];
+        if (CL_SUCCESS == clGetDeviceInfo(active_id,
+          CL_DEVICE_NAME, ACC_OPENCL_BUFFERSIZE, buffer, NULL))
+        {
+          fprintf(stderr, "INFO ACC/OpenCL: ndevices=%i device%i=\"%s\"\n",
+            acc_opencl_ndevices, device_id, buffer);
+        }
+      }
+    }
+    else {
+      if (NULL != device) *device = NULL;
     }
   }
   ACC_OPENCL_RETURN(result);
@@ -546,7 +565,7 @@ int c_dbcsr_acc_opencl_wgsize(cl_device_id device, cl_kernel kernel,
 int c_dbcsr_acc_opencl_kernel(const char* source, const char* build_options,
   const char* kernel_name, cl_kernel* kernel)
 {
-  char buffer[ACC_OPENCL_BUFFERSIZE] = "\0";
+  char buffer[ACC_OPENCL_BUFFERSIZE] = "";
   cl_int result;
   assert(NULL != kernel);
   if (NULL != acc_opencl_context) {
diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h
index 6c8f5214a14..9ad3954923c 100644
--- a/src/acc/opencl/acc_opencl.h
+++ b/src/acc/opencl/acc_opencl.h
@@ -86,8 +86,8 @@
 #if !defined(ACC_OPENCL_MEM_ASYNC) && 1
 # define ACC_OPENCL_MEM_ASYNC
 #endif
-#if !defined(ACC_OPENCL_VERBOSE) && 0
-# define ACC_OPENCL_VERBOSE
+#if !defined(ACC_OPENCL_DEBUG) && 0
+# define ACC_OPENCL_DEBUG
 #endif
 #if !defined(ACC_OPENCL_SVM) && 0
 # if defined(CL_VERSION_2_0)
@@ -189,9 +189,12 @@ extern "C" {
 
 /** Settings depending on OpenCL vendor or standard level (discovered/setup in acc_init). */
 typedef struct acc_opencl_options_t {
-  /** Asynchronous memory operations may crash for some OpenCL implementations. */
+  /** Asynchronous memory operations (may crash for some OpenCL implementations). */
   cl_bool async_memops;
+  /** Runtime SVM support (needs ACC_OPENCL_SVM at compile-time). */
   cl_bool svm_interop;
+  /** Runtime verbosity (output on stderr). */
+  cl_int verbosity;
 } acc_opencl_options_t;
 
 extern acc_opencl_options_t acc_opencl_options;
diff --git a/src/acc/opencl/acc_opencl_event.c b/src/acc/opencl/acc_opencl_event.c
index fd59a69611d..0dd92e4bdc6 100644
--- a/src/acc/opencl/acc_opencl_event.c
+++ b/src/acc/opencl/acc_opencl_event.c
@@ -107,7 +107,7 @@ int c_dbcsr_acc_event_query(void* event, acc_bool_t* has_occurred)
   }
   assert(NULL != has_occurred);
   *has_occurred = (CL_COMPLETE == status || 0 > status);
-#if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG)
+#if defined(ACC_OPENCL_DEBUG) && defined(_DEBUG)
   fprintf(stderr, "c_dbcsr_acc_event_query(%p, %i)\n", event, *has_occurred);
 #endif
   ACC_OPENCL_RETURN(result);
@@ -118,7 +118,7 @@ int c_dbcsr_acc_event_synchronize(void* event)
 { /* Waits on the host-side. */
   int result = EXIT_SUCCESS;
   assert(NULL != event);
-#if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG)
+#if defined(ACC_OPENCL_DEBUG) && defined(_DEBUG)
   fprintf(stderr, "c_dbcsr_acc_event_synchronize(%p)\n", event);
 #endif
   ACC_OPENCL_CHECK(clWaitForEvents(1, ACC_OPENCL_EVENT(event)),
diff --git a/src/acc/opencl/acc_opencl_stream.c b/src/acc/opencl/acc_opencl_stream.c
index be8825523b0..fe1fbccd5f2 100644
--- a/src/acc/opencl/acc_opencl_stream.c
+++ b/src/acc/opencl/acc_opencl_stream.c
@@ -165,7 +165,7 @@ int c_dbcsr_acc_stream_sync(void* stream)
 { /* Blocks the host-thread. */
   int result = EXIT_SUCCESS;
   assert(NULL != stream);
-#if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG)
+#if defined(ACC_OPENCL_DEBUG) && defined(_DEBUG)
   fprintf(stderr, "c_dbcsr_acc_stream_sync(%p)\n", stream);
 #endif
   ACC_OPENCL_CHECK(clFinish(*ACC_OPENCL_STREAM(stream)),
@@ -178,7 +178,7 @@ int c_dbcsr_acc_stream_wait_event(void* stream, void* event)
 { /* Wait for an event (device-side). */
   int result = EXIT_SUCCESS;
   assert(NULL != stream && NULL != event);
-#if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG)
+#if defined(ACC_OPENCL_DEBUG) && defined(_DEBUG)
   fprintf(stderr, "c_dbcsr_acc_stream_wait_event(%p, %p)\n", stream, event);
 #endif
 #if defined(ACC_OPENCL_STREAM_SYNCFLUSH)
diff --git a/src/acc/opencl/smm/README.md b/src/acc/opencl/smm/README.md
index 1536bd14c04..a3ff7a223d4 100644
--- a/src/acc/opencl/smm/README.md
+++ b/src/acc/opencl/smm/README.md
@@ -14,7 +14,16 @@ The `OPENCL_LIBSMM_DEBUG` compile-time setting enables side-by-side validation o
 
 ### Runtime Settings
 
-Runtime settings are made by the means of environment variables (implemented in `opencl_libsmm.c`). There are two categories (for the two major functions) like matrix transpose (`OPENCL_LIBSMM_TRANS_*`) and matrix multiplication (`OPENCL_LIBSMM_SMM_*`). For tranposing matrices:
+Runtime settings are made by the means of environment variables (implemented in `opencl_libsmm.c`). There are two categories (for the two major functions) like matrix transpose (`OPENCL_LIBSMM_TRANS_*`) and matrix multiplication (`OPENCL_LIBSMM_SMM_*`). Common settings are (see OpenCL backend documentation for more details):
+
+* `ACC_OPENCL_VENDOR`: character string matching the vendor of the OpenCL device in an case-insensitive fashion, e.g., "intel".
+* `ACC_OPENCL_DEVTYPE`: character string matching the device-kind like "cpu", "gpu", or another kind if neither CPU or GPU.
+* `ACC_OPENCL_DEVICE`: non-negative integer number to select a device from the (internally enumerated) list of devices.
+* `ACC_OPENCL_VERBOSE`: verbosity level (integer).
+    * `ACC_OPENCL_VERBOSE=1`: outputs (stderr) the number of devices found and the name of the selected device.
+    * `ACC_OPENCL_VERBOSE=2`: outputs (stderr) the duration needed to generate a requested kernel.
+
+For tranposing matrices:
 
 * `OPENCL_LIBSMM_TRANS_BUILDOPTS`: character string with build options (compile and link) supplied to the OpenCL runtime compiler.
 * `OPENCL_LIBSMM_TRANS_INPLACE`: Boolean value (zero or non-zero integer) for inplace matrix transpose not relying on local memory.
@@ -28,13 +37,13 @@ For multiplying matrices:
 * `OPENCL_LIBSMM_SMM_BLOCK_M`: non-negative integer number (less/equal than the M-extent) denoting the blocksize in M-direction.
 * `OPENCL_LIBSMM_SMM_BLOCK_N`: non-negative integer number (less/equal than the N-extent) denoting the blocksize in N-direction.
 
-**NOTE**: above runtime settings may be non-smooth in the sense of enabling a distinct code-path depending on a specific value, e.g., `OPENCL_LIBSMM_SMM_BATCHSIZE=1`.
+**NOTE**: LIBSMM's tunable runtime settings may be non-smooth in the sense of enabling a distinct code-path depending on a specific value, e.g., `OPENCL_LIBSMM_SMM_BATCHSIZE=1` vs. `OPENCL_LIBSMM_SMM_BATCHSIZE=2`.
 
 ## Auto Tuning
 
 Auto tuning code for performance is a practical way to find the "best" setting for parameterized code (e.g., GPU kernels). Introducing effective parameters is a prerequisite, and exploring the (potentially) high-dimensional parameter space in an efficient way is an art. It is desirable to have reasonable defaults even without auto-tuning the parameters. It would be even better to avoid auto-tuning if best performance was possible right away, i.e., if auto-tuning is not able to find better settings.
 
-For the OpenCL based LIBSMM, `OPENCL_LIBSMM_SMM_BATCHSIZE`, `OPENCL_LIBSMM_SMM_BLOCK_M`, and `OPENCL_LIBSMM_SMM_BLOCK_N` are explored using [OpenTuner](http://opentuner.org/). The script [tune_multiply.py](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/smm/tune_multiply.py) leverages for instance the [acc_bench_smm](index.html) benchmark by parsing console output (timing, data type, etc.). This way, the tuning is implemented without being intermingled with subject being tuned. To build the benchmarks:
+For the OpenCL based LIBSMM, `OPENCL_LIBSMM_SMM_BATCHSIZE`, `OPENCL_LIBSMM_SMM_BLOCK_M`, and `OPENCL_LIBSMM_SMM_BLOCK_N` are explored using [OpenTuner](http://opentuner.org/). The script [tune_multiply.py](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/smm/tune_multiply.py) leverages for instance the `acc_bench_smm` benchmark by parsing console output (timing, data type, etc.). This way, the tuning is implemented without being intermingled with subject being tuned. To build the benchmarks:
 
 ```bash
 cd src/acc/opencl
@@ -66,7 +75,7 @@ The OpenTuner script implements multiple objectives ("cost"), primarily "accurac
 [    67s]    INFO opentuner.search.plugin.DisplayPlugin: tests=53, best {'BS': 48, 'BM': 8, 'BN': 1}, cost accuracy=32.20000000, size=1.0, found by UniformGreedyMutation
 ```
 
-The script finally writes a JSON-file with a filename like `tune_multiply-float-12x12x12-60gflops.json` which is encoding the benchmark (multiply), the precision (float), the kernel (12x12x12), and the achieved performance (60gflops). Tuninig starts from an internal default that is supposed to match LIBSMM's internal default parameters. However, tuning can be (re-)started with specific parameters (e.g., `-bs 64`, `-bm 13`, `-bn 1` for `OPENCL_LIBSMM_SMM_BATCHSIZE`, `OPENCL_LIBSMM_SMM_BLOCK_M`, and `OPENCL_LIBSMM_SMM_BLOCK_N` respectively).
+The script finally writes a JSON-file with a filename like `tune_multiply-float-12x12x12-60gflops.json` which is encoding the benchmark (multiply), the precision (float), the kernel (12x12x12), and the achieved performance (60gflops). The script handles SIGINT (like Ctrl-C), and output is still written despite of not terminating normally (can abused to tune interactively). Tuninig starts from an internal default that is supposed to match LIBSMM's internal default parameters. However, tuning can be (re-)started with specific parameters (e.g., `-bs 64`, `-bm 13`, `-bn 1` for `OPENCL_LIBSMM_SMM_BATCHSIZE`, `OPENCL_LIBSMM_SMM_BLOCK_M`, and `OPENCL_LIBSMM_SMM_BLOCK_N` respectively).
 
 ## Optimized Kernels
 
@@ -114,4 +123,4 @@ cd src/acc/opencl/smm
 ./tune_multiply.sh 300  8 1  4 10 15, 6 7 8, 23
 ```
 
-The script `tune_multiply.sh` is tuning 1444 kernels by default (`./acc_bench_smm 300 8 1` taking approximately 15 hours per part).
+The script `tune_multiply.sh` is tuning 1444 kernels by default (`./acc_bench_smm 300 8 1` taking approximately 15 hours per part). If the process is interrupted earlier (per SIGINT or Ctrl-C), the execution terminates for all requested kernels (triplet specification) unless an environment variable `CONTINUE=1` is set (proceeds to the next kernel).
diff --git a/src/acc/opencl/smm/opencl_libsmm.c b/src/acc/opencl/smm/opencl_libsmm.c
index f7878d1b1fe..87b680ec6ec 100644
--- a/src/acc/opencl/smm/opencl_libsmm.c
+++ b/src/acc/opencl/smm/opencl_libsmm.c
@@ -128,8 +128,12 @@ int opencl_libsmm_read_params(char* parambuf,
 int libsmm_acc_init(void)
 {
 #if defined(_OPENMP)
-    /* initialization/finalization is not meant to be thread-safe */
-  int result = (0 == omp_in_parallel() ? EXIT_SUCCESS : EXIT_FAILURE);
+  /* initialization/finalization is not meant to be thread-safe */
+  int result = ((0 == omp_in_parallel()
+# if /*WORKAROUND*/defined(__DBCSR_ACC)
+    || 0/*master*/ == omp_get_thread_num()
+# endif
+    ) ? EXIT_SUCCESS : EXIT_FAILURE);
 #else
   int result = EXIT_SUCCESS;
 #endif
@@ -204,9 +208,13 @@ int libsmm_acc_finalize(void)
    * However, libsmm_acc_finalize is indirectly called (acc_finalize) inside of a
    * parallel region (not just the master thread).
    */
-#if defined(_OPENMP) && /*WORKAROUND*/!defined(__DBCSR_ACC)
+#if defined(_OPENMP)
   /* initialization/finalization is not meant to be thread-safe */
-  int result = (0 == omp_in_parallel() ? EXIT_SUCCESS : EXIT_FAILURE);
+  int result = ((0 == omp_in_parallel()
+# if /*WORKAROUND*/defined(__DBCSR_ACC)
+    || 0/*master*/ == omp_get_thread_num()
+# endif
+    ) ? EXIT_SUCCESS : EXIT_FAILURE);
 #else
   int result = EXIT_SUCCESS;
 #endif
@@ -252,6 +260,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size,
     ) &&
     0 < stack_size && 1 < mn && m <= max_kernel_dim && n <= max_kernel_dim)
   {
+    const libxsmm_timer_tickint start = libxsmm_timer_tick();
     opencl_libsmm_trans_t* config;
     opencl_libsmm_transkey_t key;
     LIBXSMM_MEMZERO127(&key); /* potentially heterogeneous key-data */
@@ -314,6 +323,11 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size,
                   new_config.wgsize = (size_t)wgsize;
                   config = (opencl_libsmm_trans_t*)OPENCL_LIBSMM_REGISTER(&key, sizeof(key),
                     sizeof(new_config), &new_config);
+                  if (1 < acc_opencl_options.verbosity || 0 > acc_opencl_options.verbosity) {
+                    const double duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
+                    fprintf(stderr, "INFO ACC/OpenCL: %ix%i transpose-kernel generated in %.1f ms\n",
+                      m, n, 1000.0 * duration);
+                  }
                 }
                 else result = EXIT_FAILURE;
               }
@@ -452,6 +466,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
     0 < n_max && n_max <= max_kernel_dim &&
     0 < k_max && k_max <= max_kernel_dim)
   {
+    const libxsmm_timer_tickint start = libxsmm_timer_tick();
     opencl_libsmm_smm_t* config;
     opencl_libsmm_smmkey_t key;
     LIBXSMM_MEMZERO127(&key); /* potentially heterogeneous key-data */
@@ -572,7 +587,8 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
                   assert(0 < wgsize && 0 < max_wgsize);
                   /* check planned WG-size against kernel-specific WG-size */
                   if (wgsize <= max_wgsize) {
-                    if (NULL == config) {
+                    const int default_params = (NULL == config ? 1 : 0);
+                    if (default_params) {
                       config = (opencl_libsmm_smm_t*)OPENCL_LIBSMM_REGISTER(
                         &key, sizeof(key), sizeof(new_config), &new_config);
                     }
@@ -580,6 +596,11 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
                       config->wgsize = (size_t)wgsize;
                       config->bs = bs; config->bm = bm; config->bn = bn;
                       config->kernel = new_config.kernel;
+                      if (1 < acc_opencl_options.verbosity || 0 > acc_opencl_options.verbosity) {
+                        const double duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
+                        fprintf(stderr, "INFO ACC/OpenCL: %ix%ix%i %sSMM-kernel generated in %.1f ms\n",
+                          m_max, n_max, k_max, default_params ? "" : "tuned ", 1000.0 * duration);
+                      }
                     }
                     else { /* failed to register config */
                       result = EXIT_FAILURE;
diff --git a/src/acc/opencl/smm/tune_multiply.py b/src/acc/opencl/smm/tune_multiply.py
index 5d871655797..06722d3f582 100755
--- a/src/acc/opencl/smm/tune_multiply.py
+++ b/src/acc/opencl/smm/tune_multiply.py
@@ -17,6 +17,7 @@
 from opentuner import MeasurementInterface
 from opentuner import IntegerParameter
 from opentuner import Result
+from signal import signal, SIGINT
 import json
 import glob
 import sys
@@ -60,29 +61,34 @@ def manipulator(self):
         manipulator.add_parameter(IntegerParameter("BS", 1, self.args.mb))
         manipulator.add_parameter(IntegerParameter("BM", 1, self.args.m))
         manipulator.add_parameter(IntegerParameter("BN", 1, self.args.n))
+        # register signal handler (CTRL-C)
+        signal(SIGINT, self.handle_sigint)
         return manipulator
 
     def seed_configurations(self):
         return [{"BS": self.args.bs, "BM": self.args.bm, "BN": self.args.bn}]
 
     def objective(self):
-        return opentuner.search.objective.MaximizeAccuracyMinimizeSize()
+        if not self.args.primary:
+            return opentuner.search.objective.MaximizeAccuracyMinimizeSize()
+        else:
+            return opentuner.search.objective.MaximizeAccuracy()
 
     def run(self, desired_result, input, limit):
         """
         Compile and run a given configuration then
         return performance
         """
-        cfg = desired_result.configuration.data
+        config = desired_result.configuration.data
         run_cmd = (
             "OMP_PROC_BIND=TRUE CHECK="
             + str(self.args.check)
             + " OPENCL_LIBSMM_SMM_BATCHSIZE="
-            + str(cfg["BS"])
+            + str(config["BS"])
             + " OPENCL_LIBSMM_SMM_BLOCK_M="
-            + str(cfg["BM"])
+            + str(config["BM"])
             + " OPENCL_LIBSMM_SMM_BLOCK_N="
-            + str(cfg["BN"])
+            + str(config["BN"])
             + " "
             + self.exepath
             + "/"
@@ -106,9 +112,12 @@ def run(self, desired_result, input, limit):
         if (match is not None) and match.group(1) and match.group(3):
             mseconds = float(match.group(1))
             gflops = float(match.group(3))
-            self.gflops = max(self.gflops, gflops)
+            if self.gflops < gflops:
+                # keep best configuration in case of an early exit
+                self.config = desired_result.configuration
+                self.gflops = gflops
             kernelreq = round(
-                (100.0 * cfg["BM"] * cfg["BN"]) / (self.args.m * self.args.n)
+                (100.0 * config["BM"] * config["BN"]) / (self.args.m * self.args.n)
             )
             # gflops are reported as "accuracy" (console output)
             return Result(time=mseconds, accuracy=gflops, size=kernelreq)
@@ -140,14 +149,15 @@ def save_final_config(self, configuration):
                 + ofilename
             )
             # extend result for easier reuse later
-            configuration.data["GFLOPS"] = self.gflops
-            configuration.data["TYPEID"] = self.typeid
-            configuration.data["M"] = self.args.m
-            configuration.data["N"] = self.args.n
-            configuration.data["K"] = self.args.k
-            # self.manipulator().save_to_file(configuration.data, ofilename)
+            config = configuration.data
+            config["GFLOPS"] = self.gflops
+            config["TYPEID"] = self.typeid
+            config["M"] = self.args.m
+            config["N"] = self.args.n
+            config["K"] = self.args.k
+            # self.manipulator().save_to_file(config, ofilename)
             with open(ofilename, "w") as ofile:
-                json.dump(configuration.data, ofile)
+                json.dump(config, ofile)
                 ofile.write("\n")  # append newline at EOF
             # merge all JSONs into a single CSV file
             if self.args.csvfile:
@@ -172,7 +182,7 @@ def save_final_config(self, configuration):
                                     ifilename = merged[key][-1]
                                     merged[key] = value
                                 print(
-                                    "Superfluous "
+                                    "Worse result "
                                     + ifilename
                                     + " ignored when merging CSV file"
                                 )
@@ -204,6 +214,20 @@ def save_final_config(self, configuration):
                         + self.args.csvfile
                     )
 
+    def handle_sigint(self, signum, frame):
+        """handles SIGINT or CTRL-C"""
+        print(
+            "\nWARNING: tuning "
+            + str(self.args.m)
+            + "x"
+            + str(self.args.n)
+            + "x"
+            + str(self.args.k)
+            + "-kernel was interrupted."
+        )
+        self.save_final_config(self.config)
+        exit(1)
+
 
 if __name__ == "__main__":
     argparser = opentuner.default_argparser()
@@ -279,4 +303,12 @@ def save_final_config(self, configuration):
         dest="check",
         help="Validate kernel (epsilon)",
     )
+    argparser.add_argument(
+        "-p",
+        "--primary-objective",
+        action="store_true",
+        default=False,
+        dest="primary",
+        help="Primary objective only",
+    )
     SmmTuner.main(argparser.parse_args())
diff --git a/src/acc/opencl/smm/tune_multiply.sh b/src/acc/opencl/smm/tune_multiply.sh
index 939b452af5d..0bc34130a40 100755
--- a/src/acc/opencl/smm/tune_multiply.sh
+++ b/src/acc/opencl/smm/tune_multiply.sh
@@ -92,7 +92,7 @@ if [ "${SED}" ] && [ "${LS}" ] && [ "${RM}" ] && [ "${WC}" ]; then
   fi
   NJSONS=$(${LS} -1 ./*.json 2>/dev/null | ${WC} -l)
   if [ "0" != "${NJSONS}" ]; then
-    echo "There are already ${NJSONS} (unrelated?) JSON-files found."
+    echo "Already found ${NJSONS} (unrelated?) JSON-files."
   fi
   SLEEP=$(command -v sleep)
   if [ "${DELAY}" ] && [ "${SLEEP}" ]; then
@@ -109,6 +109,17 @@ if [ "${SED}" ] && [ "${LS}" ] && [ "${RM}" ] && [ "${WC}" ]; then
       # avoid mixing database of previous results into new session
       ${RM} -rf "${HERE}/opentuner.db"
       eval "${HERE}/tune_multiply.py ${TRIPLET} --no-dups ${LIMIT}"
+      RESULT=$?
+      # environment var. CONTINUE allows to proceed with next kernel
+      # even if tune_multiply.py returned non-zero exit code
+      if [[ ("0" != "${RESULT}") && \
+            ("${CONTINUE}" = "" \
+          || "${CONTINUE}" = "0" \
+          || "${CONTINUE}" = "no" \
+          || "${CONTINUE}" = "false") ]];
+      then
+        exit ${RESULT}
+      fi
     fi
     N=$((N+1))
   done

From 69a0e850a1f09b77d03f9390ebbd83179303a03f Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Thu, 4 Feb 2021 14:41:01 +0100
Subject: [PATCH 02/22] Fixed Makefile used to build
 acc_bench_trans/acc_bench_smm with CUDA (accommodate changes from #419).

---
 src/acc/cuda/Makefile | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/acc/cuda/Makefile b/src/acc/cuda/Makefile
index 3f938a1066a..292a48b4a58 100644
--- a/src/acc/cuda/Makefile
+++ b/src/acc/cuda/Makefile
@@ -1,6 +1,6 @@
 INCACC := $(wildcard *.h*) ../acc.h
 SRCACC := $(wildcard *.cpp)
-OBJACC := $(SRCACC:.cpp=.o) acc_cublas.o
+OBJACC := $(SRCACC:.cpp=.o)
 
 GPUSMM := $(wildcard ../libsmm_acc/kernels/*.h*)
 INCSMM := $(wildcard ../libsmm_acc/*.h*) ../acc_libsmm.h \
@@ -130,10 +130,7 @@ test: ../dbcsr_acc_test
 ../libsmm_acc/smm_acc_kernels.h: $(GPUSMM) Makefile ../libsmm_acc/generate_kernels.py ../libsmm_acc/parameters/parameters_$(WITH_GPU).json
 	@cd ../libsmm_acc && $(PYTHON) ../libsmm_acc/generate_kernels.py ../libsmm_acc/kernels
 
-acc_cublas.o: acc_cublas.cu Makefile
-	$(NVCC) $(addprefix -Xcompiler $(NULL),$(CXXFLAGS)) -c $< -o $@
-
-../dbcsr_acc.a: $(OBJACC) acc_cublas.o ../libsmm_acc/libsmm_acc_init.o
+../dbcsr_acc.a: $(OBJACC) ../libsmm_acc/libsmm_acc_init.o
 	$(AR) -rs $@ $^
 
 ../dbcsr_acc_smm.a: $(OBJSMM)
@@ -153,7 +150,7 @@ acc_bench_trans.o: ../acc_bench_trans.c Makefile
 	$(CXX) $^ $(LDFLAGS) -o $@
 
 dbcsr_acc_test.o: ../../../tests/dbcsr_acc_test.c Makefile
-	$(CC) $(CFLAGS) -c $< -o $@
+	$(CC) $(CFLAGS) -I../.. -c $< -o $@
 ../dbcsr_acc_test: dbcsr_acc_test.o ../dbcsr_acc_smm.a ../dbcsr_acc.a
 	$(CXX) $^ $(LDFLAGS) -o $@
 

From 3bf854c290bfa3e23d5891bbff93cedf537494fd Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Thu, 4 Feb 2021 15:12:17 +0100
Subject: [PATCH 03/22] Disabled ACC_OPENCL_THREADLOCAL_CONTEXT since DBCSR
 calls init/finalize in a parallel region which makes this code ineffective.

---
 src/acc/opencl/acc_opencl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h
index 9ad3954923c..cc15cb6340f 100644
--- a/src/acc/opencl/acc_opencl.h
+++ b/src/acc/opencl/acc_opencl.h
@@ -71,7 +71,7 @@
 # define ACC_OPENCL_EVENT(A) ((cl_event*)(A))
 #endif
 
-#if !defined(ACC_OPENCL_THREADLOCAL_CONTEXT) && 1
+#if !defined(ACC_OPENCL_THREADLOCAL_CONTEXT) && 0
 # define ACC_OPENCL_THREADLOCAL_CONTEXT
 #endif
 #if !defined(ACC_OPENCL_STREAM_PRIORITIES) && 1

From f14eee43b928e17b91d76fea515a4405e34721f6 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Thu, 4 Feb 2021 15:40:08 +0100
Subject: [PATCH 04/22] Updated LIBXSMM prior to v1.17.

---
 .ci/daint.cscs.ch/ocl.build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/daint.cscs.ch/ocl.build.sh b/.ci/daint.cscs.ch/ocl.build.sh
index 52c153ee929..b674f9044e3 100755
--- a/.ci/daint.cscs.ch/ocl.build.sh
+++ b/.ci/daint.cscs.ch/ocl.build.sh
@@ -24,7 +24,7 @@ if [ ! -d "${HOME}/libxsmm" ]; then
   git clone https://github.com/hfp/libxsmm.git
 fi
 cd "${HOME}/libxsmm"
-git checkout 02d6ab213a35d5fc2f6454c3b465598b0c086c17
+git checkout 05cab50ec6f11a86c15c0ed511c5a9066c613dfb
 make -j
 cd ..
 

From 9598017402808c8cc75e23c0bd1fb576d3163397 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Thu, 4 Feb 2021 15:40:56 +0100
Subject: [PATCH 05/22] Attempt to runtime-test OpenCL BE/LIBSMM.

---
 .ci/daint.cscs.ch/Jenkinsfile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.ci/daint.cscs.ch/Jenkinsfile b/.ci/daint.cscs.ch/Jenkinsfile
index 73a0e486146..22ef331b6a8 100644
--- a/.ci/daint.cscs.ch/Jenkinsfile
+++ b/.ci/daint.cscs.ch/Jenkinsfile
@@ -66,11 +66,11 @@ pipeline {
                                 run_batch("0:15:00", "ocl", "build")
                             }
                         }
-//                        stage('test') {
-//                            steps {
-//                                run_batch("1:00:00", "ocl", "test")
-//                            }
-//                        }
+                        stage('test') {
+                            steps {
+                                run_batch("1:00:00", "ocl", "test")
+                            }
+                        }
                     }
                 }
                 stage("Intel") {

From 6721baa232da39b9dce3114f06cd1b498f3134ff Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Thu, 4 Feb 2021 16:43:32 +0100
Subject: [PATCH 06/22] Reduced console output to potentially improve runtime
 of (CI-)tests.

---
 src/acc/libsmm_acc/libsmm_acc_benchmark.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/acc/libsmm_acc/libsmm_acc_benchmark.cpp b/src/acc/libsmm_acc/libsmm_acc_benchmark.cpp
index 9f8dade4cd9..e76d4e94a65 100644
--- a/src/acc/libsmm_acc/libsmm_acc_benchmark.cpp
+++ b/src/acc/libsmm_acc/libsmm_acc_benchmark.cpp
@@ -350,9 +350,12 @@ int libsmm_acc_benchmark(libsmm_acc_benchmark_t* h,
            best_gflops = gflops;
            best_kernel = ikern;
        }
-    } else {
+    }
+#if !defined(NDEBUG)
+    else {
        printf("%sOK %s\n", msg_prefix, descr);
     }
+#endif
  }
 
  if(h->mode == tune){
@@ -427,10 +430,12 @@ int libsmm_acc_benchmark_transpose_(int n_stack, int* stack, int* d_stack,
  if(sumGPU != sumCPU){
      printf("%sERROR %s checksum_diff: %g\n", msg_prefix, descr, sumGPU-sumCPU);
      error_counter++;
- } else {
+ }
+#if !defined(NDEBUG)
+ else {
      printf("%sOK %s\n", msg_prefix, descr);
  }
-
+#endif
  return error_counter;
 
 }

From ee12e07ebd18c50c77b479448563d8d161059e55 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Thu, 4 Feb 2021 19:58:08 +0100
Subject: [PATCH 07/22] Increased timeout from 15m to 20m.

---
 .ci/daint.cscs.ch/cray.test.sh  | 2 +-
 .ci/daint.cscs.ch/gnu.test.sh   | 2 +-
 .ci/daint.cscs.ch/intel.test.sh | 2 +-
 .ci/daint.cscs.ch/ocl.test.sh   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.ci/daint.cscs.ch/cray.test.sh b/.ci/daint.cscs.ch/cray.test.sh
index ee5df55151b..f88544849bb 100755
--- a/.ci/daint.cscs.ch/cray.test.sh
+++ b/.ci/daint.cscs.ch/cray.test.sh
@@ -32,4 +32,4 @@ export OMP_PROC_BIND=TRUE # set thread affinity
 env |& tee -a "${STAGE_NAME}.out"
 
 ulimit -s 256000
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 900" |& tee -a "${STAGE_NAME}.out"
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 1200" |& tee -a "${STAGE_NAME}.out"
diff --git a/.ci/daint.cscs.ch/gnu.test.sh b/.ci/daint.cscs.ch/gnu.test.sh
index 2ed9c13b050..1f1d43316ec 100755
--- a/.ci/daint.cscs.ch/gnu.test.sh
+++ b/.ci/daint.cscs.ch/gnu.test.sh
@@ -32,4 +32,4 @@ export OMP_PROC_BIND=TRUE # set thread affinity
 # document the current environment
 env |& tee -a "${STAGE_NAME}.out"
 
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 900" |& tee -a "${STAGE_NAME}.out"
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 1200" |& tee -a "${STAGE_NAME}.out"
diff --git a/.ci/daint.cscs.ch/intel.test.sh b/.ci/daint.cscs.ch/intel.test.sh
index 0e3497a7cf0..b1ea3ccdcc3 100755
--- a/.ci/daint.cscs.ch/intel.test.sh
+++ b/.ci/daint.cscs.ch/intel.test.sh
@@ -32,4 +32,4 @@ export OMP_PROC_BIND=TRUE # set thread affinity
 # document the current environment
 env |& tee -a "${STAGE_NAME}.out"
 
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 900" |& tee -a "${STAGE_NAME}.out"
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 1200" |& tee -a "${STAGE_NAME}.out"
diff --git a/.ci/daint.cscs.ch/ocl.test.sh b/.ci/daint.cscs.ch/ocl.test.sh
index c7a180bd22e..045c5055ed1 100755
--- a/.ci/daint.cscs.ch/ocl.test.sh
+++ b/.ci/daint.cscs.ch/ocl.test.sh
@@ -33,4 +33,4 @@ export OMP_PROC_BIND=TRUE # set thread affinity
 # document the current environment
 env |& tee -a "${STAGE_NAME}.out"
 
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 900" |& tee -a "${STAGE_NAME}.out"
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 1200" |& tee -a "${STAGE_NAME}.out"

From 8baf7ae4818ddf382534959cb32b150449f29328 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Thu, 4 Feb 2021 20:19:22 +0100
Subject: [PATCH 08/22] Fetch all commits before referring to some SHA.

---
 .ci/daint.cscs.ch/ocl.build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.ci/daint.cscs.ch/ocl.build.sh b/.ci/daint.cscs.ch/ocl.build.sh
index b674f9044e3..a326a93886c 100755
--- a/.ci/daint.cscs.ch/ocl.build.sh
+++ b/.ci/daint.cscs.ch/ocl.build.sh
@@ -24,6 +24,7 @@ if [ ! -d "${HOME}/libxsmm" ]; then
   git clone https://github.com/hfp/libxsmm.git
 fi
 cd "${HOME}/libxsmm"
+git fetch
 git checkout 05cab50ec6f11a86c15c0ed511c5a9066c613dfb
 make -j
 cd ..

From 2b9335fc73dff8f0ca15ba74d59e575e6f9bb1a4 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Thu, 4 Feb 2021 21:38:16 +0100
Subject: [PATCH 09/22] Revert "Attempt to runtime-test OpenCL BE/LIBSMM."

This reverts commit 9598017402808c8cc75e23c0bd1fb576d3163397.
---
 .ci/daint.cscs.ch/Jenkinsfile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.ci/daint.cscs.ch/Jenkinsfile b/.ci/daint.cscs.ch/Jenkinsfile
index 22ef331b6a8..73a0e486146 100644
--- a/.ci/daint.cscs.ch/Jenkinsfile
+++ b/.ci/daint.cscs.ch/Jenkinsfile
@@ -66,11 +66,11 @@ pipeline {
                                 run_batch("0:15:00", "ocl", "build")
                             }
                         }
-                        stage('test') {
-                            steps {
-                                run_batch("1:00:00", "ocl", "test")
-                            }
-                        }
+//                        stage('test') {
+//                            steps {
+//                                run_batch("1:00:00", "ocl", "test")
+//                            }
+//                        }
                     }
                 }
                 stage("Intel") {

From 20f9d25dbdb29e92650c0edb601664c62bcab09a Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Fri, 5 Feb 2021 08:42:19 +0100
Subject: [PATCH 10/22] Revised enabling ACC_OPENCL_THREADLOCAL_CONTEXT.

---
 src/acc/opencl/acc_opencl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h
index cc15cb6340f..000013219b7 100644
--- a/src/acc/opencl/acc_opencl.h
+++ b/src/acc/opencl/acc_opencl.h
@@ -71,7 +71,7 @@
 # define ACC_OPENCL_EVENT(A) ((cl_event*)(A))
 #endif
 
-#if !defined(ACC_OPENCL_THREADLOCAL_CONTEXT) && 0
+#if !defined(ACC_OPENCL_THREADLOCAL_CONTEXT) && /*WORKAROUND*/!defined(__DBCSR_ACC)
 # define ACC_OPENCL_THREADLOCAL_CONTEXT
 #endif
 #if !defined(ACC_OPENCL_STREAM_PRIORITIES) && 1

From a26e779b5bb90d4f72f113a22323e7b41d2afd71 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Fri, 5 Feb 2021 08:42:35 +0100
Subject: [PATCH 11/22] Repeated note about combining auto-tuned parameters for
 SP and DP in one application.

---
 src/acc/opencl/smm/README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/acc/opencl/smm/README.md b/src/acc/opencl/smm/README.md
index a3ff7a223d4..90af6c979e4 100644
--- a/src/acc/opencl/smm/README.md
+++ b/src/acc/opencl/smm/README.md
@@ -43,7 +43,11 @@ For multiplying matrices:
 
 Auto tuning code for performance is a practical way to find the "best" setting for parameterized code (e.g., GPU kernels). Introducing effective parameters is a prerequisite, and exploring the (potentially) high-dimensional parameter space in an efficient way is an art. It is desirable to have reasonable defaults even without auto-tuning the parameters. It would be even better to avoid auto-tuning if best performance was possible right away, i.e., if auto-tuning is not able to find better settings.
 
-For the OpenCL based LIBSMM, `OPENCL_LIBSMM_SMM_BATCHSIZE`, `OPENCL_LIBSMM_SMM_BLOCK_M`, and `OPENCL_LIBSMM_SMM_BLOCK_N` are explored using [OpenTuner](http://opentuner.org/). The script [tune_multiply.py](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/smm/tune_multiply.py) leverages for instance the `acc_bench_smm` benchmark by parsing console output (timing, data type, etc.). This way, the tuning is implemented without being intermingled with subject being tuned. To build the benchmarks:
+For the OpenCL based LIBSMM, `OPENCL_LIBSMM_SMM_BATCHSIZE`, `OPENCL_LIBSMM_SMM_BLOCK_M`, and `OPENCL_LIBSMM_SMM_BLOCK_N` are explored using [OpenTuner](http://opentuner.org/). The script [tune_multiply.py](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/smm/tune_multiply.py) leverages the `acc_bench_smm` benchmark by parsing console output (timing, data type, etc.). This way, the tuning is implemented without being intermingled with the subject being tuned.
+
+**NOTE**: To toggle between tuning single-precision (SP) and double-precision (DP), the `ELEM_TYPE` in [acc_bench_smm.c](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc_bench_smm.c#L22) can be edited. Auto-tuned parameters for SP and DP can both be embedded into the final application and are picked up correctly at runtime.
+
+To build the benchmarks:
 
 ```bash
 cd src/acc/opencl

From cb91474abc4429addd780e00c5a758772e7a3f7d Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Fri, 5 Feb 2021 08:53:07 +0100
Subject: [PATCH 12/22] Only print device name if the device changed (and avoid
 duplicated verbose output).

---
 src/acc/opencl/acc_opencl.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c
index fa18c94f770..541edb9897e 100644
--- a/src/acc/opencl/acc_opencl.c
+++ b/src/acc/opencl/acc_opencl.c
@@ -481,23 +481,24 @@ int c_dbcsr_acc_opencl_set_active_device(int device_id, cl_device_id* device)
           acc_opencl_context = clCreateContext(0 != properties[0] ? properties : NULL,
             1/*num_devices*/, &active_id, notify, NULL/* user_data*/, &result);
         }
-        ACC_OPENCL_CHECK(result, "create context", result);
-      }
-    }
-    if (EXIT_SUCCESS == result) {
-      if (NULL != device) *device = active_id;
-      if (0 != acc_opencl_options.verbosity) {
-        char buffer[ACC_OPENCL_BUFFERSIZE];
-        if (CL_SUCCESS == clGetDeviceInfo(active_id,
-          CL_DEVICE_NAME, ACC_OPENCL_BUFFERSIZE, buffer, NULL))
-        {
-          fprintf(stderr, "INFO ACC/OpenCL: ndevices=%i device%i=\"%s\"\n",
-            acc_opencl_ndevices, device_id, buffer);
+        if (EXIT_SUCCESS == result) {
+          if (0 != acc_opencl_options.verbosity) {
+            char buffer[ACC_OPENCL_BUFFERSIZE];
+            if (CL_SUCCESS == clGetDeviceInfo(active_id,
+              CL_DEVICE_NAME, ACC_OPENCL_BUFFERSIZE, buffer, NULL))
+            {
+              fprintf(stderr, "INFO ACC/OpenCL: ndevices=%i device%i=\"%s\"\n",
+                acc_opencl_ndevices, device_id, buffer);
+            }
+          }
+        }
+        else {
+          ACC_OPENCL_ERROR("create context", result);
         }
       }
     }
-    else {
-      if (NULL != device) *device = NULL;
+    if (NULL != device) {
+      *device = (EXIT_SUCCESS == result ? active_id : NULL);
     }
   }
   ACC_OPENCL_RETURN(result);

From 9221d1b7737621f53d1ff0a65c275a5bf60d20d5 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Fri, 5 Feb 2021 08:56:34 +0100
Subject: [PATCH 13/22] Removed tabs from source file (minor/unrelated change).

---
 src/acc/cuda/acc_cublas.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/acc/cuda/acc_cublas.cpp b/src/acc/cuda/acc_cublas.cpp
index 6a923e0d0f5..4d51611fab3 100644
--- a/src/acc/cuda/acc_cublas.cpp
+++ b/src/acc/cuda/acc_cublas.cpp
@@ -49,10 +49,10 @@ int acc_blas_dgemm(ACC_BLAS(Handle_t) *handle, char transa, char transb,
   ACC_BLAS_CALL(SetStream, (*handle, *stream));
 
   ACC_BLAS_CALL(Dgemm, (*handle, cTransa, cTransb,
-				    m, n, k,
-				    &alpha, &a_data[a_offset], lda,
-				    &b_data[ b_offset], ldb,
-				    &beta, &c_data[ c_offset], lda));
+                m, n, k,
+                &alpha, &a_data[a_offset], lda,
+                &b_data[ b_offset], ldb,
+                &beta, &c_data[ c_offset], lda));
 
   return(0);
 }

From 8ac6f1d38df5e043385c6d8038c2e49b3eb05c65 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Fri, 5 Feb 2021 13:08:59 +0100
Subject: [PATCH 14/22] More prefixes in follow-up of #419 (c_dbcsr_).

---
 src/acc/opencl/acc_opencl.c        | 128 ++++++++++++++---------------
 src/acc/opencl/acc_opencl.h        |  12 +--
 src/acc/opencl/acc_opencl_event.c  |   2 +-
 src/acc/opencl/acc_opencl_mem.c    |  38 ++++-----
 src/acc/opencl/acc_opencl_stream.c |  10 +--
 src/acc/opencl/smm/opencl_libsmm.c |   4 +-
 6 files changed, 97 insertions(+), 97 deletions(-)

diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c
index 541edb9897e..ee75ebef04e 100644
--- a/src/acc/opencl/acc_opencl.c
+++ b/src/acc/opencl/acc_opencl.c
@@ -37,14 +37,14 @@
 extern "C" {
 #endif
 
-acc_opencl_options_t acc_opencl_options;
-int acc_opencl_ndevices;
-cl_device_id acc_opencl_devices[ACC_OPENCL_DEVICES_MAXCOUNT];
-cl_context acc_opencl_context;
+c_dbcsr_acc_opencl_options_t c_dbcsr_acc_opencl_options;
+int c_dbcsr_acc_opencl_ndevices;
+cl_device_id c_dbcsr_acc_opencl_devices[ACC_OPENCL_DEVICES_MAXCOUNT];
+cl_context c_dbcsr_acc_opencl_context;
 
 #if !defined(NDEBUG)
-void acc_opencl_notify(const char* /*errinfo*/, const void* /*private_info*/, size_t /*cb*/, void* /*user_data*/);
-void acc_opencl_notify(const char* errinfo, const void* private_info, size_t cb, void* user_data)
+void c_dbcsr_acc_opencl_notify(const char* /*errinfo*/, const void* /*private_info*/, size_t /*cb*/, void* /*user_data*/);
+void c_dbcsr_acc_opencl_notify(const char* errinfo, const void* private_info, size_t cb, void* user_data)
 {
   ACC_OPENCL_UNUSED(private_info); ACC_OPENCL_UNUSED(cb); ACC_OPENCL_UNUSED(user_data);
   fprintf(stderr, "ERROR ACC/OpenCL: %s\n", errinfo);
@@ -139,7 +139,7 @@ int c_dbcsr_acc_init(void)
 #else
   int result = EXIT_SUCCESS;
 #endif
-  if (0 == acc_opencl_ndevices) { /* avoid to initialize multiple times */
+  if (0 == c_dbcsr_acc_opencl_ndevices) { /* avoid to initialize multiple times */
     const char *const disable = getenv("ACC_OPENCL_DISABLE");
     if (NULL == disable || '0' == *disable) {
       cl_platform_id platforms[ACC_OPENCL_DEVICES_MAXCOUNT];
@@ -160,35 +160,35 @@ int c_dbcsr_acc_init(void)
         else if (NULL != c_dbcsr_acc_opencl_stristr(env_device_type, "cpu")) type = CL_DEVICE_TYPE_CPU;
         else type = CL_DEVICE_TYPE_ACCELERATOR;
       }
-      acc_opencl_ndevices = 0;
+      c_dbcsr_acc_opencl_ndevices = 0;
       for (i = 0; i < nplatforms; ++i) {
         if (EXIT_SUCCESS == result
           && CL_SUCCESS == clGetDeviceIDs(platforms[i], type, 0, NULL, &ndevices))
         {
-          const int n = (acc_opencl_ndevices + ndevices) < ACC_OPENCL_DEVICES_MAXCOUNT
-            ? (int)ndevices : (ACC_OPENCL_DEVICES_MAXCOUNT - acc_opencl_ndevices);
+          const int n = (c_dbcsr_acc_opencl_ndevices + ndevices) < ACC_OPENCL_DEVICES_MAXCOUNT
+            ? (int)ndevices : (ACC_OPENCL_DEVICES_MAXCOUNT - c_dbcsr_acc_opencl_ndevices);
           if (CL_SUCCESS == clGetDeviceIDs(platforms[i], type,
-            n, acc_opencl_devices + acc_opencl_ndevices, NULL))
+            n, c_dbcsr_acc_opencl_devices + c_dbcsr_acc_opencl_ndevices, NULL))
           {
-            acc_opencl_ndevices += n;
+            c_dbcsr_acc_opencl_ndevices += n;
           }
           else {
             ACC_OPENCL_ERROR("retrieve device ids", result);
           }
         }
       }
-      assert(NULL == acc_opencl_context);
-      if (device_id < acc_opencl_ndevices) {
+      assert(NULL == c_dbcsr_acc_opencl_context);
+      if (device_id < c_dbcsr_acc_opencl_ndevices) {
         if (NULL != env_device_vendor && '\0' != *env_device_vendor) {
-          for (i = 0; i < (cl_uint)acc_opencl_ndevices;) {
-            if (CL_SUCCESS == clGetDeviceInfo(acc_opencl_devices[i],
+          for (i = 0; i < (cl_uint)c_dbcsr_acc_opencl_ndevices;) {
+            if (CL_SUCCESS == clGetDeviceInfo(c_dbcsr_acc_opencl_devices[i],
               CL_DEVICE_VENDOR, ACC_OPENCL_BUFFERSIZE, buffer, NULL))
             {
               if (NULL == c_dbcsr_acc_opencl_stristr(buffer, env_device_vendor)) {
-                --acc_opencl_ndevices;
-                if (i < (cl_uint)acc_opencl_ndevices) { /* keep relative order of IDs */
-                  memmove(acc_opencl_devices + i, acc_opencl_devices + i + 1,
-                    sizeof(cl_device_id) * (acc_opencl_ndevices - i));
+                --c_dbcsr_acc_opencl_ndevices;
+                if (i < (cl_uint)c_dbcsr_acc_opencl_ndevices) { /* keep relative order of IDs */
+                  memmove(c_dbcsr_acc_opencl_devices + i, c_dbcsr_acc_opencl_devices + i + 1,
+                    sizeof(cl_device_id) * (c_dbcsr_acc_opencl_ndevices - i));
                 }
               }
               else ++i;
@@ -200,15 +200,15 @@ int c_dbcsr_acc_init(void)
           }
         }
       }
-      if (device_id < acc_opencl_ndevices) {
-        if (EXIT_SUCCESS == result && 1 < acc_opencl_ndevices) {
-          /* reorder devices according to acc_opencl_order_devices */
-          qsort(acc_opencl_devices, acc_opencl_ndevices,
+      if (device_id < c_dbcsr_acc_opencl_ndevices) {
+        if (EXIT_SUCCESS == result && 1 < c_dbcsr_acc_opencl_ndevices) {
+          /* reorder devices according to c_dbcsr_acc_opencl_order_devices */
+          qsort(c_dbcsr_acc_opencl_devices, c_dbcsr_acc_opencl_ndevices,
             sizeof(cl_device_id), c_dbcsr_acc_opencl_order_devices);
           /* preselect default device */
           if (NULL == env_device_id || '\0' == *env_device_id) {
-            for (i = 0; i < (cl_uint)acc_opencl_ndevices; ++i) {
-              ACC_OPENCL_CHECK(clGetDeviceInfo(acc_opencl_devices[i],
+            for (i = 0; i < (cl_uint)c_dbcsr_acc_opencl_ndevices; ++i) {
+              ACC_OPENCL_CHECK(clGetDeviceInfo(c_dbcsr_acc_opencl_devices[i],
                 CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL),
                 "retrieve device type", result);
               if (CL_DEVICE_TYPE_DEFAULT & type) {
@@ -221,19 +221,19 @@ int c_dbcsr_acc_init(void)
         if (EXIT_SUCCESS == result) {
           const char *const env_verbose = getenv("ACC_OPENCL_VERBOSE");
           cl_device_id active_device;
-          acc_opencl_options.verbosity = (NULL == env_verbose ? 0 : atoi(env_verbose));
+          c_dbcsr_acc_opencl_options.verbosity = (NULL == env_verbose ? 0 : atoi(env_verbose));
           result = c_dbcsr_acc_opencl_set_active_device(device_id, &active_device);
 #if defined(_OPENMP) && defined(ACC_OPENCL_THREADLOCAL_CONTEXT)
           if (EXIT_SUCCESS == result) {
-            const cl_context context = acc_opencl_context;
+            const cl_context context = c_dbcsr_acc_opencl_context;
 #           pragma omp parallel
-            if (context != acc_opencl_context) {
+            if (context != c_dbcsr_acc_opencl_context) {
               if (CL_SUCCESS == clRetainContext(context)) {
-                acc_opencl_context = context;
+                c_dbcsr_acc_opencl_context = context;
               }
               else {
                 ACC_OPENCL_ERROR("retain context", result);
-                acc_opencl_context = NULL;
+                c_dbcsr_acc_opencl_context = NULL;
               }
             }
           }
@@ -243,32 +243,32 @@ int c_dbcsr_acc_init(void)
             const char *const env = getenv("ACC_OPENCL_ASYNC_MEMOPS");
             if (NULL == env) {
               const int confirmation = c_dbcsr_acc_opencl_device_vendor(active_device, "nvidia");
-              acc_opencl_options.async_memops = (EXIT_SUCCESS != confirmation);
+              c_dbcsr_acc_opencl_options.async_memops = (EXIT_SUCCESS != confirmation);
             }
-            else acc_opencl_options.async_memops = (0 != atoi(env));
+            else c_dbcsr_acc_opencl_options.async_memops = (0 != atoi(env));
           }
           else
 #endif
-          acc_opencl_options.async_memops = CL_FALSE;
+          c_dbcsr_acc_opencl_options.async_memops = CL_FALSE;
 #if defined(ACC_OPENCL_SVM)
           if (EXIT_SUCCESS == result) {
             const char *const env = getenv("ACC_OPENCL_SVM");
             int level_major = 0;
-            acc_opencl_options.svm_interop = (NULL == env || 0 != atoi(env)) &&
-              (EXIT_SUCCESS == acc_opencl_device_level(active_device,
+            c_dbcsr_acc_opencl_options.svm_interop = (NULL == env || 0 != atoi(env)) &&
+              (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_level(active_device,
                 &level_major, NULL/*level_minor*/) && 2 <= level_major);
           }
           else
 #endif
-          acc_opencl_options.svm_interop = CL_FALSE;
+          c_dbcsr_acc_opencl_options.svm_interop = CL_FALSE;
         }
       }
       else { /* mark as initialized */
-        acc_opencl_ndevices = -1;
+        c_dbcsr_acc_opencl_ndevices = -1;
       }
     }
     else { /* mark as initialized */
-      acc_opencl_ndevices = -1;
+      c_dbcsr_acc_opencl_ndevices = -1;
     }
 #if defined(__DBCSR_ACC)
     /* DBCSR shall call acc_init as well as libsmm_acc_init (since both interfaces are used).
@@ -297,20 +297,20 @@ int c_dbcsr_acc_finalize(void)
 #else
   int result = EXIT_SUCCESS;
 #endif
-  if (NULL != acc_opencl_context) {
-    const cl_context context = acc_opencl_context;
-    assert(0 < acc_opencl_ndevices);
+  if (NULL != c_dbcsr_acc_opencl_context) {
+    const cl_context context = c_dbcsr_acc_opencl_context;
+    assert(0 < c_dbcsr_acc_opencl_ndevices);
 #if defined(_OPENMP) && defined(ACC_OPENCL_THREADLOCAL_CONTEXT)
 #   pragma omp parallel
-    if (context != acc_opencl_context) {
-      ACC_OPENCL_CHECK(clReleaseContext(acc_opencl_context),
+    if (context != c_dbcsr_acc_opencl_context) {
+      ACC_OPENCL_CHECK(clReleaseContext(c_dbcsr_acc_opencl_context),
         "release context", result);
-      acc_opencl_context = NULL;
+      c_dbcsr_acc_opencl_context = NULL;
     }
 #endif
     ACC_OPENCL_CHECK(clReleaseContext(context),
       "release context", result);
-    acc_opencl_context = NULL;
+    c_dbcsr_acc_opencl_context = NULL;
 #if defined(__DBCSR_ACC)
     /* DBCSR may call acc_init() as well as libsmm_acc_init() since both interface are used.
      * libsmm_acc_init may privately call acc_init (as it depends on the ACC interface).
@@ -340,8 +340,8 @@ int c_dbcsr_acc_get_ndevices(int* ndevices)
   if (EXIT_SUCCESS == result)
 #endif
   {
-    if (NULL != ndevices && 0 != acc_opencl_ndevices) {
-      *ndevices = (0 < acc_opencl_ndevices ? acc_opencl_ndevices : 0);
+    if (NULL != ndevices && 0 != c_dbcsr_acc_opencl_ndevices) {
+      *ndevices = (0 < c_dbcsr_acc_opencl_ndevices ? c_dbcsr_acc_opencl_ndevices : 0);
       result = EXIT_SUCCESS;
     }
     else {
@@ -360,13 +360,13 @@ int c_dbcsr_acc_opencl_device(void* stream, cl_device_id* device)
     ACC_OPENCL_CHECK(clGetCommandQueueInfo(*ACC_OPENCL_STREAM(stream), CL_QUEUE_DEVICE,
       sizeof(cl_device_id), device, NULL), "retrieve device from queue", result);
   }
-  else if (NULL != acc_opencl_context) {
+  else if (NULL != c_dbcsr_acc_opencl_context) {
 #if !defined(NDEBUG)
     size_t n = sizeof(cl_device_id);
-    ACC_OPENCL_CHECK(clGetContextInfo(acc_opencl_context, CL_CONTEXT_DEVICES,
+    ACC_OPENCL_CHECK(clGetContextInfo(c_dbcsr_acc_opencl_context, CL_CONTEXT_DEVICES,
       sizeof(cl_device_id), device, &n), "retrieve id of active device", result);
 #else
-    ACC_OPENCL_CHECK(clGetContextInfo(acc_opencl_context, CL_CONTEXT_DEVICES,
+    ACC_OPENCL_CHECK(clGetContextInfo(c_dbcsr_acc_opencl_context, CL_CONTEXT_DEVICES,
       sizeof(cl_device_id), device, NULL), "retrieve id of active device", result);
 #endif
     assert(EXIT_SUCCESS != result || sizeof(cl_device_id) == n/*single-device context*/);
@@ -449,16 +449,16 @@ int c_dbcsr_acc_opencl_device_ext(cl_device_id device, const char *const extname
 
 int c_dbcsr_acc_opencl_set_active_device(int device_id, cl_device_id* device)
 {
-  cl_int result = (((0 <= device_id && device_id < acc_opencl_ndevices) ||
+  cl_int result = (((0 <= device_id && device_id < c_dbcsr_acc_opencl_ndevices) ||
     /* allow successful completion if no device was found */
-    0 > acc_opencl_ndevices) ? EXIT_SUCCESS : EXIT_FAILURE);
-  if (0 < acc_opencl_ndevices) {
-    const cl_device_id active_id = acc_opencl_devices[device_id];
+    0 > c_dbcsr_acc_opencl_ndevices) ? EXIT_SUCCESS : EXIT_FAILURE);
+  if (0 < c_dbcsr_acc_opencl_ndevices) {
+    const cl_device_id active_id = c_dbcsr_acc_opencl_devices[device_id];
     cl_device_id current_id = NULL;
     if (EXIT_SUCCESS == result) result = c_dbcsr_acc_opencl_device(NULL/*stream*/, &current_id);
     if (EXIT_SUCCESS == result && active_id != current_id) {
-      if (NULL != acc_opencl_context) {
-        ACC_OPENCL_CHECK(clReleaseContext(acc_opencl_context),
+      if (NULL != c_dbcsr_acc_opencl_context) {
+        ACC_OPENCL_CHECK(clReleaseContext(c_dbcsr_acc_opencl_context),
           "release context", result);
       }
       if (EXIT_SUCCESS == result) {
@@ -470,25 +470,25 @@ int c_dbcsr_acc_opencl_set_active_device(int device_id, cl_device_id* device)
 #if defined(NDEBUG)
         void (*const notify)(const char*, const void*, size_t, void*) = NULL;
 #else
-        void (*const notify)(const char*, const void*, size_t, void*) = acc_opencl_notify;
+        void (*const notify)(const char*, const void*, size_t, void*) = c_dbcsr_acc_opencl_notify;
 #endif
-        acc_opencl_context = clCreateContext(properties,
+        c_dbcsr_acc_opencl_context = clCreateContext(properties,
           1/*num_devices*/, &active_id, notify, NULL/* user_data*/, &result);
         if (CL_INVALID_VALUE == result) { /* retry */
           const size_t n = sizeof(properties) / sizeof(*properties);
           assert(3 <= n);
           properties[n-3] = 0;
-          acc_opencl_context = clCreateContext(0 != properties[0] ? properties : NULL,
+          c_dbcsr_acc_opencl_context = clCreateContext(0 != properties[0] ? properties : NULL,
             1/*num_devices*/, &active_id, notify, NULL/* user_data*/, &result);
         }
         if (EXIT_SUCCESS == result) {
-          if (0 != acc_opencl_options.verbosity) {
+          if (0 != c_dbcsr_acc_opencl_options.verbosity) {
             char buffer[ACC_OPENCL_BUFFERSIZE];
             if (CL_SUCCESS == clGetDeviceInfo(active_id,
               CL_DEVICE_NAME, ACC_OPENCL_BUFFERSIZE, buffer, NULL))
             {
               fprintf(stderr, "INFO ACC/OpenCL: ndevices=%i device%i=\"%s\"\n",
-                acc_opencl_ndevices, device_id, buffer);
+                c_dbcsr_acc_opencl_ndevices, device_id, buffer);
             }
           }
         }
@@ -569,9 +569,9 @@ int c_dbcsr_acc_opencl_kernel(const char* source, const char* build_options,
   char buffer[ACC_OPENCL_BUFFERSIZE] = "";
   cl_int result;
   assert(NULL != kernel);
-  if (NULL != acc_opencl_context) {
+  if (NULL != c_dbcsr_acc_opencl_context) {
     const cl_program program = clCreateProgramWithSource(
-      acc_opencl_context, 1/*nlines*/, &source, NULL, &result);
+      c_dbcsr_acc_opencl_context, 1/*nlines*/, &source, NULL, &result);
     if (NULL != program) {
       cl_device_id active_id = NULL;
       assert(CL_SUCCESS == result);
diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h
index 000013219b7..c38445f63a0 100644
--- a/src/acc/opencl/acc_opencl.h
+++ b/src/acc/opencl/acc_opencl.h
@@ -188,23 +188,23 @@ extern "C" {
 #endif
 
 /** Settings depending on OpenCL vendor or standard level (discovered/setup in acc_init). */
-typedef struct acc_opencl_options_t {
+typedef struct c_dbcsr_acc_opencl_options_t {
   /** Asynchronous memory operations (may crash for some OpenCL implementations). */
   cl_bool async_memops;
   /** Runtime SVM support (needs ACC_OPENCL_SVM at compile-time). */
   cl_bool svm_interop;
   /** Runtime verbosity (output on stderr). */
   cl_int verbosity;
-} acc_opencl_options_t;
+} c_dbcsr_acc_opencl_options_t;
 
-extern acc_opencl_options_t acc_opencl_options;
+extern c_dbcsr_acc_opencl_options_t c_dbcsr_acc_opencl_options;
 
 /* non-zero if library is initialized, zero devices is signaled by nagative value */
-extern int acc_opencl_ndevices;
+extern int c_dbcsr_acc_opencl_ndevices;
 /* allow a context per each OpenMP thread */
-extern cl_context acc_opencl_context;
+extern cl_context c_dbcsr_acc_opencl_context;
 #if defined(_OPENMP) && defined(ACC_OPENCL_THREADLOCAL_CONTEXT)
-# pragma omp threadprivate(acc_opencl_context)
+# pragma omp threadprivate(c_dbcsr_acc_opencl_context)
 #endif
 
 typedef struct c_dbcsr_acc_opencl_info_hostptr_t {
diff --git a/src/acc/opencl/acc_opencl_event.c b/src/acc/opencl/acc_opencl_event.c
index 0dd92e4bdc6..5792b40acde 100644
--- a/src/acc/opencl/acc_opencl_event.c
+++ b/src/acc/opencl/acc_opencl_event.c
@@ -29,7 +29,7 @@ extern "C" {
 int c_dbcsr_acc_event_create(void** event_p)
 {
   cl_int result = EXIT_SUCCESS;
-  const cl_event event = clCreateUserEvent(acc_opencl_context, &result);
+  const cl_event event = clCreateUserEvent(c_dbcsr_acc_opencl_context, &result);
   assert(NULL != event_p);
   if (NULL != event) {
     cl_int status = CL_COMPLETE;
diff --git a/src/acc/opencl/acc_opencl_mem.c b/src/acc/opencl/acc_opencl_mem.c
index 17c904121bc..5e77c41bdb4 100644
--- a/src/acc/opencl/acc_opencl_mem.c
+++ b/src/acc/opencl/acc_opencl_mem.c
@@ -63,7 +63,7 @@ c_dbcsr_acc_opencl_info_hostptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory)
 void* c_dbcsr_acc_opencl_get_hostptr(cl_mem memory)
 {
   void* result = NULL;
-  assert(acc_opencl_options.svm_interop);
+  assert(c_dbcsr_acc_opencl_options.svm_interop);
   if (NULL != memory && CL_SUCCESS != clGetMemObjectInfo(memory, CL_MEM_HOST_PTR, sizeof(void*), &result, NULL)) {
     assert(NULL == result);
   }
@@ -79,15 +79,15 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream)
   const size_t size = nbytes + alignment + size_meminfo - 1;
   const cl_mem buffer = (
 #if defined(ACC_OPENCL_SVM)
-    acc_opencl_options.svm_interop ? clCreateBuffer(acc_opencl_context, CL_MEM_USE_HOST_PTR, size,
-      clSVMAlloc(acc_opencl_context, CL_MEM_READ_WRITE, size, sizeof(void*)/*minimal alignment*/), &result) :
+    c_dbcsr_acc_opencl_options.svm_interop ? clCreateBuffer(c_dbcsr_acc_opencl_context, CL_MEM_USE_HOST_PTR, size,
+      clSVMAlloc(c_dbcsr_acc_opencl_context, CL_MEM_READ_WRITE, size, sizeof(void*)/*minimal alignment*/), &result) :
 #endif
-    clCreateBuffer(acc_opencl_context, CL_MEM_ALLOC_HOST_PTR, size, NULL/*host_ptr*/, &result));
+    clCreateBuffer(c_dbcsr_acc_opencl_context, CL_MEM_ALLOC_HOST_PTR, size, NULL/*host_ptr*/, &result));
   assert(NULL != host_mem && NULL != stream);
   if (NULL != buffer) {
     const cl_command_queue queue = *ACC_OPENCL_STREAM(stream);
     const uintptr_t address = (uintptr_t)clEnqueueMapBuffer(queue, buffer,
-      !acc_opencl_options.async_memops, CL_MAP_READ | CL_MAP_WRITE,
+      !c_dbcsr_acc_opencl_options.async_memops, CL_MAP_READ | CL_MAP_WRITE,
       0/*offset*/, size, 0, NULL, NULL, &result);
     if (0 != address) {
       const uintptr_t aligned = ACC_OPENCL_UP2(address + size_meminfo, alignment);
@@ -145,7 +145,7 @@ int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream)
       ACC_OPENCL_CHECK(clReleaseMemObject(info.buffer),
         "release host memory buffer", result);
 #if defined(ACC_OPENCL_SVM)
-      if (acc_opencl_options.svm_interop) clSVMFree(acc_opencl_context, info.mapped);
+      if (c_dbcsr_acc_opencl_options.svm_interop) clSVMFree(c_dbcsr_acc_opencl_context, info.mapped);
 #endif
     }
   }
@@ -158,10 +158,10 @@ int c_dbcsr_acc_dev_mem_allocate(void** dev_mem, size_t nbytes)
   cl_int result;
   const cl_mem buffer = (
 #if defined(ACC_OPENCL_SVM)
-    acc_opencl_options.svm_interop ? clCreateBuffer(acc_opencl_context, CL_MEM_USE_HOST_PTR, nbytes,
-      clSVMAlloc(acc_opencl_context, CL_MEM_READ_WRITE, nbytes, 0/*default alignment*/), &result) :
+    c_dbcsr_acc_opencl_options.svm_interop ? clCreateBuffer(c_dbcsr_acc_opencl_context, CL_MEM_USE_HOST_PTR, nbytes,
+      clSVMAlloc(c_dbcsr_acc_opencl_context, CL_MEM_READ_WRITE, nbytes, 0/*default alignment*/), &result) :
 #endif
-    clCreateBuffer(acc_opencl_context, CL_MEM_READ_WRITE, nbytes, NULL/*host_ptr*/, &result));
+    clCreateBuffer(c_dbcsr_acc_opencl_context, CL_MEM_READ_WRITE, nbytes, NULL/*host_ptr*/, &result));
   assert(NULL != dev_mem);
   if (NULL != buffer) {
 #if defined(ACC_OPENCL_MEM_NOALLOC)
@@ -175,12 +175,12 @@ int c_dbcsr_acc_dev_mem_allocate(void** dev_mem, size_t nbytes)
     }
     else {
 #if defined(ACC_OPENCL_SVM)
-      void *const ptr = (acc_opencl_options.svm_interop
+      void *const ptr = (c_dbcsr_acc_opencl_options.svm_interop
         ? c_dbcsr_acc_opencl_get_hostptr(buffer) : NULL);
 #endif
       clReleaseMemObject(buffer);
 #if defined(ACC_OPENCL_SVM)
-      /*if (NULL != ptr)*/ clSVMFree(acc_opencl_context, ptr);
+      /*if (NULL != ptr)*/ clSVMFree(c_dbcsr_acc_opencl_context, ptr);
 #endif
       result = EXIT_FAILURE;
     }
@@ -201,7 +201,7 @@ int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem)
   if (NULL != dev_mem) {
     const cl_mem buffer = *ACC_OPENCL_MEM(dev_mem);
 #if defined(ACC_OPENCL_SVM)
-    void *const ptr = (acc_opencl_options.svm_interop
+    void *const ptr = (c_dbcsr_acc_opencl_options.svm_interop
       ? c_dbcsr_acc_opencl_get_hostptr(buffer) : NULL);
 #endif
     ACC_OPENCL_CHECK(clReleaseMemObject(buffer),
@@ -212,7 +212,7 @@ int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem)
     free(dev_mem);
 #endif
 #if defined(ACC_OPENCL_SVM)
-    /*if (NULL != ptr)*/ clSVMFree(acc_opencl_context, ptr);
+    /*if (NULL != ptr)*/ clSVMFree(c_dbcsr_acc_opencl_context, ptr);
 #endif
   }
   ACC_OPENCL_RETURN(result);
@@ -238,7 +238,7 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v
   assert((NULL != host_mem || 0 == nbytes) && (NULL != dev_mem || 0 == nbytes) && NULL != stream);
   if (NULL != host_mem && NULL != dev_mem && 0 != nbytes) {
     ACC_OPENCL_CHECK(clEnqueueWriteBuffer(*ACC_OPENCL_STREAM(stream), *ACC_OPENCL_MEM(dev_mem),
-      !acc_opencl_options.async_memops, 0/*offset*/, nbytes, host_mem, 0, NULL, NULL),
+      !c_dbcsr_acc_opencl_options.async_memops, 0/*offset*/, nbytes, host_mem, 0, NULL, NULL),
       "enqueue h2d copy", result);
   }
   ACC_OPENCL_RETURN(result);
@@ -251,7 +251,7 @@ int c_dbcsr_acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, v
   assert((NULL != dev_mem || 0 == nbytes) && (NULL != host_mem || 0 == nbytes) && NULL != stream);
   if (NULL != host_mem && NULL != dev_mem && 0 != nbytes) {
     ACC_OPENCL_CHECK(clEnqueueReadBuffer(*ACC_OPENCL_STREAM(stream), *ACC_OPENCL_MEM(dev_mem),
-      !acc_opencl_options.async_memops, 0/*offset*/, nbytes, host_mem, 0, NULL, NULL),
+      !c_dbcsr_acc_opencl_options.async_memops, 0/*offset*/, nbytes, host_mem, 0, NULL, NULL),
       "enqueue d2h copy", result);
   }
   ACC_OPENCL_RETURN(result);
@@ -333,9 +333,9 @@ int c_dbcsr_acc_opencl_info_devmem(cl_device_id device, size_t* mem_free, size_t
     cl_ulong cl_size_total = 0;
     ACC_OPENCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE,
       sizeof(cl_ulong), &cl_size_total, NULL), "retrieve amount of device memory", result);
-    assert(0 < acc_opencl_ndevices);
-    size_total /= acc_opencl_ndevices;
-    size_free  /= acc_opencl_ndevices;
+    assert(0 < c_dbcsr_acc_opencl_ndevices);
+    size_total /= c_dbcsr_acc_opencl_ndevices;
+    size_free  /= c_dbcsr_acc_opencl_ndevices;
     if (EXIT_SUCCESS == result) {
       if (cl_size_total < size_total) size_total = cl_size_total;
       if (size_total < size_free) size_free = size_total;
@@ -353,7 +353,7 @@ int c_dbcsr_acc_dev_mem_info(size_t* mem_free, size_t* mem_total)
 {
   int result = EXIT_SUCCESS;
   cl_device_id active_id = NULL;
-  if (NULL != acc_opencl_context) {
+  if (NULL != c_dbcsr_acc_opencl_context) {
     result = c_dbcsr_acc_opencl_device(NULL/*stream*/, &active_id);
   }
   if (EXIT_SUCCESS == result) {
diff --git a/src/acc/opencl/acc_opencl_stream.c b/src/acc/opencl/acc_opencl_stream.c
index fe1fbccd5f2..98f9c1bb84e 100644
--- a/src/acc/opencl/acc_opencl_stream.c
+++ b/src/acc/opencl/acc_opencl_stream.c
@@ -41,11 +41,11 @@ int c_dbcsr_acc_opencl_stream_create(cl_command_queue* stream_p, const char* nam
 {
   cl_int result = EXIT_SUCCESS;
   assert(NULL != stream_p);
-  if (NULL != acc_opencl_context) {
+  if (NULL != c_dbcsr_acc_opencl_context) {
     cl_device_id device_id = NULL;
     result = c_dbcsr_acc_opencl_device(NULL/*stream*/, &device_id);
     if (EXIT_SUCCESS == result) {
-      *stream_p = ACC_OPENCL_CREATE_COMMAND_QUEUE(acc_opencl_context, device_id, properties, &result);
+      *stream_p = ACC_OPENCL_CREATE_COMMAND_QUEUE(c_dbcsr_acc_opencl_context, device_id, properties, &result);
     }
     else {
       ACC_OPENCL_ERROR("create command queue", result);
@@ -58,7 +58,7 @@ int c_dbcsr_acc_opencl_stream_create(cl_command_queue* stream_p, const char* nam
 int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority)
 {
   cl_int result = EXIT_SUCCESS;
-  if (NULL != acc_opencl_context) {
+  if (NULL != c_dbcsr_acc_opencl_context) {
     cl_command_queue queue = NULL;
 #if !defined(ACC_OPENCL_STREAM_PRIORITIES) || !defined(CL_QUEUE_PRIORITY_KHR)
     ACC_OPENCL_UNUSED(priority);
@@ -124,12 +124,12 @@ int c_dbcsr_acc_stream_destroy(void* stream)
 int c_dbcsr_acc_stream_priority_range(int* least, int* greatest)
 {
   int result = ((NULL != least || NULL != greatest) ? EXIT_SUCCESS : EXIT_FAILURE);
-  if (NULL != acc_opencl_context) {
+  if (NULL != c_dbcsr_acc_opencl_context) {
 #if defined(ACC_OPENCL_STREAM_PRIORITIES) && defined(CL_QUEUE_PRIORITY_KHR)
     char buffer[ACC_OPENCL_BUFFERSIZE];
     cl_platform_id platform = NULL;
     cl_device_id active_id = NULL;
-    assert(0 < acc_opencl_ndevices);
+    assert(0 < c_dbcsr_acc_opencl_ndevices);
     if (EXIT_SUCCESS == result) result = c_dbcsr_acc_opencl_device(NULL/*stream*/, &active_id);
     ACC_OPENCL_CHECK(clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM,
       sizeof(cl_platform_id), &platform, NULL),
diff --git a/src/acc/opencl/smm/opencl_libsmm.c b/src/acc/opencl/smm/opencl_libsmm.c
index 87b680ec6ec..2fe1ab87487 100644
--- a/src/acc/opencl/smm/opencl_libsmm.c
+++ b/src/acc/opencl/smm/opencl_libsmm.c
@@ -323,7 +323,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size,
                   new_config.wgsize = (size_t)wgsize;
                   config = (opencl_libsmm_trans_t*)OPENCL_LIBSMM_REGISTER(&key, sizeof(key),
                     sizeof(new_config), &new_config);
-                  if (1 < acc_opencl_options.verbosity || 0 > acc_opencl_options.verbosity) {
+                  if (1 < c_dbcsr_acc_opencl_options.verbosity || 0 > c_dbcsr_acc_opencl_options.verbosity) {
                     const double duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
                     fprintf(stderr, "INFO ACC/OpenCL: %ix%i transpose-kernel generated in %.1f ms\n",
                       m, n, 1000.0 * duration);
@@ -596,7 +596,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
                       config->wgsize = (size_t)wgsize;
                       config->bs = bs; config->bm = bm; config->bn = bn;
                       config->kernel = new_config.kernel;
-                      if (1 < acc_opencl_options.verbosity || 0 > acc_opencl_options.verbosity) {
+                      if (1 < c_dbcsr_acc_opencl_options.verbosity || 0 > c_dbcsr_acc_opencl_options.verbosity) {
                         const double duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
                         fprintf(stderr, "INFO ACC/OpenCL: %ix%ix%i %sSMM-kernel generated in %.1f ms\n",
                           m_max, n_max, k_max, default_params ? "" : "tuned ", 1000.0 * duration);

From b58a37b5c678ff994d328fbadb4aeaf4819584b9 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Fri, 5 Feb 2021 15:33:40 +0100
Subject: [PATCH 15/22] Supply platform when forming context.

---
 src/acc/opencl/acc_opencl.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c
index ee75ebef04e..3154d04225c 100644
--- a/src/acc/opencl/acc_opencl.c
+++ b/src/acc/opencl/acc_opencl.c
@@ -456,22 +456,28 @@ int c_dbcsr_acc_opencl_set_active_device(int device_id, cl_device_id* device)
     const cl_device_id active_id = c_dbcsr_acc_opencl_devices[device_id];
     cl_device_id current_id = NULL;
     if (EXIT_SUCCESS == result) result = c_dbcsr_acc_opencl_device(NULL/*stream*/, &current_id);
-    if (EXIT_SUCCESS == result && active_id != current_id) {
+    if (active_id != current_id) {
+      cl_platform_id platform = NULL;
+      ACC_OPENCL_CHECK(clGetDeviceInfo(active_id,
+        CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL),
+        "query device platform", result);
       if (NULL != c_dbcsr_acc_opencl_context) {
         ACC_OPENCL_CHECK(clReleaseContext(c_dbcsr_acc_opencl_context),
           "release context", result);
       }
       if (EXIT_SUCCESS == result) {
-        cl_context_properties properties[] = {
-          /* insert other properties in front of below property */
-          CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE, /* TODO */
-          0 /* end of properties */
-        };
 #if defined(NDEBUG)
         void (*const notify)(const char*, const void*, size_t, void*) = NULL;
 #else
         void (*const notify)(const char*, const void*, size_t, void*) = c_dbcsr_acc_opencl_notify;
 #endif
+        cl_context_properties properties[] = {
+          CL_CONTEXT_PLATFORM, 0/*placeholder*/,
+          /* insert other properties in front of below property */
+          CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE, /* TODO */
+          0 /* end of properties */
+        };
+        properties[1] = (long)platform;
         c_dbcsr_acc_opencl_context = clCreateContext(properties,
           1/*num_devices*/, &active_id, notify, NULL/* user_data*/, &result);
         if (CL_INVALID_VALUE == result) { /* retry */

From 6f3c910b772b90a626c6662ddf3d8899f57e0c65 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Fri, 5 Feb 2021 15:51:52 +0100
Subject: [PATCH 16/22] Code cleanup.

---
 src/acc/opencl/acc_opencl.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c
index 3154d04225c..026dcb73040 100644
--- a/src/acc/opencl/acc_opencl.c
+++ b/src/acc/opencl/acc_opencl.c
@@ -383,8 +383,8 @@ int c_dbcsr_acc_opencl_device_vendor(cl_device_id device, const char* vendor)
   char buffer[ACC_OPENCL_BUFFERSIZE];
   int result = EXIT_SUCCESS;
   assert(NULL != device && NULL != vendor);
-  ACC_OPENCL_CHECK(clGetDeviceInfo(device,
-    CL_DEVICE_VENDOR, ACC_OPENCL_BUFFERSIZE, buffer, NULL),
+  ACC_OPENCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_VENDOR,
+    ACC_OPENCL_BUFFERSIZE, buffer, NULL),
     "retrieve device vendor", result);
   if (EXIT_SUCCESS == result) {
     return (NULL != c_dbcsr_acc_opencl_stristr(buffer, vendor)
@@ -458,8 +458,8 @@ int c_dbcsr_acc_opencl_set_active_device(int device_id, cl_device_id* device)
     if (EXIT_SUCCESS == result) result = c_dbcsr_acc_opencl_device(NULL/*stream*/, &current_id);
     if (active_id != current_id) {
       cl_platform_id platform = NULL;
-      ACC_OPENCL_CHECK(clGetDeviceInfo(active_id,
-        CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL),
+      ACC_OPENCL_CHECK(clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM,
+        sizeof(cl_platform_id), &platform, NULL),
         "query device platform", result);
       if (NULL != c_dbcsr_acc_opencl_context) {
         ACC_OPENCL_CHECK(clReleaseContext(c_dbcsr_acc_opencl_context),
@@ -490,8 +490,8 @@ int c_dbcsr_acc_opencl_set_active_device(int device_id, cl_device_id* device)
         if (EXIT_SUCCESS == result) {
           if (0 != c_dbcsr_acc_opencl_options.verbosity) {
             char buffer[ACC_OPENCL_BUFFERSIZE];
-            if (CL_SUCCESS == clGetDeviceInfo(active_id,
-              CL_DEVICE_NAME, ACC_OPENCL_BUFFERSIZE, buffer, NULL))
+            if (CL_SUCCESS == clGetDeviceInfo(active_id, CL_DEVICE_NAME,
+              ACC_OPENCL_BUFFERSIZE, buffer, NULL))
             {
               fprintf(stderr, "INFO ACC/OpenCL: ndevices=%i device%i=\"%s\"\n",
                 c_dbcsr_acc_opencl_ndevices, device_id, buffer);
@@ -546,7 +546,8 @@ int c_dbcsr_acc_opencl_wgsize(cl_device_id device, cl_kernel kernel,
     if (NULL != max_value) {
       size_t value = 0;
       ACC_OPENCL_CHECK(clGetDeviceInfo(device,
-        CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &value, NULL),
+        CL_DEVICE_MAX_WORK_GROUP_SIZE,
+        sizeof(size_t), &value, NULL),
         "query maximum WG-size of device", result);
       assert(value <= INT_MAX);
       *max_value = (int)value;

From b5cb129fe418873144b1e5a044b7d7624effe252 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Fri, 5 Feb 2021 16:19:20 +0100
Subject: [PATCH 17/22] Try to avoid MPS issue (temporarily) testing with only
 one rank. Sync up with #428.

---
 .ci/daint.cscs.ch/cray.build.sh  | 4 ----
 .ci/daint.cscs.ch/cray.test.sh   | 5 ++---
 .ci/daint.cscs.ch/gnu.build.sh   | 4 ----
 .ci/daint.cscs.ch/gnu.test.sh    | 5 ++---
 .ci/daint.cscs.ch/intel.build.sh | 4 ----
 .ci/daint.cscs.ch/intel.test.sh  | 5 ++---
 .ci/daint.cscs.ch/ocl.build.sh   | 4 ----
 .ci/daint.cscs.ch/ocl.test.sh    | 9 +++------
 8 files changed, 9 insertions(+), 31 deletions(-)

diff --git a/.ci/daint.cscs.ch/cray.build.sh b/.ci/daint.cscs.ch/cray.build.sh
index 82016b666a0..c1647f321a9 100755
--- a/.ci/daint.cscs.ch/cray.build.sh
+++ b/.ci/daint.cscs.ch/cray.build.sh
@@ -1,13 +1,9 @@
 #!/bin/bash -l
 
 #SBATCH --export=ALL
-#SBATCH --exclusive
 #SBATCH --constraint="mc"
 #SBATCH --partition="cscsci"
 #SBATCH --nodes=1
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=3
-#SBATCH --ntasks-per-core=1 # 1=no HT, 2=HT
 
 set -o errexit
 set -o nounset
diff --git a/.ci/daint.cscs.ch/cray.test.sh b/.ci/daint.cscs.ch/cray.test.sh
index f88544849bb..f98212e1e06 100755
--- a/.ci/daint.cscs.ch/cray.test.sh
+++ b/.ci/daint.cscs.ch/cray.test.sh
@@ -1,13 +1,12 @@
 #!/bin/bash -l
 
 #SBATCH --export=ALL
-#SBATCH --exclusive
 #SBATCH --constraint="gpu"
 #SBATCH --partition="cscsci"
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=3
-#SBATCH --ntasks-per-core=1 # 1=no HT, 2=HT
+#SBATCH --hint=nomultithread
 
 set -o errexit
 set -o nounset
@@ -32,4 +31,4 @@ export OMP_PROC_BIND=TRUE # set thread affinity
 env |& tee -a "${STAGE_NAME}.out"
 
 ulimit -s 256000
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 1200" |& tee -a "${STAGE_NAME}.out"
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 900" |& tee -a "${STAGE_NAME}.out"
diff --git a/.ci/daint.cscs.ch/gnu.build.sh b/.ci/daint.cscs.ch/gnu.build.sh
index 76984955c40..8673d7f35a1 100755
--- a/.ci/daint.cscs.ch/gnu.build.sh
+++ b/.ci/daint.cscs.ch/gnu.build.sh
@@ -1,13 +1,9 @@
 #!/bin/bash -l
 
 #SBATCH --export=ALL
-#SBATCH --exclusive
 #SBATCH --constraint="mc"
 #SBATCH --partition="cscsci"
 #SBATCH --nodes=1
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=3
-#SBATCH --ntasks-per-core=1 # 1=no HT, 2=HT
 
 set -o errexit
 set -o nounset
diff --git a/.ci/daint.cscs.ch/gnu.test.sh b/.ci/daint.cscs.ch/gnu.test.sh
index 1f1d43316ec..49f7aba7d63 100755
--- a/.ci/daint.cscs.ch/gnu.test.sh
+++ b/.ci/daint.cscs.ch/gnu.test.sh
@@ -1,13 +1,12 @@
 #!/bin/bash -l
 
 #SBATCH --export=ALL
-#SBATCH --exclusive
 #SBATCH --constraint="gpu"
 #SBATCH --partition="cscsci"
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=3
-#SBATCH --ntasks-per-core=1 # 1=no HT, 2=HT
+#SBATCH --hint=nomultithread
 
 set -o errexit
 set -o nounset
@@ -32,4 +31,4 @@ export OMP_PROC_BIND=TRUE # set thread affinity
 # document the current environment
 env |& tee -a "${STAGE_NAME}.out"
 
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 1200" |& tee -a "${STAGE_NAME}.out"
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 900" |& tee -a "${STAGE_NAME}.out"
diff --git a/.ci/daint.cscs.ch/intel.build.sh b/.ci/daint.cscs.ch/intel.build.sh
index 4a92733d87d..e333596c89f 100755
--- a/.ci/daint.cscs.ch/intel.build.sh
+++ b/.ci/daint.cscs.ch/intel.build.sh
@@ -1,13 +1,9 @@
 #!/bin/bash -l
 
 #SBATCH --export=ALL
-#SBATCH --exclusive
 #SBATCH --constraint="mc"
 #SBATCH --partition="cscsci"
 #SBATCH --nodes=1
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=3
-#SBATCH --ntasks-per-core=1 # 1=no HT, 2=HT
 
 set -o errexit
 set -o nounset
diff --git a/.ci/daint.cscs.ch/intel.test.sh b/.ci/daint.cscs.ch/intel.test.sh
index b1ea3ccdcc3..b4f211382d6 100755
--- a/.ci/daint.cscs.ch/intel.test.sh
+++ b/.ci/daint.cscs.ch/intel.test.sh
@@ -1,13 +1,12 @@
 #!/bin/bash -l
 
 #SBATCH --export=ALL
-#SBATCH --exclusive
 #SBATCH --constraint="gpu"
 #SBATCH --partition="cscsci"
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=3
-#SBATCH --ntasks-per-core=1 # 1=no HT, 2=HT
+#SBATCH --hint=nomultithread
 
 set -o errexit
 set -o nounset
@@ -32,4 +31,4 @@ export OMP_PROC_BIND=TRUE # set thread affinity
 # document the current environment
 env |& tee -a "${STAGE_NAME}.out"
 
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 1200" |& tee -a "${STAGE_NAME}.out"
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 900" |& tee -a "${STAGE_NAME}.out"
diff --git a/.ci/daint.cscs.ch/ocl.build.sh b/.ci/daint.cscs.ch/ocl.build.sh
index a326a93886c..ce7f4ade7a3 100755
--- a/.ci/daint.cscs.ch/ocl.build.sh
+++ b/.ci/daint.cscs.ch/ocl.build.sh
@@ -1,13 +1,9 @@
 #!/bin/bash -l
 
 #SBATCH --export=ALL
-#SBATCH --exclusive
 #SBATCH --constraint="mc"
 #SBATCH --partition="cscsci"
 #SBATCH --nodes=1
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=3
-#SBATCH --ntasks-per-core=1 # 1=no HT, 2=HT
 
 set -o errexit
 set -o nounset
diff --git a/.ci/daint.cscs.ch/ocl.test.sh b/.ci/daint.cscs.ch/ocl.test.sh
index 045c5055ed1..c371ec71925 100755
--- a/.ci/daint.cscs.ch/ocl.test.sh
+++ b/.ci/daint.cscs.ch/ocl.test.sh
@@ -1,13 +1,11 @@
 #!/bin/bash -l
 
 #SBATCH --export=ALL
-#SBATCH --exclusive
 #SBATCH --constraint="gpu"
 #SBATCH --partition="cscsci"
 #SBATCH --nodes=1
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=3
-#SBATCH --ntasks-per-core=1 # 1=no HT, 2=HT
+#SBATCH --ntasks-per-node=1
+#SBATCH --hint=nomultithread
 
 set -o errexit
 set -o nounset
@@ -26,11 +24,10 @@ mkdir -p "${SCRATCH}/${BUILD_TAG}.ocl"
 chmod 0775 "${SCRATCH}/${BUILD_TAG}.ocl"
 cd "${SCRATCH}/${BUILD_TAG}.ocl"
 
-export CRAY_CUDA_MPS=1 # enable the CUDA proxy for MPI+CUDA
 export OMP_PROC_BIND=TRUE # set thread affinity
 # OMP_NUM_THREADS is set by cmake
 
 # document the current environment
 env |& tee -a "${STAGE_NAME}.out"
 
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 1200" |& tee -a "${STAGE_NAME}.out"
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 900" |& tee -a "${STAGE_NAME}.out"

From 367a117e156100f5a62fa8dd57564b4f4601c354 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Fri, 5 Feb 2021 16:35:03 +0100
Subject: [PATCH 18/22] Enabled OpenCL based runtime tests.

---
 .ci/daint.cscs.ch/Jenkinsfile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.ci/daint.cscs.ch/Jenkinsfile b/.ci/daint.cscs.ch/Jenkinsfile
index 73a0e486146..22ef331b6a8 100644
--- a/.ci/daint.cscs.ch/Jenkinsfile
+++ b/.ci/daint.cscs.ch/Jenkinsfile
@@ -66,11 +66,11 @@ pipeline {
                                 run_batch("0:15:00", "ocl", "build")
                             }
                         }
-//                        stage('test') {
-//                            steps {
-//                                run_batch("1:00:00", "ocl", "test")
-//                            }
-//                        }
+                        stage('test') {
+                            steps {
+                                run_batch("1:00:00", "ocl", "test")
+                            }
+                        }
                     }
                 }
                 stage("Intel") {

From 6c9f84cee53815b5c2cb60a17250e44d5e3a9a88 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Fri, 5 Feb 2021 19:41:40 +0100
Subject: [PATCH 19/22] Fixed CI-scripts.

---
 .ci/daint.cscs.ch/cray.build.sh  | 3 +++
 .ci/daint.cscs.ch/gnu.build.sh   | 3 +++
 .ci/daint.cscs.ch/intel.build.sh | 3 +++
 .ci/daint.cscs.ch/ocl.build.sh   | 2 ++
 4 files changed, 11 insertions(+)

diff --git a/.ci/daint.cscs.ch/cray.build.sh b/.ci/daint.cscs.ch/cray.build.sh
index c1647f321a9..bbc3d113988 100755
--- a/.ci/daint.cscs.ch/cray.build.sh
+++ b/.ci/daint.cscs.ch/cray.build.sh
@@ -4,6 +4,9 @@
 #SBATCH --constraint="mc"
 #SBATCH --partition="cscsci"
 #SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=3
+#SBATCH --hint=nomultithread
 
 set -o errexit
 set -o nounset
diff --git a/.ci/daint.cscs.ch/gnu.build.sh b/.ci/daint.cscs.ch/gnu.build.sh
index 8673d7f35a1..4683e776836 100755
--- a/.ci/daint.cscs.ch/gnu.build.sh
+++ b/.ci/daint.cscs.ch/gnu.build.sh
@@ -4,6 +4,9 @@
 #SBATCH --constraint="mc"
 #SBATCH --partition="cscsci"
 #SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=3
+#SBATCH --hint=nomultithread
 
 set -o errexit
 set -o nounset
diff --git a/.ci/daint.cscs.ch/intel.build.sh b/.ci/daint.cscs.ch/intel.build.sh
index e333596c89f..4b33b585269 100755
--- a/.ci/daint.cscs.ch/intel.build.sh
+++ b/.ci/daint.cscs.ch/intel.build.sh
@@ -4,6 +4,9 @@
 #SBATCH --constraint="mc"
 #SBATCH --partition="cscsci"
 #SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=3
+#SBATCH --hint=nomultithread
 
 set -o errexit
 set -o nounset
diff --git a/.ci/daint.cscs.ch/ocl.build.sh b/.ci/daint.cscs.ch/ocl.build.sh
index ce7f4ade7a3..e542cbba52c 100755
--- a/.ci/daint.cscs.ch/ocl.build.sh
+++ b/.ci/daint.cscs.ch/ocl.build.sh
@@ -4,6 +4,8 @@
 #SBATCH --constraint="mc"
 #SBATCH --partition="cscsci"
 #SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --hint=nomultithread
 
 set -o errexit
 set -o nounset

From 9ff03d94c5662dc68f1b64ab3c29a48fa24acf7a Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Fri, 5 Feb 2021 19:54:32 +0100
Subject: [PATCH 20/22] Fixed another variable which was left unbound
 (CI-script).

---
 .ci/daint.cscs.ch/ocl.build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.ci/daint.cscs.ch/ocl.build.sh b/.ci/daint.cscs.ch/ocl.build.sh
index e542cbba52c..6f333ece9e7 100755
--- a/.ci/daint.cscs.ch/ocl.build.sh
+++ b/.ci/daint.cscs.ch/ocl.build.sh
@@ -5,6 +5,7 @@
 #SBATCH --partition="cscsci"
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=12
 #SBATCH --hint=nomultithread
 
 set -o errexit

From d17f03f6283ab9828ab4fb66cd0844ad821d95b2 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Mon, 8 Feb 2021 09:07:50 +0100
Subject: [PATCH 21/22] Incorporated #428.

---
 cmake/CompilerConfiguration.cmake | 2 +-
 src/CMakeLists.txt                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/CompilerConfiguration.cmake b/cmake/CompilerConfiguration.cmake
index 351ea8d53b7..af29b4dd0a2 100644
--- a/cmake/CompilerConfiguration.cmake
+++ b/cmake/CompilerConfiguration.cmake
@@ -41,7 +41,7 @@ Please open an issue at https://github.com/cp2k/dbcsr/issues with the reported c
 endif ()
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-  set(CMAKE_CXX_FLAGS_RELEASE      "-O3 -g -funroll-loops -Wall")
+  set(CMAKE_CXX_FLAGS_RELEASE      "-O3 -g -funroll-loops -Wall -Werror")
   set(CMAKE_CXX_FLAGS_COVERAGE     "-O0 -g --coverage -Wall -Werror")
   set(CMAKE_CXX_FLAGS_DEBUG        "-O2 -ggdb -Wall -Werror -fsanitize=undefined -fsanitize=address -fsanitize-recover=all")
   if ((NOT (USE_MPI)) OR (NOT ("${MPI_Fortran_LIBRARY_VERSION_STRING}" MATCHES "Open MPI")))
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 351373e29fa..763df46897f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -233,8 +233,8 @@ if (USE_ACCEL)
     dbcsr
     PRIVATE $<$<STREQUAL:${USE_ACCEL},cuda>:CUDA::cudart>
             $<$<STREQUAL:${USE_ACCEL},cuda>:CUDA::cublas>
-            $<$<STREQUAL:${USE_ACCEL},cuda>:CUDA::nvToolsExt>
             $<$<STREQUAL:${USE_ACCEL},cuda>:CUDA::nvrtc>
+            $<$<BOOL:${WITH_CUDA_PROFILING}>:CUDA::nvToolsExt>
             $<$<STREQUAL:${USE_ACCEL},hip>:roc::hipblas>
             $<$<STREQUAL:${USE_ACCEL},hip>:hip::host>
             $<$<STREQUAL:${USE_ACCEL},opencl>:OpenCL::OpenCL>)

From 2e57682f24a759b02e553b8be3f1e125973fbe77 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hans.pabst@intel.com>
Date: Mon, 8 Feb 2021 10:55:01 +0100
Subject: [PATCH 22/22] Warn about potentially exclusive device-mode.

---
 src/acc/opencl/acc_opencl.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c
index 026dcb73040..5de7768aafb 100644
--- a/src/acc/opencl/acc_opencl.c
+++ b/src/acc/opencl/acc_opencl.c
@@ -499,6 +499,13 @@ int c_dbcsr_acc_opencl_set_active_device(int device_id, cl_device_id* device)
           }
         }
         else {
+          if (CL_INVALID_DEVICE == result) {
+            if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia")) {
+              fprintf(stderr,
+                "WARNING ACC/OpenCL: if MPI-ranks target the same device in exclusive mode,\n"
+                "                    SMI must enable sharing the device.\n");
+            }
+          }
           ACC_OPENCL_ERROR("create context", result);
         }
       }