diff --git a/.ci/daint.cscs.ch/Jenkinsfile b/.ci/daint.cscs.ch/Jenkinsfile index 73a0e486146..22ef331b6a8 100644 --- a/.ci/daint.cscs.ch/Jenkinsfile +++ b/.ci/daint.cscs.ch/Jenkinsfile @@ -66,11 +66,11 @@ pipeline { run_batch("0:15:00", "ocl", "build") } } -// stage('test') { -// steps { -// run_batch("1:00:00", "ocl", "test") -// } -// } + stage('test') { + steps { + run_batch("1:00:00", "ocl", "test") + } + } } } stage("Intel") { diff --git a/.ci/daint.cscs.ch/ocl.build.sh b/.ci/daint.cscs.ch/ocl.build.sh index c846815ff1f..6f333ece9e7 100755 --- a/.ci/daint.cscs.ch/ocl.build.sh +++ b/.ci/daint.cscs.ch/ocl.build.sh @@ -4,8 +4,8 @@ #SBATCH --constraint="mc" #SBATCH --partition="cscsci" #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=3 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=12 #SBATCH --hint=nomultithread set -o errexit @@ -23,7 +23,8 @@ if [ ! -d "${HOME}/libxsmm" ]; then git clone https://github.com/hfp/libxsmm.git fi cd "${HOME}/libxsmm" -git checkout 02d6ab213a35d5fc2f6454c3b465598b0c086c17 +git fetch +git checkout 05cab50ec6f11a86c15c0ed511c5a9066c613dfb make -j cd .. diff --git a/.ci/daint.cscs.ch/ocl.test.sh b/.ci/daint.cscs.ch/ocl.test.sh index 797e2d60dca..c371ec71925 100755 --- a/.ci/daint.cscs.ch/ocl.test.sh +++ b/.ci/daint.cscs.ch/ocl.test.sh @@ -4,8 +4,7 @@ #SBATCH --constraint="gpu" #SBATCH --partition="cscsci" #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=3 +#SBATCH --ntasks-per-node=1 #SBATCH --hint=nomultithread set -o errexit @@ -21,10 +20,10 @@ set -o xtrace # do not set earlier to avoid noise from module umask 0002 # make sure group members can access the data -mkdir --mode=0775 -p "${SCRATCH}/${BUILD_TAG}.ocl" +mkdir -p "${SCRATCH}/${BUILD_TAG}.ocl" +chmod 0775 "${SCRATCH}/${BUILD_TAG}.ocl" cd "${SCRATCH}/${BUILD_TAG}.ocl" -export CRAY_CUDA_MPS=1 # enable the CUDA proxy for MPI+CUDA export OMP_PROC_BIND=TRUE # set thread affinity # OMP_NUM_THREADS is set by cmake diff --git a/src/acc/acc_bench_smm.c b/src/acc/acc_bench_smm.c index 560f6b59e5e..793bac0c2f9 100644 --- a/src/acc/acc_bench_smm.c +++ b/src/acc/acc_bench_smm.c @@ -106,6 +106,8 @@ int main(int argc, char* argv[]) printf("%s%s%i %i %i %i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n, k, nc, na, nb); CHECK(c_dbcsr_acc_init(), &result); + /* note: libsmm_acc_init() may imply acc_init() */ + CHECK(libsmm_acc_init(), &result); CHECK(c_dbcsr_acc_get_ndevices(&ndevices), &result); if (0 < ndevices) { #if defined(_DEBUG) @@ -113,8 +115,9 @@ int main(int argc, char* argv[]) #endif } else { -#if defined(_DEBUG) - fprintf(stderr, "Error: no device found!\n"); + fprintf(stderr, "No ACC-device found!\n"); +#if !defined(__CUDA) + CHECK(libsmm_acc_finalize(), NULL); #endif CHECK(c_dbcsr_acc_finalize(), NULL); return result; @@ -165,14 +168,14 @@ int main(int argc, char* argv[]) CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev, DBCSR_TYPE(ELEM_TYPE), n, k, MAX_KERNEL_DIM, stream), &result); } -#if defined(USE_LIBXSMM) +# if defined(USE_LIBXSMM) CHECK(c_dbcsr_acc_stream_sync(stream), &result); start = libxsmm_timer_tick(); -#endif +# endif /* to perform NN-SMMs on the device, all B-matrices are transposed upfront (SMM-kernel is limited to NT) */ CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev, DBCSR_TYPE(ELEM_TYPE), k, n, MAX_KERNEL_DIM, stream), &result); -#if defined(USE_LIBXSMM) +# if defined(USE_LIBXSMM) CHECK(c_dbcsr_acc_stream_sync(stream), &result); transpose = libxsmm_timer_duration(start, libxsmm_timer_tick()); # endif @@ -282,6 +285,9 @@ int main(int argc, char* argv[]) CHECK(c_dbcsr_acc_dev_mem_deallocate(bmat_dev), NULL); CHECK(c_dbcsr_acc_dev_mem_deallocate(cmat_dev), NULL); CHECK(c_dbcsr_acc_stream_destroy(stream), NULL); +#if !defined(__CUDA) + CHECK(libsmm_acc_finalize(), NULL); +#endif CHECK(c_dbcsr_acc_finalize(), NULL); if (EXIT_SUCCESS != result) { fprintf(stderr, "FAILED\n"); diff --git a/src/acc/acc_bench_trans.c b/src/acc/acc_bench_trans.c index b10bb0b3d82..ccac1c8d292 100644 --- a/src/acc/acc_bench_trans.c +++ b/src/acc/acc_bench_trans.c @@ -91,6 +91,8 @@ int main(int argc, char* argv[]) assert(m <= (mn / n) && 0 == (mn % n)); printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n); CHECK(c_dbcsr_acc_init(), &result); + /* note: libsmm_acc_init() may imply acc_init() */ + CHECK(libsmm_acc_init(), &result); CHECK(c_dbcsr_acc_get_ndevices(&ndevices), &result); if (0 < ndevices) { #if defined(_DEBUG) @@ -98,8 +100,9 @@ int main(int argc, char* argv[]) #endif } else { -#if defined(_DEBUG) - fprintf(stderr, "Error: no device found!\n"); + fprintf(stderr, "No ACC-device found!\n"); +#if !defined(__CUDA) + CHECK(libsmm_acc_finalize(), NULL); #endif CHECK(c_dbcsr_acc_finalize(), NULL); return result; @@ -210,6 +213,9 @@ int main(int argc, char* argv[]) CHECK(c_dbcsr_acc_dev_mem_deallocate(stack_dev), NULL); CHECK(c_dbcsr_acc_dev_mem_deallocate(mat_dev), NULL); CHECK(c_dbcsr_acc_stream_destroy(stream), NULL); +#if !defined(__CUDA) + CHECK(libsmm_acc_finalize(), NULL); +#endif CHECK(c_dbcsr_acc_finalize(), NULL); if (EXIT_SUCCESS != result) { fprintf(stderr, "FAILED\n"); diff --git a/src/acc/cuda/Makefile b/src/acc/cuda/Makefile index 3f938a1066a..292a48b4a58 100644 --- a/src/acc/cuda/Makefile +++ b/src/acc/cuda/Makefile @@ -1,6 +1,6 @@ INCACC := $(wildcard *.h*) ../acc.h SRCACC := $(wildcard *.cpp) -OBJACC := $(SRCACC:.cpp=.o) acc_cublas.o +OBJACC := $(SRCACC:.cpp=.o) GPUSMM := $(wildcard ../libsmm_acc/kernels/*.h*) INCSMM := $(wildcard ../libsmm_acc/*.h*) ../acc_libsmm.h \ @@ -130,10 +130,7 @@ test: ../dbcsr_acc_test ../libsmm_acc/smm_acc_kernels.h: $(GPUSMM) Makefile ../libsmm_acc/generate_kernels.py ../libsmm_acc/parameters/parameters_$(WITH_GPU).json @cd ../libsmm_acc && $(PYTHON) ../libsmm_acc/generate_kernels.py ../libsmm_acc/kernels -acc_cublas.o: acc_cublas.cu Makefile - $(NVCC) $(addprefix -Xcompiler $(NULL),$(CXXFLAGS)) -c $< -o $@ - -../dbcsr_acc.a: $(OBJACC) acc_cublas.o ../libsmm_acc/libsmm_acc_init.o +../dbcsr_acc.a: $(OBJACC) ../libsmm_acc/libsmm_acc_init.o $(AR) -rs $@ $^ ../dbcsr_acc_smm.a: $(OBJSMM) @@ -153,7 +150,7 @@ acc_bench_trans.o: ../acc_bench_trans.c Makefile $(CXX) $^ $(LDFLAGS) -o $@ dbcsr_acc_test.o: ../../../tests/dbcsr_acc_test.c Makefile - $(CC) $(CFLAGS) -c $< -o $@ + $(CC) $(CFLAGS) -I../.. -c $< -o $@ ../dbcsr_acc_test: dbcsr_acc_test.o ../dbcsr_acc_smm.a ../dbcsr_acc.a $(CXX) $^ $(LDFLAGS) -o $@ diff --git a/src/acc/cuda/acc_cublas.cpp b/src/acc/cuda/acc_cublas.cpp index 6a923e0d0f5..4d51611fab3 100644 --- a/src/acc/cuda/acc_cublas.cpp +++ b/src/acc/cuda/acc_cublas.cpp @@ -49,10 +49,10 @@ int acc_blas_dgemm(ACC_BLAS(Handle_t) *handle, char transa, char transb, ACC_BLAS_CALL(SetStream, (*handle, *stream)); ACC_BLAS_CALL(Dgemm, (*handle, cTransa, cTransb, - m, n, k, - &alpha, &a_data[a_offset], lda, - &b_data[ b_offset], ldb, - &beta, &c_data[ c_offset], lda)); + m, n, k, + &alpha, &a_data[a_offset], lda, + &b_data[ b_offset], ldb, + &beta, &c_data[ c_offset], lda)); return(0); } diff --git a/src/acc/libsmm_acc/libsmm_acc_benchmark.cpp b/src/acc/libsmm_acc/libsmm_acc_benchmark.cpp index 9f8dade4cd9..e76d4e94a65 100644 --- a/src/acc/libsmm_acc/libsmm_acc_benchmark.cpp +++ b/src/acc/libsmm_acc/libsmm_acc_benchmark.cpp @@ -350,9 +350,12 @@ int libsmm_acc_benchmark(libsmm_acc_benchmark_t* h, best_gflops = gflops; best_kernel = ikern; } - } else { + } +#if !defined(NDEBUG) + else { printf("%sOK %s\n", msg_prefix, descr); } +#endif } if(h->mode == tune){ @@ -427,10 +430,12 @@ int libsmm_acc_benchmark_transpose_(int n_stack, int* stack, int* d_stack, if(sumGPU != sumCPU){ printf("%sERROR %s checksum_diff: %g\n", msg_prefix, descr, sumGPU-sumCPU); error_counter++; - } else { + } +#if !defined(NDEBUG) + else { printf("%sOK %s\n", msg_prefix, descr); } - +#endif return error_counter; } diff --git a/src/acc/opencl/README.md b/src/acc/opencl/README.md index 47791410edc..9148c0b9a25 100644 --- a/src/acc/opencl/README.md +++ b/src/acc/opencl/README.md @@ -8,7 +8,7 @@ The OpenCL backend implements the [ACC interface](https://github.com/cp2k/dbcsr/ ### Compile-time Settings -Compile-time settings are (implicitly) documented and can be adjusted by editing [acc_opencl.h](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/acc_opencl.h) (adjusting the build-line as per `-D` is possible as well but less convenient). For example, `ACC_OPENCL_STREAM_PRIORITIES` is enabled by default (and further confirmed at runtime/build-time) but can be disabled, or `ACC_OPENCL_VERBOSE` (which is disabled by default) can be enabled for debug purpose. More sensitive/private compile-time settings may be available within particular translation units like in `acc_opencl_mem.c`. +Compile-time settings are (implicitly) documented and can be adjusted by editing [acc_opencl.h](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/acc_opencl.h) (adjusting the build-line as per `-D` is possible as well but less convenient). For example, `ACC_OPENCL_STREAM_PRIORITIES` is enabled by default (and further confirmed at runtime/build-time) but can be disabled, or `ACC_OPENCL_DEBUG` (which is disabled by default) can be enabled for debug purpose. More sensitive/private compile-time settings may be available within particular translation units like in `acc_opencl_mem.c`. An application of compile-time settings (and perhaps a valuable contribution) might be to call a GPU library in OpenCL-based LIBSMM. In such case, Shared Virtual Memory support (SVM) in OpenCL comes handy and can be enabled per `ACC_OPENCL_SVM`. The latter allows then to simply take the raw pointer out of an `cl_mem` object, and pass it into such library/function (which in turn can work across language borders, etc.). @@ -19,6 +19,9 @@ Runtime settings are made by the means of environment variables (implemented in * `ACC_OPENCL_VENDOR`: character string matching the vendor of the OpenCL device in an case-insensitive fashion, e.g., "intel". * `ACC_OPENCL_DEVTYPE`: character string matching the device-kind like "cpu", "gpu", or another kind if neither CPU or GPU. * `ACC_OPENCL_DEVICE`: non-negative integer number to select a device from the (internally enumerated) list of devices. +* `ACC_OPENCL_VERBOSE`: verbosity level (integer). + * `ACC_OPENCL_VERBOSE=1`: outputs (stderr) the number of devices found and the name of the selected device. + * `ACC_OPENCL_VERBOSE=2`: outputs (stderr) the duration needed to generate a requested kernel. The OpenCL backend enumerates and orders devices primarily by device-kind (GPU, CPU, and others in that order) and by memory capacity (secondary criterion). Device IDs are zero-based as per ACC interface (and less than what is permitted/returned by `acc_get_ndevices`). diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c index 88553be672f..5de7768aafb 100644 --- a/src/acc/opencl/acc_opencl.c +++ b/src/acc/opencl/acc_opencl.c @@ -37,14 +37,14 @@ extern "C" { #endif -acc_opencl_options_t acc_opencl_options; -int acc_opencl_ndevices; -cl_device_id acc_opencl_devices[ACC_OPENCL_DEVICES_MAXCOUNT]; -cl_context acc_opencl_context; +c_dbcsr_acc_opencl_options_t c_dbcsr_acc_opencl_options; +int c_dbcsr_acc_opencl_ndevices; +cl_device_id c_dbcsr_acc_opencl_devices[ACC_OPENCL_DEVICES_MAXCOUNT]; +cl_context c_dbcsr_acc_opencl_context; #if !defined(NDEBUG) -void acc_opencl_notify(const char* /*errinfo*/, const void* /*private_info*/, size_t /*cb*/, void* /*user_data*/); -void acc_opencl_notify(const char* errinfo, const void* private_info, size_t cb, void* user_data) +void c_dbcsr_acc_opencl_notify(const char* /*errinfo*/, const void* /*private_info*/, size_t /*cb*/, void* /*user_data*/); +void c_dbcsr_acc_opencl_notify(const char* errinfo, const void* private_info, size_t cb, void* user_data) { ACC_OPENCL_UNUSED(private_info); ACC_OPENCL_UNUSED(cb); ACC_OPENCL_UNUSED(user_data); fprintf(stderr, "ERROR ACC/OpenCL: %s\n", errinfo); @@ -131,11 +131,15 @@ int c_dbcsr_acc_init(void) { #if defined(_OPENMP) /* initialization/finalization is not meant to be thread-safe */ - int result = (0 == omp_in_parallel() ? EXIT_SUCCESS : EXIT_FAILURE); + int result = ((0 == omp_in_parallel() +# if /*WORKAROUND*/defined(__DBCSR_ACC) + || 0/*master*/ == omp_get_thread_num() +# endif + ) ? EXIT_SUCCESS : EXIT_FAILURE); #else int result = EXIT_SUCCESS; #endif - if (0 == acc_opencl_ndevices) { /* avoid to initialize multiple times */ + if (0 == c_dbcsr_acc_opencl_ndevices) { /* avoid to initialize multiple times */ const char *const disable = getenv("ACC_OPENCL_DISABLE"); if (NULL == disable || '0' == *disable) { cl_platform_id platforms[ACC_OPENCL_DEVICES_MAXCOUNT]; @@ -156,36 +160,35 @@ int c_dbcsr_acc_init(void) else if (NULL != c_dbcsr_acc_opencl_stristr(env_device_type, "cpu")) type = CL_DEVICE_TYPE_CPU; else type = CL_DEVICE_TYPE_ACCELERATOR; } - acc_opencl_ndevices = 0; + c_dbcsr_acc_opencl_ndevices = 0; for (i = 0; i < nplatforms; ++i) { if (EXIT_SUCCESS == result && CL_SUCCESS == clGetDeviceIDs(platforms[i], type, 0, NULL, &ndevices)) { - const int n = (acc_opencl_ndevices + ndevices) < ACC_OPENCL_DEVICES_MAXCOUNT - ? (int)ndevices : (ACC_OPENCL_DEVICES_MAXCOUNT - acc_opencl_ndevices); + const int n = (c_dbcsr_acc_opencl_ndevices + ndevices) < ACC_OPENCL_DEVICES_MAXCOUNT + ? (int)ndevices : (ACC_OPENCL_DEVICES_MAXCOUNT - c_dbcsr_acc_opencl_ndevices); if (CL_SUCCESS == clGetDeviceIDs(platforms[i], type, - n, acc_opencl_devices + acc_opencl_ndevices, NULL)) + n, c_dbcsr_acc_opencl_devices + c_dbcsr_acc_opencl_ndevices, NULL)) { - acc_opencl_ndevices += n; + c_dbcsr_acc_opencl_ndevices += n; } else { ACC_OPENCL_ERROR("retrieve device ids", result); } } } - assert(NULL == acc_opencl_context); - if (device_id < acc_opencl_ndevices) { + assert(NULL == c_dbcsr_acc_opencl_context); + if (device_id < c_dbcsr_acc_opencl_ndevices) { if (NULL != env_device_vendor && '\0' != *env_device_vendor) { - for (i = 0; i < (cl_uint)acc_opencl_ndevices;) { - buffer[0] = '\0'; - if (CL_SUCCESS == clGetDeviceInfo(acc_opencl_devices[i], + for (i = 0; i < (cl_uint)c_dbcsr_acc_opencl_ndevices;) { + if (CL_SUCCESS == clGetDeviceInfo(c_dbcsr_acc_opencl_devices[i], CL_DEVICE_VENDOR, ACC_OPENCL_BUFFERSIZE, buffer, NULL)) { if (NULL == c_dbcsr_acc_opencl_stristr(buffer, env_device_vendor)) { - --acc_opencl_ndevices; - if (i < (cl_uint)acc_opencl_ndevices) { /* keep relative order of IDs */ - memmove(acc_opencl_devices + i, acc_opencl_devices + i + 1, - sizeof(cl_device_id) * (acc_opencl_ndevices - i)); + --c_dbcsr_acc_opencl_ndevices; + if (i < (cl_uint)c_dbcsr_acc_opencl_ndevices) { /* keep relative order of IDs */ + memmove(c_dbcsr_acc_opencl_devices + i, c_dbcsr_acc_opencl_devices + i + 1, + sizeof(cl_device_id) * (c_dbcsr_acc_opencl_ndevices - i)); } } else ++i; @@ -197,15 +200,15 @@ int c_dbcsr_acc_init(void) } } } - if (device_id < acc_opencl_ndevices) { - if (EXIT_SUCCESS == result && 1 < acc_opencl_ndevices) { - /* reorder devices according to acc_opencl_order_devices */ - qsort(acc_opencl_devices, acc_opencl_ndevices, + if (device_id < c_dbcsr_acc_opencl_ndevices) { + if (EXIT_SUCCESS == result && 1 < c_dbcsr_acc_opencl_ndevices) { + /* reorder devices according to c_dbcsr_acc_opencl_order_devices */ + qsort(c_dbcsr_acc_opencl_devices, c_dbcsr_acc_opencl_ndevices, sizeof(cl_device_id), c_dbcsr_acc_opencl_order_devices); /* preselect default device */ if (NULL == env_device_id || '\0' == *env_device_id) { - for (i = 0; i < (cl_uint)acc_opencl_ndevices; ++i) { - ACC_OPENCL_CHECK(clGetDeviceInfo(acc_opencl_devices[i], + for (i = 0; i < (cl_uint)c_dbcsr_acc_opencl_ndevices; ++i) { + ACC_OPENCL_CHECK(clGetDeviceInfo(c_dbcsr_acc_opencl_devices[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL), "retrieve device type", result); if (CL_DEVICE_TYPE_DEFAULT & type) { @@ -216,19 +219,21 @@ int c_dbcsr_acc_init(void) } } if (EXIT_SUCCESS == result) { + const char *const env_verbose = getenv("ACC_OPENCL_VERBOSE"); cl_device_id active_device; + c_dbcsr_acc_opencl_options.verbosity = (NULL == env_verbose ? 0 : atoi(env_verbose)); result = c_dbcsr_acc_opencl_set_active_device(device_id, &active_device); #if defined(_OPENMP) && defined(ACC_OPENCL_THREADLOCAL_CONTEXT) if (EXIT_SUCCESS == result) { - const cl_context context = acc_opencl_context; + const cl_context context = c_dbcsr_acc_opencl_context; # pragma omp parallel - if (context != acc_opencl_context) { + if (context != c_dbcsr_acc_opencl_context) { if (CL_SUCCESS == clRetainContext(context)) { - acc_opencl_context = context; + c_dbcsr_acc_opencl_context = context; } else { ACC_OPENCL_ERROR("retain context", result); - acc_opencl_context = NULL; + c_dbcsr_acc_opencl_context = NULL; } } } @@ -238,32 +243,32 @@ int c_dbcsr_acc_init(void) const char *const env = getenv("ACC_OPENCL_ASYNC_MEMOPS"); if (NULL == env) { const int confirmation = c_dbcsr_acc_opencl_device_vendor(active_device, "nvidia"); - acc_opencl_options.async_memops = (EXIT_SUCCESS != confirmation); + c_dbcsr_acc_opencl_options.async_memops = (EXIT_SUCCESS != confirmation); } - else acc_opencl_options.async_memops = (0 != atoi(env)); + else c_dbcsr_acc_opencl_options.async_memops = (0 != atoi(env)); } else #endif - acc_opencl_options.async_memops = CL_FALSE; + c_dbcsr_acc_opencl_options.async_memops = CL_FALSE; #if defined(ACC_OPENCL_SVM) if (EXIT_SUCCESS == result) { const char *const env = getenv("ACC_OPENCL_SVM"); int level_major = 0; - acc_opencl_options.svm_interop = (NULL == env || 0 != atoi(env)) && - (EXIT_SUCCESS == acc_opencl_device_level(active_device, + c_dbcsr_acc_opencl_options.svm_interop = (NULL == env || 0 != atoi(env)) && + (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_level(active_device, &level_major, NULL/*level_minor*/) && 2 <= level_major); } else #endif - acc_opencl_options.svm_interop = CL_FALSE; + c_dbcsr_acc_opencl_options.svm_interop = CL_FALSE; } } else { /* mark as initialized */ - acc_opencl_ndevices = -1; + c_dbcsr_acc_opencl_ndevices = -1; } } else { /* mark as initialized */ - acc_opencl_ndevices = -1; + c_dbcsr_acc_opencl_ndevices = -1; } #if defined(__DBCSR_ACC) /* DBCSR shall call acc_init as well as libsmm_acc_init (since both interfaces are used). @@ -284,24 +289,28 @@ int c_dbcsr_acc_finalize(void) { #if defined(_OPENMP) /* initialization/finalization is not meant to be thread-safe */ - int result = (0 == omp_in_parallel() ? EXIT_SUCCESS : EXIT_FAILURE); + int result = ((0 == omp_in_parallel() +# if /*WORKAROUND*/defined(__DBCSR_ACC) + || 0/*master*/ == omp_get_thread_num() +# endif + ) ? EXIT_SUCCESS : EXIT_FAILURE); #else int result = EXIT_SUCCESS; #endif - if (NULL != acc_opencl_context) { - const cl_context context = acc_opencl_context; - assert(0 < acc_opencl_ndevices); + if (NULL != c_dbcsr_acc_opencl_context) { + const cl_context context = c_dbcsr_acc_opencl_context; + assert(0 < c_dbcsr_acc_opencl_ndevices); #if defined(_OPENMP) && defined(ACC_OPENCL_THREADLOCAL_CONTEXT) # pragma omp parallel - if (context != acc_opencl_context) { - ACC_OPENCL_CHECK(clReleaseContext(acc_opencl_context), + if (context != c_dbcsr_acc_opencl_context) { + ACC_OPENCL_CHECK(clReleaseContext(c_dbcsr_acc_opencl_context), "release context", result); - acc_opencl_context = NULL; + c_dbcsr_acc_opencl_context = NULL; } #endif ACC_OPENCL_CHECK(clReleaseContext(context), "release context", result); - acc_opencl_context = NULL; + c_dbcsr_acc_opencl_context = NULL; #if defined(__DBCSR_ACC) /* DBCSR may call acc_init() as well as libsmm_acc_init() since both interface are used. * libsmm_acc_init may privately call acc_init (as it depends on the ACC interface). @@ -325,15 +334,14 @@ void c_dbcsr_acc_clear_errors(void) int c_dbcsr_acc_get_ndevices(int* ndevices) { int result; - #if defined(__DBCSR_ACC) /* DBCSR calls acc_get_ndevices before calling acc_init(). */ result = c_dbcsr_acc_init(); if (EXIT_SUCCESS == result) #endif { - if (NULL != ndevices && 0 != acc_opencl_ndevices) { - *ndevices = (0 < acc_opencl_ndevices ? acc_opencl_ndevices : 0); + if (NULL != ndevices && 0 != c_dbcsr_acc_opencl_ndevices) { + *ndevices = (0 < c_dbcsr_acc_opencl_ndevices ? c_dbcsr_acc_opencl_ndevices : 0); result = EXIT_SUCCESS; } else { @@ -352,13 +360,13 @@ int c_dbcsr_acc_opencl_device(void* stream, cl_device_id* device) ACC_OPENCL_CHECK(clGetCommandQueueInfo(*ACC_OPENCL_STREAM(stream), CL_QUEUE_DEVICE, sizeof(cl_device_id), device, NULL), "retrieve device from queue", result); } - else if (NULL != acc_opencl_context) { + else if (NULL != c_dbcsr_acc_opencl_context) { #if !defined(NDEBUG) size_t n = sizeof(cl_device_id); - ACC_OPENCL_CHECK(clGetContextInfo(acc_opencl_context, CL_CONTEXT_DEVICES, + ACC_OPENCL_CHECK(clGetContextInfo(c_dbcsr_acc_opencl_context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), device, &n), "retrieve id of active device", result); #else - ACC_OPENCL_CHECK(clGetContextInfo(acc_opencl_context, CL_CONTEXT_DEVICES, + ACC_OPENCL_CHECK(clGetContextInfo(c_dbcsr_acc_opencl_context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), device, NULL), "retrieve id of active device", result); #endif assert(EXIT_SUCCESS != result || sizeof(cl_device_id) == n/*single-device context*/); @@ -375,9 +383,8 @@ int c_dbcsr_acc_opencl_device_vendor(cl_device_id device, const char* vendor) char buffer[ACC_OPENCL_BUFFERSIZE]; int result = EXIT_SUCCESS; assert(NULL != device && NULL != vendor); - buffer[0] = '\0'; - ACC_OPENCL_CHECK(clGetDeviceInfo(device, - CL_DEVICE_VENDOR, ACC_OPENCL_BUFFERSIZE, buffer, NULL), + ACC_OPENCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_VENDOR, + ACC_OPENCL_BUFFERSIZE, buffer, NULL), "retrieve device vendor", result); if (EXIT_SUCCESS == result) { return (NULL != c_dbcsr_acc_opencl_stristr(buffer, vendor) @@ -442,39 +449,65 @@ int c_dbcsr_acc_opencl_device_ext(cl_device_id device, const char *const extname int c_dbcsr_acc_opencl_set_active_device(int device_id, cl_device_id* device) { - cl_int result = (((0 <= device_id && device_id < acc_opencl_ndevices) || + cl_int result = (((0 <= device_id && device_id < c_dbcsr_acc_opencl_ndevices) || /* allow successful completion if no device was found */ - 0 > acc_opencl_ndevices) ? EXIT_SUCCESS : EXIT_FAILURE); - if (0 < acc_opencl_ndevices) { - const cl_device_id active_id = acc_opencl_devices[device_id]; + 0 > c_dbcsr_acc_opencl_ndevices) ? EXIT_SUCCESS : EXIT_FAILURE); + if (0 < c_dbcsr_acc_opencl_ndevices) { + const cl_device_id active_id = c_dbcsr_acc_opencl_devices[device_id]; cl_device_id current_id = NULL; if (EXIT_SUCCESS == result) result = c_dbcsr_acc_opencl_device(NULL/*stream*/, ¤t_id); - if (EXIT_SUCCESS == result && active_id != current_id) { - if (NULL != acc_opencl_context) { - ACC_OPENCL_CHECK(clReleaseContext(acc_opencl_context), + if (active_id != current_id) { + cl_platform_id platform = NULL; + ACC_OPENCL_CHECK(clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM, + sizeof(cl_platform_id), &platform, NULL), + "query device platform", result); + if (NULL != c_dbcsr_acc_opencl_context) { + ACC_OPENCL_CHECK(clReleaseContext(c_dbcsr_acc_opencl_context), "release context", result); } if (EXIT_SUCCESS == result) { +#if defined(NDEBUG) + void (*const notify)(const char*, const void*, size_t, void*) = NULL; +#else + void (*const notify)(const char*, const void*, size_t, void*) = c_dbcsr_acc_opencl_notify; +#endif cl_context_properties properties[] = { + CL_CONTEXT_PLATFORM, 0/*placeholder*/, /* insert other properties in front of below property */ CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE, /* TODO */ 0 /* end of properties */ }; -#if defined(NDEBUG) - void (*const notify)(const char*, const void*, size_t, void*) = NULL; -#else - void (*const notify)(const char*, const void*, size_t, void*) = acc_opencl_notify; -#endif - acc_opencl_context = clCreateContext(properties, + properties[1] = (long)platform; + c_dbcsr_acc_opencl_context = clCreateContext(properties, 1/*num_devices*/, &active_id, notify, NULL/* user_data*/, &result); if (CL_INVALID_VALUE == result) { /* retry */ const size_t n = sizeof(properties) / sizeof(*properties); assert(3 <= n); properties[n-3] = 0; - acc_opencl_context = clCreateContext(0 != properties[0] ? properties : NULL, + c_dbcsr_acc_opencl_context = clCreateContext(0 != properties[0] ? properties : NULL, 1/*num_devices*/, &active_id, notify, NULL/* user_data*/, &result); } - ACC_OPENCL_CHECK(result, "create context", result); + if (EXIT_SUCCESS == result) { + if (0 != c_dbcsr_acc_opencl_options.verbosity) { + char buffer[ACC_OPENCL_BUFFERSIZE]; + if (CL_SUCCESS == clGetDeviceInfo(active_id, CL_DEVICE_NAME, + ACC_OPENCL_BUFFERSIZE, buffer, NULL)) + { + fprintf(stderr, "INFO ACC/OpenCL: ndevices=%i device%i=\"%s\"\n", + c_dbcsr_acc_opencl_ndevices, device_id, buffer); + } + } + } + else { + if (CL_INVALID_DEVICE == result) { + if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia")) { + fprintf(stderr, + "WARNING ACC/OpenCL: if MPI-ranks target the same device in exclusive mode,\n" + " SMI must enable sharing the device.\n"); + } + } + ACC_OPENCL_ERROR("create context", result); + } } } if (NULL != device) { @@ -520,7 +553,8 @@ int c_dbcsr_acc_opencl_wgsize(cl_device_id device, cl_kernel kernel, if (NULL != max_value) { size_t value = 0; ACC_OPENCL_CHECK(clGetDeviceInfo(device, - CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &value, NULL), + CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(size_t), &value, NULL), "query maximum WG-size of device", result); assert(value <= INT_MAX); *max_value = (int)value; @@ -546,12 +580,12 @@ int c_dbcsr_acc_opencl_wgsize(cl_device_id device, cl_kernel kernel, int c_dbcsr_acc_opencl_kernel(const char* source, const char* build_options, const char* kernel_name, cl_kernel* kernel) { - char buffer[ACC_OPENCL_BUFFERSIZE] = "\0"; + char buffer[ACC_OPENCL_BUFFERSIZE] = ""; cl_int result; assert(NULL != kernel); - if (NULL != acc_opencl_context) { + if (NULL != c_dbcsr_acc_opencl_context) { const cl_program program = clCreateProgramWithSource( - acc_opencl_context, 1/*nlines*/, &source, NULL, &result); + c_dbcsr_acc_opencl_context, 1/*nlines*/, &source, NULL, &result); if (NULL != program) { cl_device_id active_id = NULL; assert(CL_SUCCESS == result); diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h index 6c8f5214a14..c38445f63a0 100644 --- a/src/acc/opencl/acc_opencl.h +++ b/src/acc/opencl/acc_opencl.h @@ -71,7 +71,7 @@ # define ACC_OPENCL_EVENT(A) ((cl_event*)(A)) #endif -#if !defined(ACC_OPENCL_THREADLOCAL_CONTEXT) && 1 +#if !defined(ACC_OPENCL_THREADLOCAL_CONTEXT) && /*WORKAROUND*/!defined(__DBCSR_ACC) # define ACC_OPENCL_THREADLOCAL_CONTEXT #endif #if !defined(ACC_OPENCL_STREAM_PRIORITIES) && 1 @@ -86,8 +86,8 @@ #if !defined(ACC_OPENCL_MEM_ASYNC) && 1 # define ACC_OPENCL_MEM_ASYNC #endif -#if !defined(ACC_OPENCL_VERBOSE) && 0 -# define ACC_OPENCL_VERBOSE +#if !defined(ACC_OPENCL_DEBUG) && 0 +# define ACC_OPENCL_DEBUG #endif #if !defined(ACC_OPENCL_SVM) && 0 # if defined(CL_VERSION_2_0) @@ -188,20 +188,23 @@ extern "C" { #endif /** Settings depending on OpenCL vendor or standard level (discovered/setup in acc_init). */ -typedef struct acc_opencl_options_t { - /** Asynchronous memory operations may crash for some OpenCL implementations. */ +typedef struct c_dbcsr_acc_opencl_options_t { + /** Asynchronous memory operations (may crash for some OpenCL implementations). */ cl_bool async_memops; + /** Runtime SVM support (needs ACC_OPENCL_SVM at compile-time). */ cl_bool svm_interop; -} acc_opencl_options_t; + /** Runtime verbosity (output on stderr). */ + cl_int verbosity; +} c_dbcsr_acc_opencl_options_t; -extern acc_opencl_options_t acc_opencl_options; +extern c_dbcsr_acc_opencl_options_t c_dbcsr_acc_opencl_options; /* non-zero if library is initialized, zero devices is signaled by nagative value */ -extern int acc_opencl_ndevices; +extern int c_dbcsr_acc_opencl_ndevices; /* allow a context per each OpenMP thread */ -extern cl_context acc_opencl_context; +extern cl_context c_dbcsr_acc_opencl_context; #if defined(_OPENMP) && defined(ACC_OPENCL_THREADLOCAL_CONTEXT) -# pragma omp threadprivate(acc_opencl_context) +# pragma omp threadprivate(c_dbcsr_acc_opencl_context) #endif typedef struct c_dbcsr_acc_opencl_info_hostptr_t { diff --git a/src/acc/opencl/acc_opencl_event.c b/src/acc/opencl/acc_opencl_event.c index fd59a69611d..5792b40acde 100644 --- a/src/acc/opencl/acc_opencl_event.c +++ b/src/acc/opencl/acc_opencl_event.c @@ -29,7 +29,7 @@ extern "C" { int c_dbcsr_acc_event_create(void** event_p) { cl_int result = EXIT_SUCCESS; - const cl_event event = clCreateUserEvent(acc_opencl_context, &result); + const cl_event event = clCreateUserEvent(c_dbcsr_acc_opencl_context, &result); assert(NULL != event_p); if (NULL != event) { cl_int status = CL_COMPLETE; @@ -107,7 +107,7 @@ int c_dbcsr_acc_event_query(void* event, acc_bool_t* has_occurred) } assert(NULL != has_occurred); *has_occurred = (CL_COMPLETE == status || 0 > status); -#if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG) +#if defined(ACC_OPENCL_DEBUG) && defined(_DEBUG) fprintf(stderr, "c_dbcsr_acc_event_query(%p, %i)\n", event, *has_occurred); #endif ACC_OPENCL_RETURN(result); @@ -118,7 +118,7 @@ int c_dbcsr_acc_event_synchronize(void* event) { /* Waits on the host-side. */ int result = EXIT_SUCCESS; assert(NULL != event); -#if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG) +#if defined(ACC_OPENCL_DEBUG) && defined(_DEBUG) fprintf(stderr, "c_dbcsr_acc_event_synchronize(%p)\n", event); #endif ACC_OPENCL_CHECK(clWaitForEvents(1, ACC_OPENCL_EVENT(event)), diff --git a/src/acc/opencl/acc_opencl_mem.c b/src/acc/opencl/acc_opencl_mem.c index 17c904121bc..5e77c41bdb4 100644 --- a/src/acc/opencl/acc_opencl_mem.c +++ b/src/acc/opencl/acc_opencl_mem.c @@ -63,7 +63,7 @@ c_dbcsr_acc_opencl_info_hostptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory) void* c_dbcsr_acc_opencl_get_hostptr(cl_mem memory) { void* result = NULL; - assert(acc_opencl_options.svm_interop); + assert(c_dbcsr_acc_opencl_options.svm_interop); if (NULL != memory && CL_SUCCESS != clGetMemObjectInfo(memory, CL_MEM_HOST_PTR, sizeof(void*), &result, NULL)) { assert(NULL == result); } @@ -79,15 +79,15 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream) const size_t size = nbytes + alignment + size_meminfo - 1; const cl_mem buffer = ( #if defined(ACC_OPENCL_SVM) - acc_opencl_options.svm_interop ? clCreateBuffer(acc_opencl_context, CL_MEM_USE_HOST_PTR, size, - clSVMAlloc(acc_opencl_context, CL_MEM_READ_WRITE, size, sizeof(void*)/*minimal alignment*/), &result) : + c_dbcsr_acc_opencl_options.svm_interop ? clCreateBuffer(c_dbcsr_acc_opencl_context, CL_MEM_USE_HOST_PTR, size, + clSVMAlloc(c_dbcsr_acc_opencl_context, CL_MEM_READ_WRITE, size, sizeof(void*)/*minimal alignment*/), &result) : #endif - clCreateBuffer(acc_opencl_context, CL_MEM_ALLOC_HOST_PTR, size, NULL/*host_ptr*/, &result)); + clCreateBuffer(c_dbcsr_acc_opencl_context, CL_MEM_ALLOC_HOST_PTR, size, NULL/*host_ptr*/, &result)); assert(NULL != host_mem && NULL != stream); if (NULL != buffer) { const cl_command_queue queue = *ACC_OPENCL_STREAM(stream); const uintptr_t address = (uintptr_t)clEnqueueMapBuffer(queue, buffer, - !acc_opencl_options.async_memops, CL_MAP_READ | CL_MAP_WRITE, + !c_dbcsr_acc_opencl_options.async_memops, CL_MAP_READ | CL_MAP_WRITE, 0/*offset*/, size, 0, NULL, NULL, &result); if (0 != address) { const uintptr_t aligned = ACC_OPENCL_UP2(address + size_meminfo, alignment); @@ -145,7 +145,7 @@ int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream) ACC_OPENCL_CHECK(clReleaseMemObject(info.buffer), "release host memory buffer", result); #if defined(ACC_OPENCL_SVM) - if (acc_opencl_options.svm_interop) clSVMFree(acc_opencl_context, info.mapped); + if (c_dbcsr_acc_opencl_options.svm_interop) clSVMFree(c_dbcsr_acc_opencl_context, info.mapped); #endif } } @@ -158,10 +158,10 @@ int c_dbcsr_acc_dev_mem_allocate(void** dev_mem, size_t nbytes) cl_int result; const cl_mem buffer = ( #if defined(ACC_OPENCL_SVM) - acc_opencl_options.svm_interop ? clCreateBuffer(acc_opencl_context, CL_MEM_USE_HOST_PTR, nbytes, - clSVMAlloc(acc_opencl_context, CL_MEM_READ_WRITE, nbytes, 0/*default alignment*/), &result) : + c_dbcsr_acc_opencl_options.svm_interop ? clCreateBuffer(c_dbcsr_acc_opencl_context, CL_MEM_USE_HOST_PTR, nbytes, + clSVMAlloc(c_dbcsr_acc_opencl_context, CL_MEM_READ_WRITE, nbytes, 0/*default alignment*/), &result) : #endif - clCreateBuffer(acc_opencl_context, CL_MEM_READ_WRITE, nbytes, NULL/*host_ptr*/, &result)); + clCreateBuffer(c_dbcsr_acc_opencl_context, CL_MEM_READ_WRITE, nbytes, NULL/*host_ptr*/, &result)); assert(NULL != dev_mem); if (NULL != buffer) { #if defined(ACC_OPENCL_MEM_NOALLOC) @@ -175,12 +175,12 @@ int c_dbcsr_acc_dev_mem_allocate(void** dev_mem, size_t nbytes) } else { #if defined(ACC_OPENCL_SVM) - void *const ptr = (acc_opencl_options.svm_interop + void *const ptr = (c_dbcsr_acc_opencl_options.svm_interop ? c_dbcsr_acc_opencl_get_hostptr(buffer) : NULL); #endif clReleaseMemObject(buffer); #if defined(ACC_OPENCL_SVM) - /*if (NULL != ptr)*/ clSVMFree(acc_opencl_context, ptr); + /*if (NULL != ptr)*/ clSVMFree(c_dbcsr_acc_opencl_context, ptr); #endif result = EXIT_FAILURE; } @@ -201,7 +201,7 @@ int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem) if (NULL != dev_mem) { const cl_mem buffer = *ACC_OPENCL_MEM(dev_mem); #if defined(ACC_OPENCL_SVM) - void *const ptr = (acc_opencl_options.svm_interop + void *const ptr = (c_dbcsr_acc_opencl_options.svm_interop ? c_dbcsr_acc_opencl_get_hostptr(buffer) : NULL); #endif ACC_OPENCL_CHECK(clReleaseMemObject(buffer), @@ -212,7 +212,7 @@ int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem) free(dev_mem); #endif #if defined(ACC_OPENCL_SVM) - /*if (NULL != ptr)*/ clSVMFree(acc_opencl_context, ptr); + /*if (NULL != ptr)*/ clSVMFree(c_dbcsr_acc_opencl_context, ptr); #endif } ACC_OPENCL_RETURN(result); @@ -238,7 +238,7 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v assert((NULL != host_mem || 0 == nbytes) && (NULL != dev_mem || 0 == nbytes) && NULL != stream); if (NULL != host_mem && NULL != dev_mem && 0 != nbytes) { ACC_OPENCL_CHECK(clEnqueueWriteBuffer(*ACC_OPENCL_STREAM(stream), *ACC_OPENCL_MEM(dev_mem), - !acc_opencl_options.async_memops, 0/*offset*/, nbytes, host_mem, 0, NULL, NULL), + !c_dbcsr_acc_opencl_options.async_memops, 0/*offset*/, nbytes, host_mem, 0, NULL, NULL), "enqueue h2d copy", result); } ACC_OPENCL_RETURN(result); @@ -251,7 +251,7 @@ int c_dbcsr_acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, v assert((NULL != dev_mem || 0 == nbytes) && (NULL != host_mem || 0 == nbytes) && NULL != stream); if (NULL != host_mem && NULL != dev_mem && 0 != nbytes) { ACC_OPENCL_CHECK(clEnqueueReadBuffer(*ACC_OPENCL_STREAM(stream), *ACC_OPENCL_MEM(dev_mem), - !acc_opencl_options.async_memops, 0/*offset*/, nbytes, host_mem, 0, NULL, NULL), + !c_dbcsr_acc_opencl_options.async_memops, 0/*offset*/, nbytes, host_mem, 0, NULL, NULL), "enqueue d2h copy", result); } ACC_OPENCL_RETURN(result); @@ -333,9 +333,9 @@ int c_dbcsr_acc_opencl_info_devmem(cl_device_id device, size_t* mem_free, size_t cl_ulong cl_size_total = 0; ACC_OPENCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &cl_size_total, NULL), "retrieve amount of device memory", result); - assert(0 < acc_opencl_ndevices); - size_total /= acc_opencl_ndevices; - size_free /= acc_opencl_ndevices; + assert(0 < c_dbcsr_acc_opencl_ndevices); + size_total /= c_dbcsr_acc_opencl_ndevices; + size_free /= c_dbcsr_acc_opencl_ndevices; if (EXIT_SUCCESS == result) { if (cl_size_total < size_total) size_total = cl_size_total; if (size_total < size_free) size_free = size_total; @@ -353,7 +353,7 @@ int c_dbcsr_acc_dev_mem_info(size_t* mem_free, size_t* mem_total) { int result = EXIT_SUCCESS; cl_device_id active_id = NULL; - if (NULL != acc_opencl_context) { + if (NULL != c_dbcsr_acc_opencl_context) { result = c_dbcsr_acc_opencl_device(NULL/*stream*/, &active_id); } if (EXIT_SUCCESS == result) { diff --git a/src/acc/opencl/acc_opencl_stream.c b/src/acc/opencl/acc_opencl_stream.c index be8825523b0..98f9c1bb84e 100644 --- a/src/acc/opencl/acc_opencl_stream.c +++ b/src/acc/opencl/acc_opencl_stream.c @@ -41,11 +41,11 @@ int c_dbcsr_acc_opencl_stream_create(cl_command_queue* stream_p, const char* nam { cl_int result = EXIT_SUCCESS; assert(NULL != stream_p); - if (NULL != acc_opencl_context) { + if (NULL != c_dbcsr_acc_opencl_context) { cl_device_id device_id = NULL; result = c_dbcsr_acc_opencl_device(NULL/*stream*/, &device_id); if (EXIT_SUCCESS == result) { - *stream_p = ACC_OPENCL_CREATE_COMMAND_QUEUE(acc_opencl_context, device_id, properties, &result); + *stream_p = ACC_OPENCL_CREATE_COMMAND_QUEUE(c_dbcsr_acc_opencl_context, device_id, properties, &result); } else { ACC_OPENCL_ERROR("create command queue", result); @@ -58,7 +58,7 @@ int c_dbcsr_acc_opencl_stream_create(cl_command_queue* stream_p, const char* nam int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) { cl_int result = EXIT_SUCCESS; - if (NULL != acc_opencl_context) { + if (NULL != c_dbcsr_acc_opencl_context) { cl_command_queue queue = NULL; #if !defined(ACC_OPENCL_STREAM_PRIORITIES) || !defined(CL_QUEUE_PRIORITY_KHR) ACC_OPENCL_UNUSED(priority); @@ -124,12 +124,12 @@ int c_dbcsr_acc_stream_destroy(void* stream) int c_dbcsr_acc_stream_priority_range(int* least, int* greatest) { int result = ((NULL != least || NULL != greatest) ? EXIT_SUCCESS : EXIT_FAILURE); - if (NULL != acc_opencl_context) { + if (NULL != c_dbcsr_acc_opencl_context) { #if defined(ACC_OPENCL_STREAM_PRIORITIES) && defined(CL_QUEUE_PRIORITY_KHR) char buffer[ACC_OPENCL_BUFFERSIZE]; cl_platform_id platform = NULL; cl_device_id active_id = NULL; - assert(0 < acc_opencl_ndevices); + assert(0 < c_dbcsr_acc_opencl_ndevices); if (EXIT_SUCCESS == result) result = c_dbcsr_acc_opencl_device(NULL/*stream*/, &active_id); ACC_OPENCL_CHECK(clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL), @@ -165,7 +165,7 @@ int c_dbcsr_acc_stream_sync(void* stream) { /* Blocks the host-thread. */ int result = EXIT_SUCCESS; assert(NULL != stream); -#if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG) +#if defined(ACC_OPENCL_DEBUG) && defined(_DEBUG) fprintf(stderr, "c_dbcsr_acc_stream_sync(%p)\n", stream); #endif ACC_OPENCL_CHECK(clFinish(*ACC_OPENCL_STREAM(stream)), @@ -178,7 +178,7 @@ int c_dbcsr_acc_stream_wait_event(void* stream, void* event) { /* Wait for an event (device-side). */ int result = EXIT_SUCCESS; assert(NULL != stream && NULL != event); -#if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG) +#if defined(ACC_OPENCL_DEBUG) && defined(_DEBUG) fprintf(stderr, "c_dbcsr_acc_stream_wait_event(%p, %p)\n", stream, event); #endif #if defined(ACC_OPENCL_STREAM_SYNCFLUSH) diff --git a/src/acc/opencl/smm/README.md b/src/acc/opencl/smm/README.md index 1536bd14c04..90af6c979e4 100644 --- a/src/acc/opencl/smm/README.md +++ b/src/acc/opencl/smm/README.md @@ -14,7 +14,16 @@ The `OPENCL_LIBSMM_DEBUG` compile-time setting enables side-by-side validation o ### Runtime Settings -Runtime settings are made by the means of environment variables (implemented in `opencl_libsmm.c`). There are two categories (for the two major functions) like matrix transpose (`OPENCL_LIBSMM_TRANS_*`) and matrix multiplication (`OPENCL_LIBSMM_SMM_*`). For tranposing matrices: +Runtime settings are made by the means of environment variables (implemented in `opencl_libsmm.c`). There are two categories (for the two major functions) like matrix transpose (`OPENCL_LIBSMM_TRANS_*`) and matrix multiplication (`OPENCL_LIBSMM_SMM_*`). Common settings are (see OpenCL backend documentation for more details): + +* `ACC_OPENCL_VENDOR`: character string matching the vendor of the OpenCL device in an case-insensitive fashion, e.g., "intel". +* `ACC_OPENCL_DEVTYPE`: character string matching the device-kind like "cpu", "gpu", or another kind if neither CPU or GPU. +* `ACC_OPENCL_DEVICE`: non-negative integer number to select a device from the (internally enumerated) list of devices. +* `ACC_OPENCL_VERBOSE`: verbosity level (integer). + * `ACC_OPENCL_VERBOSE=1`: outputs (stderr) the number of devices found and the name of the selected device. + * `ACC_OPENCL_VERBOSE=2`: outputs (stderr) the duration needed to generate a requested kernel. + +For tranposing matrices: * `OPENCL_LIBSMM_TRANS_BUILDOPTS`: character string with build options (compile and link) supplied to the OpenCL runtime compiler. * `OPENCL_LIBSMM_TRANS_INPLACE`: Boolean value (zero or non-zero integer) for inplace matrix transpose not relying on local memory. @@ -28,13 +37,17 @@ For multiplying matrices: * `OPENCL_LIBSMM_SMM_BLOCK_M`: non-negative integer number (less/equal than the M-extent) denoting the blocksize in M-direction. * `OPENCL_LIBSMM_SMM_BLOCK_N`: non-negative integer number (less/equal than the N-extent) denoting the blocksize in N-direction. -**NOTE**: above runtime settings may be non-smooth in the sense of enabling a distinct code-path depending on a specific value, e.g., `OPENCL_LIBSMM_SMM_BATCHSIZE=1`. +**NOTE**: LIBSMM's tunable runtime settings may be non-smooth in the sense of enabling a distinct code-path depending on a specific value, e.g., `OPENCL_LIBSMM_SMM_BATCHSIZE=1` vs. `OPENCL_LIBSMM_SMM_BATCHSIZE=2`. ## Auto Tuning Auto tuning code for performance is a practical way to find the "best" setting for parameterized code (e.g., GPU kernels). Introducing effective parameters is a prerequisite, and exploring the (potentially) high-dimensional parameter space in an efficient way is an art. It is desirable to have reasonable defaults even without auto-tuning the parameters. It would be even better to avoid auto-tuning if best performance was possible right away, i.e., if auto-tuning is not able to find better settings. -For the OpenCL based LIBSMM, `OPENCL_LIBSMM_SMM_BATCHSIZE`, `OPENCL_LIBSMM_SMM_BLOCK_M`, and `OPENCL_LIBSMM_SMM_BLOCK_N` are explored using [OpenTuner](http://opentuner.org/). The script [tune_multiply.py](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/smm/tune_multiply.py) leverages for instance the [acc_bench_smm](index.html) benchmark by parsing console output (timing, data type, etc.). This way, the tuning is implemented without being intermingled with subject being tuned. To build the benchmarks: +For the OpenCL based LIBSMM, `OPENCL_LIBSMM_SMM_BATCHSIZE`, `OPENCL_LIBSMM_SMM_BLOCK_M`, and `OPENCL_LIBSMM_SMM_BLOCK_N` are explored using [OpenTuner](http://opentuner.org/). The script [tune_multiply.py](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/smm/tune_multiply.py) leverages the `acc_bench_smm` benchmark by parsing console output (timing, data type, etc.). This way, the tuning is implemented without being intermingled with the subject being tuned. + +**NOTE**: To toggle between tuning single-precision (SP) and double-precision (DP), the `ELEM_TYPE` in [acc_bench_smm.c](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc_bench_smm.c#L22) can be edited. Auto-tuned parameters for SP and DP can both be embedded into the final application and are picked up correctly at runtime. + +To build the benchmarks: ```bash cd src/acc/opencl @@ -66,7 +79,7 @@ The OpenTuner script implements multiple objectives ("cost"), primarily "accurac [ 67s] INFO opentuner.search.plugin.DisplayPlugin: tests=53, best {'BS': 48, 'BM': 8, 'BN': 1}, cost accuracy=32.20000000, size=1.0, found by UniformGreedyMutation ``` -The script finally writes a JSON-file with a filename like `tune_multiply-float-12x12x12-60gflops.json` which is encoding the benchmark (multiply), the precision (float), the kernel (12x12x12), and the achieved performance (60gflops). Tuninig starts from an internal default that is supposed to match LIBSMM's internal default parameters. However, tuning can be (re-)started with specific parameters (e.g., `-bs 64`, `-bm 13`, `-bn 1` for `OPENCL_LIBSMM_SMM_BATCHSIZE`, `OPENCL_LIBSMM_SMM_BLOCK_M`, and `OPENCL_LIBSMM_SMM_BLOCK_N` respectively). +The script finally writes a JSON-file with a filename like `tune_multiply-float-12x12x12-60gflops.json` which is encoding the benchmark (multiply), the precision (float), the kernel (12x12x12), and the achieved performance (60gflops). The script handles SIGINT (like Ctrl-C), and output is still written despite of not terminating normally (can abused to tune interactively). Tuninig starts from an internal default that is supposed to match LIBSMM's internal default parameters. However, tuning can be (re-)started with specific parameters (e.g., `-bs 64`, `-bm 13`, `-bn 1` for `OPENCL_LIBSMM_SMM_BATCHSIZE`, `OPENCL_LIBSMM_SMM_BLOCK_M`, and `OPENCL_LIBSMM_SMM_BLOCK_N` respectively). ## Optimized Kernels @@ -114,4 +127,4 @@ cd src/acc/opencl/smm ./tune_multiply.sh 300 8 1 4 10 15, 6 7 8, 23 ``` -The script `tune_multiply.sh` is tuning 1444 kernels by default (`./acc_bench_smm 300 8 1` taking approximately 15 hours per part). +The script `tune_multiply.sh` is tuning 1444 kernels by default (`./acc_bench_smm 300 8 1` taking approximately 15 hours per part). If the process is interrupted earlier (per SIGINT or Ctrl-C), the execution terminates for all requested kernels (triplet specification) unless an environment variable `CONTINUE=1` is set (proceeds to the next kernel). diff --git a/src/acc/opencl/smm/opencl_libsmm.c b/src/acc/opencl/smm/opencl_libsmm.c index f7878d1b1fe..2fe1ab87487 100644 --- a/src/acc/opencl/smm/opencl_libsmm.c +++ b/src/acc/opencl/smm/opencl_libsmm.c @@ -128,8 +128,12 @@ int opencl_libsmm_read_params(char* parambuf, int libsmm_acc_init(void) { #if defined(_OPENMP) - /* initialization/finalization is not meant to be thread-safe */ - int result = (0 == omp_in_parallel() ? EXIT_SUCCESS : EXIT_FAILURE); + /* initialization/finalization is not meant to be thread-safe */ + int result = ((0 == omp_in_parallel() +# if /*WORKAROUND*/defined(__DBCSR_ACC) + || 0/*master*/ == omp_get_thread_num() +# endif + ) ? EXIT_SUCCESS : EXIT_FAILURE); #else int result = EXIT_SUCCESS; #endif @@ -204,9 +208,13 @@ int libsmm_acc_finalize(void) * However, libsmm_acc_finalize is indirectly called (acc_finalize) inside of a * parallel region (not just the master thread). */ -#if defined(_OPENMP) && /*WORKAROUND*/!defined(__DBCSR_ACC) +#if defined(_OPENMP) /* initialization/finalization is not meant to be thread-safe */ - int result = (0 == omp_in_parallel() ? EXIT_SUCCESS : EXIT_FAILURE); + int result = ((0 == omp_in_parallel() +# if /*WORKAROUND*/defined(__DBCSR_ACC) + || 0/*master*/ == omp_get_thread_num() +# endif + ) ? EXIT_SUCCESS : EXIT_FAILURE); #else int result = EXIT_SUCCESS; #endif @@ -252,6 +260,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, ) && 0 < stack_size && 1 < mn && m <= max_kernel_dim && n <= max_kernel_dim) { + const libxsmm_timer_tickint start = libxsmm_timer_tick(); opencl_libsmm_trans_t* config; opencl_libsmm_transkey_t key; LIBXSMM_MEMZERO127(&key); /* potentially heterogeneous key-data */ @@ -314,6 +323,11 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, new_config.wgsize = (size_t)wgsize; config = (opencl_libsmm_trans_t*)OPENCL_LIBSMM_REGISTER(&key, sizeof(key), sizeof(new_config), &new_config); + if (1 < c_dbcsr_acc_opencl_options.verbosity || 0 > c_dbcsr_acc_opencl_options.verbosity) { + const double duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); + fprintf(stderr, "INFO ACC/OpenCL: %ix%i transpose-kernel generated in %.1f ms\n", + m, n, 1000.0 * duration); + } } else result = EXIT_FAILURE; } @@ -452,6 +466,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, 0 < n_max && n_max <= max_kernel_dim && 0 < k_max && k_max <= max_kernel_dim) { + const libxsmm_timer_tickint start = libxsmm_timer_tick(); opencl_libsmm_smm_t* config; opencl_libsmm_smmkey_t key; LIBXSMM_MEMZERO127(&key); /* potentially heterogeneous key-data */ @@ -572,7 +587,8 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, assert(0 < wgsize && 0 < max_wgsize); /* check planned WG-size against kernel-specific WG-size */ if (wgsize <= max_wgsize) { - if (NULL == config) { + const int default_params = (NULL == config ? 1 : 0); + if (default_params) { config = (opencl_libsmm_smm_t*)OPENCL_LIBSMM_REGISTER( &key, sizeof(key), sizeof(new_config), &new_config); } @@ -580,6 +596,11 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, config->wgsize = (size_t)wgsize; config->bs = bs; config->bm = bm; config->bn = bn; config->kernel = new_config.kernel; + if (1 < c_dbcsr_acc_opencl_options.verbosity || 0 > c_dbcsr_acc_opencl_options.verbosity) { + const double duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); + fprintf(stderr, "INFO ACC/OpenCL: %ix%ix%i %sSMM-kernel generated in %.1f ms\n", + m_max, n_max, k_max, default_params ? "" : "tuned ", 1000.0 * duration); + } } else { /* failed to register config */ result = EXIT_FAILURE; diff --git a/src/acc/opencl/smm/tune_multiply.py b/src/acc/opencl/smm/tune_multiply.py index 5d871655797..06722d3f582 100755 --- a/src/acc/opencl/smm/tune_multiply.py +++ b/src/acc/opencl/smm/tune_multiply.py @@ -17,6 +17,7 @@ from opentuner import MeasurementInterface from opentuner import IntegerParameter from opentuner import Result +from signal import signal, SIGINT import json import glob import sys @@ -60,29 +61,34 @@ def manipulator(self): manipulator.add_parameter(IntegerParameter("BS", 1, self.args.mb)) manipulator.add_parameter(IntegerParameter("BM", 1, self.args.m)) manipulator.add_parameter(IntegerParameter("BN", 1, self.args.n)) + # register signal handler (CTRL-C) + signal(SIGINT, self.handle_sigint) return manipulator def seed_configurations(self): return [{"BS": self.args.bs, "BM": self.args.bm, "BN": self.args.bn}] def objective(self): - return opentuner.search.objective.MaximizeAccuracyMinimizeSize() + if not self.args.primary: + return opentuner.search.objective.MaximizeAccuracyMinimizeSize() + else: + return opentuner.search.objective.MaximizeAccuracy() def run(self, desired_result, input, limit): """ Compile and run a given configuration then return performance """ - cfg = desired_result.configuration.data + config = desired_result.configuration.data run_cmd = ( "OMP_PROC_BIND=TRUE CHECK=" + str(self.args.check) + " OPENCL_LIBSMM_SMM_BATCHSIZE=" - + str(cfg["BS"]) + + str(config["BS"]) + " OPENCL_LIBSMM_SMM_BLOCK_M=" - + str(cfg["BM"]) + + str(config["BM"]) + " OPENCL_LIBSMM_SMM_BLOCK_N=" - + str(cfg["BN"]) + + str(config["BN"]) + " " + self.exepath + "/" @@ -106,9 +112,12 @@ def run(self, desired_result, input, limit): if (match is not None) and match.group(1) and match.group(3): mseconds = float(match.group(1)) gflops = float(match.group(3)) - self.gflops = max(self.gflops, gflops) + if self.gflops < gflops: + # keep best configuration in case of an early exit + self.config = desired_result.configuration + self.gflops = gflops kernelreq = round( - (100.0 * cfg["BM"] * cfg["BN"]) / (self.args.m * self.args.n) + (100.0 * config["BM"] * config["BN"]) / (self.args.m * self.args.n) ) # gflops are reported as "accuracy" (console output) return Result(time=mseconds, accuracy=gflops, size=kernelreq) @@ -140,14 +149,15 @@ def save_final_config(self, configuration): + ofilename ) # extend result for easier reuse later - configuration.data["GFLOPS"] = self.gflops - configuration.data["TYPEID"] = self.typeid - configuration.data["M"] = self.args.m - configuration.data["N"] = self.args.n - configuration.data["K"] = self.args.k - # self.manipulator().save_to_file(configuration.data, ofilename) + config = configuration.data + config["GFLOPS"] = self.gflops + config["TYPEID"] = self.typeid + config["M"] = self.args.m + config["N"] = self.args.n + config["K"] = self.args.k + # self.manipulator().save_to_file(config, ofilename) with open(ofilename, "w") as ofile: - json.dump(configuration.data, ofile) + json.dump(config, ofile) ofile.write("\n") # append newline at EOF # merge all JSONs into a single CSV file if self.args.csvfile: @@ -172,7 +182,7 @@ def save_final_config(self, configuration): ifilename = merged[key][-1] merged[key] = value print( - "Superfluous " + "Worse result " + ifilename + " ignored when merging CSV file" ) @@ -204,6 +214,20 @@ def save_final_config(self, configuration): + self.args.csvfile ) + def handle_sigint(self, signum, frame): + """handles SIGINT or CTRL-C""" + print( + "\nWARNING: tuning " + + str(self.args.m) + + "x" + + str(self.args.n) + + "x" + + str(self.args.k) + + "-kernel was interrupted." + ) + self.save_final_config(self.config) + exit(1) + if __name__ == "__main__": argparser = opentuner.default_argparser() @@ -279,4 +303,12 @@ def save_final_config(self, configuration): dest="check", help="Validate kernel (epsilon)", ) + argparser.add_argument( + "-p", + "--primary-objective", + action="store_true", + default=False, + dest="primary", + help="Primary objective only", + ) SmmTuner.main(argparser.parse_args()) diff --git a/src/acc/opencl/smm/tune_multiply.sh b/src/acc/opencl/smm/tune_multiply.sh index 939b452af5d..0bc34130a40 100755 --- a/src/acc/opencl/smm/tune_multiply.sh +++ b/src/acc/opencl/smm/tune_multiply.sh @@ -92,7 +92,7 @@ if [ "${SED}" ] && [ "${LS}" ] && [ "${RM}" ] && [ "${WC}" ]; then fi NJSONS=$(${LS} -1 ./*.json 2>/dev/null | ${WC} -l) if [ "0" != "${NJSONS}" ]; then - echo "There are already ${NJSONS} (unrelated?) JSON-files found." + echo "Already found ${NJSONS} (unrelated?) JSON-files." fi SLEEP=$(command -v sleep) if [ "${DELAY}" ] && [ "${SLEEP}" ]; then @@ -109,6 +109,17 @@ if [ "${SED}" ] && [ "${LS}" ] && [ "${RM}" ] && [ "${WC}" ]; then # avoid mixing database of previous results into new session ${RM} -rf "${HERE}/opentuner.db" eval "${HERE}/tune_multiply.py ${TRIPLET} --no-dups ${LIMIT}" + RESULT=$? + # environment var. CONTINUE allows to proceed with next kernel + # even if tune_multiply.py returned non-zero exit code + if [[ ("0" != "${RESULT}") && \ + ("${CONTINUE}" = "" \ + || "${CONTINUE}" = "0" \ + || "${CONTINUE}" = "no" \ + || "${CONTINUE}" = "false") ]]; + then + exit ${RESULT} + fi fi N=$((N+1)) done