Skip to content

Commit 4d750ca

Browse files
authored
Introducing sb handle (codeplaysoftware#363)
* Introducing the concept of sycl_blas_handle * Removing dependency on the virtual pointer in ComputeCpp SDK * Removing policy handler concept * Removing The executor concept
1 parent 6b649bf commit 4d750ca

File tree

168 files changed

+3565
-4748
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

168 files changed

+3565
-4748
lines changed

Diff for: CMakeLists.txt

+4-6
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,13 @@ endif()
3939
option(INSTALL_HEADER_ONLY "Install SYCL-BLAS as a header only library" OFF)
4040

4141
set(BUILD_SHARED_LIBS ON CACHE BOOL "")
42-
set(CMAKE_CXX_STANDARD 11)
42+
set(CMAKE_CXX_STANDARD 14)
4343
set(CMAKE_CXX_STANDARD_REQUIRED ON)
4444
set(CMAKE_CXX_EXTENSIONS OFF)
4545
set(CMAKE_POSITION_INDEPENDENT_CODE ${BUILD_SHARED_LIBS})
4646

47+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-deprecated-copy-with-user-provided-copy")
48+
4749
if(DEFINED SYSTEM_BLAS_ROOT)
4850
message(DEPRECATION
4951
"SYSTEM_BLAS_ROOT is deprecated. Add the path to the reference BLAS to CMAKE_PREFIX_PATH instead")
@@ -74,7 +76,6 @@ find_package(PythonInterp 3 REQUIRED)
7476

7577
if (MSVC)
7678
# The device compiler needs C++14 to parse the Windows headers
77-
set(CMAKE_CXX_STANDARD 14)
7879
set(BUILD_SHARED_LIBS FALSE CACHE BOOL
7980
"Force SYCL-BLAS to be built as a static library on Windows"
8081
FORCE
@@ -189,10 +190,7 @@ if (INSTALL_HEADER_ONLY)
189190
FILES_MATCHING PATTERN "*.hpp"
190191
)
191192
endif()
192-
install(DIRECTORY ${CMAKE_SOURCE_DIR}/external/computecpp-sdk/include/vptr
193-
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
194-
COMPONENT sycl_blas
195-
)
193+
196194
install(FILES ${version_file} DESTINATION ${CMAKE_INSTALL_PREFIX})
197195
install(EXPORT sycl_blas
198196
DESTINATION ${CMAKE_INSTALL_PREFIX}

Diff for: README.md

+31-31
Original file line numberDiff line numberDiff line change
@@ -18,28 +18,31 @@ the project.
1818

1919
## Table of Contents
2020

21-
* [Motivation](#motivation)
22-
* [Basic Concepts](#basic-concepts)
23-
* [Views](#views)
24-
* [Operations](#operations)
25-
* [Executors](#executors)
26-
* [Interface](#interface)
27-
* [API description](#api-description)
28-
* [BLAS 1](#blas-1)
29-
* [BLAS 2](#blas-2)
30-
* [BLAS 3](#blas-3)
31-
* [Requirements](#requirements)
32-
* [Setup](#setup)
33-
* [Compile with ComputeCpp](#Compile-with-ComputeCpp)
34-
* [Compile with DPC++](#Compile-with-DPC++)
35-
* [Instaling SYCL-BLAS](#Instaling-SYCL-BLAS)
36-
* [POWER_VR support (ComputeCpp Only)](#POWER_VR-support-(ComputeCpp-Only))
37-
* [Doxygen](#Doxygen)
38-
* [How to compile](#how-to-compile)
39-
* [CMake options](#cmake-options)
40-
* [Cross-Compile](#cross-compile)
41-
* [Tests and benchmarks](#tests-and-benchmarks)
42-
* [Contributing to the project](#contributing-to-the-project)
21+
- [SYCL-BLAS Implementation](#sycl-blas-implementation)
22+
- [Table of Contents](#table-of-contents)
23+
- [Motivation](#motivation)
24+
- [Basic Concepts](#basic-concepts)
25+
- [Views](#views)
26+
- [Operations](#operations)
27+
- [SB\_Handle](#sb_handle)
28+
- [Interface](#interface)
29+
- [API description](#api-description)
30+
- [BLAS 1](#blas-1)
31+
- [BLAS 2](#blas-2)
32+
- [BLAS 3](#blas-3)
33+
- [Requirements](#requirements)
34+
- [Setup](#setup)
35+
- [Compile with ComputeCpp](#compile-with-computecpp)
36+
- [Compile with DPC++](#compile-with-dpc)
37+
- [Compile with hipSYCL](#compile-with-hipsycl)
38+
- [Instaling SYCL-BLAS](#instaling-sycl-blas)
39+
- [POWER\_VR support (ComputeCpp Only)](#power_vr-support-computecpp-only)
40+
- [Doxygen](#doxygen)
41+
- [CMake options](#cmake-options)
42+
- [Cross-Compile (ComputeCpp Only)](#cross-compile-computecpp-only)
43+
- [Tests and benchmarks](#tests-and-benchmarks)
44+
- [Contributing to the project](#contributing-to-the-project)
45+
- [Guides and Other Documents](#guides-and-other-documents)
4346

4447
## Motivation
4548

@@ -110,7 +113,7 @@ All the relevant files can be found in
110113
the `include` directory.
111114

112115
There are four components in SYCL-BLAS, the *View*, the *Operations*,
113-
the *Executors* and the *Interface* itself.
116+
the *SB_Handle* and the *Interface* itself.
114117

115118
### Views
116119

@@ -144,14 +147,11 @@ The leaf nodes of an Expression Tree are Views or Scalar types (data).
144147
The intermediate nodes of the Expression Tree are operations (e.g,
145148
binary operations, unary operations, etc).
146149

147-
### Executors
150+
### SB_Handle
148151

149-
An executor traverses the Expression Tree to evaluate the operations that it
152+
An SB_Handle traverses the Expression Tree to evaluate the operations that it
150153
defines.
151-
Executors use different techniques to evaluate the expression tree.
152-
The basic C++ executor performs a for loop on the size of the data and calls
153-
the evaluation function on each item.
154-
154+
SB_Handle use different techniques to evaluate the expression tree.
155155
The SYCL evaluator transform the tree into a device tree (i.e, converting
156156
buffer to accessors) and then evaluates the Expression Tree on the device.
157157

@@ -176,8 +176,8 @@ of multiple BLAS operations.
176176

177177
This section references all the supported operations and their interface.
178178

179-
All operations take as their first argument a reference to the executor, a
180-
`blas::Executor` created with a `sycl::queue`. The return value is usually an
179+
All operations take as their first argument a reference to the SB_Handle, a
180+
`blas::SB_Handle` created with a `sycl::queue`. The return value is usually an
181181
array of SYCL events (except for some operations that can return a scalar or
182182
a tuple). The containers for the vectors and matrices (and scalars written by
183183
the BLAS operations) are iterator buffers that can be created with

Diff for: Roadmap.md

-2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,4 @@ Medium Term:
2727
Long Term:
2828

2929
* Complete the Blas 3 interface
30-
* Test the Parallel and Sequential Executors
3130
* Add continuous integration testing
32-

Diff for: benchmark/clBench/acl/blas3/gemm.cpp

+15-14
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,14 @@ std::string get_name(std::string t1, std::string t2, int m, int k, int n) {
3838
return str.str();
3939
}
4040

41-
void run(benchmark::State& state, int t1, int t2,
42-
index_t m, index_t k, index_t n, float alpha, float beta, bool* success) {
41+
void run(benchmark::State& state, int t1, int t2, index_t m, index_t k,
42+
index_t n, float alpha, float beta, bool* success) {
4343
// Standard test setup.
4444
std::string t1s = blas_benchmark::utils::from_transpose_enum(
4545
static_cast<blas_benchmark::utils::Transposition>(t1));
4646
std::string t2s = blas_benchmark::utils::from_transpose_enum(
4747
static_cast<blas_benchmark::utils::Transposition>(t2));
48-
if(t1s != "n" || t2s != "n") {
48+
if (t1s != "n" || t2s != "n") {
4949
state.SkipWithError("Transposed matrices not supported in ACL benchmarks");
5050
return;
5151
}
@@ -85,8 +85,7 @@ void run(benchmark::State& state, int t1, int t2,
8585
// Matrices
8686
std::vector<float> a = blas_benchmark::utils::random_data<float>(m * k);
8787
std::vector<float> b = blas_benchmark::utils::random_data<float>(k * n);
88-
std::vector<float> c =
89-
blas_benchmark::utils::const_data<float>(m * n, 0);
88+
std::vector<float> c = blas_benchmark::utils::const_data<float>(m * n, 0);
9089

9190
// Device matrices
9291
const arm_compute::TensorShape shape_a(k, m), shape_b(n, k), shape_c(n, m);
@@ -96,9 +95,12 @@ void run(benchmark::State& state, int t1, int t2,
9695
arm_compute::CLScheduler::get().default_init();
9796
arm_compute::CLTensor arm_a, arm_b, arm_c;
9897
#endif
99-
arm_a.allocator()->init(arm_compute::TensorInfo(shape_a, 1, arm_compute::DataType::F32));
100-
arm_b.allocator()->init(arm_compute::TensorInfo(shape_b, 1, arm_compute::DataType::F32));
101-
arm_c.allocator()->init(arm_compute::TensorInfo(shape_c, 1, arm_compute::DataType::F32));
98+
arm_a.allocator()->init(
99+
arm_compute::TensorInfo(shape_a, 1, arm_compute::DataType::F32));
100+
arm_b.allocator()->init(
101+
arm_compute::TensorInfo(shape_b, 1, arm_compute::DataType::F32));
102+
arm_c.allocator()->init(
103+
arm_compute::TensorInfo(shape_c, 1, arm_compute::DataType::F32));
102104
arm_a.allocator()->allocate();
103105
arm_b.allocator()->allocate();
104106
arm_c.allocator()->allocate();
@@ -172,14 +174,13 @@ void register_benchmark(blas_benchmark::Args& args, bool* success) {
172174
int t1 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t1s));
173175
int t2 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t2s));
174176

175-
auto BM_lambda = [&](benchmark::State& st, int t1,
176-
int t2, index_t m, index_t k, index_t n,
177-
float alpha, float beta, bool* success) {
177+
auto BM_lambda = [&](benchmark::State& st, int t1, int t2, index_t m,
178+
index_t k, index_t n, float alpha, float beta,
179+
bool* success) {
178180
run(st, t1, t2, m, k, n, alpha, beta, success);
179181
};
180-
benchmark::RegisterBenchmark(get_name(t1s, t2s, m, k, n).c_str(),
181-
BM_lambda, t1, t2, m, k, n, alpha,
182-
beta, success);
182+
benchmark::RegisterBenchmark(get_name(t1s, t2s, m, k, n).c_str(), BM_lambda,
183+
t1, t2, m, k, n, alpha, beta, success);
183184
}
184185
}
185186

Diff for: benchmark/clBench/acl/utils.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
#define ACL_UTILS_HPP
33

44
#include <CL/cl.h>
5-
#include <clBench/clwrap.hpp>
65
#include <arm_compute/core/Helpers.h>
76
#include <arm_compute/core/ITensor.h>
87
#include <arm_compute/core/Types.h>
@@ -11,6 +10,7 @@
1110
#include <arm_compute/runtime/CL/CLFunctions.h>
1211
#include <arm_compute/runtime/CL/CLScheduler.h>
1312
#include <arm_compute/runtime/Tensor.h>
13+
#include <clBench/clwrap.hpp>
1414
#include <thread>
1515

1616
#ifdef ACL_BACKEND_NEON

Diff for: benchmark/clBench/clblas/blas3/trsm.cpp

+3-6
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,8 @@ void run(benchmark::State& state, ExecutorType* executorPtr, char side,
6464
clblasTranspose transA =
6565
blas_benchmark::utils::translate_transposition(&transpose);
6666
clblasSide sideA = blas_benchmark::utils::translate_side(&side);
67-
clblasUplo triangleA =
68-
blas_benchmark::utils::translate_triangle(&triangle);
69-
clblasDiag diagA =
70-
blas_benchmark::utils::translate_diagonal(&diagonal);
67+
clblasUplo triangleA = blas_benchmark::utils::translate_triangle(&triangle);
68+
clblasDiag diagA = blas_benchmark::utils::translate_diagonal(&diagonal);
7169

7270
if (clblasSetup() != CL_SUCCESS) {
7371
state.SkipWithError("error initiazing clblas");
@@ -90,8 +88,7 @@ void run(benchmark::State& state, ExecutorType* executorPtr, char side,
9088
err = clEnqueueWriteBuffer(executorPtr->queue(), a_gpu, CL_TRUE, 0,
9189
sizeA * sizeof(scalar_t), a.data(), 0, nullptr,
9290
nullptr);
93-
err =
94-
clEnqueueWriteBuffer(executorPtr->queue(), b_gpu, CL_TRUE, 0,
91+
err = clEnqueueWriteBuffer(executorPtr->queue(), b_gpu, CL_TRUE, 0,
9592
sizeB * sizeof(scalar_t), b.data(), 0, nullptr,
9693
nullptr);
9794

Diff for: benchmark/clBench/clblas/utils.hpp

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#ifndef CLBLAST_UTILS_HPP
22
#define CLBLAST_UTILS_HPP
33

4-
#include <common/common_utils.hpp>
5-
#include <clBench/clwrap.hpp>
64
#include <clBLAS.h>
5+
#include <clBench/clwrap.hpp>
6+
#include <common/common_utils.hpp>
77

88
typedef Context ExecutorType;
99

@@ -86,7 +86,7 @@ static inline clblasSide translate_side(const char *side) {
8686
}
8787
}
8888

89-
static inline clblasUplo translate_triangle(const char* triangle) {
89+
static inline clblasUplo translate_triangle(const char *triangle) {
9090
if (triangle[0] == 'u') {
9191
return clblasUpper;
9292
} else if (triangle[0] == 'l') {
@@ -99,7 +99,8 @@ static inline clblasUplo translate_triangle(const char* triangle) {
9999
static inline clblasDiag translate_diagonal(const char *diag) {
100100
if (diag[0] == 'u') {
101101
return clblasUnit;
102-
} if (diag[0] == 'n') {
102+
}
103+
if (diag[0] == 'n') {
103104
return clblasNonUnit;
104105
} else {
105106
throw std::runtime_error("Got invalid diagonal parameter!");

Diff for: benchmark/clBench/clblast/utils.hpp

+4-3
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@
2626
#ifndef CLBLAST_UTILS_HPP
2727
#define CLBLAST_UTILS_HPP
2828

29-
#include <common/common_utils.hpp>
3029
#include <clBench/clwrap.hpp>
3130
#include <clblast.h>
31+
#include <common/common_utils.hpp>
3232

3333
typedef Context ExecutorType;
3434

@@ -111,7 +111,7 @@ static inline clblast::Side translate_side(const char *side) {
111111
}
112112
}
113113

114-
static inline clblast::Triangle translate_triangle(const char* triangle) {
114+
static inline clblast::Triangle translate_triangle(const char *triangle) {
115115
if (triangle[0] == 'u') {
116116
return clblast::Triangle::kUpper;
117117
} else if (triangle[0] == 'l') {
@@ -124,7 +124,8 @@ static inline clblast::Triangle translate_triangle(const char* triangle) {
124124
static inline clblast::Diagonal translate_diagonal(const char *diag) {
125125
if (diag[0] == 'u') {
126126
return clblast::Diagonal::kUnit;
127-
} if (diag[0] == 'n') {
127+
}
128+
if (diag[0] == 'n') {
128129
return clblast::Diagonal::kNonUnit;
129130
} else {
130131
throw std::runtime_error("Got invalid diagonal parameter!");

Diff for: benchmark/clBench/clwrap.hpp

+9-12
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@
3030
#include <stdexcept>
3131
#include <vector>
3232

33+
#include <CL/cl.h>
3334
#include <algorithm>
3435
#include <memory>
35-
#include <CL/cl.h>
3636

3737
/* We don't want to return exceptions in destructors. #define them out for now.
3838
*/
@@ -150,10 +150,10 @@ class OpenCLDeviceSelector {
150150
cl_platform_id best_platform = NULL;
151151
int best_score = 0;
152152

153-
static cl_device_type match_device_type(std::string requested){
153+
static cl_device_type match_device_type(std::string requested) {
154154
if (requested.empty()) return CL_DEVICE_TYPE_ALL;
155155
std::transform(requested.begin(), requested.end(), requested.begin(),
156-
::tolower);
156+
::tolower);
157157
if (requested == "gpu") return CL_DEVICE_TYPE_GPU;
158158
if (requested == "cpu") return CL_DEVICE_TYPE_CPU;
159159
if (requested == "accel") return CL_DEVICE_TYPE_ACCELERATOR;
@@ -238,7 +238,8 @@ class OpenCLDeviceSelector {
238238
OpenCLDeviceSelector(std::string vendor, std::string type) {
239239
// Get the number of platforms, and a list of IDs
240240
cl_uint num_platforms = get_platform_count();
241-
std::unique_ptr<cl_platform_id[]> platforms(new cl_platform_id[num_platforms]);
241+
std::unique_ptr<cl_platform_id[]> platforms(
242+
new cl_platform_id[num_platforms]);
242243
cl_int status = clGetPlatformIDs(num_platforms, platforms.get(), NULL);
243244
if (status != CL_SUCCESS) {
244245
do_error("failure in clGetPlatformIDs");
@@ -254,7 +255,7 @@ class OpenCLDeviceSelector {
254255
cl_uint num_devices = get_device_count(platform);
255256
std::unique_ptr<cl_device_id[]> devices(new cl_device_id[num_devices]);
256257
cl_int status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices,
257-
devices.get(), NULL);
258+
devices.get(), NULL);
258259
if (status != CL_SUCCESS) {
259260
do_error("failure in clGetDeviceIDs");
260261
}
@@ -289,7 +290,6 @@ class Context {
289290
bool is_active = false;
290291

291292
public:
292-
293293
// Delete the copy constructor so that we don't accidentally leak references
294294
// to the underlying opencl context
295295
Context(const Context &) = delete;
@@ -301,7 +301,7 @@ class Context {
301301
is_active(c.active()),
302302
command_queue(c.queue()) {}
303303

304-
Context(OpenCLDeviceSelector oclds= OpenCLDeviceSelector("*", "*")) {
304+
Context(OpenCLDeviceSelector oclds = OpenCLDeviceSelector("*", "*")) {
305305
platform = oclds.platform();
306306
device = oclds.device();
307307
create();
@@ -314,8 +314,8 @@ class Context {
314314
if (status != CL_SUCCESS) {
315315
do_error("failure to create context");
316316
}
317-
command_queue =
318-
clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
317+
command_queue = clCreateCommandQueue(context, device,
318+
CL_QUEUE_PROFILING_ENABLE, &status);
319319
if (status != CL_SUCCESS) {
320320
do_error("failure to create command queue");
321321
}
@@ -349,7 +349,6 @@ class Context {
349349
}
350350

351351
operator cl_context() const { return context; }
352-
353352
};
354353

355354
class CLEventHandler {
@@ -379,7 +378,6 @@ class CLEventHandler {
379378
release(event);
380379
}
381380
}
382-
383381
};
384382

385383
template <typename scalar_t, int Options = CL_MEM_READ_WRITE>
@@ -458,5 +456,4 @@ class MemBuffer {
458456
}
459457
};
460458

461-
462459
#endif /* end of include guard: CLWRAP_HPP */

0 commit comments

Comments
 (0)