-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvector-sum.cpp
146 lines (123 loc) · 4.4 KB
/
vector-sum.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#include <algorithm>
#include <iostream>
#include <random>
#include <numeric>
#include <sycl/sycl.hpp>
#include "util.hpp"
using namespace sycl;
constexpr size_t N = 512 * 512;
constexpr size_t B = 16 * 16;
float vector_sum_native(const std::vector<float> &a) {
return std::accumulate(a.begin(), a.end(), 0.f);
}
void vector_sum(queue &q, buffer<float, 1> &a_buf, buffer<float, 1> &o_buf) {
// Submit the kernel to the queue
q.submit([&](handler &h) {
accessor a{a_buf, h, read_only};
accessor o{o_buf, h, write_only, no_init};
// BEGIN CODE SNIP
h.parallel_for(range{N}, [=](id<1> i) {
atomic_ref<float, memory_order::relaxed,
memory_scope::system,
access::address_space::global_space>(
o[0]) += a[i];
});
// END CODE SNIP
});
}
void vector_sum_nd_range(queue &q, buffer<float, 1> &a_buf, buffer<float, 1> &o_buf) {
// Submit the kernel to the queue
q.submit([&](handler &h) {
accessor a{a_buf, h, read_only};
accessor o{o_buf, h, write_only, no_init};
// BEGIN CODE SNIP
h.parallel_for(nd_range<1>{N, B}, [=](nd_item<1> it) {
auto grp = it.get_group();
float group_sum = reduce_over_group(grp, a[it.get_global_id(0)], plus<>());
if (grp.leader()) {
atomic_ref<float, memory_order::relaxed,
memory_scope::system,
access::address_space::global_space>(
o[0]) += group_sum;
}
});
// END CODE SNIP
});
}
void vector_sum_reduction(queue &q, buffer<float, 1> &a_buf, buffer<float, 1> &o_buf) {
// Submit the kernel to the queue
q.submit([&](handler &h) {
accessor a{a_buf, h, read_only};
auto red = reduction(o_buf, h, plus<>());
// BEGIN CODE SNIP
h.parallel_for(range{N}, red,
[=](id<1> i, auto &sum) {
sum += a[i];
});
// END CODE SNIP
});
}
void test_acc() {
// Initialize input and output memory on the host
std::vector<float> a(N), o(1);
std::default_random_engine gen(42);
std::uniform_real_distribution<float> dist(0.0, 1.0);
auto rng = [&]() { return dist(gen); };
std::generate(a.begin(), a.end(), rng);
// calculate groud truth
float gt = vector_sum_native(a);
queue cpu_q{cpu_selector_v};
queue gpu_q{gpu_selector_by_cu};
std::vector<std::tuple<
std::string,
std::function<void(queue &, buffer<float, 1> &, buffer<float, 1> &)>,
queue> > tests = {
{"CPU SYCL", vector_sum, cpu_q},
{"CPU SYCL ND-range", vector_sum_nd_range, cpu_q},
{"CPU SYCL Reduction", vector_sum_reduction, cpu_q},
{"GPU SYCL", vector_sum, gpu_q},
{"GPU SYCL ND-range", vector_sum_nd_range, gpu_q},
{"GPU SYCL Reduction", vector_sum_reduction, gpu_q}
};
for (auto &[name,kernel, q]: tests) {
{
std::fill(o.begin(), o.end(), 0);
buffer<float, 1> a_buf(a.data(), range<1>(N)),
o_buf(o.data(), range<1>(1));
kernel(q, a_buf, o_buf);
}
auto success = floatVectorEquals(o, {gt});
std::cout << name << ": " << (success ? "SUCCESS" : "FAILURE") << std::endl;
}
}
void test_perfomance() {
std::vector<float> a(N), o(1);
std::cout << "CPU single core: ";
benchmark_func([&] { vector_sum_native(a); });
queue cpu_q{cpu_selector_v};
queue gpu_q{gpu_selector_by_cu};
std::vector<std::tuple<
std::string,
std::function<void(queue &, buffer<float, 1> &, buffer<float, 1> &)>,
queue> > tests = {
{"CPU SYCL", vector_sum, cpu_q},
{"CPU SYCL ND-range", vector_sum_nd_range, cpu_q},
{"CPU SYCL Reduction", vector_sum_reduction, cpu_q},
{"GPU SYCL", vector_sum, gpu_q},
{"GPU SYCL ND-range", vector_sum_nd_range, gpu_q},
{"GPU SYCL Reduction", vector_sum_reduction, gpu_q}
};
for (auto &[name,kernel, q]: tests) {
{
std::fill(o.begin(), o.end(), 0);
buffer<float, 1> a_buf(a.data(), range<1>(N)),
o_buf(o.data(), range<1>(1));
std::cout << name << ": ";
benchmark_sycl_kernel([&](queue &q) { kernel(q, a_buf, o_buf); }, q);
}
}
}
int main() {
test_acc();
test_perfomance();
}