Skip to content

Commit 404e325

Browse files
committed
Add support for 2D tiled splits, split hints
This adds two new task hints, `split_1d` and `split_2d`, which influence the way tasks are split into chunks. The latter uses a new splitting function of the same name; all splitting related functionality is moved into a separate file and tests for both 1D and 2D splitting are included.
1 parent 2908bc1 commit 404e325

File tree

10 files changed

+550
-49
lines changed

10 files changed

+550
-49
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Versioning](http://semver.org/spec/v2.0.0.html).
1616
- `distr_queue::fence` and `buffer_snapshot` are now stable, subsuming the `experimental::` APIs of the same name (#225)
1717
- Celerity now warns at runtime when a task declares reads from uninitialized buffers or writes with overlapping ranges between nodes (#224)
1818
- Introduce new `experimental::hint` API for providing the runtime with additional information on how to execute a task (#227)
19+
- Introduce new `experimental::hints::split_1d` and `experimental::hints::split_2d` task hints for controlling how a task is split into chunks (#227)
1920

2021
### Changed
2122

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ set(SOURCES
194194
src/recorders.cc
195195
src/runtime.cc
196196
src/scheduler.cc
197+
src/split.cc
197198
src/task.cc
198199
src/task_manager.cc
199200
src/user_bench.cc

include/hint.h

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,31 @@ class hint_base {
3232

3333
} // namespace celerity::detail
3434

35-
namespace celerity::experimental::hints {}; // namespace celerity::experimental::hints
35+
namespace celerity::experimental::hints {
36+
37+
/**
38+
* Suggests that the task should be split into 1D chunks.
39+
* This is currently the default behavior.
40+
*/
41+
class split_1d : public detail::hint_base {
42+
private:
43+
void validate(const hint_base& other) const override;
44+
};
45+
46+
/**
47+
* Suggests that the task should be split into 2D chunks.
48+
*/
49+
class split_2d : public detail::hint_base {
50+
private:
51+
void validate(const hint_base& other) const override;
52+
};
53+
54+
inline void split_1d::validate(const hint_base& other) const {
55+
if(dynamic_cast<const split_2d*>(&other) != nullptr) { throw std::runtime_error("Cannot combine split_1d and split_2d hints"); }
56+
}
57+
58+
inline void split_2d::validate(const hint_base& other) const {
59+
if(dynamic_cast<const split_1d*>(&other) != nullptr) { throw std::runtime_error("Cannot combine split_1d and split_2d hints"); }
60+
}
61+
62+
}; // namespace celerity::experimental::hints

include/split.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#pragma once
2+
3+
#include <vector>
4+
5+
#include "ranges.h"
6+
7+
namespace celerity::detail {
8+
9+
std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
10+
std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
11+
12+
} // namespace celerity::detail

src/distributed_graph_generator.cc

Lines changed: 8 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "command.h"
55
#include "command_graph.h"
66
#include "recorders.h"
7+
#include "split.h"
78
#include "task.h"
89
#include "task_manager.h"
910

@@ -35,53 +36,6 @@ void distributed_graph_generator::add_buffer(const buffer_id bid, const int dims
3536
m_buffer_states.at(bid).replicated_regions.update_region(subrange<3>({}, range), node_bitset{}.set());
3637
}
3738

38-
// We simply split in the first dimension for now
39-
static std::vector<chunk<3>> split_equal(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks, const int dims) {
40-
#ifndef NDEBUG
41-
assert(num_chunks > 0);
42-
for(int d = 0; d < dims; ++d) {
43-
assert(granularity[d] > 0);
44-
assert(full_chunk.range[d] % granularity[d] == 0);
45-
}
46-
#endif
47-
48-
// Due to split granularity requirements or if num_workers > global_size[0],
49-
// we may not be able to create the requested number of chunks.
50-
const auto actual_num_chunks = std::min(num_chunks, full_chunk.range[0] / granularity[0]);
51-
52-
// If global range is not divisible by (actual_num_chunks * granularity),
53-
// assign ceil(quotient) to the first few chunks and floor(quotient) to the remaining
54-
const auto small_chunk_size_dim0 = full_chunk.range[0] / (actual_num_chunks * granularity[0]) * granularity[0];
55-
const auto large_chunk_size_dim0 = small_chunk_size_dim0 + granularity[0];
56-
const auto num_large_chunks = (full_chunk.range[0] - small_chunk_size_dim0 * actual_num_chunks) / granularity[0];
57-
assert(num_large_chunks * large_chunk_size_dim0 + (actual_num_chunks - num_large_chunks) * small_chunk_size_dim0 == full_chunk.range[0]);
58-
59-
std::vector<chunk<3>> result(actual_num_chunks, {full_chunk.offset, full_chunk.range, full_chunk.global_size});
60-
for(auto i = 0u; i < num_large_chunks; ++i) {
61-
result[i].range[0] = large_chunk_size_dim0;
62-
result[i].offset[0] += i * large_chunk_size_dim0;
63-
}
64-
for(auto i = num_large_chunks; i < actual_num_chunks; ++i) {
65-
result[i].range[0] = small_chunk_size_dim0;
66-
result[i].offset[0] += num_large_chunks * large_chunk_size_dim0 + (i - num_large_chunks) * small_chunk_size_dim0;
67-
}
68-
69-
#ifndef NDEBUG
70-
size_t total_range_dim0 = 0;
71-
for(size_t i = 0; i < result.size(); ++i) {
72-
total_range_dim0 += result[i].range[0];
73-
if(i == 0) {
74-
assert(result[i].offset[0] == full_chunk.offset[0]);
75-
} else {
76-
assert(result[i].offset[0] == result[i - 1].offset[0] + result[i - 1].range[0]);
77-
}
78-
}
79-
assert(total_range_dim0 == full_chunk.range[0]);
80-
#endif
81-
82-
return result;
83-
}
84-
8539
using buffer_requirements_map = std::unordered_map<buffer_id, std::unordered_map<access_mode, region<3>>>;
8640

8741
static buffer_requirements_map get_buffer_requirements_for_mapped_access(const task& tsk, subrange<3> sr, const range<3> global_size) {
@@ -179,7 +133,13 @@ void distributed_graph_generator::generate_distributed_commands(const task& tsk)
179133
}
180134
return chunks;
181135
}
182-
if(tsk.has_variable_split()) { return split_equal(full_chunk, tsk.get_granularity(), num_chunks, tsk.get_dimensions()); }
136+
if(tsk.has_variable_split()) {
137+
if(tsk.get_hint<experimental::hints::split_1d>() != nullptr) {
138+
// no-op, keeping this for documentation purposes
139+
}
140+
if(tsk.get_hint<experimental::hints::split_2d>() != nullptr) { return split_2d(full_chunk, tsk.get_granularity(), num_chunks); }
141+
return split_1d(full_chunk, tsk.get_granularity(), num_chunks);
142+
}
183143
return std::vector<chunk<3>>{full_chunk};
184144
})();
185145
assert(chunks.size() <= num_chunks); // We may have created less than requested

src/split.cc

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
#include "split.h"
2+
3+
#include <array>
4+
#include <tuple>
5+
6+
#include "grid.h"
7+
8+
namespace {
9+
10+
using namespace celerity;
11+
using namespace celerity::detail;
12+
13+
[[maybe_unused]] void sanity_check_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& split) {
14+
region<3> reconstructed_chunk;
15+
for(auto& chnk : split) {
16+
assert(region_intersection(reconstructed_chunk, box<3>(chnk)).empty());
17+
reconstructed_chunk = region_union(box<3>(chnk), reconstructed_chunk);
18+
}
19+
assert(region_difference(reconstructed_chunk, box<3>(full_chunk)).empty());
20+
}
21+
22+
template <int Dims>
23+
std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks(
24+
const chunk<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
25+
range<Dims> small_chunk_size{zeros};
26+
range<Dims> large_chunk_size{zeros};
27+
range<Dims> num_large_chunks{zeros};
28+
for(int d = 0; d < Dims; ++d) {
29+
const size_t ideal_chunk_size = full_chunk.range[d] / actual_num_chunks[d];
30+
small_chunk_size[d] = (ideal_chunk_size / granularity[d]) * granularity[d];
31+
large_chunk_size[d] = small_chunk_size[d] + granularity[d];
32+
num_large_chunks[d] = (full_chunk.range[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];
33+
}
34+
return {small_chunk_size, large_chunk_size, num_large_chunks};
35+
}
36+
37+
/**
38+
* Given a factorization of `num_chunks` (i.e., `f0 * f1 = num_chunks`), try to find the assignment of factors to
39+
* dimensions that produces more chunks under the given constraints. If they are tied, try to find the assignment
40+
* that results in a "nicer" split according to some heuristics (see below).
41+
*
42+
* The single argument `factor` specifies both factors, as `f0 = factor` and `f1 = num_chunks / factor`.
43+
*
44+
* @returns The number of chunks that can be created in dimension 0 and dimension 1, respectively. These are at most
45+
* (f0, f1) or (f1, f0), however may be less if constrained by the split granularity.
46+
*/
47+
std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {
48+
assert(num_chunks % factor == 0);
49+
const size_t max_chunks[2] = {full_chunk.range[0] / granularity[0], full_chunk.range[1] / granularity[1]};
50+
const size_t f0 = factor;
51+
const size_t f1 = num_chunks / factor;
52+
53+
// Decide in which direction to split by first checking which
54+
// factor assignment produces more chunks under the given constraints.
55+
const std::array<size_t, 2> split_0_1 = {std::min(f0, max_chunks[0]), std::min(f1, max_chunks[1])};
56+
const std::array<size_t, 2> split_1_0 = {std::min(f1, max_chunks[0]), std::min(f0, max_chunks[1])};
57+
const auto count0 = split_0_1[0] * split_0_1[1];
58+
const auto count1 = split_1_0[0] * split_1_0[1];
59+
60+
if(count0 > count1) { return split_0_1; }
61+
if(count0 < count1) { return split_1_0; }
62+
63+
// If we're tied for the number of chunks we can create, try some heuristics to decide.
64+
65+
// If domain is square(-ish), prefer splitting along slower dimension.
66+
// (These bounds have been chosen arbitrarily!)
67+
const double squareishness = std::sqrt(full_chunk.range.size()) / static_cast<double>(full_chunk.range[0]);
68+
if(squareishness > 0.95 && squareishness < 1.05) { return (f0 >= f1) ? split_0_1 : split_1_0; }
69+
70+
// For non-square domains, prefer split that produces shorter edges (compare sum of circumferences)
71+
const auto circ0 = full_chunk.range[0] / split_0_1[0] + full_chunk.range[1] / split_0_1[1];
72+
const auto circ1 = full_chunk.range[0] / split_1_0[0] + full_chunk.range[1] / split_1_0[1];
73+
return circ0 < circ1 ? split_0_1 : split_1_0;
74+
75+
// TODO: Yet another heuristic we may want to consider is how even chunk sizes are,
76+
// i.e., how balanced the workload is.
77+
}
78+
79+
} // namespace
80+
81+
namespace celerity::detail {
82+
83+
std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
84+
#ifndef NDEBUG
85+
assert(num_chunks > 0);
86+
for(int d = 0; d < 3; ++d) {
87+
assert(granularity[d] > 0);
88+
assert(full_chunk.range[d] % granularity[d] == 0);
89+
}
90+
#endif
91+
92+
// Due to split granularity requirements or if num_workers > global_size[0],
93+
// we may not be able to create the requested number of chunks.
94+
const std::array<size_t, 1> actual_num_chunks = {std::min(num_chunks, full_chunk.range[0] / granularity[0])};
95+
const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<1>(full_chunk, granularity, actual_num_chunks);
96+
97+
std::vector<chunk<3>> result(actual_num_chunks[0], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
98+
for(auto i = 0u; i < num_large_chunks[0]; ++i) {
99+
result[i].range[0] = large_chunk_size[0];
100+
result[i].offset[0] += i * large_chunk_size[0];
101+
}
102+
for(auto i = num_large_chunks[0]; i < actual_num_chunks[0]; ++i) {
103+
result[i].range[0] = small_chunk_size[0];
104+
result[i].offset[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];
105+
}
106+
107+
#ifndef NDEBUG
108+
sanity_check_split(full_chunk, result);
109+
#endif
110+
111+
return result;
112+
}
113+
114+
// TODO: Make the split dimensions configurable for 3D chunks?
115+
std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
116+
#ifndef NDEBUG
117+
assert(num_chunks > 0);
118+
for(int d = 0; d < 3; ++d) {
119+
assert(granularity[d] > 0);
120+
assert(full_chunk.range[d] % granularity[d] == 0);
121+
}
122+
#endif
123+
124+
// Factorize num_chunks
125+
// We start out with an initial guess of `factor = floor(sqrt(num_chunks))` (the other one is implicitly given by `num_chunks / factor`),
126+
// and work our way down, keeping track of the best factorization we've found so far, until we find a factorization that produces
127+
// the requested number of chunks, or until we reach (1, num_chunks), i.e., a 1D split.
128+
size_t factor = std::floor(std::sqrt(num_chunks));
129+
std::array<size_t, 2> best_chunk_counts = {0, 0};
130+
while(factor >= 1) {
131+
while(factor > 1 && num_chunks % factor != 0) {
132+
factor--;
133+
}
134+
// The returned counts are at most (factor, num_chunks / factor), however may be less if constrained by the split granularity.
135+
const auto chunk_counts = assign_split_factors_2d(full_chunk, granularity, factor, num_chunks);
136+
if(chunk_counts[0] * chunk_counts[1] > best_chunk_counts[0] * best_chunk_counts[1]) { best_chunk_counts = chunk_counts; }
137+
if(chunk_counts[0] * chunk_counts[1] == num_chunks) { break; }
138+
factor--;
139+
}
140+
const auto actual_num_chunks = best_chunk_counts;
141+
const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<2>(full_chunk, granularity, actual_num_chunks);
142+
143+
std::vector<chunk<3>> result(actual_num_chunks[0] * actual_num_chunks[1], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
144+
id<3> offset = full_chunk.offset;
145+
146+
for(size_t j = 0; j < actual_num_chunks[0]; ++j) {
147+
range<2> chunk_size = {(j < num_large_chunks[0]) ? large_chunk_size[0] : small_chunk_size[0], 0};
148+
for(size_t i = 0; i < actual_num_chunks[1]; ++i) {
149+
chunk_size[1] = (i < num_large_chunks[1]) ? large_chunk_size[1] : small_chunk_size[1];
150+
auto& chnk = result[j * actual_num_chunks[1] + i];
151+
chnk.offset = offset;
152+
chnk.range[0] = chunk_size[0];
153+
chnk.range[1] = chunk_size[1];
154+
offset[1] += chunk_size[1];
155+
}
156+
offset[0] += chunk_size[0];
157+
offset[1] = full_chunk.offset[1];
158+
}
159+
160+
#ifndef NDEBUG
161+
sanity_check_split(full_chunk, result);
162+
#endif
163+
164+
return result;
165+
}
166+
} // namespace celerity::detail

test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ set(TEST_TARGETS
4242
runtime_tests
4343
runtime_deprecation_tests
4444
sycl_tests
45+
split_tests
4546
task_graph_tests
4647
task_ring_buffer_tests
4748
test_utils_tests

test/graph_gen_granularity_tests.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,16 @@ TEST_CASE("distributed_graph_generator respects split constraints", "[distribute
7474
CHECK(dynamic_cast<const execution_command*>(dctx.query(tid_b).get_raw(1)[0])->get_execution_range().range == range<3>{96, 1, 1});
7575
}
7676

77+
TEST_CASE("distributed_graph_generator creates 2-dimensional chunks when providing the split_2d hint", "[distributed_graph_generator][split][task-hints]") {
78+
const size_t num_nodes = 4;
79+
dist_cdag_test_context dctx(num_nodes);
80+
const auto tid_a = dctx.device_compute<class UKN(task)>(range<2>{128, 128}).hint(experimental::hints::split_2d{}).submit();
81+
REQUIRE(dctx.query(tid_a).count() == 4);
82+
for(node_id nid = 0; nid < 4; ++nid) {
83+
CHECK(dynamic_cast<const execution_command*>(dctx.query(tid_a).get_raw(nid)[0])->get_execution_range().range == range<3>{64, 64, 1});
84+
}
85+
}
86+
7787
template <int Dims>
7888
class simple_task;
7989

test/hint_tests.cc

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,20 @@ TEST_CASE_METHOD(test_utils::runtime_fixture, "hints can ensure combinations wit
5555
CHECK_THROWS_WITH(experimental::hint(cgh, my_hint{1336}), "not leet enough");
5656
});
5757
}
58+
59+
TEST_CASE_METHOD(test_utils::runtime_fixture, "split_1d and split_2d hints cannot be combined", "[task-hints]") {
60+
celerity::runtime::init(nullptr, nullptr);
61+
auto& tm = detail::runtime::get_instance().get_task_manager();
62+
SECTION("1d then 2d") {
63+
test_utils::add_compute_task<class UKN(hint_task)>(tm, [&](handler& cgh) {
64+
CHECK_NOTHROW(experimental::hint(cgh, experimental::hints::split_1d{}));
65+
CHECK_THROWS_WITH(experimental::hint(cgh, experimental::hints::split_2d{}), "Cannot combine split_1d and split_2d hints");
66+
});
67+
}
68+
SECTION("2d then 1d") {
69+
test_utils::add_compute_task<class UKN(hint_task)>(tm, [&](handler& cgh) {
70+
CHECK_NOTHROW(experimental::hint(cgh, experimental::hints::split_2d{}));
71+
CHECK_THROWS_WITH(experimental::hint(cgh, experimental::hints::split_1d{}), "Cannot combine split_1d and split_2d hints");
72+
});
73+
}
74+
}

0 commit comments

Comments
 (0)