Add support for 2D tiled splits, split hints

psalz · psalz · commit 404e3259e3ac · 2023-12-05T13:42:07.000+01:00
This adds two new task hints, `split_1d` and `split_2d`, which influence
the way tasks are split into chunks. The latter uses a new splitting
function of the same name; all splitting related functionality is moved
into a separate file and tests for both 1D and 2D splitting are
included.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@ Versioning](http://semver.org/spec/v2.0.0.html).
 - `distr_queue::fence` and `buffer_snapshot` are now stable, subsuming the `experimental::` APIs of the same name (#225)
 - Celerity now warns at runtime when a task declares reads from uninitialized buffers or writes with overlapping ranges between nodes (#224)
 - Introduce new `experimental::hint` API for providing the runtime with additional information on how to execute a task (#227)
+- Introduce new `experimental::hints::split_1d` and `experimental::hints::split_2d` task hints for controlling how a task is split into chunks (#227)
 
 ### Changed
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -194,6 +194,7 @@ set(SOURCES
   src/recorders.cc
   src/runtime.cc
   src/scheduler.cc
+  src/split.cc
   src/task.cc
   src/task_manager.cc
   src/user_bench.cc
diff --git a/include/hint.h b/include/hint.h
@@ -32,4 +32,31 @@ class hint_base {
 
 } // namespace celerity::detail
 
-namespace celerity::experimental::hints {}; // namespace celerity::experimental::hints
+namespace celerity::experimental::hints {
+
+/**
+ * Suggests that the task should be split into 1D chunks.
+ * This is currently the default behavior.
+ */
+class split_1d : public detail::hint_base {
+  private:
+	void validate(const hint_base& other) const override;
+};
+
+/**
+ * Suggests that the task should be split into 2D chunks.
+ */
+class split_2d : public detail::hint_base {
+  private:
+	void validate(const hint_base& other) const override;
+};
+
+inline void split_1d::validate(const hint_base& other) const {
+	if(dynamic_cast<const split_2d*>(&other) != nullptr) { throw std::runtime_error("Cannot combine split_1d and split_2d hints"); }
+}
+
+inline void split_2d::validate(const hint_base& other) const {
+	if(dynamic_cast<const split_1d*>(&other) != nullptr) { throw std::runtime_error("Cannot combine split_1d and split_2d hints"); }
+}
+
+}; // namespace celerity::experimental::hints
diff --git a/include/split.h b/include/split.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <vector>
+
+#include "ranges.h"
+
+namespace celerity::detail {
+
+std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
+std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
+
+} // namespace celerity::detail
diff --git a/src/distributed_graph_generator.cc b/src/distributed_graph_generator.cc
@@ -4,6 +4,7 @@
 #include "command.h"
 #include "command_graph.h"
 #include "recorders.h"
+#include "split.h"
 #include "task.h"
 #include "task_manager.h"
 
@@ -35,53 +36,6 @@ void distributed_graph_generator::add_buffer(const buffer_id bid, const int dims
 	m_buffer_states.at(bid).replicated_regions.update_region(subrange<3>({}, range), node_bitset{}.set());
 }
 
-// We simply split in the first dimension for now
-static std::vector<chunk<3>> split_equal(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks, const int dims) {
-#ifndef NDEBUG
-	assert(num_chunks > 0);
-	for(int d = 0; d < dims; ++d) {
-		assert(granularity[d] > 0);
-		assert(full_chunk.range[d] % granularity[d] == 0);
-	}
-#endif
-
-	// Due to split granularity requirements or if num_workers > global_size[0],
-	// we may not be able to create the requested number of chunks.
-	const auto actual_num_chunks = std::min(num_chunks, full_chunk.range[0] / granularity[0]);
-
-	// If global range is not divisible by (actual_num_chunks * granularity),
-	// assign ceil(quotient) to the first few chunks and floor(quotient) to the remaining
-	const auto small_chunk_size_dim0 = full_chunk.range[0] / (actual_num_chunks * granularity[0]) * granularity[0];
-	const auto large_chunk_size_dim0 = small_chunk_size_dim0 + granularity[0];
-	const auto num_large_chunks = (full_chunk.range[0] - small_chunk_size_dim0 * actual_num_chunks) / granularity[0];
-	assert(num_large_chunks * large_chunk_size_dim0 + (actual_num_chunks - num_large_chunks) * small_chunk_size_dim0 == full_chunk.range[0]);
-
-	std::vector<chunk<3>> result(actual_num_chunks, {full_chunk.offset, full_chunk.range, full_chunk.global_size});
-	for(auto i = 0u; i < num_large_chunks; ++i) {
-		result[i].range[0] = large_chunk_size_dim0;
-		result[i].offset[0] += i * large_chunk_size_dim0;
-	}
-	for(auto i = num_large_chunks; i < actual_num_chunks; ++i) {
-		result[i].range[0] = small_chunk_size_dim0;
-		result[i].offset[0] += num_large_chunks * large_chunk_size_dim0 + (i - num_large_chunks) * small_chunk_size_dim0;
-	}
-
-#ifndef NDEBUG
-	size_t total_range_dim0 = 0;
-	for(size_t i = 0; i < result.size(); ++i) {
-		total_range_dim0 += result[i].range[0];
-		if(i == 0) {
-			assert(result[i].offset[0] == full_chunk.offset[0]);
-		} else {
-			assert(result[i].offset[0] == result[i - 1].offset[0] + result[i - 1].range[0]);
-		}
-	}
-	assert(total_range_dim0 == full_chunk.range[0]);
-#endif
-
-	return result;
-}
-
 using buffer_requirements_map = std::unordered_map<buffer_id, std::unordered_map<access_mode, region<3>>>;
 
 static buffer_requirements_map get_buffer_requirements_for_mapped_access(const task& tsk, subrange<3> sr, const range<3> global_size) {
@@ -179,7 +133,13 @@ void distributed_graph_generator::generate_distributed_commands(const task& tsk)
 			}
 			return chunks;
 		}
-		if(tsk.has_variable_split()) { return split_equal(full_chunk, tsk.get_granularity(), num_chunks, tsk.get_dimensions()); }
+		if(tsk.has_variable_split()) {
+			if(tsk.get_hint<experimental::hints::split_1d>() != nullptr) {
+				// no-op, keeping this for documentation purposes
+			}
+			if(tsk.get_hint<experimental::hints::split_2d>() != nullptr) { return split_2d(full_chunk, tsk.get_granularity(), num_chunks); }
+			return split_1d(full_chunk, tsk.get_granularity(), num_chunks);
+		}
 		return std::vector<chunk<3>>{full_chunk};
 	})();
 	assert(chunks.size() <= num_chunks); // We may have created less than requested
diff --git a/src/split.cc b/src/split.cc
@@ -0,0 +1,166 @@
+#include "split.h"
+
+#include <array>
+#include <tuple>
+
+#include "grid.h"
+
+namespace {
+
+using namespace celerity;
+using namespace celerity::detail;
+
+[[maybe_unused]] void sanity_check_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& split) {
+	region<3> reconstructed_chunk;
+	for(auto& chnk : split) {
+		assert(region_intersection(reconstructed_chunk, box<3>(chnk)).empty());
+		reconstructed_chunk = region_union(box<3>(chnk), reconstructed_chunk);
+	}
+	assert(region_difference(reconstructed_chunk, box<3>(full_chunk)).empty());
+}
+
+template <int Dims>
+std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks(
+    const chunk<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
+	range<Dims> small_chunk_size{zeros};
+	range<Dims> large_chunk_size{zeros};
+	range<Dims> num_large_chunks{zeros};
+	for(int d = 0; d < Dims; ++d) {
+		const size_t ideal_chunk_size = full_chunk.range[d] / actual_num_chunks[d];
+		small_chunk_size[d] = (ideal_chunk_size / granularity[d]) * granularity[d];
+		large_chunk_size[d] = small_chunk_size[d] + granularity[d];
+		num_large_chunks[d] = (full_chunk.range[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];
+	}
+	return {small_chunk_size, large_chunk_size, num_large_chunks};
+}
+
+/**
+ * Given a factorization of `num_chunks` (i.e., `f0 * f1 = num_chunks`), try to find the assignment of factors to
+ * dimensions that produces more chunks under the given constraints. If they are tied, try to find the assignment
+ * that results in a "nicer" split according to some heuristics (see below).
+ *
+ * The single argument `factor` specifies both factors, as `f0 = factor` and `f1 = num_chunks / factor`.
+ *
+ * @returns The number of chunks that can be created in dimension 0 and dimension 1, respectively. These are at most
+ *          (f0, f1) or (f1, f0), however may be less if constrained by the split granularity.
+ */
+std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {
+	assert(num_chunks % factor == 0);
+	const size_t max_chunks[2] = {full_chunk.range[0] / granularity[0], full_chunk.range[1] / granularity[1]};
+	const size_t f0 = factor;
+	const size_t f1 = num_chunks / factor;
+
+	// Decide in which direction to split by first checking which
+	// factor assignment produces more chunks under the given constraints.
+	const std::array<size_t, 2> split_0_1 = {std::min(f0, max_chunks[0]), std::min(f1, max_chunks[1])};
+	const std::array<size_t, 2> split_1_0 = {std::min(f1, max_chunks[0]), std::min(f0, max_chunks[1])};
+	const auto count0 = split_0_1[0] * split_0_1[1];
+	const auto count1 = split_1_0[0] * split_1_0[1];
+
+	if(count0 > count1) { return split_0_1; }
+	if(count0 < count1) { return split_1_0; }
+
+	// If we're tied for the number of chunks we can create, try some heuristics to decide.
+
+	// If domain is square(-ish), prefer splitting along slower dimension.
+	// (These bounds have been chosen arbitrarily!)
+	const double squareishness = std::sqrt(full_chunk.range.size()) / static_cast<double>(full_chunk.range[0]);
+	if(squareishness > 0.95 && squareishness < 1.05) { return (f0 >= f1) ? split_0_1 : split_1_0; }
+
+	// For non-square domains, prefer split that produces shorter edges (compare sum of circumferences)
+	const auto circ0 = full_chunk.range[0] / split_0_1[0] + full_chunk.range[1] / split_0_1[1];
+	const auto circ1 = full_chunk.range[0] / split_1_0[0] + full_chunk.range[1] / split_1_0[1];
+	return circ0 < circ1 ? split_0_1 : split_1_0;
+
+	// TODO: Yet another heuristic we may want to consider is how even chunk sizes are,
+	// i.e., how balanced the workload is.
+}
+
+} // namespace
+
+namespace celerity::detail {
+
+std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
+#ifndef NDEBUG
+	assert(num_chunks > 0);
+	for(int d = 0; d < 3; ++d) {
+		assert(granularity[d] > 0);
+		assert(full_chunk.range[d] % granularity[d] == 0);
+	}
+#endif
+
+	// Due to split granularity requirements or if num_workers > global_size[0],
+	// we may not be able to create the requested number of chunks.
+	const std::array<size_t, 1> actual_num_chunks = {std::min(num_chunks, full_chunk.range[0] / granularity[0])};
+	const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<1>(full_chunk, granularity, actual_num_chunks);
+
+	std::vector<chunk<3>> result(actual_num_chunks[0], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
+	for(auto i = 0u; i < num_large_chunks[0]; ++i) {
+		result[i].range[0] = large_chunk_size[0];
+		result[i].offset[0] += i * large_chunk_size[0];
+	}
+	for(auto i = num_large_chunks[0]; i < actual_num_chunks[0]; ++i) {
+		result[i].range[0] = small_chunk_size[0];
+		result[i].offset[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];
+	}
+
+#ifndef NDEBUG
+	sanity_check_split(full_chunk, result);
+#endif
+
+	return result;
+}
+
+// TODO: Make the split dimensions configurable for 3D chunks?
+std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
+#ifndef NDEBUG
+	assert(num_chunks > 0);
+	for(int d = 0; d < 3; ++d) {
+		assert(granularity[d] > 0);
+		assert(full_chunk.range[d] % granularity[d] == 0);
+	}
+#endif
+
+	// Factorize num_chunks
+	// We start out with an initial guess of `factor = floor(sqrt(num_chunks))` (the other one is implicitly given by `num_chunks / factor`),
+	// and work our way down, keeping track of the best factorization we've found so far, until we find a factorization that produces
+	// the requested number of chunks, or until we reach (1, num_chunks), i.e., a 1D split.
+	size_t factor = std::floor(std::sqrt(num_chunks));
+	std::array<size_t, 2> best_chunk_counts = {0, 0};
+	while(factor >= 1) {
+		while(factor > 1 && num_chunks % factor != 0) {
+			factor--;
+		}
+		// The returned counts are at most (factor, num_chunks / factor), however may be less if constrained by the split granularity.
+		const auto chunk_counts = assign_split_factors_2d(full_chunk, granularity, factor, num_chunks);
+		if(chunk_counts[0] * chunk_counts[1] > best_chunk_counts[0] * best_chunk_counts[1]) { best_chunk_counts = chunk_counts; }
+		if(chunk_counts[0] * chunk_counts[1] == num_chunks) { break; }
+		factor--;
+	}
+	const auto actual_num_chunks = best_chunk_counts;
+	const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<2>(full_chunk, granularity, actual_num_chunks);
+
+	std::vector<chunk<3>> result(actual_num_chunks[0] * actual_num_chunks[1], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
+	id<3> offset = full_chunk.offset;
+
+	for(size_t j = 0; j < actual_num_chunks[0]; ++j) {
+		range<2> chunk_size = {(j < num_large_chunks[0]) ? large_chunk_size[0] : small_chunk_size[0], 0};
+		for(size_t i = 0; i < actual_num_chunks[1]; ++i) {
+			chunk_size[1] = (i < num_large_chunks[1]) ? large_chunk_size[1] : small_chunk_size[1];
+			auto& chnk = result[j * actual_num_chunks[1] + i];
+			chnk.offset = offset;
+			chnk.range[0] = chunk_size[0];
+			chnk.range[1] = chunk_size[1];
+			offset[1] += chunk_size[1];
+		}
+		offset[0] += chunk_size[0];
+		offset[1] = full_chunk.offset[1];
+	}
+
+#ifndef NDEBUG
+	sanity_check_split(full_chunk, result);
+#endif
+
+	return result;
+}
+} // namespace celerity::detail
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -42,6 +42,7 @@ set(TEST_TARGETS
   runtime_tests
   runtime_deprecation_tests
   sycl_tests
+  split_tests
   task_graph_tests
   task_ring_buffer_tests
   test_utils_tests
diff --git a/test/graph_gen_granularity_tests.cc b/test/graph_gen_granularity_tests.cc
@@ -74,6 +74,16 @@ TEST_CASE("distributed_graph_generator respects split constraints", "[distribute
 	CHECK(dynamic_cast<const execution_command*>(dctx.query(tid_b).get_raw(1)[0])->get_execution_range().range == range<3>{96, 1, 1});
 }
 
+TEST_CASE("distributed_graph_generator creates 2-dimensional chunks when providing the split_2d hint", "[distributed_graph_generator][split][task-hints]") {
+	const size_t num_nodes = 4;
+	dist_cdag_test_context dctx(num_nodes);
+	const auto tid_a = dctx.device_compute<class UKN(task)>(range<2>{128, 128}).hint(experimental::hints::split_2d{}).submit();
+	REQUIRE(dctx.query(tid_a).count() == 4);
+	for(node_id nid = 0; nid < 4; ++nid) {
+		CHECK(dynamic_cast<const execution_command*>(dctx.query(tid_a).get_raw(nid)[0])->get_execution_range().range == range<3>{64, 64, 1});
+	}
+}
+
 template <int Dims>
 class simple_task;
 
diff --git a/test/hint_tests.cc b/test/hint_tests.cc
@@ -55,3 +55,20 @@ TEST_CASE_METHOD(test_utils::runtime_fixture, "hints can ensure combinations wit
 		CHECK_THROWS_WITH(experimental::hint(cgh, my_hint{1336}), "not leet enough");
 	});
 }
+
+TEST_CASE_METHOD(test_utils::runtime_fixture, "split_1d and split_2d hints cannot be combined", "[task-hints]") {
+	celerity::runtime::init(nullptr, nullptr);
+	auto& tm = detail::runtime::get_instance().get_task_manager();
+	SECTION("1d then 2d") {
+		test_utils::add_compute_task<class UKN(hint_task)>(tm, [&](handler& cgh) {
+			CHECK_NOTHROW(experimental::hint(cgh, experimental::hints::split_1d{}));
+			CHECK_THROWS_WITH(experimental::hint(cgh, experimental::hints::split_2d{}), "Cannot combine split_1d and split_2d hints");
+		});
+	}
+	SECTION("2d then 1d") {
+		test_utils::add_compute_task<class UKN(hint_task)>(tm, [&](handler& cgh) {
+			CHECK_NOTHROW(experimental::hint(cgh, experimental::hints::split_2d{}));
+			CHECK_THROWS_WITH(experimental::hint(cgh, experimental::hints::split_1d{}), "Cannot combine split_1d and split_2d hints");
+		});
+	}
+}
diff --git a/test/split_tests.cc b/test/split_tests.cc