-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[Offload] Add olLinkProgram
#148648
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/RossBrunton/link1
Are you sure you want to change the base?
[Offload] Add olLinkProgram
#148648
Conversation
A version of `olCreateProgram` that inputs many bitcode files and links them together before loading them.
@llvm/pr-subscribers-offload Author: Ross Brunton (RossBrunton) ChangesA version of Full diff: https://github.com/llvm/llvm-project/pull/148648.diff 11 Files Affected:
diff --git a/offload/liboffload/API/Program.td b/offload/liboffload/API/Program.td
index 0476fa1f7c27a..3dae37f288ff7 100644
--- a/offload/liboffload/API/Program.td
+++ b/offload/liboffload/API/Program.td
@@ -25,6 +25,34 @@ def : Function {
let returns = [];
}
+def : Struct {
+ let name = "ol_program_link_buffer_t";
+ let desc = "An image to link with `olLinkProgram`.";
+ let members = [
+ StructMember<"void *", "Address", "base address of memory image">,
+ StructMember<"size_t", "Size", "size in bytes of memory image">,
+ ];
+}
+
+def : Function {
+ let name = "olLinkProgram";
+ let desc = "Compile and link multiple bitcode images into a single binary.";
+ let details = [
+ "No caching is performed; multiple calls to `olLinkProgram` with the same images will result in multiple linking operations",
+ ];
+ let params = [
+ Param<"ol_device_handle_t", "Device", "handle of the device to link for", PARAM_IN>,
+ Param<"ol_program_link_buffer_t *", "Images", "a pointer to an array of `ImagesSize` entries, one for each image to link", PARAM_IN>,
+ Param<"size_t", "ImagesSize", "the number of elements in `Images`", PARAM_IN>,
+ Param<"ol_program_handle_t*", "Program", "output handle for the created program", PARAM_OUT>
+ ];
+ let returns = [
+ Return<"OL_ERRC_INVALID_SIZE", ["`ImagesSize == 0`"]>,
+ Return<"OL_ERRC_INVALID_BINARY", ["Any image is not in the bitcode format"]>,
+ Return<"OL_ERRC_UNSUPPORTED", ["Linking is not supported for this device and `ImagesSize` > 1"]>,
+ ];
+}
+
def : Function {
let name = "olDestroyProgram";
let desc = "Destroy the program and free all underlying resources.";
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 17a2b00cb7140..14af015460c8c 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -650,6 +650,39 @@ Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData,
return Error::success();
}
+Error olLinkProgram_impl(ol_device_impl_t *Device,
+ ol_program_link_buffer_t *Images, size_t ImagesSize,
+ ol_program_handle_t *Program) {
+ std::vector<__tgt_device_image> DevImages;
+ for (size_t I = 0; I < ImagesSize; I++) {
+ auto &ProgData = Images[I];
+ DevImages.push_back({ProgData.Address,
+ utils::advancePtr(ProgData.Address, ProgData.Size),
+ nullptr, nullptr});
+ }
+
+ auto LinkResult =
+ Device->Device->jitLinkBinary(Device->Device->Plugin, DevImages);
+ if (!LinkResult)
+ return LinkResult.takeError();
+
+ ol_program_handle_t Prog =
+ new ol_program_impl_t(nullptr, nullptr, *LinkResult);
+
+ auto Res =
+ Device->Device->loadBinary(Device->Device->Plugin, &Prog->DeviceImage);
+ if (!Res) {
+ delete Prog;
+ return Res.takeError();
+ }
+ assert(*Res != nullptr && "loadBinary returned nullptr");
+
+ Prog->Image = *Res;
+ *Program = Prog;
+
+ return Error::success();
+}
+
Error olDestroyProgram_impl(ol_program_handle_t Program) {
auto &Device = Program->Image->getDevice();
if (auto Err = Device.unloadBinary(Program->Image))
diff --git a/offload/plugins-nextgen/common/include/JIT.h b/offload/plugins-nextgen/common/include/JIT.h
index 1d6280a0af141..08b82c4aefb8d 100644
--- a/offload/plugins-nextgen/common/include/JIT.h
+++ b/offload/plugins-nextgen/common/include/JIT.h
@@ -55,6 +55,10 @@ struct JITEngine {
process(const __tgt_device_image &Image,
target::plugin::GenericDeviceTy &Device);
+ /// Link and compile multiple bitcode images into a single binary
+ Expected<__tgt_device_image> link(std::vector<__tgt_device_image> &Images,
+ target::plugin::GenericDeviceTy &Device);
+
private:
/// Compile the bitcode image \p Image and generate the binary image that can
/// be loaded to the target device of the triple \p Triple architecture \p
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 7824257d28e1f..79e021cc64f3b 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -749,6 +749,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Load the binary image into the device and return the target table.
Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin,
const __tgt_device_image *TgtImage);
+ /// Link and compile multiple bitcode images into a single image.
+ Expected<__tgt_device_image>
+ jitLinkBinary(GenericPluginTy &Plugin,
+ std::vector<__tgt_device_image> InputImages);
virtual Expected<DeviceImageTy *>
loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp
index 835dcc0da2ec9..2cf6ddbfdff0b 100644
--- a/offload/plugins-nextgen/common/src/JIT.cpp
+++ b/offload/plugins-nextgen/common/src/JIT.cpp
@@ -327,3 +327,44 @@ JITEngine::process(const __tgt_device_image &Image,
return &Image;
}
+
+Expected<__tgt_device_image>
+JITEngine::link(std::vector<__tgt_device_image> &Images,
+ target::plugin::GenericDeviceTy &Device) {
+ const std::string &ComputeUnitKind = Device.getComputeUnitKind();
+ ComputeUnitInfo &CUI = ComputeUnitMap[ComputeUnitKind];
+
+ PostProcessingFn PostProcessing =
+ [&Device](llvm::SmallVector<std::unique_ptr<MemoryBuffer>> &&MB)
+ -> Expected<std::unique_ptr<MemoryBuffer>> {
+ return Device.doJITPostProcessing(std::move(MB));
+ };
+
+ std::lock_guard<std::mutex> Lock(ComputeUnitMapMutex);
+
+ llvm::SmallVector<std::unique_ptr<MemoryBuffer>> Buffers;
+ size_t Index = 0;
+ for (auto &I : Images) {
+ if (!isImageBitcode(I))
+ return error::createOffloadError(
+ error::ErrorCode::INVALID_BINARY,
+ "binary %i provided to link operation is not bitcode", Index);
+ Index++;
+
+ auto ObjMBOrErr = getOrCreateObjFile(I, CUI.Context, ComputeUnitKind);
+ if (!ObjMBOrErr)
+ return ObjMBOrErr.takeError();
+ Buffers.push_back(std::move(*ObjMBOrErr));
+ }
+
+ auto ImageMBOrErr = PostProcessing(std::move(Buffers));
+ if (!ImageMBOrErr)
+ return ImageMBOrErr.takeError();
+
+ auto &ImageMB = CUI.JITImages.emplace_back(std::move(*ImageMBOrErr));
+ __tgt_device_image JITedImage{};
+ JITedImage.ImageStart = const_cast<char *>(ImageMB->getBufferStart());
+ JITedImage.ImageEnd = const_cast<char *>(ImageMB->getBufferEnd());
+
+ return JITedImage;
+}
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 81b9d423e13d8..9e2234dcc148b 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -903,6 +903,13 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
return deinitImpl();
}
+
+Expected<__tgt_device_image>
+GenericDeviceTy::jitLinkBinary(GenericPluginTy &Plugin,
+ std::vector<__tgt_device_image> InputImages) {
+ return Plugin.getJIT().link(InputImages, *this);
+}
+
Expected<DeviceImageTy *>
GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
const __tgt_device_image *InputTgtImage) {
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index d76338612210d..e13ded4f8b1aa 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -32,7 +32,8 @@ add_offload_unittest("platform"
add_offload_unittest("program"
program/olCreateProgram.cpp
- program/olDestroyProgram.cpp)
+ program/olDestroyProgram.cpp
+ program/olLinkProgram.cpp)
add_offload_unittest("queue"
queue/olCreateQueue.cpp
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index 11c8ccbd6c7c5..c3e07724086fe 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -8,6 +8,8 @@ add_offload_test_device_code(localmem_static.c localmem_static)
add_offload_test_device_code(global.c global)
add_offload_test_device_code(global_ctor.c global_ctor)
add_offload_test_device_code(global_dtor.c global_dtor)
+add_offload_test_device_code(link_a.c link_a)
+add_offload_test_device_code(link_b.c link_b)
add_custom_target(offload_device_binaries DEPENDS
foo.bin
@@ -19,5 +21,7 @@ add_custom_target(offload_device_binaries DEPENDS
global.bin
global_ctor.bin
global_dtor.bin
+ link_a.bin
+ link_b.bin
)
set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
diff --git a/offload/unittests/OffloadAPI/device_code/link_a.c b/offload/unittests/OffloadAPI/device_code/link_a.c
new file mode 100644
index 0000000000000..7feb92189c018
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/link_a.c
@@ -0,0 +1,11 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+uint32_t global;
+
+extern uint32_t funky();
+
+__gpu_kernel void link_a(uint32_t *out) {
+ out[0] = funky();
+ out[1] = global;
+}
diff --git a/offload/unittests/OffloadAPI/device_code/link_b.c b/offload/unittests/OffloadAPI/device_code/link_b.c
new file mode 100644
index 0000000000000..82f41fd8a0218
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/link_b.c
@@ -0,0 +1,10 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+extern uint32_t global[64];
+
+[[gnu::visibility("default")]]
+uint32_t funky() {
+ global[0] = 100;
+ return 200;
+}
diff --git a/offload/unittests/OffloadAPI/program/olLinkProgram.cpp b/offload/unittests/OffloadAPI/program/olLinkProgram.cpp
new file mode 100644
index 0000000000000..122d0156e6e0c
--- /dev/null
+++ b/offload/unittests/OffloadAPI/program/olLinkProgram.cpp
@@ -0,0 +1,99 @@
+//===------- Offload API tests - olCreateProgram --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olLinkProgramTest = OffloadQueueTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLinkProgramTest);
+
+TEST_P(olLinkProgramTest, SuccessSingle) {
+
+ std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
+ ASSERT_TRUE(TestEnvironment::loadDeviceBinary("foo", Device, DeviceBin));
+ ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
+
+ ol_program_link_buffer_t Buffers[1] = {
+ {const_cast<char *>(DeviceBin->getBufferStart()),
+ DeviceBin->getBufferSize()},
+ };
+
+ ol_program_handle_t Program;
+ ASSERT_SUCCESS(olLinkProgram(Device, Buffers, 1, &Program));
+ ASSERT_NE(Program, nullptr);
+
+ ASSERT_SUCCESS(olDestroyProgram(Program));
+}
+
+TEST_P(olLinkProgramTest, SuccessBuild) {
+ std::unique_ptr<llvm::MemoryBuffer> ABin;
+ ASSERT_TRUE(TestEnvironment::loadDeviceBinary("link_a", Device, ABin));
+ std::unique_ptr<llvm::MemoryBuffer> BBin;
+ ASSERT_TRUE(TestEnvironment::loadDeviceBinary("link_b", Device, BBin));
+
+ ol_program_link_buffer_t Buffers[2] = {
+ {const_cast<char *>(ABin->getBufferStart()), ABin->getBufferSize()},
+ {const_cast<char *>(BBin->getBufferStart()), BBin->getBufferSize()},
+ };
+
+ ol_program_handle_t Program;
+ auto LinkResult = olLinkProgram(Device, Buffers, 2, &Program);
+ if (LinkResult && LinkResult->Code == OL_ERRC_UNSUPPORTED)
+ GTEST_SKIP() << "Linking unsupported: " << LinkResult->Details;
+ ASSERT_SUCCESS(LinkResult);
+ ASSERT_NE(Program, nullptr);
+
+ ol_symbol_handle_t Kernel;
+ ASSERT_SUCCESS(
+ olGetSymbol(Program, "link_a", OL_SYMBOL_KIND_KERNEL, &Kernel));
+
+ void *Mem;
+ ASSERT_SUCCESS(
+ olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, 2 * sizeof(uint32_t), &Mem));
+ struct {
+ void *Mem;
+ } Args{Mem};
+ ol_kernel_launch_size_args_t LaunchArgs{};
+ LaunchArgs.GroupSize = {1, 1, 1};
+ LaunchArgs.NumGroups = {1, 1, 1};
+ LaunchArgs.Dimensions = 1;
+
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs, nullptr));
+ ASSERT_SUCCESS(olWaitQueue(Queue));
+
+ uint32_t *Data = (uint32_t *)Mem;
+ ASSERT_EQ(Data[0], 200);
+ ASSERT_EQ(Data[1], 100);
+
+ ASSERT_SUCCESS(olMemFree(Mem));
+ ASSERT_SUCCESS(olDestroyProgram(Program));
+}
+
+TEST_P(olLinkProgramTest, InvalidNotBitcode) {
+ char FakeElf[] =
+ "\177ELF0000000000000000000000000000000000000000000000000000"
+ "00000000000000000000000000000000000000000000000000000000000";
+
+ ol_program_link_buffer_t Buffers[1] = {
+ {FakeElf, sizeof(FakeElf)},
+ };
+
+ ol_program_handle_t Program;
+ ASSERT_ERROR(OL_ERRC_INVALID_BINARY,
+ olLinkProgram(Device, Buffers, 1, &Program));
+}
+
+TEST_P(olLinkProgramTest, InvalidSize) {
+ ol_program_link_buffer_t Buffers[0] = {};
+
+ ol_program_handle_t Program;
+ ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+ olLinkProgram(Device, Buffers, 0, &Program));
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm kind of iffy here, I'm not sure if it's the runtime's job to be a linker. The createProgram
interface already should handle 'JIT' compilation by detecting magics bits for LLVM-IR, ELF, PTX, or SPIR-V or whatever, but it's expected that the user provides a 'program' to run. What's the use-case for having the offloading runtime doing the compiler's job?
@jhuber6 I'm looking at implementing a version of urProgramLink for liboffload, which is used by SYCL. More generally though, linking is very backend dependant, and liboffload already has logic for invoking the appropriate tool (lld/ptxas/etc.). I think it's worthwhile exposing this interface to offload consumers so they don't have to fiddle around with implementing their own wrappers around the linker. In theory, liboffload could even set optimisation flags that it knows it can consume as well. Implementers can also dynamically compute what kernels they want to use at runtime and link those into a single module rather than unconditionally loading a huge image to the device. If users do want to compile offline though, why not just have the hypothetical |
My gut reaction is that if people need a compiler they could just invoke |
@jhuber6 That would mean that any program that wants to do dynamic linking would need to ship a full clang install alongside IMO, since this code already exists in liboffload, it makes sense to expose that to the user as a convenience. |
This seems like something we should discuss at one of the weekly meetings, since it has pretty large implications for what this library is supposed to do. |
A version of
olCreateProgram
that inputs many bitcode files and linksthem together before loading them.