Skip to content

[UR][SYCL] Implement USM prefetch from device to host in SYCL runtime and UR #19437

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 32 commits into
base: sycl
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
9ae0c55
Initial non-ABI breaking impl in runtime
ianayl Jul 2, 2025
59ae777
Initial UR implementation, + opencl adapter
ianayl Jul 3, 2025
6e6fe08
Add migration flags to memorymanager
ianayl Jul 7, 2025
fb827be
Add CUDA adapter impl
ianayl Jul 7, 2025
c42c233
More preliminary adapter support
ianayl Jul 9, 2025
a5ed168
Merge branch 'sycl' of https://github.com/intel/llvm into 2way-prefet…
ianayl Jul 14, 2025
41d9a6f
Update USM testing
ianayl Jul 14, 2025
742a636
Revise UR impl to not error, add graph testing
ianayl Jul 17, 2025
c160e42
Merge branch 'sycl' into 2way-prefetch-2
ianayl Jul 17, 2025
6654b6e
Fix bug
ianayl Jul 17, 2025
7141dea
Merge branch 'sycl' of https://github.com/intel/llvm into 2way-prefet…
ianayl Jul 18, 2025
96059fc
Fix bug in enqueue function header
ianayl Jul 18, 2025
b427472
update ur testing
ianayl Jul 18, 2025
4f09c40
fix build issue in new command buffer ur test
ianayl Jul 21, 2025
e3b9e9e
Fix bug
ianayl Jul 21, 2025
ba1f9f6
Fix memory leak
ianayl Jul 21, 2025
294702c
Disable opencl adapter
ianayl Jul 22, 2025
6dbf10a
Disable opencl enqueue function grpah tests
ianayl Jul 22, 2025
a2263f6
ammend test
ianayl Jul 22, 2025
1d16e60
Add breaking changes preview hotpath
ianayl Jul 23, 2025
64bec80
formatting
ianayl Jul 23, 2025
c3cfc1f
Merge branch 'sycl' into 2way-prefetch-2
ianayl Jul 23, 2025
0a45ea3
reenable queue test
ianayl Jul 23, 2025
0f4ed1f
Add unittesting to ensure the runtime calls the UR with the right args
ianayl Jul 24, 2025
236f70b
clang-format
ianayl Jul 24, 2025
6de51cc
Remove overcomplicated prefetch alternatives
ianayl Jul 25, 2025
16d40d9
Fix testing
ianayl Jul 25, 2025
160af9d
Apply suggestions for graph tests
ianayl Jul 25, 2025
ddb53e5
use new syntax for malloc_shared
ianayl Jul 25, 2025
8de6a9a
clang-format
ianayl Jul 25, 2025
e601af9
remove import from handler
ianayl Jul 25, 2025
ebc89db
Merge branch 'sycl' of https://github.com/intel/llvm into 2way-prefet…
ianayl Jul 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include <sycl/detail/common.hpp>
#include <sycl/event.hpp>
#include <sycl/ext/oneapi/experimental/enqueue_types.hpp>
#include <sycl/ext/oneapi/experimental/graph.hpp>
#include <sycl/ext/oneapi/properties/properties.hpp>
#include <sycl/handler.hpp>
Expand Down Expand Up @@ -369,15 +370,27 @@ void fill(sycl::queue Q, T *Ptr, const T &Pattern, size_t Count,
CodeLoc);
}

inline void prefetch(handler &CGH, void *Ptr, size_t NumBytes) {
CGH.prefetch(Ptr, NumBytes);
inline void prefetch(handler &CGH, void *Ptr, size_t NumBytes,
prefetch_type Type = prefetch_type::device) {
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
CGH.prefetch(Ptr, NumBytes, Type);
#else
if (Type == prefetch_type::device) {
// Incase an older libsycl.so is used, don't call prefetch function overload
// with new prefetch_type parameter:
CGH.prefetch(Ptr, NumBytes);
} else {
CGH.prefetch(Ptr, NumBytes, Type);
}
#endif
}

inline void prefetch(queue Q, void *Ptr, size_t NumBytes,
prefetch_type Type = prefetch_type::device,
const sycl::detail::code_location &CodeLoc =
sycl::detail::code_location::current()) {
submit(
std::move(Q), [&](handler &CGH) { prefetch(CGH, Ptr, NumBytes); },
std::move(Q), [&](handler &CGH) { prefetch(CGH, Ptr, NumBytes, Type); },
CodeLoc);
}

Expand Down
33 changes: 33 additions & 0 deletions sycl/include/sycl/ext/oneapi/experimental/enqueue_types.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
//==--------------- enqueue_types.hpp ---- SYCL enqueue types --------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#pragma once

#include <string>

namespace sycl {
inline namespace _V1 {
namespace ext::oneapi::experimental {

/// @brief Indicates the destination device for USM data to be prefetched to.
enum class prefetch_type { device, host };

inline std::string prefetchTypeToString(prefetch_type value) {
switch (value) {
case sycl::ext::oneapi::experimental::prefetch_type::device:
return "prefetch_type::device";
case sycl::ext::oneapi::experimental::prefetch_type::host:
return "prefetch_type::host";
default:
return "prefetch_type::unknown";
}
}

} // namespace ext::oneapi::experimental
} // namespace _V1
} // namespace sycl
12 changes: 12 additions & 0 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ namespace ext ::oneapi ::experimental {
template <typename, typename> class work_group_memory;
template <typename, typename> class dynamic_work_group_memory;
struct image_descriptor;
enum class prefetch_type;

__SYCL_EXPORT void async_free(sycl::handler &h, void *ptr);
__SYCL_EXPORT void *async_malloc(sycl::handler &h, sycl::usm::alloc kind,
size_t size);
Expand Down Expand Up @@ -2627,6 +2629,16 @@ class __SYCL_EXPORT handler {
/// \param Count is a number of bytes to be prefetched.
void prefetch(const void *Ptr, size_t Count);

/// Provides hints to the runtime library that data should be made available
/// on a device earlier than Unified Shared Memory would normally require it
/// to be available.
///
/// \param Ptr is a USM pointer to the memory to be prefetched to the device.
/// \param Count is a number of bytes to be prefetched.
/// \param Type is type of prefetch, i.e. fetch to device or fetch to host.
void prefetch(const void *Ptr, size_t Count,
ext::oneapi::experimental::prefetch_type Type);

/// Provides additional information to the underlying runtime about how
/// different allocations are used.
///
Expand Down
11 changes: 8 additions & 3 deletions sycl/source/detail/cg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,14 +398,19 @@ class CGFillUSM : public CG {
class CGPrefetchUSM : public CG {
void *MDst;
size_t MLength;
ext::oneapi::experimental::prefetch_type MPrefetchType;

public:
CGPrefetchUSM(void *DstPtr, size_t Length, CG::StorageInitHelper CGData,
ext::oneapi::experimental::prefetch_type PrefetchType,
detail::code_location loc = {})
: CG(CGType::PrefetchUSM, std::move(CGData), std::move(loc)),
MDst(DstPtr), MLength(Length) {}
void *getDst() { return MDst; }
size_t getLength() { return MLength; }
MDst(DstPtr), MLength(Length), MPrefetchType(PrefetchType) {}
void *getDst() const { return MDst; }
size_t getLength() const { return MLength; }
ext::oneapi::experimental::prefetch_type getPrefetchType() const {
return MPrefetchType;
}
};

/// "Advise USM" command group class.
Expand Down
8 changes: 6 additions & 2 deletions sycl/source/detail/graph/node_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
#include <sycl/detail/cg_types.hpp> // for CGType
#include <sycl/detail/kernel_desc.hpp> // for kernel_param_kind_t

#include <sycl/ext/oneapi/experimental/graph/node.hpp> // for node
#include <sycl/ext/oneapi/experimental/enqueue_types.hpp> // for prefetchType
#include <sycl/ext/oneapi/experimental/graph/node.hpp> // for node

#include <cstring>
#include <fstream>
Expand Down Expand Up @@ -655,7 +656,10 @@ class node_impl : public std::enable_shared_from_this<node_impl> {
sycl::detail::CGPrefetchUSM *Prefetch =
static_cast<sycl::detail::CGPrefetchUSM *>(MCommandGroup.get());
Stream << "Dst: " << Prefetch->getDst()
<< " Length: " << Prefetch->getLength() << "\\n";
<< " Length: " << Prefetch->getLength() << " PrefetchType: "
<< sycl::ext::oneapi::experimental::prefetchTypeToString(
Prefetch->getPrefetchType())
<< "\\n";
}
break;
case sycl::detail::CGType::AdviseUSM:
Expand Down
5 changes: 5 additions & 0 deletions sycl/source/detail/handler_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <detail/cg.hpp>
#include <detail/kernel_bundle_impl.hpp>
#include <memory>
#include <sycl/ext/oneapi/experimental/enqueue_types.hpp>

namespace sycl {
inline namespace _V1 {
Expand Down Expand Up @@ -91,6 +92,10 @@ class handler_impl {
/// property.
bool MIsDeviceImageScoped = false;

/// Direction of USM prefetch / destination device.
sycl::ext::oneapi::experimental::prefetch_type MPrefetchType =
sycl::ext::oneapi::experimental::prefetch_type::device;

// Program scope pipe information.

// Pipe name that uniquely identifies a pipe.
Expand Down
28 changes: 19 additions & 9 deletions sycl/source/detail/memory_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -922,13 +922,18 @@ void MemoryManager::fill_usm(void *Mem, queue_impl &Queue, size_t Length,
DepEvents.size(), DepEvents.data(), OutEvent);
}

void MemoryManager::prefetch_usm(void *Mem, queue_impl &Queue, size_t Length,
std::vector<ur_event_handle_t> DepEvents,
ur_event_handle_t *OutEvent) {
void MemoryManager::prefetch_usm(
void *Mem, queue_impl &Queue, size_t Length,
std::vector<ur_event_handle_t> DepEvents, ur_event_handle_t *OutEvent,
sycl::ext::oneapi::experimental::prefetch_type Dest) {
adapter_impl &Adapter = Queue.getAdapter();
Adapter.call<UrApiKind::urEnqueueUSMPrefetch>(Queue.getHandleRef(), Mem,
Length, 0u, DepEvents.size(),
DepEvents.data(), OutEvent);
ur_usm_migration_flags_t MigrationFlag =
(Dest == sycl::ext::oneapi::experimental::prefetch_type::device)
? UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE
: UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST;
Adapter.call<UrApiKind::urEnqueueUSMPrefetch>(
Queue.getHandleRef(), Mem, Length, MigrationFlag, DepEvents.size(),
DepEvents.data(), OutEvent);
}

void MemoryManager::advise_usm(const void *Mem, queue_impl &Queue,
Expand Down Expand Up @@ -1539,11 +1544,16 @@ void MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer(
sycl::detail::context_impl *Context,
ur_exp_command_buffer_handle_t CommandBuffer, void *Mem, size_t Length,
std::vector<ur_exp_command_buffer_sync_point_t> Deps,
ur_exp_command_buffer_sync_point_t *OutSyncPoint) {
ur_exp_command_buffer_sync_point_t *OutSyncPoint,
sycl::ext::oneapi::experimental::prefetch_type Dest) {
adapter_impl &Adapter = Context->getAdapter();
ur_usm_migration_flags_t MigrationFlag =
(Dest == sycl::ext::oneapi::experimental::prefetch_type::device)
? UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE
: UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST;
Adapter.call<UrApiKind::urCommandBufferAppendUSMPrefetchExp>(
CommandBuffer, Mem, Length, ur_usm_migration_flags_t(0), Deps.size(),
Deps.data(), 0u, nullptr, OutSyncPoint, nullptr, nullptr);
CommandBuffer, Mem, Length, MigrationFlag, Deps.size(), Deps.data(), 0,
nullptr, OutSyncPoint, nullptr, nullptr);
}

void MemoryManager::ext_oneapi_advise_usm_cmd_buffer(
Expand Down
14 changes: 10 additions & 4 deletions sycl/source/detail/memory_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <detail/sycl_mem_obj_i.hpp>
#include <sycl/access/access.hpp>
#include <sycl/detail/export.hpp>
#include <sycl/ext/oneapi/experimental/enqueue_types.hpp> // for prefetch_type
#include <sycl/id.hpp>
#include <sycl/property_list.hpp>
#include <sycl/range.hpp>
Expand Down Expand Up @@ -146,9 +147,12 @@ class MemoryManager {
std::vector<ur_event_handle_t> DepEvents,
ur_event_handle_t *OutEvent);

static void prefetch_usm(void *Ptr, queue_impl &Queue, size_t Len,
std::vector<ur_event_handle_t> DepEvents,
ur_event_handle_t *OutEvent);
static void
prefetch_usm(void *Ptr, queue_impl &Queue, size_t Len,
std::vector<ur_event_handle_t> DepEvents,
ur_event_handle_t *OutEvent,
sycl::ext::oneapi::experimental::prefetch_type Dest =
sycl::ext::oneapi::experimental::prefetch_type::device);

static void advise_usm(const void *Ptr, queue_impl &Queue, size_t Len,
ur_usm_advice_flags_t Advice,
Expand Down Expand Up @@ -245,7 +249,9 @@ class MemoryManager {
sycl::detail::context_impl *Context,
ur_exp_command_buffer_handle_t CommandBuffer, void *Mem, size_t Length,
std::vector<ur_exp_command_buffer_sync_point_t> Deps,
ur_exp_command_buffer_sync_point_t *OutSyncPoint);
ur_exp_command_buffer_sync_point_t *OutSyncPoint,
sycl::ext::oneapi::experimental::prefetch_type Dest =
sycl::ext::oneapi::experimental::prefetch_type::device);

static void ext_oneapi_advise_usm_cmd_buffer(
sycl::detail::context_impl *Context,
Expand Down
6 changes: 4 additions & 2 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2985,7 +2985,8 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() {
if (auto Result = callMemOpHelper(
MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer,
&MQueue->getContextImpl(), MCommandBuffer, Prefetch->getDst(),
Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint);
Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint,
Prefetch->getPrefetchType());
Result != UR_RESULT_SUCCESS)
return Result;

Expand Down Expand Up @@ -3296,7 +3297,8 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
if (auto Result = callMemOpHelper(
MemoryManager::prefetch_usm, Prefetch->getDst(), *MQueue,
Prefetch->getLength(), std::move(RawEvents), Event);
Prefetch->getLength(), std::move(RawEvents), Event,
Prefetch->getPrefetchType());
Result != UR_RESULT_SUCCESS)
return Result;

Expand Down
16 changes: 14 additions & 2 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <sycl/stream.hpp>

#include <sycl/ext/oneapi/bindless_images_memory.hpp>
#include <sycl/ext/oneapi/experimental/enqueue_types.hpp>
#include <sycl/ext/oneapi/experimental/graph.hpp>
#include <sycl/ext/oneapi/experimental/work_group_memory.hpp>
#include <sycl/ext/oneapi/memcpy2d.hpp>
Expand Down Expand Up @@ -719,8 +720,9 @@ event handler::finalize() {
MCodeLoc));
break;
case detail::CGType::PrefetchUSM:
CommandGroup.reset(new detail::CGPrefetchUSM(
MDstPtr, MLength, std::move(impl->CGData), MCodeLoc));
CommandGroup.reset(
new detail::CGPrefetchUSM(MDstPtr, MLength, std::move(impl->CGData),
impl->MPrefetchType, MCodeLoc));
break;
case detail::CGType::AdviseUSM:
CommandGroup.reset(new detail::CGAdviseUSM(MDstPtr, MLength, impl->MAdvice,
Expand Down Expand Up @@ -1473,6 +1475,16 @@ void handler::prefetch(const void *Ptr, size_t Count) {
throwIfActionIsCreated();
MDstPtr = const_cast<void *>(Ptr);
MLength = Count;
impl->MPrefetchType = ext::oneapi::experimental::prefetch_type::device;
setType(detail::CGType::PrefetchUSM);
}

void handler::prefetch(const void *Ptr, size_t Count,
ext::oneapi::experimental::prefetch_type Type) {
throwIfActionIsCreated();
MDstPtr = const_cast<void *>(Ptr);
MLength = Count;
impl->MPrefetchType = Type;
setType(detail::CGType::PrefetchUSM);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
// REQUIRES: aspect-usm_shared_allocations
//
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
//
// UNSUPPORTED: opencl
// UNSUPPORTED-INTENDED: OpenCL currently does not support command buffers
//
// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}

// Tests prefetch functionality in enqueue functions

#include "../graph_common.hpp"
#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>

static constexpr int N = 100;
static constexpr int Pattern = 42;

int main() {
queue Q{};

int *Src = malloc_shared<int>(N, Q);
int *Dst = malloc_shared<int>(N, Q);
for (int i = 0; i < N; i++)
Src[i] = Pattern;

{
exp_ext::command_graph Graph{Q.get_context(), Q.get_device(), {}};

Graph.begin_recording(Q);

// Test submitting host-to-device prefetch
event TestH2D = exp_ext::submit_with_event(
Q, [&](handler &CGH) { exp_ext::prefetch(CGH, Src, sizeof(int) * N); });

exp_ext::submit(Q, [&](handler &CGH) {
CGH.depends_on(TestH2D);
exp_ext::parallel_for(CGH, range<1>(N),
[=](id<1> i) { Dst[i] = Src[i] * 2; });
});

Graph.end_recording();

auto GraphExec = Graph.finalize();

exp_ext::execute_graph(Q, GraphExec);
Q.wait_and_throw();
}

// Check host-to-device prefetch results
for (int i = 0; i < N; i++)
assert(check_value(i, Pattern * 2, Dst[i], "Dst"));

{
exp_ext::command_graph Graph{Q.get_context(), Q.get_device(), {}};

Graph.begin_recording(Q);

// Test submitting device-to-host prefetch
event TestD2H = exp_ext::submit_with_event(Q, [&](handler &CGH) {
exp_ext::parallel_for(CGH, range<1>(N),
[=](id<1> i) { Dst[i] = Src[i] + 1; });
});

exp_ext::submit(Q, [&](handler &CGH) {
CGH.depends_on(TestD2H);
exp_ext::prefetch(CGH, Dst, sizeof(int) * N,
exp_ext::prefetch_type::host);
});

Graph.end_recording();

auto GraphExec = Graph.finalize();

exp_ext::execute_graph(Q, GraphExec);
Q.wait_and_throw();
}

// Check device-to-host prefetch results
for (int i = 0; i < N; i++)
assert(check_value(i, Pattern + 1, Dst[i], "Dst"));

free(Src, Q);
free(Dst, Q);

return 0;
}
Loading
Loading