Skip to content

Commit

Permalink
[L0] Add initial USM alloc enqueue API
Browse files Browse the repository at this point in the history
Co-authored-by: Michał Staniewski <[email protected]>
  • Loading branch information
kswiecicki and staniewzki committed Jan 14, 2025
1 parent 363bc63 commit f270330
Show file tree
Hide file tree
Showing 5 changed files with 385 additions and 95 deletions.
9 changes: 9 additions & 0 deletions source/adapters/level_zero/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,15 @@ struct ur_context_handle_t_ : _ur_object {
// Allocation-tracking proxy pools for direct allocations. No pooling used.
ur_usm_pool_handle_t_ ProxyPool;

// USM pools for async allocations.
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
AsyncDeviceMemPools;
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
AsyncSharedMemPools;
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
AsyncSharedReadOnlyMemPools;
umf::pool_unique_handle_t AsyncHostMemPool;

// Map associating pools created with urUsmPoolCreate and internal pools
std::list<ur_usm_pool_handle_t> UsmPoolHandles{};

Expand Down
2 changes: 1 addition & 1 deletion source/adapters/level_zero/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1329,7 +1329,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
Device->Platform->ContextsMutex, std::defer_lock);

if (IndirectAccessTrackingEnabled) {
// We are going to submit kernels for execution. If indirect access flag is
// We are going to submit kernels for execution. If indirect access flag is
// set for a kernel then we need to make a snapshot of existing memory
// allocations in all contexts in the platform. We need to lock the mutex
// guarding the list of contexts in the platform to prevent creation of new
Expand Down
321 changes: 227 additions & 94 deletions source/adapters/level_zero/usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,124 +588,257 @@ ur_result_t urUSMReleaseExp(ur_context_handle_t Context, void *HostPtr) {
return UR_RESULT_SUCCESS;
}

enum class USMAllocType { Host = 0, Device = 1, Shared = 2 };

static ur_result_t USMAllocHelper(ur_context_handle_t Context,
ur_device_handle_t Device, size_t Size,
void **RetMem, USMAllocType Type) {
auto &Platform = Device->Platform;

// TODO: Should alignemnt be passed in 'ur_exp_async_usm_alloc_properties_t'?
uint32_t Alignment = 0;

std::shared_lock<ur_shared_mutex> ContextLock(Context->Mutex,
std::defer_lock);
std::unique_lock<ur_shared_mutex> IndirectAccessTrackingLock(
Platform->ContextsMutex, std::defer_lock);
if (IndirectAccessTrackingEnabled) {
IndirectAccessTrackingLock.lock();
UR_CALL(ur::level_zero::urContextRetain(Context));
} else {
ContextLock.lock();
}

umf_memory_pool_handle_t hPoolInternal = nullptr;
switch (Type) {
case USMAllocType::Host:
hPoolInternal = Context->AsyncHostMemPool.get();
break;
case USMAllocType::Device: {
auto It = Context->AsyncDeviceMemPools.find(Device->ZeDevice);
if (It == Context->AsyncDeviceMemPools.end()) {
return UR_RESULT_ERROR_INVALID_VALUE;
}
hPoolInternal = It->second.get();
} break;
case USMAllocType::Shared: {
auto It = Context->AsyncSharedMemPools.find(Device->ZeDevice);
if (It == Context->AsyncSharedMemPools.end()) {
return UR_RESULT_ERROR_INVALID_VALUE;
}
hPoolInternal = It->second.get();
} break;
};

*RetMem = umfPoolAlignedMalloc(hPoolInternal, Size, Alignment);
if (*RetMem == nullptr) {
auto umfRet = umfPoolGetLastAllocationError(hPoolInternal);
return umf2urResult(umfRet);
}

if (IndirectAccessTrackingEnabled) {
// Keep track of all memory allocations in the context
Context->MemAllocs.emplace(std::piecewise_construct,
std::forward_as_tuple(*RetMem),
std::forward_as_tuple(Context));
}

return UR_RESULT_SUCCESS;
}

static ur_result_t enqueueUSMAllocHelper(
ur_queue_handle_t Queue, ur_usm_pool_handle_t Pool, const size_t Size,
const ur_exp_enqueue_usm_alloc_properties_t *Properties,
uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
void **RetMem, ur_event_handle_t *OutEvent, USMAllocType Type) {
std::ignore = Pool;
std::ignore = Properties;

std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);

bool UseCopyEngine = false;
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));

bool OkToBatch = true;
// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
UR_CALL(Queue->Context->getAvailableCommandList(
Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
OkToBatch, nullptr /*ForcedCmdQueue*/));

ze_event_handle_t ZeEvent = nullptr;
ur_event_handle_t InternalEvent{};
bool IsInternal = OutEvent == nullptr;
ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;

ur_command_t CommandType = UR_COMMAND_FORCE_UINT32;
switch (Type) {
case USMAllocType::Host:
CommandType = UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP;
break;
case USMAllocType::Device:
CommandType = UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP;
break;
case USMAllocType::Shared:
CommandType = UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP;
break;
}
UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
IsInternal, false));
ZeEvent = (*Event)->ZeEvent;
(*Event)->WaitList = TmpWaitList;

// Allocate USM memory
auto Ret = USMAllocHelper(Queue->Context, Queue->Device, Size, RetMem, Type);
if (Ret) {
return Ret;
}

// Signal that USM allocation event was finished
ZE2UR_CALL(zeCommandListAppendSignalEvent, (CommandList->first, ZeEvent));

UR_CALL(Queue->executeCommandList(CommandList, false, OkToBatch));

return UR_RESULT_SUCCESS;
}

ur_result_t urEnqueueUSMDeviceAllocExp(
ur_queue_handle_t hQueue, ///< [in] handle of the queue object
ur_usm_pool_handle_t
pPool, ///< [in][optional] handle of the USM memory pool
const size_t size, ///< [in] minimum size in bytes of the USM memory object
ur_queue_handle_t Queue, ///< [in] handle of the queue object
ur_usm_pool_handle_t Pool, ///< [in][optional] USM pool descriptor
const size_t Size, ///< [in] minimum size in bytes of the USM memory object
///< to be allocated
const ur_exp_enqueue_usm_alloc_properties_t
*pProperties, ///< [in][optional] pointer to the enqueue asynchronous
///< USM allocation properties
uint32_t numEventsInWaitList, ///< [in] size of the event wait list
*Properties, ///< [in][optional] pointer to the enqueue async alloc
///< properties
uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
const ur_event_handle_t
*phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
///< pointer to a list of events that must be complete
///< before the kernel execution. If nullptr, the
///< numEventsInWaitList must be 0, indicating no wait
///< events.
void **ppMem, ///< [out] pointer to USM memory object
ur_event_handle_t
*phEvent ///< [out][optional] return an event object that identifies the
///< asynchronous USM device allocation
*EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
///< pointer to a list of events that must be complete
///< before the kernel execution. If nullptr, the
///< numEventsInWaitList must be 0, indicating no wait
///< events.
void **Mem, ///< [out] pointer to USM memory object
ur_event_handle_t *OutEvent ///< [out][optional] return an event object that
///< identifies the async alloc
) {
std::ignore = hQueue;
std::ignore = pPool;
std::ignore = size;
std::ignore = pProperties;
std::ignore = numEventsInWaitList;
std::ignore = phEventWaitList;
std::ignore = ppMem;
std::ignore = phEvent;
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
return enqueueUSMAllocHelper(Queue, Pool, Size, Properties,
NumEventsInWaitList, EventWaitList, Mem,
OutEvent, USMAllocType::Device);
}

ur_result_t urEnqueueUSMSharedAllocExp(
ur_queue_handle_t hQueue, ///< [in] handle of the queue object
ur_usm_pool_handle_t
pPool, ///< [in][optional] handle of the USM memory pool
const size_t size, ///< [in] minimum size in bytes of the USM memory object
ur_queue_handle_t Queue, ///< [in] handle of the queue object
ur_usm_pool_handle_t Pool, ///< [in][optional] USM pool descriptor
const size_t Size, ///< [in] minimum size in bytes of the USM memory object
///< to be allocated
const ur_exp_enqueue_usm_alloc_properties_t
*pProperties, ///< [in][optional] pointer to the enqueue asynchronous
///< USM allocation properties
uint32_t numEventsInWaitList, ///< [in] size of the event wait list
*Properties, ///< [in][optional] pointer to the enqueue async alloc
///< properties
uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
const ur_event_handle_t
*phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
///< pointer to a list of events that must be complete
///< before the kernel execution. If nullptr, the
///< numEventsInWaitList must be 0, indicating no wait
///< events.
void **ppMem, ///< [out] pointer to USM memory object
ur_event_handle_t
*phEvent ///< [out][optional] return an event object that identifies the
///< asynchronous USM shared allocation
*EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
///< pointer to a list of events that must be complete
///< before the kernel execution. If nullptr, the
///< numEventsInWaitList must be 0, indicating no wait
///< events.
void **Mem, ///< [out] pointer to USM memory object
ur_event_handle_t *OutEvent ///< [out][optional] return an event object that
///< identifies the async alloc
) {
std::ignore = hQueue;
std::ignore = pPool;
std::ignore = size;
std::ignore = pProperties;
std::ignore = numEventsInWaitList;
std::ignore = phEventWaitList;
std::ignore = ppMem;
std::ignore = phEvent;
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
return enqueueUSMAllocHelper(Queue, Pool, Size, Properties,
NumEventsInWaitList, EventWaitList, Mem,
OutEvent, USMAllocType::Shared);
}

ur_result_t urEnqueueUSMHostAllocExp(
ur_queue_handle_t hQueue, ///< [in] handle of the queue object
ur_usm_pool_handle_t
pPool, ///< [in][optional] handle of the USM memory pool
const size_t size, ///< [in] minimum size in bytes of the USM memory object
ur_queue_handle_t Queue, ///< [in] handle of the queue object
ur_usm_pool_handle_t Pool, ///< [in][optional] USM pool descriptor
const size_t Size, ///< [in] minimum size in bytes of the USM memory object
///< to be allocated
const ur_exp_enqueue_usm_alloc_properties_t
*pProperties, ///< [in][optional] pointer to the enqueue asynchronous
///< USM allocation properties
uint32_t numEventsInWaitList, ///< [in] size of the event wait list
*Properties, ///< [in][optional] pointer to the enqueue async alloc
///< properties
uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
const ur_event_handle_t
*phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
///< pointer to a list of events that must be complete
///< before the kernel execution. If nullptr, the
///< numEventsInWaitList must be 0, indicating no wait
///< events.
void **ppMem, ///< [out] pointer to USM memory object
ur_event_handle_t
*phEvent ///< [out][optional] return an event object that identifies the
///< asynchronous USM host allocation
*EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
///< pointer to a list of events that must be complete
///< before the kernel execution. If nullptr, the
///< numEventsInWaitList must be 0, indicating no wait
///< events.
void **Mem, ///< [out] pointer to USM memory object
ur_event_handle_t *OutEvent ///< [out][optional] return an event object that
///< identifies the async alloc
) {
std::ignore = hQueue;
std::ignore = pPool;
std::ignore = size;
std::ignore = pProperties;
std::ignore = numEventsInWaitList;
std::ignore = phEventWaitList;
std::ignore = ppMem;
std::ignore = phEvent;
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
return enqueueUSMAllocHelper(Queue, Pool, Size, Properties,
NumEventsInWaitList, EventWaitList, Mem,
OutEvent, USMAllocType::Host);
}

ur_result_t urEnqueueUSMFreeExp(
ur_queue_handle_t hQueue, ///< [in] handle of the queue object
ur_usm_pool_handle_t
pPool, ///< [in][optional] handle of the USM memory pooliptor
void *pMem, ///< [in] pointer to USM memory object
uint32_t numEventsInWaitList, ///< [in] size of the event wait list
ur_queue_handle_t Queue, ///< [in] handle of the queue object
ur_usm_pool_handle_t Pool, ///< [in][optional] USM pool descriptor
void *Mem, ///< [in] pointer to USM memory object
uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
const ur_event_handle_t
*phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
///< pointer to a list of events that must be complete
///< before the kernel execution. If nullptr, the
///< numEventsInWaitList must be 0, indicating no wait
///< events.
ur_event_handle_t *phEvent ///< [out][optional] return an event object that
///< identifies the asynchronous USM deallocation
*EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
///< pointer to a list of events that must be complete
///< before the kernel execution. If nullptr, the
///< numEventsInWaitList must be 0, indicating no wait
///< events.
ur_event_handle_t *OutEvent ///< [out][optional] return an event object that
///< identifies the async alloc
) {
std::ignore = hQueue;
std::ignore = pPool;
std::ignore = pMem;
std::ignore = numEventsInWaitList;
std::ignore = phEventWaitList;
std::ignore = phEvent;
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
std::ignore = Pool;

std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);

bool UseCopyEngine = false;
_ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));

bool OkToBatch = false;
// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
UR_CALL(Queue->Context->getAvailableCommandList(
Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
OkToBatch, nullptr /*ForcedCmdQueue*/));

ze_event_handle_t ZeEvent = nullptr;
ur_event_handle_t InternalEvent{};
bool IsInternal = OutEvent == nullptr;
ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;

UR_CALL(createEventAndAssociateQueue(Queue, Event,
UR_COMMAND_ENQUEUE_USM_FREE_EXP,
CommandList, IsInternal, false));
ZeEvent = (*Event)->ZeEvent;
(*Event)->WaitList = TmpWaitList;

const auto &ZeCommandList = CommandList->first;
const auto &WaitList = (*Event)->WaitList;
if (WaitList.Length) {
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
(ZeCommandList, WaitList.Length, WaitList.ZeEventList));
}

// Wait for commands execution until USM can be freed
UR_CALL(Queue->executeCommandList(CommandList, true, OkToBatch)); // Blocking

// Free USM memory
auto Ret = USMFreeHelper(Queue->Context, Mem);
if (Ret) {
return Ret;
}

// Signal that USM free event was finished
ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent));

UR_CALL(Queue->executeCommandList(CommandList, false, OkToBatch));

return UR_RESULT_SUCCESS;
}
} // namespace ur::level_zero

Expand Down
8 changes: 8 additions & 0 deletions test/adapters/level_zero/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@ function(add_adapter_tests adapter)

add_dependencies(test-adapter-${adapter}
generate_device_binaries kernel_names_header)

add_adapter_test(level_zero_enqueue_alloc
FIXTURE KERNELS
SOURCES
enqueue_alloc.cpp
ENVIRONMENT
"UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero>\""
)
endif()

if(NOT WIN32 AND NOT UR_STATIC_ADAPTER_L0)
Expand Down
Loading

0 comments on commit f270330

Please sign in to comment.