From eb6487d5b27db993e6567ebe64de48c6d3f90c5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Tue, 14 Jan 2025 13:24:55 +0000 Subject: [PATCH 1/4] [CUDA][HIP] Fix kernel arguments being overriden when added out of order In the Cuda and Hip adapter, when kernel arguments are added out of order (e.g. argument at index 1 is added before argument at index 0), the existing arguments are currently being overwritten. This happens because some of the argument sizes might not be known when adding them out of order and the code relies on those sizes to choose where to store the argument. This commit avoids this issue by storing the arguments in the same order that they are added and accessing them using pointer offsets. --- source/adapters/cuda/kernel.hpp | 31 ++- source/adapters/hip/kernel.hpp | 65 +++--- .../update/local_memory_update.cpp | 220 ++++++++++++++++++ .../kernel/urKernelSetArgLocal.cpp | 99 ++++++++ 4 files changed, 377 insertions(+), 38 deletions(-) diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index d1b3b61244..a6194e9a57 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -68,6 +68,8 @@ struct ur_kernel_handle_t_ { args_size_t ParamSizes; /// Byte offset into /p Storage allocation for each parameter. args_index_t Indices; + /// Position in the Storage array where the next argument should added. + size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has /// been added. Zero if the argument at the index isn't a local memory /// argument. @@ -101,6 +103,7 @@ struct ur_kernel_handle_t_ { /// Implicit offset argument is kept at the back of the indices collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { + // Expand storage to accommodate this Index if needed. if (Index + 2 > Indices.size()) { // Move implicit offset argument index with the end Indices.resize(Index + 2, Indices.back()); @@ -109,14 +112,21 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } - ParamSizes[Index] = Size; - // calculate the insertion point on the array - size_t InsertPos = std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + Index, 0); - // Update the stored value for the argument - std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; - AlignedLocalMemSize[Index] = LocalSize; + + // Copy new argument to storage if it hasn't been added before. + if (ParamSizes[Index] == 0) { + ParamSizes[Index] = Size; + std::memcpy(&Storage[InsertPos], Arg, Size); + Indices[Index] = &Storage[InsertPos]; + AlignedLocalMemSize[Index] = LocalSize; + InsertPos += Size; + } + // Otherwise, update the existing argument. + else { + std::memcpy(Indices[Index], Arg, Size); + AlignedLocalMemSize[Index] = LocalSize; + assert(Size == ParamSizes[Index]); + } } /// Returns the padded size and offset of a local memory argument. @@ -177,10 +187,7 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - const size_t InsertPos = - std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + SuccIndex, size_t{0}); - std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset, + std::memcpy(Indices[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp index c6d30e81ad..61dd89cc99 100644 --- a/source/adapters/hip/kernel.hpp +++ b/source/adapters/hip/kernel.hpp @@ -63,6 +63,8 @@ struct ur_kernel_handle_t_ { args_size_t ParamSizes; /// Byte offset into /p Storage allocation for each parameter. args_index_t Indices; + /// Position in the Storage array where the next argument should added. + size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has /// been added. Zero if the argument at the index isn't a local memory /// argument. @@ -95,22 +97,30 @@ struct ur_kernel_handle_t_ { /// Implicit offset argument is kept at the back of the indices collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { + // Expand storage to accommodate this Index if needed. if (Index + 2 > Indices.size()) { - // Move implicit offset argument Index with the end + // Move implicit offset argument index with the end Indices.resize(Index + 2, Indices.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } - ParamSizes[Index] = Size; - // calculate the insertion point on the array - size_t InsertPos = std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + Index, 0); - // Update the stored value for the argument - std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; - AlignedLocalMemSize[Index] = LocalSize; + + // Copy new argument to storage if it hasn't been added before. + if (ParamSizes[Index] == 0) { + ParamSizes[Index] = Size; + std::memcpy(&Storage[InsertPos], Arg, Size); + Indices[Index] = &Storage[InsertPos]; + AlignedLocalMemSize[Index] = LocalSize; + InsertPos += Size; + } + // Otherwise, update the existing argument. + else { + std::memcpy(Indices[Index], Arg, Size); + AlignedLocalMemSize[Index] = LocalSize; + assert(Size == ParamSizes[Index]); + } } /// Returns the padded size and offset of a local memory argument. @@ -151,20 +161,11 @@ struct ur_kernel_handle_t_ { return std::make_pair(AlignedLocalSize, AlignedLocalOffset); } - void addLocalArg(size_t Index, size_t Size) { - // Get the aligned argument size and offset into local data - auto [AlignedLocalSize, AlignedLocalOffset] = - calcAlignedLocalArgument(Index, Size); - - // Store argument details - addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), - AlignedLocalSize); - - // For every existing local argument which follows at later argument - // indices, update the offset and pointer into the kernel local memory. - // Required as padding will need to be recalculated. + // Iterate over all existing local argument which follows StartIndex + // index, update the offset and pointer into the kernel local memory. + void updateLocalArgOffset(size_t StartIndex) { const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg - for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) { + for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) { const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; if (OriginalLocalSize == 0) { // Skip if successor argument isn't a local memory arg @@ -179,14 +180,26 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - const size_t InsertPos = - std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + SuccIndex, size_t{0}); - std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset, + std::memcpy(Indices[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } + void addLocalArg(size_t Index, size_t Size) { + // Get the aligned argument size and offset into local data + auto [AlignedLocalSize, AlignedLocalOffset] = + calcAlignedLocalArgument(Index, Size); + + // Store argument details + addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), + AlignedLocalSize); + + // For every existing local argument which follows at later argument + // indices, update the offset and pointer into the kernel local memory. + // Required as padding will need to be recalculated. + updateLocalArgOffset(Index + 1); + } + void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { assert(hMem && "Invalid mem handle"); // To avoid redundancy we are not storing mem obj with index i at index diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp index 559a61e3ad..6f309b6933 100644 --- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp +++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp @@ -1094,3 +1094,223 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateWithoutBlocking) { uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; Validate(new_output, new_X, new_Y, new_A, global_size, local_size); } + +struct LocalMemoryUpdateTestBaseOutOfOrder : LocalMemoryUpdateTestBase { + virtual void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE( + urUpdatableCommandBufferExpExecutionTest::SetUp()); + + if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) { + GTEST_SKIP() + << "Local memory argument update not supported on Level Zero."; + } + + // HIP has extra args for local memory so we define an offset for arg + // indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = + sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + + std::array index_order{}; + if (backend != UR_PLATFORM_BACKEND_HIP) { + index_order = {3, 2, 4, 5, 1, 0}; + } else { + index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + } + size_t current_index = 0; + + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(A), nullptr, &A)); + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[0])); + + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[2])); + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } + + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } + } +}; + +struct LocalMemoryUpdateTestOutOfOrder : LocalMemoryUpdateTestBaseOutOfOrder { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE( + LocalMemoryUpdateTestBaseOutOfOrder::SetUp()); + + // Append kernel command to command-buffer and close command-buffer + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, + nullptr, nullptr, &command_handle)); + ASSERT_NE(command_handle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + } + + void TearDown() override { + if (command_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle)); + } + + UUR_RETURN_ON_FATAL_FAILURE( + LocalMemoryUpdateTestBaseOutOfOrder::TearDown()); + } + + ur_exp_command_buffer_command_handle_t command_handle = nullptr; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryUpdateTestOutOfOrder); + +// Test updating A,X,Y parameters to new values and local memory to larger +// values when the kernel arguments were added out of order. +TEST_P(LocalMemoryUpdateTestOutOfOrder, UpdateAllParameters) { + // Run command-buffer prior to update and verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); + + // Update inputs + std::array + new_input_descs; + std::array + new_value_descs; + + size_t new_local_size = local_size * 4; + size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t); + + // New local_mem_a at index 0 + new_value_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 0, // argIndex + new_local_mem_a_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New local_mem_b at index 1 + new_value_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1 + hip_arg_offset, // argIndex + local_mem_b_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New A at index 3 + uint32_t new_A = 33; + new_value_descs[2] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3 + (2 * hip_arg_offset), // argIndex + sizeof(new_A), // argSize + nullptr, // pProperties + &new_A, // hArgValue + }; + + // New X at index 4 + new_input_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 4 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue + }; + + // New Y at index 5 + new_input_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 5 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue + }; + + // Update kernel inputs + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + kernel, // hNewKernel + 0, // numNewMemObjArgs + new_input_descs.size(), // numNewPointerArgs + new_value_descs.size(), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs.data(), // pNewPointerArgList + new_value_descs.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that update occurred correctly + uint32_t *new_output = (uint32_t *)shared_ptrs[0]; + uint32_t *new_X = (uint32_t *)shared_ptrs[3]; + uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; + Validate(new_output, new_X, new_Y, new_A, global_size, local_size); +} \ No newline at end of file diff --git a/test/conformance/kernel/urKernelSetArgLocal.cpp b/test/conformance/kernel/urKernelSetArgLocal.cpp index 688724ec09..f5fc0019ae 100644 --- a/test/conformance/kernel/urKernelSetArgLocal.cpp +++ b/test/conformance/kernel/urKernelSetArgLocal.cpp @@ -237,3 +237,102 @@ TEST_P(urKernelSetArgLocalMultiTest, Overwrite) { Validate(output, X, Y, A, global_size, new_local_size); } + +// Tests that adding arguments out of order (e.g. index 1 before index 0) works. +struct urKernelSetArgLocalOutOfOrder : urKernelSetArgLocalMultiTest { + void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + + // HIP has extra args for local memory so we define an offset for arg indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = + sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + + std::array index_order{}; + if (backend != UR_PLATFORM_BACKEND_HIP) { + index_order = {3, 2, 4, 5, 1, 0}; + } else { + index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + } + size_t current_index = 0; + + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(A), nullptr, &A)); + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[0])); + + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[2])); + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } + + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } + } +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelSetArgLocalOutOfOrder); +TEST_P(urKernelSetArgLocalOutOfOrder, Success) { + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, + &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); +} From e3dcfc3c5726f39e94ae4a12f0ef2325e7970d63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Tue, 21 Jan 2025 14:20:16 +0000 Subject: [PATCH 2/4] Rename Indices member variable to ArgPointers --- source/adapters/cuda/command_buffer.cpp | 6 ++-- source/adapters/cuda/enqueue.cpp | 6 ++-- source/adapters/cuda/kernel.hpp | 37 +++++++++++++------------ source/adapters/hip/command_buffer.cpp | 6 ++-- source/adapters/hip/enqueue.cpp | 4 +-- source/adapters/hip/kernel.hpp | 34 ++++++++++++----------- test/adapters/cuda/kernel_tests.cpp | 6 ++-- 7 files changed, 52 insertions(+), 47 deletions(-) diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 05c20a6614..37018dde6c 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -523,7 +523,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ThreadsPerBlock, BlocksPerGrid)); // Set node param structure with the kernel related data - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); CUDA_KERNEL_NODE_PARAMS NodeParams = {}; NodeParams.func = CuFunc; NodeParams.gridDimX = BlocksPerGrid[0]; @@ -533,7 +533,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( NodeParams.blockDimY = ThreadsPerBlock[1]; NodeParams.blockDimZ = ThreadsPerBlock[2]; NodeParams.sharedMemBytes = LocalSize; - NodeParams.kernelParams = const_cast(ArgIndices.data()); + NodeParams.kernelParams = const_cast(ArgPointers.data()); // Create and add an new kernel node to the Cuda graph UR_CHECK_ERROR(cuGraphAddKernelNode(&GraphNode, hCommandBuffer->CudaGraph, @@ -1398,7 +1398,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.blockDimZ = ThreadsPerBlock[2]; Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize(); Params.kernelParams = - const_cast(KernelCommandHandle->Kernel->getArgIndices().data()); + const_cast(KernelCommandHandle->Kernel->getArgPointers().data()); CUgraphNode Node = KernelCommandHandle->Node; CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec; diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 2a4a2cf54f..71c4340456 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -492,7 +492,7 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, UR_CHECK_ERROR(RetImplEvent->start()); } - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgIndices = hKernel->getArgPointers(); UR_CHECK_ERROR(cuLaunchKernel( CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize, @@ -680,7 +680,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( UR_CHECK_ERROR(RetImplEvent->start()); } - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); CUlaunchConfig launch_config; launch_config.gridDimX = BlocksPerGrid[0]; @@ -696,7 +696,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_config.numAttrs = launch_attribute.size(); UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc, - const_cast(ArgIndices.data()), + const_cast(ArgPointers.data()), nullptr)); if (phEvent) { diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index a6194e9a57..f299714b02 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -66,8 +66,8 @@ struct ur_kernel_handle_t_ { args_t Storage; /// Aligned size of each parameter, including padding. args_size_t ParamSizes; - /// Byte offset into /p Storage allocation for each parameter. - args_index_t Indices; + /// Byte offset into /p Storage allocation for each argument. + args_index_t ArgPointers; /// Position in the Storage array where the next argument should added. size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has @@ -92,21 +92,23 @@ struct ur_kernel_handle_t_ { std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0}; arguments() { - // Place the implicit offset index at the end of the indicies collection - Indices.emplace_back(&ImplicitOffsetArgs); + // Place the implicit offset index at the end of the ArgPointers + // collection. + ArgPointers.emplace_back(&ImplicitOffsetArgs); } /// Add an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. - /// Implicit offset argument is kept at the back of the indices collection. + /// Implicit offset argument is kept at the back of the ArgPointers + /// collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { // Expand storage to accommodate this Index if needed. - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { // Move implicit offset argument index with the end - Indices.resize(Index + 2, Indices.back()); + ArgPointers.resize(Index + 2, ArgPointers.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); AlignedLocalMemSize.resize(Index + 1); @@ -117,13 +119,13 @@ struct ur_kernel_handle_t_ { if (ParamSizes[Index] == 0) { ParamSizes[Index] = Size; std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; + ArgPointers[Index] = &Storage[InsertPos]; AlignedLocalMemSize[Index] = LocalSize; InsertPos += Size; } // Otherwise, update the existing argument. else { - std::memcpy(Indices[Index], Arg, Size); + std::memcpy(ArgPointers[Index], Arg, Size); AlignedLocalMemSize[Index] = LocalSize; assert(Size == ParamSizes[Index]); } @@ -138,7 +140,7 @@ struct ur_kernel_handle_t_ { std::pair calcAlignedLocalArgument(size_t Index, size_t Size) { // Store the unpadded size of the local argument - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } @@ -168,10 +170,11 @@ struct ur_kernel_handle_t_ { return std::make_pair(AlignedLocalSize, AlignedLocalOffset); } - // Iterate over all existing local argument which follows StartIndex + // Iterate over each existing local argument which follows StartIndex // index, update the offset and pointer into the kernel local memory. void updateLocalArgOffset(size_t StartIndex) { - const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg + const size_t NumArgs = + ArgPointers.size() - 1; // Accounts for implicit arg for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) { const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; if (OriginalLocalSize == 0) { @@ -187,7 +190,7 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - std::memcpy(Indices[SuccIndex], &SuccAlignedLocalOffset, + std::memcpy(ArgPointers[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } @@ -235,7 +238,7 @@ struct ur_kernel_handle_t_ { std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); } - const args_index_t &getIndices() const noexcept { return Indices; } + const args_index_t &getArgPointers() const noexcept { return ArgPointers; } uint32_t getLocalSize() const { return std::accumulate(std::begin(AlignedLocalMemSize), @@ -306,7 +309,7 @@ struct ur_kernel_handle_t_ { /// real one required by the kernel, since this cannot be queried from /// the CUDA Driver API uint32_t getNumArgs() const noexcept { - return static_cast(Args.Indices.size() - 1); + return static_cast(Args.ArgPointers.size() - 1); } void setKernelArg(int Index, size_t Size, const void *Arg) { @@ -321,8 +324,8 @@ struct ur_kernel_handle_t_ { return Args.setImplicitOffset(Size, ImplicitOffset); } - const arguments::args_index_t &getArgIndices() const { - return Args.getIndices(); + const arguments::args_index_t &getArgPointers() const { + return Args.getArgPointers(); } void setWorkGroupMemory(size_t MemSize) { Args.setWorkGroupMemory(MemSize); } diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index 09c59bb9f7..887eb75287 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -378,7 +378,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid)); // Set node param structure with the kernel related data - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); hipKernelNodeParams NodeParams; NodeParams.func = HIPFunc; NodeParams.gridDim.x = BlocksPerGrid[0]; @@ -388,7 +388,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( NodeParams.blockDim.y = ThreadsPerBlock[1]; NodeParams.blockDim.z = ThreadsPerBlock[2]; NodeParams.sharedMemBytes = LocalSize; - NodeParams.kernelParams = const_cast(ArgIndices.data()); + NodeParams.kernelParams = const_cast(ArgPointers.data()); NodeParams.extra = nullptr; // Create and add an new kernel node to the HIP graph @@ -1098,7 +1098,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.blockDim.z = ThreadsPerBlock[2]; Params.sharedMemBytes = hCommand->Kernel->getLocalSize(); Params.kernelParams = - const_cast(hCommand->Kernel->getArgIndices().data()); + const_cast(hCommand->Kernel->getArgPointers().data()); hipGraphNode_t Node = hCommand->Node; hipGraphExec_t HipGraphExec = CommandBuffer->HIPGraphExec; diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 8c7c1c617d..849369de4b 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -308,7 +308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } } - auto ArgIndices = hKernel->getArgIndices(); + auto ArgPointers = hKernel->getArgPointers(); // If migration of mem across buffer is needed, an event must be associated // with this command, implicitly if phEvent is nullptr @@ -322,7 +322,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(hipModuleLaunchKernel( HIPFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], - hKernel->getLocalSize(), HIPStream, ArgIndices.data(), nullptr)); + hKernel->getLocalSize(), HIPStream, ArgPointers.data(), nullptr)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp index 61dd89cc99..5ec51e7fa4 100644 --- a/source/adapters/hip/kernel.hpp +++ b/source/adapters/hip/kernel.hpp @@ -61,8 +61,8 @@ struct ur_kernel_handle_t_ { args_t Storage; /// Aligned size of each parameter, including padding. args_size_t ParamSizes; - /// Byte offset into /p Storage allocation for each parameter. - args_index_t Indices; + /// Byte offset into /p Storage allocation for each argument. + args_index_t ArgPointers; /// Position in the Storage array where the next argument should added. size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has @@ -87,20 +87,21 @@ struct ur_kernel_handle_t_ { arguments() { // Place the implicit offset index at the end of the indicies collection - Indices.emplace_back(&ImplicitOffsetArgs); + ArgPointers.emplace_back(&ImplicitOffsetArgs); } /// Add an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. - /// Implicit offset argument is kept at the back of the indices collection. + /// Implicit offset argument is kept at the back of the ArgPointers + /// collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { // Expand storage to accommodate this Index if needed. - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { // Move implicit offset argument index with the end - Indices.resize(Index + 2, Indices.back()); + ArgPointers.resize(Index + 2, ArgPointers.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); AlignedLocalMemSize.resize(Index + 1); @@ -111,13 +112,13 @@ struct ur_kernel_handle_t_ { if (ParamSizes[Index] == 0) { ParamSizes[Index] = Size; std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; + ArgPointers[Index] = &Storage[InsertPos]; AlignedLocalMemSize[Index] = LocalSize; InsertPos += Size; } // Otherwise, update the existing argument. else { - std::memcpy(Indices[Index], Arg, Size); + std::memcpy(ArgPointers[Index], Arg, Size); AlignedLocalMemSize[Index] = LocalSize; assert(Size == ParamSizes[Index]); } @@ -132,7 +133,7 @@ struct ur_kernel_handle_t_ { std::pair calcAlignedLocalArgument(size_t Index, size_t Size) { // Store the unpadded size of the local argument - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } @@ -161,10 +162,11 @@ struct ur_kernel_handle_t_ { return std::make_pair(AlignedLocalSize, AlignedLocalOffset); } - // Iterate over all existing local argument which follows StartIndex + // Iterate over each existing local argument which follows StartIndex // index, update the offset and pointer into the kernel local memory. void updateLocalArgOffset(size_t StartIndex) { - const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg + const size_t NumArgs = + ArgPointers.size() - 1; // Accounts for implicit arg for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) { const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; if (OriginalLocalSize == 0) { @@ -180,7 +182,7 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - std::memcpy(Indices[SuccIndex], &SuccAlignedLocalOffset, + std::memcpy(ArgPointers[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } @@ -219,7 +221,7 @@ struct ur_kernel_handle_t_ { std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); } - const args_index_t &getIndices() const noexcept { return Indices; } + const args_index_t &getArgPointers() const noexcept { return ArgPointers; } uint32_t getLocalSize() const { return std::accumulate(std::begin(AlignedLocalMemSize), @@ -276,7 +278,7 @@ struct ur_kernel_handle_t_ { /// offset. Note this only returns the current known number of arguments, /// not the real one required by the kernel, since this cannot be queried /// from the HIP Driver API - uint32_t getNumArgs() const noexcept { return Args.Indices.size() - 1; } + uint32_t getNumArgs() const noexcept { return Args.ArgPointers.size() - 1; } void setKernelArg(int Index, size_t Size, const void *Arg) { Args.addArg(Index, Size, Arg); @@ -290,8 +292,8 @@ struct ur_kernel_handle_t_ { return Args.setImplicitOffset(Size, ImplicitOffset); } - const arguments::args_index_t &getArgIndices() const { - return Args.getIndices(); + const arguments::args_index_t &getArgPointers() const { + return Args.getArgPointers(); } uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); } diff --git a/test/adapters/cuda/kernel_tests.cpp b/test/adapters/cuda/kernel_tests.cpp index 0f7f3351fe..7b83459c5f 100644 --- a/test/adapters/cuda/kernel_tests.cpp +++ b/test/adapters/cuda/kernel_tests.cpp @@ -153,7 +153,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSimple) { int number = 10; ASSERT_SUCCESS(urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number)); - const auto &kernelArgs = kernel->getArgIndices(); + const auto &kernelArgs = kernel->getArgPointers(); ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA); int storedValue = *static_cast(kernelArgs[0]); @@ -175,7 +175,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSetTwice) { int number = 10; ASSERT_SUCCESS(urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number)); - const auto &kernelArgs = kernel->getArgIndices(); + const auto &kernelArgs = kernel->getArgPointers(); ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA); int storedValue = *static_cast(kernelArgs[0]); ASSERT_EQ(storedValue, number); @@ -183,7 +183,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSetTwice) { int otherNumber = 934; ASSERT_SUCCESS( urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &otherNumber)); - const auto kernelArgs2 = kernel->getArgIndices(); + const auto kernelArgs2 = kernel->getArgPointers(); ASSERT_EQ(kernelArgs2.size(), 1 + NumberOfImplicitArgsCUDA); storedValue = *static_cast(kernelArgs2[0]); ASSERT_EQ(storedValue, otherNumber); From f98229f8bc1ff4b11a960e06799f2d182bb9b89c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Wed, 22 Jan 2025 13:49:54 +0000 Subject: [PATCH 3/4] Fix formatting --- .../update/local_memory_update.cpp | 401 +++++++++--------- .../kernel/urKernelSetArgLocal.cpp | 180 ++++---- 2 files changed, 289 insertions(+), 292 deletions(-) diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp index 6f309b6933..d55094a52c 100644 --- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp +++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp @@ -1096,119 +1096,117 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateWithoutBlocking) { } struct LocalMemoryUpdateTestBaseOutOfOrder : LocalMemoryUpdateTestBase { - virtual void SetUp() override { - program_name = "saxpy_usm_local_mem"; - UUR_RETURN_ON_FATAL_FAILURE( - urUpdatableCommandBufferExpExecutionTest::SetUp()); - - if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) { - GTEST_SKIP() - << "Local memory argument update not supported on Level Zero."; - } - - // HIP has extra args for local memory so we define an offset for arg - // indices here for updating - hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; - ur_device_usm_access_capability_flags_t shared_usm_flags; - ASSERT_SUCCESS( - uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); - if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { - GTEST_SKIP() << "Shared USM is not supported."; - } - - const size_t allocation_size = - sizeof(uint32_t) * global_size * local_size; - for (auto &shared_ptr : shared_ptrs) { - ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, - allocation_size, &shared_ptr)); - ASSERT_NE(shared_ptr, nullptr); - - std::vector pattern(allocation_size); - uur::generateMemFillPattern(pattern); - std::memcpy(shared_ptr, pattern.data(), allocation_size); - } - - std::array index_order{}; - if (backend != UR_PLATFORM_BACKEND_HIP) { - index_order = {3, 2, 4, 5, 1, 0}; - } else { - index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; - } - size_t current_index = 0; - - // Index 3 is A - ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], - sizeof(A), nullptr, &A)); - // Index 2 is output - ASSERT_SUCCESS(urKernelSetArgPointer( - kernel, index_order[current_index++], nullptr, shared_ptrs[0])); - - // Index 4 is X - ASSERT_SUCCESS(urKernelSetArgPointer( - kernel, index_order[current_index++], nullptr, shared_ptrs[1])); - // Index 5 is Y - ASSERT_SUCCESS(urKernelSetArgPointer( - kernel, index_order[current_index++], nullptr, shared_ptrs[2])); - - // Index 1 is local_mem_b arg - ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], - local_mem_b_size, nullptr)); - if (backend == UR_PLATFORM_BACKEND_HIP) { - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - } - - // Index 0 is local_mem_a arg - ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], - local_mem_a_size, nullptr)); - - // Hip has extra args for local mem at index 1-3 - if (backend == UR_PLATFORM_BACKEND_HIP) { - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - } + virtual void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE( + urUpdatableCommandBufferExpExecutionTest::SetUp()); + + if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) { + GTEST_SKIP() + << "Local memory argument update not supported on Level Zero."; + } + + // HIP has extra args for local memory so we define an offset for arg + // indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + + std::array index_order{}; + if (backend != UR_PLATFORM_BACKEND_HIP) { + index_order = {3, 2, 4, 5, 1, 0}; + } else { + index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + } + size_t current_index = 0; + + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(A), nullptr, &A)); + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[0])); + + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[2])); + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); } + + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + } }; struct LocalMemoryUpdateTestOutOfOrder : LocalMemoryUpdateTestBaseOutOfOrder { - void SetUp() override { - UUR_RETURN_ON_FATAL_FAILURE( - LocalMemoryUpdateTestBaseOutOfOrder::SetUp()); - - // Append kernel command to command-buffer and close command-buffer - ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( - updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, - nullptr, nullptr, &command_handle)); - ASSERT_NE(command_handle, nullptr); - - ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); - } + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(LocalMemoryUpdateTestBaseOutOfOrder::SetUp()); + + // Append kernel command to command-buffer and close command-buffer + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, nullptr, + nullptr, &command_handle)); + ASSERT_NE(command_handle, nullptr); - void TearDown() override { - if (command_handle) { - EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle)); - } + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + } - UUR_RETURN_ON_FATAL_FAILURE( - LocalMemoryUpdateTestBaseOutOfOrder::TearDown()); + void TearDown() override { + if (command_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle)); } - ur_exp_command_buffer_command_handle_t command_handle = nullptr; + UUR_RETURN_ON_FATAL_FAILURE( + LocalMemoryUpdateTestBaseOutOfOrder::TearDown()); + } + + ur_exp_command_buffer_command_handle_t command_handle = nullptr; }; UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryUpdateTestOutOfOrder); @@ -1216,101 +1214,100 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryUpdateTestOutOfOrder); // Test updating A,X,Y parameters to new values and local memory to larger // values when the kernel arguments were added out of order. TEST_P(LocalMemoryUpdateTestOutOfOrder, UpdateAllParameters) { - // Run command-buffer prior to update and verify output - ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, - nullptr, nullptr)); - ASSERT_SUCCESS(urQueueFinish(queue)); - - uint32_t *output = (uint32_t *)shared_ptrs[0]; - uint32_t *X = (uint32_t *)shared_ptrs[1]; - uint32_t *Y = (uint32_t *)shared_ptrs[2]; - Validate(output, X, Y, A, global_size, local_size); - - // Update inputs - std::array - new_input_descs; - std::array - new_value_descs; - - size_t new_local_size = local_size * 4; - size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t); - - // New local_mem_a at index 0 - new_value_descs[0] = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype - nullptr, // pNext - 0, // argIndex - new_local_mem_a_size, // argSize - nullptr, // pProperties - nullptr, // hArgValue - }; - - // New local_mem_b at index 1 - new_value_descs[1] = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype - nullptr, // pNext - 1 + hip_arg_offset, // argIndex - local_mem_b_size, // argSize - nullptr, // pProperties - nullptr, // hArgValue - }; - - // New A at index 3 - uint32_t new_A = 33; - new_value_descs[2] = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype - nullptr, // pNext - 3 + (2 * hip_arg_offset), // argIndex - sizeof(new_A), // argSize - nullptr, // pProperties - &new_A, // hArgValue - }; - - // New X at index 4 - new_input_descs[0] = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype - nullptr, // pNext - 4 + (2 * hip_arg_offset), // argIndex - nullptr, // pProperties - &shared_ptrs[3], // pArgValue - }; - - // New Y at index 5 - new_input_descs[1] = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype - nullptr, // pNext - 5 + (2 * hip_arg_offset), // argIndex - nullptr, // pProperties - &shared_ptrs[4], // pArgValue - }; - - // Update kernel inputs - ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype - nullptr, // pNext - kernel, // hNewKernel - 0, // numNewMemObjArgs - new_input_descs.size(), // numNewPointerArgs - new_value_descs.size(), // numNewValueArgs - n_dimensions, // newWorkDim - nullptr, // pNewMemObjArgList - new_input_descs.data(), // pNewPointerArgList - new_value_descs.data(), // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - nullptr, // pNewGlobalWorkSize - nullptr, // pNewLocalWorkSize - }; - - // Update kernel and enqueue command-buffer again - ASSERT_SUCCESS( - urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); - ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, - nullptr, nullptr)); - ASSERT_SUCCESS(urQueueFinish(queue)); - - // Verify that update occurred correctly - uint32_t *new_output = (uint32_t *)shared_ptrs[0]; - uint32_t *new_X = (uint32_t *)shared_ptrs[3]; - uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; - Validate(new_output, new_X, new_Y, new_A, global_size, local_size); + // Run command-buffer prior to update and verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); + + // Update inputs + std::array + new_input_descs; + std::array new_value_descs; + + size_t new_local_size = local_size * 4; + size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t); + + // New local_mem_a at index 0 + new_value_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 0, // argIndex + new_local_mem_a_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New local_mem_b at index 1 + new_value_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1 + hip_arg_offset, // argIndex + local_mem_b_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New A at index 3 + uint32_t new_A = 33; + new_value_descs[2] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3 + (2 * hip_arg_offset), // argIndex + sizeof(new_A), // argSize + nullptr, // pProperties + &new_A, // hArgValue + }; + + // New X at index 4 + new_input_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 4 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue + }; + + // New Y at index 5 + new_input_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 5 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue + }; + + // Update kernel inputs + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + kernel, // hNewKernel + 0, // numNewMemObjArgs + new_input_descs.size(), // numNewPointerArgs + new_value_descs.size(), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs.data(), // pNewPointerArgList + new_value_descs.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that update occurred correctly + uint32_t *new_output = (uint32_t *)shared_ptrs[0]; + uint32_t *new_X = (uint32_t *)shared_ptrs[3]; + uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; + Validate(new_output, new_X, new_Y, new_A, global_size, local_size); } \ No newline at end of file diff --git a/test/conformance/kernel/urKernelSetArgLocal.cpp b/test/conformance/kernel/urKernelSetArgLocal.cpp index f5fc0019ae..f056d025bc 100644 --- a/test/conformance/kernel/urKernelSetArgLocal.cpp +++ b/test/conformance/kernel/urKernelSetArgLocal.cpp @@ -240,99 +240,99 @@ TEST_P(urKernelSetArgLocalMultiTest, Overwrite) { // Tests that adding arguments out of order (e.g. index 1 before index 0) works. struct urKernelSetArgLocalOutOfOrder : urKernelSetArgLocalMultiTest { - void SetUp() override { - program_name = "saxpy_usm_local_mem"; - UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); - - ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, - sizeof(backend), &backend, nullptr)); - - // HIP has extra args for local memory so we define an offset for arg indices here for updating - hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; - ur_device_usm_access_capability_flags_t shared_usm_flags; - ASSERT_SUCCESS( - uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); - if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { - GTEST_SKIP() << "Shared USM is not supported."; - } - - const size_t allocation_size = - sizeof(uint32_t) * global_size * local_size; - for (auto &shared_ptr : shared_ptrs) { - ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, - allocation_size, &shared_ptr)); - ASSERT_NE(shared_ptr, nullptr); - - std::vector pattern(allocation_size); - uur::generateMemFillPattern(pattern); - std::memcpy(shared_ptr, pattern.data(), allocation_size); - } - - std::array index_order{}; - if (backend != UR_PLATFORM_BACKEND_HIP) { - index_order = {3, 2, 4, 5, 1, 0}; - } else { - index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; - } - size_t current_index = 0; - - // Index 3 is A - ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], - sizeof(A), nullptr, &A)); - // Index 2 is output - ASSERT_SUCCESS(urKernelSetArgPointer( - kernel, index_order[current_index++], nullptr, shared_ptrs[0])); - - // Index 4 is X - ASSERT_SUCCESS(urKernelSetArgPointer( - kernel, index_order[current_index++], nullptr, shared_ptrs[1])); - // Index 5 is Y - ASSERT_SUCCESS(urKernelSetArgPointer( - kernel, index_order[current_index++], nullptr, shared_ptrs[2])); - - // Index 1 is local_mem_b arg - ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], - local_mem_b_size, nullptr)); - if (backend == UR_PLATFORM_BACKEND_HIP) { - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - } - - // Index 0 is local_mem_a arg - ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], - local_mem_a_size, nullptr)); - - // Hip has extra args for local mem at index 1-3 - if (backend == UR_PLATFORM_BACKEND_HIP) { - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - ASSERT_SUCCESS(urKernelSetArgValue( - kernel, index_order[current_index++], sizeof(hip_local_offset), - nullptr, &hip_local_offset)); - } + void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + + // HIP has extra args for local memory so we define an offset for arg + // indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; } + + const size_t allocation_size = sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + + std::array index_order{}; + if (backend != UR_PLATFORM_BACKEND_HIP) { + index_order = {3, 2, 4, 5, 1, 0}; + } else { + index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + } + size_t current_index = 0; + + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(A), nullptr, &A)); + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[0])); + + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[2])); + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + } }; UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelSetArgLocalOutOfOrder); TEST_P(urKernelSetArgLocalOutOfOrder, Success) { - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, - &global_offset, &global_size, - &local_size, 0, nullptr, nullptr)); - ASSERT_SUCCESS(urQueueFinish(queue)); - - uint32_t *output = (uint32_t *)shared_ptrs[0]; - uint32_t *X = (uint32_t *)shared_ptrs[1]; - uint32_t *Y = (uint32_t *)shared_ptrs[2]; - Validate(output, X, Y, A, global_size, local_size); + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, + &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); } From 9de10cd9547db008ef4347f86dc0bf9198a8fb97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Wed, 22 Jan 2025 14:43:42 +0000 Subject: [PATCH 4/4] Rename variable ArgIndices to ArgPointers --- source/adapters/cuda/enqueue.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 71c4340456..540ebb86fa 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -492,11 +492,11 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, UR_CHECK_ERROR(RetImplEvent->start()); } - auto &ArgIndices = hKernel->getArgPointers(); + auto &ArgPointers = hKernel->getArgPointers(); UR_CHECK_ERROR(cuLaunchKernel( CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize, - CuStream, const_cast(ArgIndices.data()), nullptr)); + CuStream, const_cast(ArgPointers.data()), nullptr)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record());