diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 05c20a6614..37018dde6c 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -523,7 +523,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ThreadsPerBlock, BlocksPerGrid)); // Set node param structure with the kernel related data - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); CUDA_KERNEL_NODE_PARAMS NodeParams = {}; NodeParams.func = CuFunc; NodeParams.gridDimX = BlocksPerGrid[0]; @@ -533,7 +533,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( NodeParams.blockDimY = ThreadsPerBlock[1]; NodeParams.blockDimZ = ThreadsPerBlock[2]; NodeParams.sharedMemBytes = LocalSize; - NodeParams.kernelParams = const_cast(ArgIndices.data()); + NodeParams.kernelParams = const_cast(ArgPointers.data()); // Create and add an new kernel node to the Cuda graph UR_CHECK_ERROR(cuGraphAddKernelNode(&GraphNode, hCommandBuffer->CudaGraph, @@ -1398,7 +1398,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.blockDimZ = ThreadsPerBlock[2]; Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize(); Params.kernelParams = - const_cast(KernelCommandHandle->Kernel->getArgIndices().data()); + const_cast(KernelCommandHandle->Kernel->getArgPointers().data()); CUgraphNode Node = KernelCommandHandle->Node; CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec; diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 2a4a2cf54f..540ebb86fa 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -492,11 +492,11 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, UR_CHECK_ERROR(RetImplEvent->start()); } - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); UR_CHECK_ERROR(cuLaunchKernel( CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize, - CuStream, const_cast(ArgIndices.data()), nullptr)); + CuStream, const_cast(ArgPointers.data()), nullptr)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); @@ -680,7 +680,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( UR_CHECK_ERROR(RetImplEvent->start()); } - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); CUlaunchConfig launch_config; launch_config.gridDimX = BlocksPerGrid[0]; @@ -696,7 +696,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_config.numAttrs = launch_attribute.size(); UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc, - const_cast(ArgIndices.data()), + const_cast(ArgPointers.data()), nullptr)); if (phEvent) { diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index d1b3b61244..f299714b02 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -66,8 +66,10 @@ struct ur_kernel_handle_t_ { args_t Storage; /// Aligned size of each parameter, including padding. args_size_t ParamSizes; - /// Byte offset into /p Storage allocation for each parameter. - args_index_t Indices; + /// Byte offset into /p Storage allocation for each argument. + args_index_t ArgPointers; + /// Position in the Storage array where the next argument should added. + size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has /// been added. Zero if the argument at the index isn't a local memory /// argument. @@ -90,33 +92,43 @@ struct ur_kernel_handle_t_ { std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0}; arguments() { - // Place the implicit offset index at the end of the indicies collection - Indices.emplace_back(&ImplicitOffsetArgs); + // Place the implicit offset index at the end of the ArgPointers + // collection. + ArgPointers.emplace_back(&ImplicitOffsetArgs); } /// Add an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. - /// Implicit offset argument is kept at the back of the indices collection. + /// Implicit offset argument is kept at the back of the ArgPointers + /// collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { - if (Index + 2 > Indices.size()) { + // Expand storage to accommodate this Index if needed. + if (Index + 2 > ArgPointers.size()) { // Move implicit offset argument index with the end - Indices.resize(Index + 2, Indices.back()); + ArgPointers.resize(Index + 2, ArgPointers.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } - ParamSizes[Index] = Size; - // calculate the insertion point on the array - size_t InsertPos = std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + Index, 0); - // Update the stored value for the argument - std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; - AlignedLocalMemSize[Index] = LocalSize; + + // Copy new argument to storage if it hasn't been added before. + if (ParamSizes[Index] == 0) { + ParamSizes[Index] = Size; + std::memcpy(&Storage[InsertPos], Arg, Size); + ArgPointers[Index] = &Storage[InsertPos]; + AlignedLocalMemSize[Index] = LocalSize; + InsertPos += Size; + } + // Otherwise, update the existing argument. + else { + std::memcpy(ArgPointers[Index], Arg, Size); + AlignedLocalMemSize[Index] = LocalSize; + assert(Size == ParamSizes[Index]); + } } /// Returns the padded size and offset of a local memory argument. @@ -128,7 +140,7 @@ struct ur_kernel_handle_t_ { std::pair calcAlignedLocalArgument(size_t Index, size_t Size) { // Store the unpadded size of the local argument - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } @@ -158,10 +170,11 @@ struct ur_kernel_handle_t_ { return std::make_pair(AlignedLocalSize, AlignedLocalOffset); } - // Iterate over all existing local argument which follows StartIndex + // Iterate over each existing local argument which follows StartIndex // index, update the offset and pointer into the kernel local memory. void updateLocalArgOffset(size_t StartIndex) { - const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg + const size_t NumArgs = + ArgPointers.size() - 1; // Accounts for implicit arg for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) { const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; if (OriginalLocalSize == 0) { @@ -177,10 +190,7 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - const size_t InsertPos = - std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + SuccIndex, size_t{0}); - std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset, + std::memcpy(ArgPointers[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } @@ -228,7 +238,7 @@ struct ur_kernel_handle_t_ { std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); } - const args_index_t &getIndices() const noexcept { return Indices; } + const args_index_t &getArgPointers() const noexcept { return ArgPointers; } uint32_t getLocalSize() const { return std::accumulate(std::begin(AlignedLocalMemSize), @@ -299,7 +309,7 @@ struct ur_kernel_handle_t_ { /// real one required by the kernel, since this cannot be queried from /// the CUDA Driver API uint32_t getNumArgs() const noexcept { - return static_cast(Args.Indices.size() - 1); + return static_cast(Args.ArgPointers.size() - 1); } void setKernelArg(int Index, size_t Size, const void *Arg) { @@ -314,8 +324,8 @@ struct ur_kernel_handle_t_ { return Args.setImplicitOffset(Size, ImplicitOffset); } - const arguments::args_index_t &getArgIndices() const { - return Args.getIndices(); + const arguments::args_index_t &getArgPointers() const { + return Args.getArgPointers(); } void setWorkGroupMemory(size_t MemSize) { Args.setWorkGroupMemory(MemSize); } diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index 09c59bb9f7..887eb75287 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -378,7 +378,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid)); // Set node param structure with the kernel related data - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); hipKernelNodeParams NodeParams; NodeParams.func = HIPFunc; NodeParams.gridDim.x = BlocksPerGrid[0]; @@ -388,7 +388,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( NodeParams.blockDim.y = ThreadsPerBlock[1]; NodeParams.blockDim.z = ThreadsPerBlock[2]; NodeParams.sharedMemBytes = LocalSize; - NodeParams.kernelParams = const_cast(ArgIndices.data()); + NodeParams.kernelParams = const_cast(ArgPointers.data()); NodeParams.extra = nullptr; // Create and add an new kernel node to the HIP graph @@ -1098,7 +1098,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.blockDim.z = ThreadsPerBlock[2]; Params.sharedMemBytes = hCommand->Kernel->getLocalSize(); Params.kernelParams = - const_cast(hCommand->Kernel->getArgIndices().data()); + const_cast(hCommand->Kernel->getArgPointers().data()); hipGraphNode_t Node = hCommand->Node; hipGraphExec_t HipGraphExec = CommandBuffer->HIPGraphExec; diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 8c7c1c617d..849369de4b 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -308,7 +308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } } - auto ArgIndices = hKernel->getArgIndices(); + auto ArgPointers = hKernel->getArgPointers(); // If migration of mem across buffer is needed, an event must be associated // with this command, implicitly if phEvent is nullptr @@ -322,7 +322,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(hipModuleLaunchKernel( HIPFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], - hKernel->getLocalSize(), HIPStream, ArgIndices.data(), nullptr)); + hKernel->getLocalSize(), HIPStream, ArgPointers.data(), nullptr)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp index c6d30e81ad..5ec51e7fa4 100644 --- a/source/adapters/hip/kernel.hpp +++ b/source/adapters/hip/kernel.hpp @@ -61,8 +61,10 @@ struct ur_kernel_handle_t_ { args_t Storage; /// Aligned size of each parameter, including padding. args_size_t ParamSizes; - /// Byte offset into /p Storage allocation for each parameter. - args_index_t Indices; + /// Byte offset into /p Storage allocation for each argument. + args_index_t ArgPointers; + /// Position in the Storage array where the next argument should added. + size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has /// been added. Zero if the argument at the index isn't a local memory /// argument. @@ -85,32 +87,41 @@ struct ur_kernel_handle_t_ { arguments() { // Place the implicit offset index at the end of the indicies collection - Indices.emplace_back(&ImplicitOffsetArgs); + ArgPointers.emplace_back(&ImplicitOffsetArgs); } /// Add an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. - /// Implicit offset argument is kept at the back of the indices collection. + /// Implicit offset argument is kept at the back of the ArgPointers + /// collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { - if (Index + 2 > Indices.size()) { - // Move implicit offset argument Index with the end - Indices.resize(Index + 2, Indices.back()); + // Expand storage to accommodate this Index if needed. + if (Index + 2 > ArgPointers.size()) { + // Move implicit offset argument index with the end + ArgPointers.resize(Index + 2, ArgPointers.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } - ParamSizes[Index] = Size; - // calculate the insertion point on the array - size_t InsertPos = std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + Index, 0); - // Update the stored value for the argument - std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; - AlignedLocalMemSize[Index] = LocalSize; + + // Copy new argument to storage if it hasn't been added before. + if (ParamSizes[Index] == 0) { + ParamSizes[Index] = Size; + std::memcpy(&Storage[InsertPos], Arg, Size); + ArgPointers[Index] = &Storage[InsertPos]; + AlignedLocalMemSize[Index] = LocalSize; + InsertPos += Size; + } + // Otherwise, update the existing argument. + else { + std::memcpy(ArgPointers[Index], Arg, Size); + AlignedLocalMemSize[Index] = LocalSize; + assert(Size == ParamSizes[Index]); + } } /// Returns the padded size and offset of a local memory argument. @@ -122,7 +133,7 @@ struct ur_kernel_handle_t_ { std::pair calcAlignedLocalArgument(size_t Index, size_t Size) { // Store the unpadded size of the local argument - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } @@ -151,20 +162,12 @@ struct ur_kernel_handle_t_ { return std::make_pair(AlignedLocalSize, AlignedLocalOffset); } - void addLocalArg(size_t Index, size_t Size) { - // Get the aligned argument size and offset into local data - auto [AlignedLocalSize, AlignedLocalOffset] = - calcAlignedLocalArgument(Index, Size); - - // Store argument details - addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), - AlignedLocalSize); - - // For every existing local argument which follows at later argument - // indices, update the offset and pointer into the kernel local memory. - // Required as padding will need to be recalculated. - const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg - for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) { + // Iterate over each existing local argument which follows StartIndex + // index, update the offset and pointer into the kernel local memory. + void updateLocalArgOffset(size_t StartIndex) { + const size_t NumArgs = + ArgPointers.size() - 1; // Accounts for implicit arg + for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) { const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; if (OriginalLocalSize == 0) { // Skip if successor argument isn't a local memory arg @@ -179,14 +182,26 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - const size_t InsertPos = - std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + SuccIndex, size_t{0}); - std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset, + std::memcpy(ArgPointers[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } + void addLocalArg(size_t Index, size_t Size) { + // Get the aligned argument size and offset into local data + auto [AlignedLocalSize, AlignedLocalOffset] = + calcAlignedLocalArgument(Index, Size); + + // Store argument details + addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), + AlignedLocalSize); + + // For every existing local argument which follows at later argument + // indices, update the offset and pointer into the kernel local memory. + // Required as padding will need to be recalculated. + updateLocalArgOffset(Index + 1); + } + void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { assert(hMem && "Invalid mem handle"); // To avoid redundancy we are not storing mem obj with index i at index @@ -206,7 +221,7 @@ struct ur_kernel_handle_t_ { std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); } - const args_index_t &getIndices() const noexcept { return Indices; } + const args_index_t &getArgPointers() const noexcept { return ArgPointers; } uint32_t getLocalSize() const { return std::accumulate(std::begin(AlignedLocalMemSize), @@ -263,7 +278,7 @@ struct ur_kernel_handle_t_ { /// offset. Note this only returns the current known number of arguments, /// not the real one required by the kernel, since this cannot be queried /// from the HIP Driver API - uint32_t getNumArgs() const noexcept { return Args.Indices.size() - 1; } + uint32_t getNumArgs() const noexcept { return Args.ArgPointers.size() - 1; } void setKernelArg(int Index, size_t Size, const void *Arg) { Args.addArg(Index, Size, Arg); @@ -277,8 +292,8 @@ struct ur_kernel_handle_t_ { return Args.setImplicitOffset(Size, ImplicitOffset); } - const arguments::args_index_t &getArgIndices() const { - return Args.getIndices(); + const arguments::args_index_t &getArgPointers() const { + return Args.getArgPointers(); } uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); } diff --git a/test/adapters/cuda/kernel_tests.cpp b/test/adapters/cuda/kernel_tests.cpp index 0f7f3351fe..7b83459c5f 100644 --- a/test/adapters/cuda/kernel_tests.cpp +++ b/test/adapters/cuda/kernel_tests.cpp @@ -153,7 +153,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSimple) { int number = 10; ASSERT_SUCCESS(urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number)); - const auto &kernelArgs = kernel->getArgIndices(); + const auto &kernelArgs = kernel->getArgPointers(); ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA); int storedValue = *static_cast(kernelArgs[0]); @@ -175,7 +175,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSetTwice) { int number = 10; ASSERT_SUCCESS(urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number)); - const auto &kernelArgs = kernel->getArgIndices(); + const auto &kernelArgs = kernel->getArgPointers(); ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA); int storedValue = *static_cast(kernelArgs[0]); ASSERT_EQ(storedValue, number); @@ -183,7 +183,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSetTwice) { int otherNumber = 934; ASSERT_SUCCESS( urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &otherNumber)); - const auto kernelArgs2 = kernel->getArgIndices(); + const auto kernelArgs2 = kernel->getArgPointers(); ASSERT_EQ(kernelArgs2.size(), 1 + NumberOfImplicitArgsCUDA); storedValue = *static_cast(kernelArgs2[0]); ASSERT_EQ(storedValue, otherNumber); diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp index 559a61e3ad..d55094a52c 100644 --- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp +++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp @@ -1094,3 +1094,220 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateWithoutBlocking) { uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; Validate(new_output, new_X, new_Y, new_A, global_size, local_size); } + +struct LocalMemoryUpdateTestBaseOutOfOrder : LocalMemoryUpdateTestBase { + virtual void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE( + urUpdatableCommandBufferExpExecutionTest::SetUp()); + + if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) { + GTEST_SKIP() + << "Local memory argument update not supported on Level Zero."; + } + + // HIP has extra args for local memory so we define an offset for arg + // indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + + std::array index_order{}; + if (backend != UR_PLATFORM_BACKEND_HIP) { + index_order = {3, 2, 4, 5, 1, 0}; + } else { + index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + } + size_t current_index = 0; + + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(A), nullptr, &A)); + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[0])); + + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[2])); + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + } +}; + +struct LocalMemoryUpdateTestOutOfOrder : LocalMemoryUpdateTestBaseOutOfOrder { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(LocalMemoryUpdateTestBaseOutOfOrder::SetUp()); + + // Append kernel command to command-buffer and close command-buffer + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, nullptr, + nullptr, &command_handle)); + ASSERT_NE(command_handle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + } + + void TearDown() override { + if (command_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle)); + } + + UUR_RETURN_ON_FATAL_FAILURE( + LocalMemoryUpdateTestBaseOutOfOrder::TearDown()); + } + + ur_exp_command_buffer_command_handle_t command_handle = nullptr; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryUpdateTestOutOfOrder); + +// Test updating A,X,Y parameters to new values and local memory to larger +// values when the kernel arguments were added out of order. +TEST_P(LocalMemoryUpdateTestOutOfOrder, UpdateAllParameters) { + // Run command-buffer prior to update and verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); + + // Update inputs + std::array + new_input_descs; + std::array new_value_descs; + + size_t new_local_size = local_size * 4; + size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t); + + // New local_mem_a at index 0 + new_value_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 0, // argIndex + new_local_mem_a_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New local_mem_b at index 1 + new_value_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1 + hip_arg_offset, // argIndex + local_mem_b_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New A at index 3 + uint32_t new_A = 33; + new_value_descs[2] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3 + (2 * hip_arg_offset), // argIndex + sizeof(new_A), // argSize + nullptr, // pProperties + &new_A, // hArgValue + }; + + // New X at index 4 + new_input_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 4 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue + }; + + // New Y at index 5 + new_input_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 5 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue + }; + + // Update kernel inputs + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + kernel, // hNewKernel + 0, // numNewMemObjArgs + new_input_descs.size(), // numNewPointerArgs + new_value_descs.size(), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs.data(), // pNewPointerArgList + new_value_descs.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that update occurred correctly + uint32_t *new_output = (uint32_t *)shared_ptrs[0]; + uint32_t *new_X = (uint32_t *)shared_ptrs[3]; + uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; + Validate(new_output, new_X, new_Y, new_A, global_size, local_size); +} \ No newline at end of file diff --git a/test/conformance/kernel/urKernelSetArgLocal.cpp b/test/conformance/kernel/urKernelSetArgLocal.cpp index 688724ec09..f056d025bc 100644 --- a/test/conformance/kernel/urKernelSetArgLocal.cpp +++ b/test/conformance/kernel/urKernelSetArgLocal.cpp @@ -237,3 +237,102 @@ TEST_P(urKernelSetArgLocalMultiTest, Overwrite) { Validate(output, X, Y, A, global_size, new_local_size); } + +// Tests that adding arguments out of order (e.g. index 1 before index 0) works. +struct urKernelSetArgLocalOutOfOrder : urKernelSetArgLocalMultiTest { + void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + + // HIP has extra args for local memory so we define an offset for arg + // indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + + std::array index_order{}; + if (backend != UR_PLATFORM_BACKEND_HIP) { + index_order = {3, 2, 4, 5, 1, 0}; + } else { + index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + } + size_t current_index = 0; + + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(A), nullptr, &A)); + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[0])); + + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, index_order[current_index++], + nullptr, shared_ptrs[2])); + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(hip_local_offset), nullptr, + &hip_local_offset)); + } + } +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelSetArgLocalOutOfOrder); +TEST_P(urKernelSetArgLocalOutOfOrder, Success) { + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, + &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); +}