From 7e4668b27883f67e5bcfd4a38c9a55ac792e77d2 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Fri, 22 Nov 2024 17:24:35 -0800 Subject: [PATCH 01/31] Use DeviceInterface for debugging --- src/models/debugging.cpp | 15 +++++++-------- src/models/model.h | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/models/debugging.cpp b/src/models/debugging.cpp index 597ded8de..67e0df3b1 100644 --- a/src/models/debugging.cpp +++ b/src/models/debugging.cpp @@ -3,15 +3,7 @@ #include "../generators.h" #include "utils.h" #include - -#if USE_CUDA -#include "../cuda/cuda_common.h" -#endif - -#if USE_DML -#include "../dml/dml_helpers.h" #include "model.h" -#endif namespace Generators { static constexpr size_t c_value_count = 10; // Dump this many values from the start of a tensor @@ -92,6 +84,12 @@ void DumpTensor(const Model& model, std::ostream& stream, OrtValue* value, bool break; case OrtMemoryInfoDeviceType_GPU: { stream << "GPU\r\n"; + auto type = type_info->GetElementType(); + auto tensor_span = std::span{const_cast(value)->GetTensorMutableData(), SizeOf(type) * element_count}; + auto device_span = model.p_device_->WrapMemory(tensor_span); + DumpValues(stream, type, device_span.CopyDeviceToCpu().data(), element_count); + break; +#if 0 #if USE_CUDA auto type = type_info->GetElementType(); size_t element_size = SizeOf(type); @@ -120,6 +118,7 @@ void DumpTensor(const Model& model, std::ostream& stream, OrtValue* value, bool DumpValues(stream, type, cpu_copy.get(), element_count); #else stream << "Unexpected, using GPU memory but not compiled with CUDA or DML?"; +#endif #endif break; } diff --git a/src/models/model.h b/src/models/model.h index 03d973afa..7f29c6b28 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -144,7 +144,7 @@ struct Model : std::enable_shared_from_this, LeakChecked { std::unique_ptr session_options_; cudaStream_t cuda_stream_{}; - DeviceInterface* p_device_{}; + mutable DeviceInterface* p_device_{}; DeviceType device_type_{DeviceType::CPU}; Ort::Allocator& allocator_cpu_{Ort::Allocator::GetWithDefaultOptions()}; Ort::Allocator* allocator_device_{}; // Can be CUDA or CPU based on the DeviceType in the model From 3823664cfe1a390b1bd8089fc23bfe3c2a51ac22 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Sun, 15 Dec 2024 23:52:16 -0800 Subject: [PATCH 02/31] Summary: Remove #ifdefs for providers and go through device interface. Details: Add a DML DeviceInterface and DML DeviceBuffer handler. Remove #if blocks that are doing memory copies between device/cpu memory and use the DeviceSpan interface. --- src/beam_search_scorer.cpp | 2 +- src/cpu/interface.cpp | 18 ++- src/cuda/interface.cpp | 65 ++++------- src/cuda/interface.h | 4 +- src/cuda/search_cuda.cpp | 2 +- src/dml/interface.cpp | 172 +++++++++++++++++++++++++++++ src/dml/interface.h | 11 ++ src/generators.cpp | 26 ++++- src/generators.h | 25 ++++- src/models/captured_graph_pool.cpp | 15 +-- src/models/extra_inputs.cpp | 46 +------- src/models/input_ids.cpp | 64 ++--------- src/models/input_ids.h | 6 - src/models/kv_cache.cpp | 14 +-- src/models/logits.cpp | 102 ++--------------- src/models/logits.h | 5 - src/models/model.cpp | 94 +++++----------- src/models/model.h | 5 - src/models/position_inputs.cpp | 5 +- src/models/utils.cpp | 6 + src/models/whisper.cpp | 6 +- src/ort_genai_c.cpp | 36 +----- src/python/python.cpp | 43 +------- src/search.cpp | 11 +- src/search.h | 3 +- src/smartptrs.h | 24 +++- test/c_api_tests.cpp | 3 + test/sampling_benchmark.cpp | 3 + test/sampling_tests.cpp | 3 + 29 files changed, 365 insertions(+), 454 deletions(-) create mode 100644 src/dml/interface.cpp create mode 100644 src/dml/interface.h diff --git a/src/beam_search_scorer.cpp b/src/beam_search_scorer.cpp index d2f056038..b9cbceffe 100644 --- a/src/beam_search_scorer.cpp +++ b/src/beam_search_scorer.cpp @@ -67,7 +67,7 @@ BeamSearchScorer::BeamSearchScorer(const GeneratorParams& parameters) // Space to store intermediate sequence size_t const per_beam = (max_length_ * (max_length_ + 1)) / 2; - hypothesis_buffer_ = device.Allocate(batch_beam_size * per_beam, true); + hypothesis_buffer_ = device.Allocate(batch_beam_size * per_beam); memset(next_beam_scores_.Span().data(), 0, next_beam_scores_.Span().size_bytes()); diff --git a/src/cpu/interface.cpp b/src/cpu/interface.cpp index 420ba8f74..b27882668 100644 --- a/src/cpu/interface.cpp +++ b/src/cpu/interface.cpp @@ -7,6 +7,7 @@ namespace Generators { +static Ort::Allocator* ort_allocator_{}; const char* label_cpu = "cpu"; struct CpuMemory final : DeviceBuffer { @@ -30,18 +31,23 @@ struct CpuMemory final : DeviceBuffer { void CopyDeviceToCpu() override {} // Nothing to do, device is also CPU void CopyCpuToDevice() override {} // Nothing to do, device is also CPU void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override { - if (GetType() == label_cpu) - memcpy(p_device_ + begin_dest, source.p_device_ + begin_source, size_in_bytes); - else - throw std::runtime_error("CpuMemory::CopyFromDevice not implemented for " + std::string(source.GetType())); + CopyThroughCpu(*this, begin_dest, source, begin_source, size_in_bytes); + } + + void Zero() override { + memset(p_device_, 0, size_in_bytes_); } bool owned_; }; struct CpuInterface : DeviceInterface { - std::shared_ptr AllocateBase(size_t size, bool cpu_accessible) override { - // cpu_accessible is ignored, as with the cpu, the device is also the cpu + void InitAllocator(Ort::Allocator& allocator) override { + assert(!ort_allocator_); + ort_allocator_ = &allocator; + } + + std::shared_ptr AllocateBase(size_t size) override { return std::make_shared(size); } diff --git a/src/cuda/interface.cpp b/src/cuda/interface.cpp index b225604f1..0b708c94d 100644 --- a/src/cuda/interface.cpp +++ b/src/cuda/interface.cpp @@ -11,33 +11,9 @@ namespace Generators { +GenaiInterface* gp_genai{}; +Ort::Allocator* ort_allocator_{}; const char* label_cuda = "cuda"; -const char* label_cuda_cpu = "cuda_cpu"; - -struct HostMemory final : DeviceBuffer { - HostMemory(size_t size) { - size_in_bytes_ = size; - ::cudaHostAlloc(&p_device_, size, 0); - p_cpu_ = p_device_; // CPU & GPU both access the same memory here - } - - ~HostMemory() override { - ::cudaFreeHost(p_device_); - } - - const char* GetType() const override { return label_cuda_cpu; } - void AllocateCpu() override {} // Nothing to do, device is also CPU - void CopyDeviceToCpu() override {} // Nothing to do, device is also CPU - void CopyCpuToDevice() override {} // Nothing to do, device is also CPU - void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override { - if (source.GetType() == label_cuda_cpu) - ::memcpy(p_cpu_ + begin_dest, source.p_cpu_ + begin_source, size_in_bytes); - else if (source.GetType() == label_cuda) - ::cudaMemcpyAsync(p_device_ + begin_dest, source.p_device_ + begin_source, size_in_bytes, ::cudaMemcpyDeviceToHost, GetStream()); - else - throw std::runtime_error("Cuda HostMemory::CopyFromDevice not implemented for " + std::string(source.GetType())); - } -}; struct GpuMemory final : DeviceBuffer { GpuMemory(size_t size) : owned_{true} { @@ -66,21 +42,24 @@ struct GpuMemory final : DeviceBuffer { void CopyDeviceToCpu() override { AllocateCpu(); - ::cudaMemcpy(p_cpu_, p_device_, size_in_bytes_, ::cudaMemcpyDeviceToHost); + ::cudaMemcpyAsync(p_cpu_, p_device_, size_in_bytes_, ::cudaMemcpyDeviceToHost, GetStream()); + ::cudaStreamSynchronize(GetStream()); } void CopyCpuToDevice() override { assert(p_cpu_); - ::cudaMemcpy(p_device_, p_cpu_, size_in_bytes_, ::cudaMemcpyHostToDevice); + ::cudaMemcpyAsync(p_device_, p_cpu_, size_in_bytes_, ::cudaMemcpyHostToDevice, GetStream()); } - void CopyFrom(size_t begin_source, DeviceBuffer& source, size_t begin_dest, size_t size_in_bytes) override { - if (source.GetType() == label_cuda_cpu) - ::cudaMemcpyAsync(p_device_ + begin_source, source.p_device_ + begin_dest, size_in_bytes, ::cudaMemcpyHostToDevice, GetStream()); - else if (source.GetType() == label_cuda) - ::cudaMemcpyAsync(p_device_ + begin_source, source.p_device_ + begin_dest, size_in_bytes, ::cudaMemcpyDeviceToDevice, GetStream()); + void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override { + if (source.GetType() == label_cuda) + ::cudaMemcpyAsync(p_device_ + begin_dest, source.p_device_ + begin_source, size_in_bytes, ::cudaMemcpyDeviceToDevice, GetStream()); else - throw std::runtime_error("Cuda GpuMemory::CopyFromDevice not implemented for " + std::string(source.GetType())); + gp_genai->CopyThroughCpu(*this, begin_dest, source, begin_source, size_in_bytes); + } + + void Zero() override { + ::cudaMemsetAsync(p_device_, 0, size_in_bytes_, GetStream()); } bool owned_; // If we own the memory, we delete it on destruction @@ -94,9 +73,12 @@ struct CudaInterfaceImpl : CudaInterface { ~CudaInterfaceImpl() { } - std::shared_ptr AllocateBase(size_t size, bool cpu_accessible) override { - if (cpu_accessible) - return std::make_shared(size); + void InitAllocator(Ort::Allocator& allocator) override { + assert(!ort_allocator_); + ort_allocator_ = &allocator; + } + + std::shared_ptr AllocateBase(size_t size) override { return std::make_shared(size); } @@ -180,18 +162,10 @@ struct CudaInterfaceImpl : CudaInterface { return ::cudaMemcpyAsync(dst, src, count, kind, stream); } - cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) override { - return ::cudaMemcpy(dst, src, count, kind); - } - cudaError_t cudaMemsetAsync(void* ptr, int value, size_t count, cudaStream_t stream) override { return ::cudaMemsetAsync(ptr, value, count, stream); } - cudaError_t cudaMemset(void* ptr, int value, size_t count) override { - return ::cudaMemset(ptr, value, count); - } - private: cuda_stream_holder cuda_stream_; }; @@ -201,7 +175,6 @@ std::unique_ptr g_cuda_device; DeviceInterface& GetCudaDeviceInterface() { return *g_cuda_device; } cudaStream_t GetStream() { return g_cuda_device->GetCudaStream(); } -GenaiInterface* gp_genai{}; LogItems& GetLogItems() { return gp_genai->GetLogItems(); } std::ostream& operator<<(std::ostream& stream, SGR sgr_code) { return gp_genai->operator_leftshift(stream, sgr_code); } std::ostream& Log(std::string_view label, std::string_view text) { return gp_genai->Log(label, text); } diff --git a/src/cuda/interface.h b/src/cuda/interface.h index d664277cc..24e6fb94f 100644 --- a/src/cuda/interface.h +++ b/src/cuda/interface.h @@ -7,6 +7,8 @@ struct GenaiInterface { virtual void HeapFree(void*) = 0; #endif + virtual void CopyThroughCpu(Generators::DeviceBuffer& dest, size_t begin_dest, Generators::DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) = 0; + virtual Generators::LogItems& GetLogItems() = 0; virtual std::ostream& operator_leftshift(std::ostream& stream, Generators::SGR sgr_code) = 0; virtual std::ostream& Log(std::string_view label, std::string_view text = {}) = 0; @@ -42,9 +44,7 @@ struct CudaInterface : DeviceInterface { virtual void LaunchFinalizeCrossQK(cudaStream_t stream, int iteration_number, int context_decoding_len, int batch_size, int num_beams, int max_length, int num_alignment_heads, int frames_of_k, const float* cross_qk_buffer_data, float* cross_qk_output, int num_return_sequences, const int* cache_indir_data) = 0; virtual cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream) = 0; - virtual cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) = 0; virtual cudaError_t cudaMemsetAsync(void* ptr, int value, size_t count, cudaStream_t stream) = 0; - virtual cudaError_t cudaMemset(void* ptr, int value, size_t count) = 0; }; #endif } // namespace Generators diff --git a/src/cuda/search_cuda.cpp b/src/cuda/search_cuda.cpp index fa80f67dc..8d7bf6fe0 100644 --- a/src/cuda/search_cuda.cpp +++ b/src/cuda/search_cuda.cpp @@ -31,8 +31,8 @@ Search_Cuda::Search_Cuda(const GeneratorParams& params) GreedySearch_Cuda::GreedySearch_Cuda(const GeneratorParams& params) : Search_Cuda{params} { next_tokens_buffer_ = params.p_device->Allocate(params.search.batch_size); + next_tokens_buffer_.Zero(); next_tokens_ = gpu_span(next_tokens_buffer_.Span()); - cudaMemsetAsync(next_tokens_.data(), 0, next_tokens_.size_bytes(), params_->cuda_stream); unsigned long long random_seed; if (params_->search.random_seed != -1) diff --git a/src/dml/interface.cpp b/src/dml/interface.cpp new file mode 100644 index 000000000..61a57d663 --- /dev/null +++ b/src/dml/interface.cpp @@ -0,0 +1,172 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "../generators.h" +#include "../search.h" +#include "../cpu/interface.h" +#include "interface.h" +#include + +#include +#include "dml_provider_factory.h" +#include "../dml/dml_helpers.h" +#include "../dml/dml_execution_context.h" +#include "../dml/dml_pooled_upload_heap.h" +#include "../dml/dml_readback_heap.h" + +std::string CurrentModulePath(); + +namespace Generators { +namespace Dml { // If this was in a shared library it wouldn't need to be in its own namespace + +Ort::Allocator* ort_allocator_{}; +const char* label_dml = "dml"; + +wil::unique_hmodule smart_directml_dll_; +DmlObjects dml_objects_; +const OrtDmlApi* dml_api_{}; +std::unique_ptr dml_pooled_upload_heap_; +std::unique_ptr dml_execution_context_; +std::unique_ptr dml_readback_heap_; +ComPtr dml_device_; + +struct GpuMemory final : DeviceBuffer { + GpuMemory(size_t size) : owned_{true} { + size_in_bytes_ = size; + p_device_ = static_cast(ort_allocator_->Alloc(size_in_bytes_)); + Ort::ThrowOnError(dml_api_->GetD3D12ResourceFromAllocation(ort_allocator_, p_device_, &gpu_resource_)); + } + + GpuMemory(void* p, size_t size) : owned_{false} { + size_in_bytes_ = size; + p_device_ = static_cast(p); + Ort::ThrowOnError(dml_api_->GetD3D12ResourceFromAllocation(ort_allocator_, p_device_, &gpu_resource_)); + } + + ~GpuMemory() override { + if (owned_) + ort_allocator_->Free(p_device_); + if (p_cpu_) + free(p_cpu_); + } + + const char* GetType() const override { return label_dml; } + + void AllocateCpu() override { + if (!p_cpu_) + p_cpu_ = static_cast(malloc(size_in_bytes_)); + } + + void CopyDeviceToCpu() override { + AllocateCpu(); + dml_readback_heap_->ReadbackFromGpu(std::span(p_cpu_, size_in_bytes_), gpu_resource_.Get(), 0, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + } + + void CopyCpuToDevice() override { + assert(p_cpu_); + auto source = std::span(p_cpu_, size_in_bytes_); + dml_pooled_upload_heap_->BeginUploadToGpu(gpu_resource_.Get(), 0, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, source); + } + + void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override { + if (source.GetType() == label_dml) { + auto& source_gpu = dynamic_cast(source); + dml_execution_context_->CopyBufferRegion( + gpu_resource_.Get(), + begin_dest, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + source_gpu.gpu_resource_.Get(), + begin_source, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + size_in_bytes); + } else + CopyThroughCpu(*this, begin_dest, source, begin_source, size_in_bytes); + } + + void Zero() override { + // TODO: Implement a zeroing that runs directly on DML vs going through CPU + AllocateCpu(); + memset(p_cpu_, 0, size_in_bytes_); + CopyCpuToDevice(); + } + + ComPtr gpu_resource_; + bool owned_; // If we own the memory, we delete it on destruction +}; + +struct DmlInterfaceImpl : DeviceInterface { + + DmlInterfaceImpl(LUID* p_device_luid) { + Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast(&dml_api_))); + if (!dml_api_) { + throw std::runtime_error("Unexpected nullptr getting OrtDmlApi"); + } + + dml_objects_ = DmlHelpers::CreateDmlObjects(CurrentModulePath(), p_device_luid); + + constexpr auto directml_dll = "DirectML.dll"; + smart_directml_dll_ = wil::unique_hmodule{LoadLibraryEx(directml_dll, nullptr, 0)}; + if (!smart_directml_dll_) + throw std::runtime_error("DirectML.dll not found"); + + auto dml_create_device1_fn = reinterpret_cast(GetProcAddress(smart_directml_dll_.get(), "DMLCreateDevice1")); + THROW_LAST_ERROR_IF(!dml_create_device1_fn); + THROW_IF_FAILED(dml_create_device1_fn(dml_objects_.d3d12_device.Get(), DML_CREATE_DEVICE_FLAG_NONE, DML_FEATURE_LEVEL_5_0, IID_PPV_ARGS(&dml_device_))); + + Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast(&dml_api_))); + } + + void InitAllocator(Ort::Allocator& allocator) override { + assert(!ort_allocator_); + ort_allocator_ = &allocator; + + dml_execution_context_ = std::make_unique( + dml_objects_.d3d12_device.Get(), + dml_device_.Get(), + dml_objects_.command_queue.Get(), + *ort_allocator_, + dml_api_); + + dml_pooled_upload_heap_ = std::make_unique(dml_objects_.d3d12_device.Get(), dml_execution_context_.get()); + dml_readback_heap_ = std::make_unique(dml_objects_.d3d12_device.Get(), dml_execution_context_.get()); + } + + std::shared_ptr AllocateBase(size_t size) override { + return std::make_shared(size); + } + + std::shared_ptr WrapMemoryBase(void* p, size_t size) override { + return std::make_shared(p, size); + } + + std::unique_ptr CreateGreedy(const GeneratorParams& params) override { + return GetCpuInterface()->CreateGreedy(params); + } + + std::unique_ptr CreateBeam(const GeneratorParams& params) override { + return GetCpuInterface()->CreateBeam(params); + } + + void Synchronize() override { + } +}; + +} // namespace Dml + +std::unique_ptr g_dml_device; + +void InitDmlInterface(LUID* p_device_luid) { + if (!g_dml_device) + g_dml_device = std::make_unique(p_device_luid); +} + +void SetDmlProvider(OrtSessionOptions& session_options) { + Ort::ThrowOnError(Dml::dml_api_->SessionOptionsAppendExecutionProvider_DML1(&session_options, Dml::dml_device_.Get(), Dml::dml_objects_.command_queue.Get())); +} + +DeviceInterface* GetDmlInterface() { + assert(g_dml_device); + return g_dml_device.get(); +} + +} // namespace Generators diff --git a/src/dml/interface.h b/src/dml/interface.h new file mode 100644 index 000000000..c70faf721 --- /dev/null +++ b/src/dml/interface.h @@ -0,0 +1,11 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +namespace Generators { + +void InitDmlInterface(LUID* p_device_luid); +void SetDmlProvider(OrtSessionOptions& options); + +DeviceInterface* GetDmlInterface(); + +} \ No newline at end of file diff --git a/src/generators.cpp b/src/generators.cpp index 8c9ce39b3..ad34d1979 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -8,6 +8,7 @@ #include "search.h" #include "cpu/interface.h" #include "cuda/interface.h" +#include "dml/interface.h" #if USE_CUDA #include "models/kernels.h" #endif @@ -89,18 +90,31 @@ OrtEnv& GetOrtEnv() { return *GetOrtGlobals()->env_; } +// Fallback to copy between two separate device buffers by going through CPU memory (slow unless we're the CPU device) +void CopyThroughCpu(DeviceBuffer& dest, size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) { + source.CopyDeviceToCpu(); + auto source_span = std::span(source.p_cpu_+begin_source, size_in_bytes); + dest.AllocateCpu(); + std::copy(source_span.begin(), source_span.end(), dest.p_cpu_ + begin_dest); + dest.CopyCpuToDevice(); +} + struct GenaiInterfaceImpl : GenaiInterface { #if _WIN32 void* HeapAllocate(size_t size) override { return std::malloc(size); } void HeapFree(void* p) override { std::free(p); } #endif + void CopyThroughCpu(DeviceBuffer& dest, size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override { + return Generators::CopyThroughCpu(dest, begin_dest, source, begin_source, size_in_bytes); + } + Generators::LogItems& GetLogItems() override { return g_log; } std::ostream& operator_leftshift(std::ostream& stream, Generators::SGR sgr_code) override { return stream << sgr_code; } std::ostream& Log(std::string_view label, std::string_view text = {}) override { return Log(label, text); } - void DumpSpan(std::ostream& stream, std::span values) override { return DumpSpan(stream, values); } - void DumpSpan(std::ostream& stream, std::span values) override { return DumpSpan(stream, values); } + void DumpSpan(std::ostream& stream, std::span values) override { return Generators::DumpSpan(stream, values); } + void DumpSpan(std::ostream& stream, std::span values) override { return Generators::DumpSpan(stream, values); } void Sequences_AfterAppendNextTokens(Sequences* p_this, DeviceSpan next_tokens, size_t batch_beam_size) override { return p_this->AfterAppendNextTokens(next_tokens, batch_beam_size); } void Sequences_RewindTo(Sequences* p_this, size_t new_length) override { return p_this->RewindTo(new_length); } @@ -136,6 +150,7 @@ CudaInterface* GetCudaInterface() { return cuda_interface; } + namespace cuda { void LaunchInt32ToInt64(const int32_t* input, int64_t* output, int count, cudaStream_t stream) { GetCudaInterface()->Int32ToInt64(input, output, count, stream); } void LaunchFp16ToFp32(const uint16_t* input, float* output, int count, cudaStream_t stream) { GetCudaInterface()->Fp16ToFp32(input, output, count, stream); } @@ -160,6 +175,7 @@ void LaunchFinalizeCrossQK(cudaStream_t stream, int iteration_number, int } // namespace cuda #endif + std::string to_string(DeviceType device_type) { switch (device_type) { case DeviceType::CPU: @@ -182,6 +198,10 @@ DeviceInterface* GetDeviceInterface(DeviceType type) { #if USE_CUDA case DeviceType::CUDA: return GetCudaInterface(); +#endif +#if USE_DML + case DeviceType::DML: + return GetDmlInterface(); #endif } } @@ -427,7 +447,5 @@ DeviceSpan Generator::GetSequence(size_t index) const { #if USE_CUDA cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream) { return Generators::GetCudaInterface()->cudaMemcpyAsync(dst, src, count, kind, stream); } -cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) { return Generators::GetCudaInterface()->cudaMemcpy(dst, src, count, kind); } cudaError_t cudaMemsetAsync(void* ptr, int value, size_t count, cudaStream_t stream) { return Generators::GetCudaInterface()->cudaMemsetAsync(ptr, value, count, stream); } -cudaError_t cudaMemset(void* ptr, int value, size_t count) { return Generators::GetCudaInterface()->cudaMemset(ptr, value, count); } #endif diff --git a/src/generators.h b/src/generators.h index 3e4561b14..7275aa976 100644 --- a/src/generators.h +++ b/src/generators.h @@ -49,9 +49,24 @@ struct Tokenizer; template DeviceSpan WrapTensor(DeviceInterface& device, OrtValue& value) { - return device.WrapMemory(std::span{value.GetTensorMutableData(), value.GetTensorTypeAndShapeInfo()->GetElementCount()}); + auto info = value.GetTensorTypeAndShapeInfo(); + assert(info->GetElementType() == Ort::TypeToTensorType>); + return device.WrapMemory(std::span{value.GetTensorMutableData(), info->GetElementCount()}); } +DeviceSpan ByteWrapTensor(DeviceInterface& device, OrtValue& value); + +template +struct OrtTensor { + OrtTensor(std::unique_ptr ort_value, DeviceInterface& device) + : ort_value_{std::move(ort_value)}, device_span_{WrapTensor(device, *ort_value_)} {} + + operator OrtValue*() { return ort_value_.get(); } + + std::unique_ptr ort_value_; + DeviceSpan device_span_; +}; + // OgaSequences are a vector of int32 vectors using TokenSequences = std::vector>; @@ -60,6 +75,7 @@ enum struct DeviceType { CUDA, DML, WEBGPU, + MAX }; std::string to_string(DeviceType device_type); @@ -136,10 +152,7 @@ struct OrtGlobals { OrtGlobals(); std::unique_ptr env_; -#if USE_CUDA - std::unique_ptr memory_info_cuda_; - std::unique_ptr allocator_cuda_; -#endif + std::unique_ptr allocator_device_[static_cast(DeviceType::MAX)]; private: OrtGlobals(const OrtGlobals&) = delete; void operator=(const OrtGlobals&) = delete; @@ -155,6 +168,8 @@ std::shared_ptr CreateGeneratorParams(const Model& model); std::shared_ptr CreateGeneratorParams(const Config& config); // For benchmarking purposes only std::unique_ptr CreateGenerator(const Model& model, const GeneratorParams& params); +void CopyThroughCpu(DeviceBuffer& dest, size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes); + float Float16ToFloat32(uint16_t v); // v is a IEEE 752-2008 binary16 format, 1 sign bit, 5 bit exponent, 10 bit fraction void top_k_indices(std::span top_k, std::span inputs); diff --git a/src/models/captured_graph_pool.cpp b/src/models/captured_graph_pool.cpp index 84c8bec11..baaa9243e 100644 --- a/src/models/captured_graph_pool.cpp +++ b/src/models/captured_graph_pool.cpp @@ -19,7 +19,7 @@ void CapturedGraphInfoRecycler::operator()(CapturedGraphInfo* captured_graph_inf } CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model, const GeneratorParams& params) const { - if (!params.use_cuda_graph || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) { + if (!params.use_cuda_graph || (model.device_type_ != DeviceType::CUDA)) { return nullptr; } @@ -48,12 +48,6 @@ CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model, size_t max_beam_batch_size = static_cast(params.search.num_beams) * params.max_batch_size; new_captured_graph->sb_input_ids_ = std::make_unique(allocator_device_, max_beam_batch_size); -#if USE_DML - if (model.device_type_ == DeviceType::DML) { - new_captured_graph->sb_input_ids_int32_ = std::make_unique(allocator_device_, max_beam_batch_size); - } -#endif - // Create the static buffers for the cache int layer_count = config_->model.decoder.num_hidden_layers; new_captured_graph->sb_kv_caches_.reserve(layer_count * 2); @@ -70,13 +64,6 @@ CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model, // Create the static buffer for the attention mask, if needed if (session_info_->HasInput(config_->model.decoder.inputs.attention_mask)) { new_captured_graph->sb_attention_mask_ = std::make_unique(allocator_device_, max_beam_batch_size); - -#if USE_DML - // DML currently needs an additional static buffer for the mask - if (model.device_type_ == DeviceType::DML) { - new_captured_graph->sb_attention_mask_next_ = std::make_unique(allocator_device_, max_beam_batch_size); - } -#endif } auto output_type = session_info_->GetOutputDataType(config_->model.decoder.outputs.logits); diff --git a/src/models/extra_inputs.cpp b/src/models/extra_inputs.cpp index 6827042d3..a4bab8ce7 100644 --- a/src/models/extra_inputs.cpp +++ b/src/models/extra_inputs.cpp @@ -28,11 +28,6 @@ ExtraInputs::ExtraInputs(State& state) } } -#pragma warning(push) -#pragma warning(disable : 4065) // switch statement contains 'default' but no 'case' labels -#pragma warning(disable : 4189) // local variable is initialized but not referenced -#pragma warning(disable : 4702) // unreachable code - void ExtraInputs::Add() { // Add extra user inputs for (int i = 0; i < state_.params_->extra_inputs.size(); ++i) { @@ -42,44 +37,11 @@ void ExtraInputs::Add() { // Copy the data from the CPU-backed ORT value to the static buffers for (int i = 0; i < sb_extra_inputs_.size(); ++i) { - auto type_and_shape_info = extra_inputs_[i]->GetTensorTypeAndShapeInfo(); - auto shape = type_and_shape_info->GetShape(); - auto element_count = std::accumulate(shape.begin(), shape.end(), 1LL, std::multiplies()); - auto copy_size_in_bytes = element_count * SizeOf(type_and_shape_info->GetElementType()); - - switch (model_.device_type_) { -#if USE_DML - case DeviceType::DML: { - ComPtr target_resource; - Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, extra_inputs_[i]->GetTensorMutableRawData(), &target_resource)); - - auto source = std::span(state_.params_->extra_inputs[i].tensor->ort_tensor_->GetTensorData(), copy_size_in_bytes); - - model_.GetDmlUploadHeap()->BeginUploadToGpu( - target_resource.Get(), - 0, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - source); - } break; -#endif - -#if USE_CUDA - case DeviceType::CUDA: { - cudaMemcpyAsync( - extra_inputs_[i]->GetTensorMutableRawData(), - state_.params_->extra_inputs[i].tensor->ort_tensor_->GetTensorMutableRawData(), - copy_size_in_bytes, - cudaMemcpyHostToDevice, - model_.cuda_stream_); - } break; -#endif - - default: - throw std::runtime_error("Unsupported device for graph capture"); - } + auto tensor = ByteWrapTensor(*model_.p_device_, *extra_inputs_[i]); + auto source = std::span{state_.params_->extra_inputs[i].tensor->ort_tensor_->GetTensorData(), tensor.size()}; + copy(source, tensor.CpuSpan()); + tensor.CopyCpuToDevice(); } } -#pragma warning(pop) - } // namespace Generators diff --git a/src/models/input_ids.cpp b/src/models/input_ids.cpp index 71f051bfc..47233f3d8 100644 --- a/src/models/input_ids.cpp +++ b/src/models/input_ids.cpp @@ -13,12 +13,6 @@ InputIDs::InputIDs(State& state) if (state_.GetCapturedGraphInfo()) { sb_input_ids_ = state_.GetCapturedGraphInfo()->sb_input_ids_.get(); - -#if USE_DML - if (model_.device_type_ == DeviceType::DML) { - sb_input_ids_int32_ = state_.GetCapturedGraphInfo()->sb_input_ids_int32_.get(); - } -#endif } if (model_.session_info_->HasInput(model_.config_->model.decoder.inputs.current_sequence_length) && @@ -56,29 +50,26 @@ void InputIDs::Add() { } void InputIDs::Update(DeviceSpan& new_tokens) { - const auto get_unpadded_sequence_length = [](std::span input_ids, - int32_t pad_token_id) { - int32_t seq_length = 0; + auto new_tokens_cpu = new_tokens.CopyDeviceToCpu(); + + const auto get_unpadded_sequence_length = [](std::span input_ids, int32_t pad_token_id) { for (int32_t i = 0; i < input_ids.size(); i++) { - if (input_ids[i] == pad_token_id) { - break; - } - seq_length++; + if (input_ids[i] == pad_token_id) + return i; } - return seq_length; + return static_cast(input_ids.size()); }; if (current_sequence_length_ && past_sequence_length_) { if (state_.params_->BatchBeamSize() != 1) { throw std::runtime_error("Batch size must be 1 for current_sequence_length and past_sequence_length inputs"); } - auto new_sequence_length = get_unpadded_sequence_length(new_tokens.CpuSpan(), model_.config_->model.pad_token_id); + auto new_sequence_length = get_unpadded_sequence_length(new_tokens_cpu, model_.config_->model.pad_token_id); *current_sequence_length_->GetTensorMutableData() += new_sequence_length; *past_sequence_length_->GetTensorMutableData() += new_sequence_length; } - // Resize input_ids shape based on new_tokens - // For beam search + // For beam search, resize input_ids shape based on new_tokens size_t sequence_length = static_cast(new_tokens.size()) / state_.params_->BatchBeamSize(); if (is_prompt_ && state_.params_->search.num_beams > 1) sequence_length = static_cast(new_tokens.size()) / state_.params_->search.batch_size; @@ -87,20 +78,8 @@ void InputIDs::Update(DeviceSpan& new_tokens) { shape_[1] = sequence_length; if (!sb_input_ids_) { value_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_); - -#if USE_DML - if (model_.device_type_ == DeviceType::DML) { - value_int32_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_); - } -#endif } else { value_ = sb_input_ids_->CreateTensorOnStaticBuffer(shape_, type_); - -#if USE_DML - if (model_.device_type_ == DeviceType::DML) { - value_int32_ = sb_input_ids_int32_->CreateTensorOnStaticBuffer(shape_, Ort::TypeToTensorType); - } -#endif } state_.inputs_[input_index_] = value_.get(); @@ -121,33 +100,8 @@ void InputIDs::Update(DeviceSpan& new_tokens) { #endif } break; - case DeviceType::DML: { -#if USE_DML - ComPtr source_resource; - Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, value_int32_->GetTensorMutableRawData(), &source_resource)); - - auto source = std::span( - reinterpret_cast(new_tokens.CpuSpan().data()), - new_tokens.CpuSpan().size_bytes()); - - model_.GetDmlUploadHeap()->BeginUploadToGpu( - source_resource.Get(), - 0, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - source); - - DmlHelpers::DmlCastInputToOutput( - model_.GetDmlExecutionContext(), - *model_.allocator_device_, - *value_int32_, - value_, - model_.GetDmlDevice(), - model_.GetOrtDmlApi(), - input_ids_cast_command_list_state_); -#endif - } break; default: { - // CPU, WEBGPU + // CPU, DML, WEBGPU auto* data = value_->GetTensorMutableData(); auto next_tokens = new_tokens.Span(); for (int b = 0; b < shape_[0]; b++) { diff --git a/src/models/input_ids.h b/src/models/input_ids.h index dd364212f..02af3a98a 100644 --- a/src/models/input_ids.h +++ b/src/models/input_ids.h @@ -35,12 +35,6 @@ struct InputIDs { // Used for decoding runs with cuda graphs. StaticBuffer* sb_input_ids_{}; -#if USE_DML - std::unique_ptr value_int32_; - StaticBuffer* sb_input_ids_int32_{}; - DmlReusedCommandListState input_ids_cast_command_list_state_{}; -#endif - std::unique_ptr current_sequence_length_; std::unique_ptr past_sequence_length_; }; diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp index 5cb66ade3..4fa3243e2 100644 --- a/src/models/kv_cache.cpp +++ b/src/models/kv_cache.cpp @@ -220,23 +220,13 @@ KV_Cache::KV_Cache(State& state) } } - auto kv_cache_size_bytes = SizeOf(type_) * shape_[0] * shape_[1] * shape_[2] * shape_[3]; try { for (int i = 0; i < layer_count_ * 2; ++i) { presents_.push_back( sb_kv_caches_.empty() ? OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_) : sb_kv_caches_[i]->CreateTensorOnStaticBuffer(shape_, type_)); -#if USE_CUDA - if (model_.device_type_ == DeviceType::CUDA) { - cudaMemsetAsync(presents_.back()->GetTensorMutableRawData(), 0, kv_cache_size_bytes, model_.cuda_stream_); - } else -#endif - { - if (model_.device_type_ == DeviceType::CPU) { - // FIXME: this is a device ternsor and we can only use memset for cpu. Revisit for other EPs. - memset(presents_.back()->GetTensorMutableRawData(), 0, kv_cache_size_bytes); - } - } + // Zero the memory so we don't leak any data from the previous run + ByteWrapTensor(*model_.p_device_, *presents_.back()).Zero(); } } catch (const Ort::Exception&) { std::ostringstream oss; diff --git a/src/models/logits.cpp b/src/models/logits.cpp index d4821c4b1..ae7d9bdcd 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -37,9 +37,6 @@ Logits::Logits(State& state) input_sequence_lengths.resize(state_.params_->search.batch_size); } -#pragma warning(push) -#pragma warning(disable : 4189) // local variable is initialized but not referenced - DeviceSpan Logits::Get() { size_t element_count = shape_[0] * shape_[1] * shape_[2]; @@ -50,7 +47,6 @@ DeviceSpan Logits::Get() { const size_t seq_length = shape_[1]; const size_t vocab_size = shape_[2]; const size_t num_beams = state_.params_->search.num_beams; - const size_t element_count_last_token = shape_[0] * shape_[2]; // create new OrtValue for logits_of_last_token and use output_last_tokens_ to hold it output_last_tokens_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_last, type_); @@ -60,54 +56,20 @@ DeviceSpan Logits::Get() { logits_of_last_token = output_last_tokens_.get(); - size_t element_size = type_ == Ort::TypeToTensorType ? 4 : 2; + size_t element_size = SizeOf(type_); size_t vocab_index = 0; // Simpler math to have this index go up by vocab_size for every logit chunk we process + auto logits_raw = ByteWrapTensor(*state_.params_->p_device, *output_raw_); + auto logits_last_tokens = ByteWrapTensor(*state_.params_->p_device, *logits_of_last_token); + for (int batch_index = 0; batch_index < state_.params_->search.batch_size; batch_index++) { // Find the first non pad token from the end size_t token_index = input_sequence_lengths[batch_index] - 1; for (int beam_index = 0; beam_index < num_beams; beam_index++) { - switch (model_.device_type_) { - case DeviceType::DML: { -#if USE_DML - ComPtr source_resource; - Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, output_raw_->GetTensorMutableRawData(), &source_resource)); - - ComPtr target_resource; - Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, logits_of_last_token->GetTensorMutableRawData(), &target_resource)); - - uint64_t source_offset = (vocab_index * seq_length + token_index * vocab_size) * element_size; - uint64_t target_offset = vocab_index * element_size; - uint64_t size_in_bytes = vocab_size * element_size; - - model_.GetDmlExecutionContext()->CopyBufferRegion( - target_resource.Get(), - target_offset, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - source_resource.Get(), - source_offset, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - size_in_bytes); -#endif - } break; - - default: { - // CPU, CUDA, WEBGPU - auto logits_raw = std::span{output_raw_->GetTensorMutableData(), element_count * element_size}; - auto logits_last_tokens = std::span{logits_of_last_token->GetTensorMutableData(), element_count_last_token * element_size}; - auto target = logits_last_tokens.subspan(vocab_index * element_size, vocab_size * element_size); - auto source = logits_raw.subspan((vocab_index * seq_length + token_index * vocab_size) * element_size, vocab_size * element_size); - if (model_.device_type_ == DeviceType::CUDA) -#if USE_CUDA - CudaCheck() == cudaMemcpyAsync(target.data(), source.data(), source.size_bytes(), cudaMemcpyDeviceToDevice, state_.params_->cuda_stream); -#else - throw std::runtime_error("Unexpected CUDA device usage"); -#endif - else - copy(source, target); - } break; - } + auto target = logits_last_tokens.subspan(vocab_index * element_size, vocab_size * element_size); + auto source = logits_raw.subspan((vocab_index * seq_length + token_index * vocab_size) * element_size, vocab_size * element_size); + target.CopyFrom(source); vocab_index += vocab_size; } } @@ -117,35 +79,10 @@ DeviceSpan Logits::Get() { // Convert from float16 to float32 if necessary if (type_ == Ort::TypeToTensorType) { -#if USE_DML - if (model_.device_type_ == DeviceType::DML) { - DmlHelpers::DmlCastInputToOutput( - model_.GetDmlExecutionContext(), - *model_.allocator_device_, - *logits_of_last_token, - logits_of_last_token_fp32_, - model_.GetDmlDevice(), - model_.GetOrtDmlApi(), - logits_cast_command_list_state_); - - logits_of_last_token = logits_of_last_token_fp32_.get(); - } else -#endif - { ConvertFp16ToFp32(*model_.allocator_device_, *logits_of_last_token, logits_of_last_token_fp32_, model_.device_type_, model_.cuda_stream_); logits_of_last_token = logits_of_last_token_fp32_.get(); - } } - assert(shape_[1] == 1); - -#if USE_DML - // DML doesn't support on-device scoring yet, so we need to download some data to the CPU - if (model_.device_type_ == DeviceType::DML) { - value32_cpu_ = OrtValue::CreateTensor(model_.allocator_cpu_, shape_last); - } -#endif - if (logits_.empty() || logits_of_last_token->GetTensorMutableRawData() != logits_.Span().data()) logits_ = WrapTensor(*state_.params_->p_device, *logits_of_last_token); @@ -162,36 +99,11 @@ DeviceSpan Logits::Get() { return logits_; } #endif -#if USE_DML - if (model_.device_type_ == DeviceType::DML) { - // DML doesn't support on-device scoring yet, so we transfer the data to the CPU - ComPtr gpu_resource; - Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation( - model_.allocator_device_, - logits_of_last_token->GetTensorMutableData(), - &gpu_resource)); - auto cpu_tensor = value32_cpu_->GetTensorMutableData(); - - model_.GetDmlReadbackHeap()->ReadbackFromGpu( - std::span(reinterpret_cast(cpu_tensor), element_count * sizeof(float)), - gpu_resource.Get(), - 0, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - - auto batched_logits_cpu = cpu_span{cpu_tensor, element_count}; - HandleEOSArray(batched_logits_cpu); - - logits_ = WrapTensor(*state_.params_->p_device, *value32_cpu_); - return logits_; - } -#endif HandleEOSArray(logits_.Span()); return logits_; } -#pragma warning(pop) - void Logits::Update(const DeviceSpan& next_tokens, size_t new_kv_length) { if (static_cast(output_raw_.get()->GetTensorTypeAndShapeInfo()->GetShape()[1]) == new_kv_length && new_kv_length == 1) { return; diff --git a/src/models/logits.h b/src/models/logits.h index f723c48ee..187ecd8cb 100644 --- a/src/models/logits.h +++ b/src/models/logits.h @@ -46,11 +46,6 @@ struct Logits { #if USE_CUDA DeviceSpan cuda_eos_token_ids_; // eos_token_ids from params, but in cuda accessible memory #endif - -#if USE_DML - DmlReusedCommandListState logits_cast_command_list_state_{}; - std::unique_ptr value32_cpu_; -#endif }; } // namespace Generators diff --git a/src/models/model.cpp b/src/models/model.cpp index 4430fdb41..9270a2cc7 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -15,11 +15,7 @@ #include "multi_modal_vision_model.h" #include "decoder_only_pipeline.h" #if USE_DML -#include -#include "dml_provider_factory.h" -#include "../dml/dml_helpers.h" - -std::string CurrentModulePath(); +#include "../dml/interface.h" #endif namespace Generators { @@ -222,20 +218,20 @@ int32_t Tokenizer::TokenToTokenId(const char* token) const { return token_id; } -#if USE_CUDA // Since Python/Others can and will hold onto a generator object past the model object's lifetime we need to ensure // the allocator used is not destroyed until last. This keeps the allocator around until exit, after all other memory // has been destroyed. Without this, we will crash in the Onnxruntime BFCArena code when deleting tensors due to the // arena already being destroyed. -Ort::Allocator* GetCudaAllocator(OrtSession& session) { - auto& globals = *GetOrtGlobals(); - if (!globals.allocator_cuda_) { - globals.memory_info_cuda_ = OrtMemoryInfo::Create("Cuda", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); - globals.allocator_cuda_ = Ort::Allocator::Create(session, *globals.memory_info_cuda_); +Ort::Allocator* GetDeviceAllocator(OrtSession& session, DeviceType type) { + auto& device = GetOrtGlobals()->allocator_device_[static_cast(type)]; + if (!device) { + static const char* device_type_names[static_cast(DeviceType::MAX)] = {"CPU", "Cuda", "DML", "WebGPU Buffer"}; + auto memory_info = OrtMemoryInfo::Create(device_type_names[static_cast(type)], OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); + device = Ort::Allocator::Create(session, *memory_info); + GetDeviceInterface(type)->InitAllocator(*device); } - return globals.allocator_cuda_.get(); + return device.get(); } -#endif SessionInfo::SessionInfo(OrtSession& session) { Add(session); @@ -290,27 +286,15 @@ Model::~Model() = default; void Model::InitDeviceAllocator([[maybe_unused]] OrtSession& session) { allocator_device_ = &allocator_cpu_; -#if USE_CUDA - if (device_type_ == DeviceType::CUDA) { - allocator_device_ = GetCudaAllocator(session); - } -#endif -#if USE_DML - if (device_type_ == DeviceType::DML) { - memory_info_device_ = OrtMemoryInfo::Create("DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); - dml_owned_allocator_ = Ort::Allocator::Create(session, *memory_info_device_); - allocator_device_ = dml_owned_allocator_.get(); - } -#endif + if (device_type_== DeviceType::CUDA) + allocator_device_ = GetDeviceAllocator(session, device_type_); + allocator_kvcache_ = allocator_device_; -#if USE_WEBGPU - if (device_type_ == DeviceType::WEBGPU) { - // for webgpu we only use device memory for kv_cache - memory_info_device_ = OrtMemoryInfo::Create("WebGPU_Buffer", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); - webgpu_owned_allocator_ = Ort::Allocator::Create(session, *memory_info_device_); - allocator_kvcache_ = webgpu_owned_allocator_.get(); + if (device_type_ == DeviceType::WEBGPU || device_type_ == DeviceType::DML) { + // for dml and webgpu we only use device memory for kv_cache + allocator_kvcache_ = GetDeviceAllocator(session, device_type_); } -#endif + session_info_ = std::make_unique(session); captured_graph_pool_ = std::make_shared(config_.get(), session_info_.get(), allocator_device_); } @@ -436,52 +420,19 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ #if USE_DML } else if (provider_options.name == "dml") { if (!p_dml_api_) { - auto current_module_path = CurrentModulePath(); - - bool contains_device_luid = false; LUID device_luid{}; + LUID* p_device_luid{}; for (const auto& [name, value] : provider_options.options) { if (name == "luid") { if (auto separator_position = value.find(":"); separator_position != std::string::npos) { device_luid.HighPart = std::stol(value.substr(0, separator_position)); device_luid.LowPart = std::stol(value.substr(separator_position + 1)); - contains_device_luid = true; + p_device_luid = &device_luid; } } } - if (contains_device_luid) { - dml_objects_ = DmlHelpers::CreateDmlObjects(current_module_path, &device_luid); - } else { - dml_objects_ = DmlHelpers::CreateDmlObjects(current_module_path); - } - - constexpr auto directml_dll = "DirectML.dll"; - wil::unique_hmodule smart_directml_dll(LoadLibraryEx(directml_dll, nullptr, 0)); - THROW_LAST_ERROR_IF(!smart_directml_dll); - - if (LoadLibraryEx(directml_dll, nullptr, 0) == NULL) { - throw std::runtime_error("DirectML.dll not found"); - } - - auto dml_create_device1_fn = reinterpret_cast(GetProcAddress(smart_directml_dll.get(), "DMLCreateDevice1")); - THROW_LAST_ERROR_IF(!dml_create_device1_fn); - THROW_IF_FAILED(dml_create_device1_fn(dml_objects_.d3d12_device.Get(), DML_CREATE_DEVICE_FLAG_NONE, DML_FEATURE_LEVEL_5_0, IID_PPV_ARGS(&dml_device_))); - - Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast(&p_dml_api_))); - if (!p_dml_api_) { - throw std::runtime_error("Unexpected nullptr getting OrtDmlApi"); - } - - dml_execution_context_ = std::make_unique( - dml_objects_.d3d12_device.Get(), - dml_device_.Get(), - dml_objects_.command_queue.Get(), - *allocator_device_, - p_dml_api_); - - dml_pooled_upload_heap_ = std::make_unique(dml_objects_.d3d12_device.Get(), dml_execution_context_.get()); - dml_readback_heap_ = std::make_unique(dml_objects_.d3d12_device.Get(), dml_execution_context_.get()); + InitDmlInterface(p_device_luid); } if (!disable_graph_capture) { @@ -489,7 +440,7 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ session_options.AddConfigEntry("ep.dml.disable_memory_arena", "1"); } - p_dml_api_->SessionOptionsAppendExecutionProvider_DML1(&session_options, dml_device_.Get(), dml_objects_.command_queue.Get()); + SetDmlProvider(session_options); if (is_primary_session_options) device_type_ = DeviceType::DML; // We use a DML allocator for input/output caches, but other tensors will use CPU tensors @@ -520,6 +471,11 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ void Model::CreateSessionOptions() { session_options_ = OrtSessionOptions::Create(); +#if 0 + ClearProviders(*config_); + SetProviderOption(*config_, "dml", {}, {}); +#endif + CreateSessionOptionsFromConfig(config_->model.decoder.session_options, *session_options_, true, false); for (auto& pipeline_model : config_->model.decoder.pipeline) { diff --git a/src/models/model.h b/src/models/model.h index 573dd5b6c..55ad960b3 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -181,14 +181,9 @@ struct Model : std::enable_shared_from_this, LeakChecked { std::unique_ptr dml_execution_context_; std::unique_ptr dml_readback_heap_; ComPtr dml_device_; - std::unique_ptr dml_owned_allocator_; #endif #if USE_WEBGPU - std::unique_ptr webgpu_owned_allocator_; std::unique_ptr webgpu_io_binding_; -#endif -#if USE_DML || USE_WEBGPU - std::unique_ptr memory_info_device_; #endif std::shared_ptr captured_graph_pool_; std::map> pipeline_session_options_; diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp index a6180e4ee..5d4c82684 100644 --- a/src/models/position_inputs.cpp +++ b/src/models/position_inputs.cpp @@ -219,10 +219,7 @@ void PositionInputs::CreateNextAttentionMaskTensor(int total_length) { attention_mask_shape_[1] = state_.params_->search.max_length; attention_mask_next_ = sb_attention_mask_->CreateTensorOnStaticBuffer(attention_mask_shape_, type_); if (is_first_mask_update_) { - cudaMemsetAsync(attention_mask_next_->GetTensorMutableRawData(), - 0, - (type_ == Ort::TypeToTensorType ? sizeof(int32_t) : sizeof(int64_t)) * attention_mask_shape_[0] * attention_mask_shape_[1], - model_.cuda_stream_); + ByteWrapTensor(*model_.p_device_, *attention_mask_next_).Zero(); } #elif USE_DML attention_mask_shape_[1] = state_.params_->search.max_length; diff --git a/src/models/utils.cpp b/src/models/utils.cpp index 7f4d43629..dd4bef813 100644 --- a/src/models/utils.cpp +++ b/src/models/utils.cpp @@ -1,9 +1,15 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #include "../generators.h" +#include "utils.h" namespace Generators { +DeviceSpan ByteWrapTensor(DeviceInterface& device, OrtValue& value) { + auto info = value.GetTensorTypeAndShapeInfo(); + return device.WrapMemory(std::span{value.GetTensorMutableData(), info->GetElementCount() * SizeOf(info->GetElementType())}); +} + size_t SizeOf(ONNXTensorElementDataType type) { switch (type) { case Ort::TypeToTensorType: diff --git a/src/models/whisper.cpp b/src/models/whisper.cpp index 2988b9108..723b7aeda 100644 --- a/src/models/whisper.cpp +++ b/src/models/whisper.cpp @@ -272,16 +272,12 @@ DeviceSpan Whisper_State::Run(int current_length, DeviceSpan& ne } if (model_.session_info_->HasInput("cache_indirection")) { -#if USE_CUDA cache_indirection_ = OrtValue::CreateTensor(*model_.allocator_device_, std::array{params_->search.batch_size, params_->search.num_beams, params_->search.max_length}); cache_indirection_index_ = inputs_.size(); input_names_.push_back("cache_indirection"); inputs_.push_back(cache_indirection_.get()); - auto data = gpu_span{cache_indirection_->GetTensorMutableData(), - static_cast(params_->BatchBeamSize()) * params_->search.max_length}; - CudaCheck() == cudaMemsetAsync(data.data(), 0, data.size_bytes(), params_->cuda_stream); -#endif + ByteWrapTensor(*model_.p_device_, *cache_indirection_).Zero(); } if (model_.session_info_->HasOutput("output_cross_qk_0")) { diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp index 87cf6f4c5..8b9a87852 100644 --- a/src/ort_genai_c.cpp +++ b/src/ort_genai_c.cpp @@ -331,37 +331,13 @@ OgaResult* OGA_API_CALL OgaGenerator_GetOutput(const OgaGenerator* oga_generator auto& generator = *reinterpret_cast(oga_generator); auto* ortvalue_output = generator.state_->GetOutput(name); auto type_info = ortvalue_output->GetTensorTypeAndShapeInfo(); - std::unique_ptr ortvalue_clone = OrtValue::CreateTensor(generator.model_->allocator_cpu_, - type_info->GetShape(), - type_info->GetElementType()); + auto ortvalue_clone = OrtValue::CreateTensor(generator.model_->allocator_cpu_, type_info->GetShape(), type_info->GetElementType()); + // Copy data to ortvalue_clone - auto element_size = Generators::SizeOf(type_info->GetElementType()); - auto data_size = type_info->GetElementCount() * element_size; - if (ortvalue_output->GetTensorMemoryInfo().GetDeviceType() == OrtMemoryInfoDeviceType_GPU && generator.model_->device_type_ == Generators::DeviceType::CUDA) { -#if USE_CUDA - cudaMemcpy(ortvalue_clone->GetTensorMutableRawData(), ortvalue_output->GetTensorMutableRawData(), data_size, cudaMemcpyDeviceToHost); -#endif - } else if (ortvalue_output->GetTensorMemoryInfo().GetDeviceType() == OrtMemoryInfoDeviceType_GPU && generator.model_->device_type_ == Generators::DeviceType::DML) { -#if USE_DML - ComPtr gpu_resource; - Ort::ThrowOnError(generator.model_->GetOrtDmlApi()->GetD3D12ResourceFromAllocation( - generator.model_->allocator_device_, - ortvalue_output->GetTensorMutableRawData(), - &gpu_resource)); - auto cpu_tensor = ortvalue_clone->GetTensorMutableRawData(); - generator.model_->GetDmlReadbackHeap()->ReadbackFromGpu( - std::span(reinterpret_cast(cpu_tensor), data_size), - gpu_resource.Get(), - 0, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS); -#endif - } else if (ortvalue_output->GetTensorMemoryInfo().GetDeviceType() == OrtMemoryInfoDeviceType_CPU) { - std::copy(static_cast(ortvalue_output->GetTensorMutableRawData()), - static_cast(ortvalue_output->GetTensorMutableRawData()) + data_size, - static_cast(ortvalue_clone->GetTensorMutableRawData())); - } else { - throw std::runtime_error("Unsupported Device type: " + std::to_string(ortvalue_output->GetTensorMemoryInfo().GetDeviceType())); - } + bool is_cpu = ortvalue_output->GetTensorMemoryInfo().GetDeviceType() == OrtMemoryInfoDeviceType_CPU; + auto output_span = Generators::ByteWrapTensor(is_cpu ? *Generators::GetDeviceInterface(Generators::DeviceType::CPU) : *generator.model_->p_device_, *ortvalue_output); + auto copy_span = Generators::ByteWrapTensor(*Generators::GetDeviceInterface(Generators::DeviceType::CPU), *ortvalue_clone); + copy_span.CopyFrom(output_span); auto tensor = std::make_shared(std::move(ortvalue_clone)); tensor->external_owner_ = tensor; diff --git a/src/python/python.cpp b/src/python/python.cpp index 4ee8f87ed..0d825665d 100644 --- a/src/python/python.cpp +++ b/src/python/python.cpp @@ -12,10 +12,6 @@ #include "../logging.h" #include "../smartptrs.h" -#if USE_CUDA -#include "../cuda/cuda_common.h" -#endif - using namespace pybind11::literals; // If a parameter to a C++ function is an array of float16, this type will let pybind11::array_t map to numpy's float16 format @@ -143,50 +139,21 @@ pybind11::array ToNumpy(OrtValue* v, const Generators::Model& model) { auto shape = type_info->GetShape(); auto type = type_info->GetElementType(); auto element_size = Generators::SizeOf(type); - auto data = v->GetTensorMutableRawData(); - - std::unique_ptr cpu_copy; - -#if USE_DML - // TODO: DML version of this - if (v->GetTensorMemoryInfo().GetDeviceType() == OrtMemoryInfoDeviceType_GPU && model.device_type_ == Generators::DeviceType::DML) { - auto data_size = type_info->GetElementCount() * element_size; - cpu_copy = std::make_unique(data_size); - - ComPtr gpu_resource; - Ort::ThrowOnError(model.GetOrtDmlApi()->GetD3D12ResourceFromAllocation( - model.allocator_device_, - data, - &gpu_resource)); - - model.GetDmlReadbackHeap()->ReadbackFromGpu( - std::span(reinterpret_cast(cpu_copy.get()), data_size), - gpu_resource.Get(), - 0, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - data = cpu_copy.get(); - } -#endif -#if USE_CUDA - if (v->GetTensorMemoryInfo().GetDeviceType() == OrtMemoryInfoDeviceType_GPU && model.device_type_ == Generators::DeviceType::CUDA) { - auto data_size = type_info->GetElementCount() * element_size; - cpu_copy = std::make_unique(data_size); - Generators::CudaCheck() == cudaMemcpy(cpu_copy.get(), data, data_size, cudaMemcpyDeviceToHost); - data = cpu_copy.get(); - } -#endif std::vector strides(shape.size()); { - auto size = Generators::SizeOf(type); + auto size = element_size; for (size_t i = strides.size(); i-- > 0;) { strides[i] = size; size *= shape[i]; } } + bool is_cpu = v->GetTensorMemoryInfo().GetDeviceType() == OrtMemoryInfoDeviceType_CPU; + auto device_span = Generators::ByteWrapTensor(is_cpu ? *Generators::GetDeviceInterface(Generators::DeviceType::CPU) : *model.p_device_, *v); + pybind11::buffer_info bufinfo{ - data, // Pointer to memory buffer + device_span.CopyDeviceToCpu().data(), // Pointer to memory buffer static_cast(element_size), // Size of underlying scalar type ToFormatDescriptor(type), // Python struct-style format descriptor static_cast(shape.size()), // Number of dimensions diff --git a/src/search.cpp b/src/search.cpp index 274a5b2dd..c4aacf333 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -2,16 +2,18 @@ #include "softmax.h" #include "search.h" #include "beam_search_scorer.h" +#include "cpu/interface.h" #include #include namespace Generators { Search_Cpu::Search_Cpu(const GeneratorParams& params) - : Search{params} { + : Search{params}, + cpu_device_{*GetCpuInterface()} { auto batch_beam_size = params.BatchBeamSize(); - sequence_lengths_ = params.p_device->Allocate(batch_beam_size); + sequence_lengths_ = cpu_device_.Allocate(batch_beam_size); } GreedySearch_Cpu::GreedySearch_Cpu(const GeneratorParams& params) @@ -26,9 +28,9 @@ GreedySearch_Cpu::GreedySearch_Cpu(const GeneratorParams& params) gen_.seed(seq); } - next_tokens_ptr_ = params.p_device->Allocate(params.search.batch_size); + next_tokens_ptr_ = cpu_device_.Allocate(params.search.batch_size); + next_tokens_ptr_.Zero(); next_tokens_ = cpu_span(next_tokens_ptr_.Span()); - memset(next_tokens_.data(), 0, next_tokens_.size_bytes()); eos_seen_buffer_ = AllocateArray(params.search.batch_size, &eos_seen_); memset(eos_seen_.data(), 0, eos_seen_.size_bytes()); @@ -368,7 +370,6 @@ DeviceSpan BeamSearch_Cpu::GetSequence(size_t index) { return beam_scorer_->GetBeamHypotheses(batch_id, beam_id); } -// TODO(aciddelgado): my question is, should this return copy or reference? A: A copy, as with DeviceSpan it's like a span DeviceSpan BeamSearch_Cpu::GetSequence(size_t batch_id, size_t beam_id) { Finalize(params_->search.num_return_sequences); return beam_scorer_->GetBeamHypotheses(batch_id, beam_id); diff --git a/src/search.h b/src/search.h index 35fc521d8..9d456368c 100644 --- a/src/search.h +++ b/src/search.h @@ -52,6 +52,8 @@ struct Search_Cpu : Search { std::span GetScores(int batch_beam_index); + DeviceInterface& cpu_device_; + DeviceSpan sequence_lengths_; // shape (beam_size*batch_size) cpu_span next_tokens_; // shape (beam_size*batch_size) @@ -82,7 +84,6 @@ struct GreedySearch_Cpu : Search_Cpu { bool PadIfAlreadyEOS(size_t batch_id); - std::unique_ptr next_tokens_buffer_; DeviceSpan next_tokens_ptr_; std::unique_ptr temp_topk_buffer_; diff --git a/src/smartptrs.h b/src/smartptrs.h index ac7037957..fbaf12ed2 100644 --- a/src/smartptrs.h +++ b/src/smartptrs.h @@ -5,6 +5,10 @@ #include #include "span.h" +namespace Ort { +struct Allocator; +} + namespace Generators { struct Search; struct Sequences; @@ -21,6 +25,7 @@ struct DeviceBuffer : std::enable_shared_from_this { virtual void CopyDeviceToCpu() = 0; // Allocates p_cpu_ if necessary and copies p_device_ memory into it virtual void CopyCpuToDevice() = 0; virtual void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) = 0; + virtual void Zero() = 0; // Zero out the device memory uint8_t* p_device_{}; uint8_t* p_cpu_{}; @@ -38,6 +43,8 @@ struct DeviceSpan { bool empty() const { return length_ == 0; } size_t size() const { return length_; } + operator DeviceSpan() const { return DeviceSpan(*p_device_memory_, begin_, length_); } + DeviceSpan subspan(size_t begin, size_t length) { return DeviceSpan(*p_device_memory_, begin_ + begin, length); } // Return the device accessible memory. Should only be done in device specific code, as it's not CPU accessible @@ -58,24 +65,35 @@ struct DeviceSpan { // Copy CPU memory to device memory, typically used after calling CpuSpan or CopyDeviceToCpu to update the device memory with the modifications made void CopyCpuToDevice() { p_device_memory_->CopyCpuToDevice(); } + // Zero out the device memory + void Zero() { p_device_memory_->Zero(); } + + void CopyFrom(const DeviceSpan& source) { + assert(source.size() == size()); // Spans must be the same size to copy + p_device_memory_->CopyFrom(begin_ * sizeof(T), *source.p_device_memory_, source.begin_ * sizeof(T), length_ * sizeof(T)); + } + private: DeviceSpan(DeviceBuffer& memory, size_t begin, size_t length) : p_device_memory_{memory.shared_from_this()}, begin_{begin}, length_{length} {} std::shared_ptr p_device_memory_; size_t begin_{}, length_{}; // Subspan of p_device_memory_, relative to original memory block + template + friend struct DeviceSpan; // All DeviceSpans are friends }; struct DeviceInterface { virtual ~DeviceInterface() {} + virtual void InitAllocator(Ort::Allocator& allocator) = 0; template - DeviceSpan Allocate(size_t count, bool cpu_accessible = false) { return DeviceSpan(AllocateBase(sizeof(T) * count, cpu_accessible)); } - virtual std::shared_ptr AllocateBase(size_t size, bool cpu_accessible) = 0; + DeviceSpan Allocate(size_t count) { return DeviceSpan(AllocateBase(sizeof(T) * count)); } + virtual std::shared_ptr AllocateBase(size_t size) = 0; // Wraps an existing memory block, useful for tensors. Use WrapTensor for OrtValue vs calling this directly template - DeviceSpan WrapMemory(std::span memory) { return DeviceSpan(WrapMemoryBase(memory.data(), memory.size_bytes())); } + DeviceSpan WrapMemory(std::span memory) { return DeviceSpan(WrapMemoryBase(const_cast*>(memory.data()), memory.size_bytes())); } virtual std::shared_ptr WrapMemoryBase(void* memory, size_t size) = 0; virtual std::unique_ptr CreateGreedy(const GeneratorParams& params) = 0; diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp index 56cd1644f..18caa0a10 100644 --- a/test/c_api_tests.cpp +++ b/test/c_api_tests.cpp @@ -32,6 +32,9 @@ TEST(CAPITests, Config) { #endif } +#undef USE_CUDA +#define USE_CUDA 0 + TEST(CAPITests, TokenizerCAPI) { #if TEST_PHI2 auto config = OgaConfig::Create(PHI2_PATH); diff --git a/test/sampling_benchmark.cpp b/test/sampling_benchmark.cpp index eb7f04cd9..c15478009 100644 --- a/test/sampling_benchmark.cpp +++ b/test/sampling_benchmark.cpp @@ -14,6 +14,9 @@ #define MODEL_PATH "../../test/test_models/" #endif +#undef USE_CUDA +#define USE_CUDA 0 + // Defined in sampling_tests.cpp void CreateRandomLogits(float* logits, int num_large, int vocab_size, int batch_size, std::mt19937& engine); diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp index a71910b15..0482f704d 100644 --- a/test/sampling_tests.cpp +++ b/test/sampling_tests.cpp @@ -13,6 +13,9 @@ #define MODEL_PATH "../../test/test_models/" #endif +#undef USE_CUDA +#define USE_CUDA 0 + template auto AllocateFromCpuMem(Generators::DeviceInterface& device, std::span cpu_memory) { auto memory = device.Allocate(cpu_memory.size()); From 41b462a5babaa0b0ef7b9e9107435cf7e7197b52 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Wed, 15 Jan 2025 15:34:15 -0800 Subject: [PATCH 03/31] Finish refactoring model processing Remove as many #if USE_CUDA/USE_DML as possible --- src/cpu/interface.cpp | 48 +++++++- src/cuda/beam_search_scorer_cuda.cpp | 2 +- src/cuda/beam_search_scorer_cuda.cuh | 1 + src/cuda/cuda_sampling.cu | 13 +- src/cuda/cuda_sampling.cuh | 2 +- src/cuda/interface.cpp | 118 +++++++++--------- src/cuda/interface.h | 24 ---- src/{models => cuda}/kernels.h | 2 - src/cuda/model_kernels.cu | 30 ----- src/cuda/search_cuda.cpp | 34 +++--- src/cuda/search_cuda.h | 3 +- src/dml/interface.cpp | 60 ++++++++- src/generators.cpp | 57 ++------- src/generators.h | 9 +- src/models/audio_processor.cpp | 2 +- src/models/captured_graph_pool.h | 5 - src/models/debugging.cpp | 34 +----- src/models/extra_inputs.cpp | 1 - src/models/input_ids.cpp | 76 +++--------- src/models/input_ids.h | 3 +- src/models/kv_cache.cpp | 119 ++++++------------ src/models/logits.cpp | 17 +-- src/models/logits.h | 2 - src/models/model.cpp | 132 ++++---------------- src/models/model.h | 31 +---- src/models/position_inputs.cpp | 169 +++----------------------- src/models/position_inputs.h | 24 ---- src/models/prompt_image_processor.cpp | 2 +- src/models/whisper.cpp | 20 ++- src/smartptrs.h | 16 ++- test/model_tests.cpp | 8 +- 31 files changed, 333 insertions(+), 731 deletions(-) rename src/{models => cuda}/kernels.h (92%) diff --git a/src/cpu/interface.cpp b/src/cpu/interface.cpp index b27882668..c6705fa3b 100644 --- a/src/cpu/interface.cpp +++ b/src/cpu/interface.cpp @@ -3,6 +3,7 @@ #include "../generators.h" #include "../search.h" +#include "../models/utils.h" #include "interface.h" namespace Generators { @@ -13,7 +14,7 @@ const char* label_cpu = "cpu"; struct CpuMemory final : DeviceBuffer { CpuMemory(size_t size) : owned_{true} { size_in_bytes_ = size; - p_cpu_ = p_device_ = new uint8_t[size_in_bytes_]; + p_cpu_ = p_device_ = static_cast(ort_allocator_->Alloc(size_in_bytes_)); } CpuMemory(void* p, size_t size) : owned_{false} { @@ -23,7 +24,7 @@ struct CpuMemory final : DeviceBuffer { ~CpuMemory() override { if (owned_) - delete[] p_device_; + ort_allocator_->Free(p_device_); } const char* GetType() const override { return label_cpu; } @@ -42,11 +43,19 @@ struct CpuMemory final : DeviceBuffer { }; struct CpuInterface : DeviceInterface { - void InitAllocator(Ort::Allocator& allocator) override { + CpuInterface() { + InitOrt(*Ort::api, Ort::Allocator::GetWithDefaultOptions()); + } + + void InitOrt(const OrtApi& /*api*/, Ort::Allocator& allocator) override { assert(!ort_allocator_); ort_allocator_ = &allocator; } + Ort::Allocator& GetAllocator() override { + return *ort_allocator_; + } + std::shared_ptr AllocateBase(size_t size) override { return std::make_shared(size); } @@ -55,6 +64,39 @@ struct CpuInterface : DeviceInterface { return std::make_shared(p, size); } + bool Cast(OrtValue& input, OrtValue& output) override { + auto input_info = input.GetTensorTypeAndShapeInfo(); + auto output_info = output.GetTensorTypeAndShapeInfo(); + + auto input_type = input_info->GetElementType(); + auto output_type = output_info->GetElementType(); + + auto element_count = input_info->GetElementCount(); + if (element_count != output_info->GetElementCount()) + throw std::runtime_error("Cast - input and output element counts do not match"); + if (input_type == output_type) + throw std::runtime_error("Cast - input and output types are the same"); + + if (input_type == Ort::TypeToTensorType && output_type == Ort::TypeToTensorType) { + auto* fp32 = input.GetTensorData(); + auto* fp16 = output.GetTensorMutableData(); + for (size_t i = 0; i < element_count; i++) + fp16[i] = FastFloat32ToFloat16(fp32[i]); + } else if (input_type == Ort::TypeToTensorType && output_type == Ort::TypeToTensorType) { + auto* fp16 = input.GetTensorData(); + auto* fp32 = output.GetTensorMutableData(); + for (size_t i = 0; i < element_count; i++) + fp32[i] = FastFloat16ToFloat32(fp16[i]); + } else if (input_type == Ort::TypeToTensorType && output_type == Ort::TypeToTensorType) { + auto* input_data = input.GetTensorData(); + auto* output_data = output.GetTensorMutableData(); + for (size_t i = 0; i < element_count; i++) + output_data[i] = input_data[i]; + } else + throw std::runtime_error("Cast - Unimplemented cast"); + return true; + } + std::unique_ptr CreateGreedy(const GeneratorParams& params) override { return std::make_unique(params); } std::unique_ptr CreateBeam(const GeneratorParams& params) override { return std::make_unique(params); } diff --git a/src/cuda/beam_search_scorer_cuda.cpp b/src/cuda/beam_search_scorer_cuda.cpp index a321af4d7..e2295e00d 100644 --- a/src/cuda/beam_search_scorer_cuda.cpp +++ b/src/cuda/beam_search_scorer_cuda.cpp @@ -8,7 +8,7 @@ namespace Generators { BeamSearchScorer_Cuda::BeamSearchScorer_Cuda(const GeneratorParams& parameters) - : stream_{parameters.cuda_stream} { + : stream_{GetStream()} { state_cpu_ = CudaMallocHostArray(1); state_cpu_->batch_size_ = static_cast(parameters.search.batch_size); state_cpu_->num_beams_ = static_cast(parameters.search.num_beams); diff --git a/src/cuda/beam_search_scorer_cuda.cuh b/src/cuda/beam_search_scorer_cuda.cuh index 68be19fee..9e441d4bd 100644 --- a/src/cuda/beam_search_scorer_cuda.cuh +++ b/src/cuda/beam_search_scorer_cuda.cuh @@ -1,3 +1,4 @@ +#include "models/onnxruntime_api.h" #include "smartptrs.h" namespace Generators { diff --git a/src/cuda/cuda_sampling.cu b/src/cuda/cuda_sampling.cu index 8b5720479..f6be0b623 100644 --- a/src/cuda/cuda_sampling.cu +++ b/src/cuda/cuda_sampling.cu @@ -8,6 +8,7 @@ #include "span.h" #include "beam_search_topk.h" #include "cuda_sampling.cuh" +#include "models/onnxruntime_api.h" #include "smartptrs.h" #include #include @@ -297,22 +298,22 @@ __global__ void SoftmaxBlockForward(outscalar_t* output, scalar_t* input, int cl } template -void DispatchBlockwiseSoftmaxForward(cudaStream_t* stream, float* output, const float* input, int softmax_elements, +void DispatchBlockwiseSoftmaxForward(cudaStream_t stream, float* output, const float* input, int softmax_elements, int input_stride, int output_stride, int batch_count, float temperature) { dim3 grid(batch_count); constexpr int ILP = sizeof(float4) / sizeof(float); dim3 block = SoftmaxGetBlockSize(ILP, softmax_elements); if (is_log_softmax) { SoftmaxBlockForward - <<>>(output, const_cast(input), + <<>>(output, const_cast(input), softmax_elements, input_stride, output_stride, temperature); } else { SoftmaxBlockForward - <<>>(output, const_cast(input), + <<>>(output, const_cast(input), softmax_elements, input_stride, output_stride, temperature); } } -template void DispatchBlockwiseSoftmaxForward(cudaStream_t*, float*, const float*, int, int, int, int, float); +template void DispatchBlockwiseSoftmaxForward(cudaStream_t, float*, const float*, int, int, int, int, float); // Populate Kernels and Launchers @@ -521,7 +522,7 @@ void LaunchSampleKernel(SamplingData* data, cudaStream_t stream, float* scores, void SoftmaxAndSort(SamplingData* data, cudaStream_t stream, float* scores_in, float* scores_out, int* indices_out, int vocab_size, int batch_size, float temperature) { // Softmax scores std::span scores{data->scores_softmaxed.get(), static_cast(vocab_size * batch_size)}; - DispatchBlockwiseSoftmaxForward(&stream, scores.data(), const_cast(scores_in), vocab_size, vocab_size, vocab_size, batch_size, temperature); + DispatchBlockwiseSoftmaxForward(stream, scores.data(), const_cast(scores_in), vocab_size, vocab_size, vocab_size, batch_size, temperature); // Sort indices by scores std::span offsets_gpu{data->offsets.get(), static_cast(batch_size + 1)}; LaunchPopulateOffsets(offsets_gpu.data(), vocab_size, batch_size, stream); @@ -550,7 +551,7 @@ void LaunchGetTopKSubsetFullSort(SamplingData* data, cudaStream_t stream, float* void GetTopKSubset(SamplingData* data, cudaStream_t stream, float* scores_in, float* scores_out, int* indices_out, int vocab_size, int batch_size, int k, float temperature) { // Softmax scores std::span scores_softmaxed{data->scores_softmaxed.get(), static_cast(vocab_size * batch_size)}; - DispatchBlockwiseSoftmaxForward(&stream, scores_softmaxed.data(), const_cast(scores_in), vocab_size, vocab_size, vocab_size, batch_size, temperature); + DispatchBlockwiseSoftmaxForward(stream, scores_softmaxed.data(), const_cast(scores_in), vocab_size, vocab_size, vocab_size, batch_size, temperature); // Get top k subset #define GetTopK(max_k) \ LaunchGetTopKSubset(stream, \ diff --git a/src/cuda/cuda_sampling.cuh b/src/cuda/cuda_sampling.cuh index e6e0f184f..390ff92bc 100644 --- a/src/cuda/cuda_sampling.cuh +++ b/src/cuda/cuda_sampling.cuh @@ -25,7 +25,7 @@ void LaunchPopulateIndices(int* indices, int size, int batch_size, cudaStream_t void GetSample(SamplingData* data, cudaStream_t stream, int32_t* d_next_token, float* d_scores, int vocab_size, int batch_size, int k, float p, float temperature); template -void DispatchBlockwiseSoftmaxForward(cudaStream_t* stream, float* output, const float* input, int softmax_elements, int input_stride, int output_stride, int batch_count, float temperature = 1.0); +void DispatchBlockwiseSoftmaxForward(cudaStream_t stream, float* output, const float* input, int softmax_elements, int input_stride, int output_stride, int batch_count, float temperature = 1.0); } // namespace cuda } // namespace Generators \ No newline at end of file diff --git a/src/cuda/interface.cpp b/src/cuda/interface.cpp index 0b708c94d..4c85379b0 100644 --- a/src/cuda/interface.cpp +++ b/src/cuda/interface.cpp @@ -6,7 +6,7 @@ #include "interface.h" #include "../search.h" #include "search_cuda.h" -#include "../models/kernels.h" +#include "kernels.h" #include namespace Generators { @@ -15,10 +15,13 @@ GenaiInterface* gp_genai{}; Ort::Allocator* ort_allocator_{}; const char* label_cuda = "cuda"; +cuda_stream_holder g_stream; +cudaStream_t GetStream() { return g_stream.get(); } + struct GpuMemory final : DeviceBuffer { GpuMemory(size_t size) : owned_{true} { size_in_bytes_ = size; - ::cudaMalloc(&p_device_, size); + p_device_ = static_cast(ort_allocator_->Alloc(size)); } GpuMemory(void* p, size_t size) : owned_{false} { @@ -28,7 +31,7 @@ struct GpuMemory final : DeviceBuffer { ~GpuMemory() override { if (owned_) - ::cudaFree(p_device_); + ort_allocator_->Free(p_device_); if (p_cpu_) ::cudaFreeHost(p_cpu_); } @@ -65,19 +68,23 @@ struct GpuMemory final : DeviceBuffer { bool owned_; // If we own the memory, we delete it on destruction }; -struct CudaInterfaceImpl : CudaInterface { +struct CudaInterfaceImpl final : DeviceInterface { CudaInterfaceImpl() { - cuda_stream_.Create(); } ~CudaInterfaceImpl() { } - void InitAllocator(Ort::Allocator& allocator) override { + void InitOrt(const OrtApi& api, Ort::Allocator& allocator) override { + Ort::api = &api; assert(!ort_allocator_); ort_allocator_ = &allocator; } + Ort::Allocator& GetAllocator() override { + return *ort_allocator_; + } + std::shared_ptr AllocateBase(size_t size) override { return std::make_shared(size); } @@ -95,85 +102,78 @@ struct CudaInterfaceImpl : CudaInterface { } void Synchronize() override { - ::cudaStreamSynchronize(cuda_stream_.get()); - } - - cudaStream_t GetCudaStream() override { - return cuda_stream_.get(); - } - - void Int32ToInt64(const int32_t* input, int64_t* output, int count, cudaStream_t stream) override { - cuda::LaunchInt32ToInt64(input, output, count, stream); - } - - void Fp16ToFp32(const uint16_t* input, float* output, int count, cudaStream_t stream) override { - cuda::LaunchFp16ToFp32(input, output, count, stream); - } - - void Fp32ToFp16(const float* input, uint16_t* output, int count, cudaStream_t stream) override { - cuda::LaunchFp32ToFp16(input, output, count, stream); + ::cudaStreamSynchronize(GetStream()); } - void LaunchExpandAndInt32ToInt64(const int32_t* src, int64_t* dst, int num_beams, int batch_size, int sequence_length, cudaStream_t stream) override { - cuda::LaunchExpandAndInt32ToInt64(src, dst, num_beams, batch_size, sequence_length, stream); + void* GetCudaStream() override { + return GetStream(); } - void LaunchExpand(const int32_t* src, int32_t* dst, int num_beams, int batch_size, int sequence_length, cudaStream_t stream) override { - cuda::LaunchExpand(src, dst, num_beams, batch_size, sequence_length, stream); - } + bool Cast(OrtValue& input, OrtValue& output) override { + auto input_info = input.GetTensorTypeAndShapeInfo(); + auto output_info = output.GetTensorTypeAndShapeInfo(); - void Launch_UpdatePositionIds(int32_t* position_ids, int batch_beam_size, int total_length, int new_kv_length, cudaStream_t stream) override { - cuda::Launch_UpdatePositionIds(position_ids, batch_beam_size, total_length, new_kv_length, stream); - } + auto input_type = input_info->GetElementType(); + auto output_type = output_info->GetElementType(); - void Launch_UpdatePositionIds(int64_t* position_ids, int batch_beam_size, int total_length, int new_kv_length, cudaStream_t stream) override { - cuda::Launch_UpdatePositionIds(position_ids, batch_beam_size, total_length, new_kv_length, stream); - } + auto input_data = input.GetTensorRawData(); + auto output_data = output.GetTensorMutableRawData(); - void Launch_UpdateAttentionMask(int32_t* mask_data, const int32_t* old_data, int batch_beam_size, int new_kv_length, int total_length, int max_length, bool update_only, cudaStream_t stream) override { - cuda::Launch_UpdateAttentionMask(mask_data, old_data, batch_beam_size, new_kv_length, total_length, max_length, update_only, stream); - } + auto element_count = input_info->GetElementCount(); + if (element_count != output_info->GetElementCount()) + throw std::runtime_error("Cast - input and output element counts do not match"); + if (input_type == output_type) + throw std::runtime_error("Cast - input and output types are the same"); - void Launch_UpdateAttentionMask(int64_t* mask_data, const int64_t* old_data, int batch_beam_size, int new_kv_length, int total_length, int max_length, bool update_only, cudaStream_t stream) override { - cuda::Launch_UpdateAttentionMask(mask_data, old_data, batch_beam_size, new_kv_length, total_length, max_length, update_only, stream); + if (input_type == Ort::TypeToTensorType && output_type == Ort::TypeToTensorType) { + cuda::LaunchFp32ToFp16(reinterpret_cast(input_data), reinterpret_cast(output_data), static_cast(element_count), GetStream()); + } else if (input_type == Ort::TypeToTensorType && output_type == Ort::TypeToTensorType) { + cuda::LaunchFp16ToFp32(reinterpret_cast(input_data), reinterpret_cast(output_data), static_cast(element_count), GetStream()); + } else if (input_type == Ort::TypeToTensorType && output_type == Ort::TypeToTensorType) { + cuda::LaunchInt32ToInt64(reinterpret_cast(input_data), reinterpret_cast(output_data), static_cast(element_count), GetStream()); + } else + return false; + return true; } - void LaunchHandleEOSArray(float* batch_logits, int batch_beam_size, int vocab_size, const int32_t* eos_token_ids, int eos_token_ids_count, cudaStream_t stream) override { - cuda::LaunchHandleEOSArray(batch_logits, batch_beam_size, vocab_size, eos_token_ids, eos_token_ids_count, stream); + void UpdatePositionIds(void* position_ids, int batch_beam_size, int total_length, int new_kv_length, ONNXTensorElementDataType type) override { + if (type == Ort::TypeToTensorType) + cuda::Launch_UpdatePositionIds(static_cast(position_ids), batch_beam_size, total_length, new_kv_length, GetStream()); + else + cuda::Launch_UpdatePositionIds(static_cast(position_ids), batch_beam_size, total_length, new_kv_length, GetStream()); } - void UpdateCacheIndirectionKernelLauncher(int32_t* tgt_indir_cache, const int32_t* src_indir_cache, const int32_t* beam_ids, int batch_size, int beam_width, int input_seq_length, int max_seq_length, int current_length, cudaStream_t stream) override { - cuda::UpdateCacheIndirectionKernelLauncher(tgt_indir_cache, src_indir_cache, beam_ids, batch_size, beam_width, input_seq_length, max_seq_length, current_length, stream); + void UpdateAttentionMask(void* mask_data, const void* old_data, int batch_beam_size, int new_kv_length, int total_length, int max_length, bool update_only, ONNXTensorElementDataType type) override { + if (type == Ort::TypeToTensorType) + cuda::Launch_UpdateAttentionMask(static_cast(mask_data), static_cast(old_data), batch_beam_size, new_kv_length, total_length, max_length, update_only, GetStream()); + else + cuda::Launch_UpdateAttentionMask(static_cast(mask_data), static_cast(old_data), batch_beam_size, new_kv_length, total_length, max_length, update_only, GetStream()); } - void ReorderPastStatesKernelLauncher(void* out_buffer, const void* in_buffer, int batch_size, int num_heads, int max_length, int head_size, int chunk_size, cudaStream_t stream) override { - cuda::ReorderPastStatesKernelLauncher(out_buffer, in_buffer, batch_size, num_heads, max_length, head_size, chunk_size, stream); + void LaunchHandleEOSArray(float* batch_logits, int batch_beam_size, int vocab_size, const int32_t* eos_token_ids, int eos_token_ids_count) override { + cuda::LaunchHandleEOSArray(batch_logits, batch_beam_size, vocab_size, eos_token_ids, eos_token_ids_count, GetStream()); } - void LaunchCopyCrossQKSingleDecodeStep(cudaStream_t stream, float* cross_qk_buffer_data, float** qk_layer_pointers, int token_index, int batch_beam_size, int num_layers, int num_heads, int num_alignment_heads, const int* alignment_heads, int frames, int max_length) override { - cuda::LaunchCopyCrossQKSingleDecodeStep(stream, cross_qk_buffer_data, qk_layer_pointers, token_index, batch_beam_size, num_layers, num_heads, num_alignment_heads, alignment_heads, frames, max_length); + void UpdateCacheIndirectionKernelLauncher(int32_t* tgt_indir_cache, const int32_t* src_indir_cache, const int32_t* beam_ids, int batch_size, int beam_width, int input_seq_length, int max_seq_length, int current_length) override { + cuda::UpdateCacheIndirectionKernelLauncher(tgt_indir_cache, src_indir_cache, beam_ids, batch_size, beam_width, input_seq_length, max_seq_length, current_length, GetStream()); } - void LaunchFinalizeCrossQK(cudaStream_t stream, int iteration_number, int context_decoding_len, int batch_size, int num_beams, int max_length, int num_alignment_heads, int frames_of_k, const float* cross_qk_buffer_data, float* cross_qk_output, int num_return_sequences, const int* cache_indir_data) override { - cuda::LaunchFinalizeCrossQK(stream, iteration_number, context_decoding_len, batch_size, num_beams, max_length, num_alignment_heads, frames_of_k, cross_qk_buffer_data, cross_qk_output, num_return_sequences, cache_indir_data); + void ReorderPastStatesKernelLauncher(void* out_buffer, const void* in_buffer, int batch_size, int num_heads, int max_length, int head_size, int chunk_size) override { + cuda::ReorderPastStatesKernelLauncher(out_buffer, in_buffer, batch_size, num_heads, max_length, head_size, chunk_size, GetStream()); } - cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream) override { - return ::cudaMemcpyAsync(dst, src, count, kind, stream); + void LaunchCopyCrossQKSingleDecodeStep(float* cross_qk_buffer_data, float** qk_layer_pointers, int token_index, int batch_beam_size, int num_layers, int num_heads, int num_alignment_heads, const int* alignment_heads, int frames, int max_length) override { + cuda::LaunchCopyCrossQKSingleDecodeStep(GetStream(), cross_qk_buffer_data, qk_layer_pointers, token_index, batch_beam_size, num_layers, num_heads, num_alignment_heads, alignment_heads, frames, max_length); } - cudaError_t cudaMemsetAsync(void* ptr, int value, size_t count, cudaStream_t stream) override { - return ::cudaMemsetAsync(ptr, value, count, stream); + void LaunchFinalizeCrossQK(int iteration_number, int context_decoding_len, int batch_size, int num_beams, int max_length, int num_alignment_heads, int frames_of_k, const float* cross_qk_buffer_data, float* cross_qk_output, int num_return_sequences, const int* cache_indir_data) override { + cuda::LaunchFinalizeCrossQK(GetStream(), iteration_number, context_decoding_len, batch_size, num_beams, max_length, num_alignment_heads, frames_of_k, cross_qk_buffer_data, cross_qk_output, num_return_sequences, cache_indir_data); } - - private: - cuda_stream_holder cuda_stream_; }; -std::unique_ptr g_cuda_device; +std::unique_ptr g_cuda_device; DeviceInterface& GetCudaDeviceInterface() { return *g_cuda_device; } -cudaStream_t GetStream() { return g_cuda_device->GetCudaStream(); } LogItems& GetLogItems() { return gp_genai->GetLogItems(); } std::ostream& operator<<(std::ostream& stream, SGR sgr_code) { return gp_genai->operator_leftshift(stream, sgr_code); } @@ -212,7 +212,7 @@ void operator delete(void* p, size_t /*size*/) noexcept { Generators::gp_genai-> #endif extern "C" { -Generators::CudaInterface* GetInterface(GenaiInterface* p_genai) { +Generators::DeviceInterface* GetInterface(GenaiInterface* p_genai) { Generators::gp_genai = p_genai; Generators::g_cuda_device = std::make_unique(); return Generators::g_cuda_device.get(); diff --git a/src/cuda/interface.h b/src/cuda/interface.h index 24e6fb94f..2236e7d31 100644 --- a/src/cuda/interface.h +++ b/src/cuda/interface.h @@ -22,29 +22,5 @@ struct GenaiInterface { namespace Generators { LogItems& GetLogItems(); - -#if USE_CUDA DeviceInterface& GetCudaDeviceInterface(); - -struct CudaInterface : DeviceInterface { - virtual void Int32ToInt64(const int32_t* input, int64_t* output, int count, cudaStream_t stream) = 0; - virtual void Fp16ToFp32(const uint16_t* input, float* output, int count, cudaStream_t stream) = 0; - virtual void Fp32ToFp16(const float* input, uint16_t* output, int count, cudaStream_t stream) = 0; - // TODO: This can be collapsed into a single function with a template parameter - virtual void LaunchExpandAndInt32ToInt64(const int32_t* src, int64_t* dst, int num_beams, int batch_size, int sequence_length, cudaStream_t stream) = 0; - virtual void LaunchExpand(const int32_t* src, int32_t* dst, int num_beams, int batch_size, int sequence_length, cudaStream_t stream) = 0; - virtual void Launch_UpdatePositionIds(int32_t* position_ids, int batch_beam_size, int total_length, int new_kv_length, cudaStream_t stream) = 0; - virtual void Launch_UpdatePositionIds(int64_t* position_ids, int batch_beam_size, int total_length, int new_kv_length, cudaStream_t stream) = 0; - virtual void Launch_UpdateAttentionMask(int32_t* mask_data, const int32_t* old_data, int batch_beam_size, int new_kv_length, int total_length, int max_length, bool update_only, cudaStream_t stream) = 0; - virtual void Launch_UpdateAttentionMask(int64_t* mask_data, const int64_t* old_data, int batch_beam_size, int new_kv_length, int total_length, int max_length, bool update_only, cudaStream_t stream) = 0; - virtual void LaunchHandleEOSArray(float* batch_logits, int batch_beam_size, int vocab_size, const int32_t* eos_token_ids, int eos_token_ids_count, cudaStream_t stream) = 0; - virtual void UpdateCacheIndirectionKernelLauncher(int32_t* tgt_indir_cache, const int32_t* src_indir_cache, const int32_t* beam_ids, int batch_size, int beam_width, int input_seq_length, int max_seq_length, int current_length, cudaStream_t stream) = 0; - virtual void ReorderPastStatesKernelLauncher(void* out_buffer, const void* in_buffer, int batch_size, int num_heads, int max_length, int head_size, int chunk_size, cudaStream_t stream) = 0; - virtual void LaunchCopyCrossQKSingleDecodeStep(cudaStream_t stream, float* cross_qk_buffer_data, float** qk_layer_pointers, int token_index, int batch_beam_size, int num_layers, int num_heads, int num_alignment_heads, const int* alignment_heads, int frames, int max_length) = 0; - virtual void LaunchFinalizeCrossQK(cudaStream_t stream, int iteration_number, int context_decoding_len, int batch_size, int num_beams, int max_length, int num_alignment_heads, int frames_of_k, const float* cross_qk_buffer_data, float* cross_qk_output, int num_return_sequences, const int* cache_indir_data) = 0; - - virtual cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream) = 0; - virtual cudaError_t cudaMemsetAsync(void* ptr, int value, size_t count, cudaStream_t stream) = 0; -}; -#endif } // namespace Generators diff --git a/src/models/kernels.h b/src/cuda/kernels.h similarity index 92% rename from src/models/kernels.h rename to src/cuda/kernels.h index fece7dadf..99be3a416 100644 --- a/src/models/kernels.h +++ b/src/cuda/kernels.h @@ -15,8 +15,6 @@ void LaunchHandleEOSArray(float* batch_logits, int batch_beam_size, int vocab_si void LaunchFp16ToFp32(const uint16_t* fp16, float* fp32, int count, cudaStream_t stream); void LaunchFp32ToFp16(const float* fp32, uint16_t* fp16, int count, cudaStream_t stream); void LaunchInt32ToInt64(const int32_t* src, int64_t* dst, int count, cudaStream_t stream); -void LaunchExpandAndInt32ToInt64(const int32_t* src, int64_t* dst, int num_beams, int batch_size, int sequence_length, cudaStream_t stream); -void LaunchExpand(const int32_t* src, int32_t* dst, int num_beams, int batch_size, int sequence_length, cudaStream_t stream); template void BufferExpansionKernelLauncher(const T* input, T* output, int batch_size, int beam_width, int chunk_size, cudaStream_t stream); diff --git a/src/cuda/model_kernels.cu b/src/cuda/model_kernels.cu index 0eb316383..59b1f5431 100644 --- a/src/cuda/model_kernels.cu +++ b/src/cuda/model_kernels.cu @@ -137,36 +137,6 @@ void LaunchInt32ToInt64(const int32_t* src, int64_t* dst, int count, cudaStream_ ConvertInt32ToInt64<<>>(src, dst, count); } -__global__ void ExpandAndConvertInt32ToInt64(const int32_t* src, int64_t* dst, int num_beams, int batch_size, int sequence_length) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < num_beams * batch_size * sequence_length) { - int batch_id = idx / (num_beams * sequence_length); - int seq_id = idx % sequence_length; - dst[idx] = (int64_t)src[batch_id * sequence_length + seq_id]; - } -} - -void LaunchExpandAndInt32ToInt64(const int32_t* src, int64_t* dst, int num_beams, int batch_size, int sequence_length, cudaStream_t stream) { - int block_size = 256; - int num_blocks = (num_beams * batch_size * sequence_length + block_size - 1) / block_size; - ExpandAndConvertInt32ToInt64<<>>(src, dst, num_beams, batch_size, sequence_length); -} - -__global__ void Expand(const int32_t* src, int32_t* dst, int num_beams, int batch_size, int sequence_length) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < num_beams * batch_size * sequence_length) { - int batch_id = idx / (num_beams * sequence_length); - int seq_id = idx % sequence_length; - dst[idx] = src[batch_id * sequence_length + seq_id]; - } -} - -void LaunchExpand(const int32_t* src, int32_t* dst, int num_beams, int batch_size, int sequence_length, cudaStream_t stream) { - int block_size = 256; - int num_blocks = (num_beams * batch_size * sequence_length + block_size - 1) / block_size; - Expand<<>>(src, dst, num_beams, batch_size, sequence_length); -} - namespace { struct ReorderPastStateParams { diff --git a/src/cuda/search_cuda.cpp b/src/cuda/search_cuda.cpp index 8d7bf6fe0..a53ff37dd 100644 --- a/src/cuda/search_cuda.cpp +++ b/src/cuda/search_cuda.cpp @@ -22,7 +22,7 @@ Search_Cuda::Search_Cuda(const GeneratorParams& params) sequence_lengths_ = params.p_device->Allocate(batch_beam_size); eos_meet_buffer_ = CudaMallocArray(batch_beam_size, &eos_meet_); - cudaMemsetAsync(eos_meet_.data(), 0, eos_meet_.size_bytes(), params_->cuda_stream); + cudaMemsetAsync(eos_meet_.data(), 0, eos_meet_.size_bytes(), GetStream()); done_cpu_ = CudaMallocHostArray(1); *done_cpu_ = false; @@ -39,7 +39,7 @@ GreedySearch_Cuda::GreedySearch_Cuda(const GeneratorParams& params) random_seed = params_->search.random_seed; else random_seed = std::random_device{}(); - samplingdata_ = std::make_unique(random_seed, params_->search.batch_size, params_->config.model.vocab_size, params_->cuda_stream); + samplingdata_ = std::make_unique(random_seed, params_->search.batch_size, params_->config.model.vocab_size, GetStream()); } BeamSearch_Cuda::BeamSearch_Cuda(const GeneratorParams& params) @@ -58,7 +58,7 @@ BeamSearch_Cuda::BeamSearch_Cuda(const GeneratorParams& params) topk_buffer_ = CudaMallocArray(topk_buffer_size); static_assert(sizeof(float) == sizeof(int32_t)); // The topk_buffer assumes these match, fix for float16 - cudaMemsetAsync(topk_buffer_.get(), 0, topk_buffer_size * sizeof(float), params_->cuda_stream); + cudaMemsetAsync(topk_buffer_.get(), 0, topk_buffer_size * sizeof(float), GetStream()); } BeamSearch_Cuda::~BeamSearch_Cuda() = default; @@ -84,20 +84,20 @@ DeviceSpan BeamSearch_Cuda::GetNextIndices() { } void BeamSearch_Cuda::SelectTop() { - cuda::DispatchBlockwiseSoftmaxForward(const_cast(¶ms_->cuda_stream), softmax_buffer_.get(), next_token_scores_.Span().data(), params_->config.model.vocab_size, + cuda::DispatchBlockwiseSoftmaxForward(GetStream(), softmax_buffer_.get(), next_token_scores_.Span().data(), params_->config.model.vocab_size, params_->config.model.vocab_size, params_->config.model.vocab_size, params_->BatchBeamSize()); // Copy next_token_scores to CPU auto next_token_scores_cpu = CudaMallocHostArray(params_->BatchBeamSize() * params_->config.model.vocab_size); - cudaMemcpyAsync(next_token_scores_cpu.get(), softmax_buffer_.get(), params_->BatchBeamSize() * params_->config.model.vocab_size * sizeof(float), cudaMemcpyDeviceToHost, params_->cuda_stream); - CudaCheck() == cudaStreamSynchronize(params_->cuda_stream); + cudaMemcpyAsync(next_token_scores_cpu.get(), softmax_buffer_.get(), params_->BatchBeamSize() * params_->config.model.vocab_size * sizeof(float), cudaMemcpyDeviceToHost, GetStream()); + CudaCheck() == cudaStreamSynchronize(GetStream()); auto beam_scores = beam_scorer_->GetNextScores(); // Add beam score to next token scores. Corresponding python code is like: // next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores) cuda::LaunchAddProbsKernel(softmax_buffer_.get(), beam_scores.Span().data(), - params_->search.batch_size, params_->search.num_beams, params_->config.model.vocab_size, params_->cuda_stream); + params_->search.batch_size, params_->search.num_beams, params_->config.model.vocab_size, GetStream()); if (params_->search.num_beams <= 32) { constexpr size_t max_parts_of_vocab = 128; @@ -120,11 +120,11 @@ void BeamSearch_Cuda::SelectTop() { topk_next_scores_.get(), topk_next_tokens_.get(), topk_next_indices_.get(), - params_->cuda_stream); + GetStream()); } else assert(false); - CudaCheck() == cudaStreamSynchronize(params_->cuda_stream); + CudaCheck() == cudaStreamSynchronize(GetStream()); size_t size = params_->BatchBeamSize() * 2; std::span next_scores{topk_next_scores_.get(), size}; @@ -146,13 +146,13 @@ void BeamSearch_Cuda::SelectTop() { void GreedySearch_Cuda::SampleTopKTopP(int k, float p, float temperature) { std::span scores = next_token_scores_.Span(); assert(scores.size() == params_->search.batch_size * params_->config.model.vocab_size); - cuda::GetSample(samplingdata_.get(), params_->cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_->search.batch_size), + cuda::GetSample(samplingdata_.get(), GetStream(), next_tokens_.data(), scores.data(), int(scores.size() / params_->search.batch_size), params_->search.batch_size, k, p, temperature); // Check for EOS assert(next_tokens_.size() == eos_meet_.size()); // Don't replace EOS with pad for batch_size == 1 for continuous decoding mode - cuda::Launch_CheckForEOSAndPad(next_tokens_.data(), static_cast(next_tokens_.size()), eos_meet_.data(), params_->config.model.eos_token_id, params_->search.batch_size > 1 ? params_->config.model.pad_token_id : params_->config.model.eos_token_id, done_cpu_.get(), params_->cuda_stream); + cuda::Launch_CheckForEOSAndPad(next_tokens_.data(), static_cast(next_tokens_.size()), eos_meet_.data(), params_->config.model.eos_token_id, params_->search.batch_size > 1 ? params_->config.model.pad_token_id : params_->config.model.eos_token_id, done_cpu_.get(), GetStream()); // Append tokens cuda::Launch_AppendNextTokensToSequences(next_tokens_buffer_.Span(), sequences_.GetSequences().Span(), params_->BatchBeamSize(), sequences_.GetSequenceLength(), sequences_.max_length_, GetStream()); @@ -207,7 +207,7 @@ std::span Search_Cuda::GetScores() { // Set user input tokens (batch_beam_size, sequence_length) void GreedySearch_Cuda::AppendTokens(DeviceSpan& next_tokens) { - cudaMemsetAsync(eos_meet_.data(), 0, eos_meet_.size_bytes(), params_->cuda_stream); + cudaMemsetAsync(eos_meet_.data(), 0, eos_meet_.size_bytes(), GetStream()); *done_cpu_ = false; auto next_tokens_gpu = next_tokens.Span(); @@ -221,7 +221,7 @@ void GreedySearch_Cuda::AppendTokens(DeviceSpan& next_tokens) { return; } - cudaMemsetAsync(eos_meet_.data(), 0, eos_meet_.size_bytes(), params_->cuda_stream); + cudaMemsetAsync(eos_meet_.data(), 0, eos_meet_.size_bytes(), GetStream()); *done_cpu_ = false; } @@ -234,12 +234,12 @@ void BeamSearch_Cuda::AppendTokens(DeviceSpan& next_tokens) { } void GreedySearch_Cuda::RewindTo(size_t index) { - cudaMemsetAsync(eos_meet_.data(), 0, eos_meet_.size_bytes(), params_->cuda_stream); + cudaMemsetAsync(eos_meet_.data(), 0, eos_meet_.size_bytes(), GetStream()); *done_cpu_ = false; if (index > 0) cuda::Launch_GetLastTokens(next_tokens_.data(), sequences_.GetSequences().Span().data(), static_cast(params_->BatchBeamSize()), static_cast(index), sequences_.max_length_, GetStream()); else - cudaMemsetAsync(next_tokens_.data(), 0, params_->search.batch_size * sizeof(int32_t), params_->cuda_stream); + cudaMemsetAsync(next_tokens_.data(), 0, params_->search.batch_size * sizeof(int32_t), GetStream()); sequences_.RewindTo(index); } @@ -247,7 +247,7 @@ void Search_Cuda::ApplyMinLength(int min_length) { if (sequences_.GetSequenceLength() >= min_length) return; - cuda::LaunchSetScoreProcessor(GetScores().data(), params_->BatchBeamSize(), params_->config.model.vocab_size, params_->config.model.eos_token_id, std::numeric_limits::lowest(), params_->cuda_stream); + cuda::LaunchSetScoreProcessor(GetScores().data(), params_->BatchBeamSize(), params_->config.model.vocab_size, params_->config.model.eos_token_id, std::numeric_limits::lowest(), GetStream()); } void Search_Cuda::ApplyRepetitionPenalty(float penalty) { @@ -256,7 +256,7 @@ void Search_Cuda::ApplyRepetitionPenalty(float penalty) { cuda::LaunchRepetitionPenaltyProcessor(sequences_.GetSequences().Span().data(), GetScores().data(), params_->search.batch_size, params_->search.num_beams, params_->config.model.vocab_size, - params_->search.max_length, GetSequenceLength(), penalty, params_->cuda_stream); + params_->search.max_length, GetSequenceLength(), penalty, GetStream()); } } // namespace Generators \ No newline at end of file diff --git a/src/cuda/search_cuda.h b/src/cuda/search_cuda.h index 2e0ec4610..bb31b4423 100644 --- a/src/cuda/search_cuda.h +++ b/src/cuda/search_cuda.h @@ -1,4 +1,5 @@ #pragma once +#include #include "search_cuda.cuh" #include "cuda_sampling.cuh" @@ -12,7 +13,7 @@ struct Search_Cuda : Search { DeviceSpan GetSequenceLengths() override { return sequence_lengths_; } bool IsDone() const { - cudaStreamSynchronize(params_->cuda_stream); + cudaStreamSynchronize(GetStream()); return *done_cpu_; } // TODO: Use an event diff --git a/src/dml/interface.cpp b/src/dml/interface.cpp index 61a57d663..459a71054 100644 --- a/src/dml/interface.cpp +++ b/src/dml/interface.cpp @@ -116,7 +116,8 @@ struct DmlInterfaceImpl : DeviceInterface { Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast(&dml_api_))); } - void InitAllocator(Ort::Allocator& allocator) override { + void InitOrt(const OrtApi& api, Ort::Allocator& allocator) override { + Ort::api = &api; assert(!ort_allocator_); ort_allocator_ = &allocator; @@ -131,6 +132,10 @@ struct DmlInterfaceImpl : DeviceInterface { dml_readback_heap_ = std::make_unique(dml_objects_.d3d12_device.Get(), dml_execution_context_.get()); } + Ort::Allocator& GetAllocator() override { + return *ort_allocator_; + } + std::shared_ptr AllocateBase(size_t size) override { return std::make_shared(size); } @@ -147,6 +152,58 @@ struct DmlInterfaceImpl : DeviceInterface { return GetCpuInterface()->CreateBeam(params); } +#if 0 + void UpdatePositionIDs() { + ComPtr target_resource; + Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, position_ids_->GetTensorMutableRawData(), &target_resource)); + + dml_update_position_ids_kernel_ = DmlIncrementValuesKernel( + model_.GetD3D12Device(), + model_.GetDmlExecutionContext(), + static_cast(position_ids_shape_[0]), + type_, + target_resource.Get()); + + // Execute the cached command list + ComPtr fence; + uint64_t completion_value; + model_.GetDmlExecutionContext()->ExecuteCommandList(dml_update_position_ids_kernel_->GetCommandList(), &fence, &completion_value); + } + + void UpdateAttentionMask(int total_length) { + ComPtr attention_mask_resource; + Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, attention_mask_->GetTensorMutableRawData(), &attention_mask_resource)); + ComPtr attention_mask_next_resource; + Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, attention_mask_next_->GetTensorMutableRawData(), &attention_mask_next_resource)); + if (is_first_mask_update_) { + dml_update_mask_kernel_ = DmlUpdateMaskKernel( + model_.GetD3D12Device(), + model_.GetDmlExecutionContext(), + static_cast(attention_mask_shape_[0]), + static_cast(attention_mask_shape_[1]), + type_, + total_length, + attention_mask_resource.Get(), + attention_mask_next_resource.Get()); + is_second_mask_update_ = true; + } else if (is_second_mask_update_) { + dml_update_mask_kernel_ = DmlUpdateMaskKernel( + model_.GetD3D12Device(), + model_.GetDmlExecutionContext(), + static_cast(attention_mask_shape_[0]), + static_cast(attention_mask_shape_[1]), + type_, + 1, + attention_mask_resource.Get(), + attention_mask_next_resource.Get()); + is_second_mask_update_ = false; + } + ComPtr fence; + uint64_t completion_value; + model_.GetDmlExecutionContext()->ExecuteCommandList(dml_update_mask_kernel_->GetCommandList(), &fence, &completion_value); + } +#endif + void Synchronize() override { } }; @@ -165,7 +222,6 @@ void SetDmlProvider(OrtSessionOptions& session_options) { } DeviceInterface* GetDmlInterface() { - assert(g_dml_device); return g_dml_device.get(); } diff --git a/src/generators.cpp b/src/generators.cpp index ad34d1979..c47f83151 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -9,9 +9,6 @@ #include "cpu/interface.h" #include "cuda/interface.h" #include "dml/interface.h" -#if USE_CUDA -#include "models/kernels.h" -#endif #if _WIN32 EXTERN_C IMAGE_DOS_HEADER __ImageBase; @@ -39,11 +36,6 @@ void ThrowErrorIfSessionTerminated(bool is_session_terminated) { namespace Generators { -#if USE_CUDA -// TODO: Remove once we remove all dependencies -void OnCudaError(cudaError_t error) { assert(false); } -#endif - static bool _ = (Ort::InitApi(), false); OrtGlobals::OrtGlobals() @@ -93,8 +85,12 @@ OrtEnv& GetOrtEnv() { // Fallback to copy between two separate device buffers by going through CPU memory (slow unless we're the CPU device) void CopyThroughCpu(DeviceBuffer& dest, size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) { source.CopyDeviceToCpu(); - auto source_span = std::span(source.p_cpu_+begin_source, size_in_bytes); - dest.AllocateCpu(); + auto source_span = std::span(source.p_cpu_ + begin_source, size_in_bytes); + // If we're overwriting the entire destination + if (dest.size_in_bytes_ == size_in_bytes) + dest.AllocateCpu(); + else + dest.CopyDeviceToCpu(); // Overwriting part of destination, so copy over initial contents first std::copy(source_span.begin(), source_span.end(), dest.p_cpu_ + begin_dest); dest.CopyCpuToDevice(); } @@ -120,8 +116,7 @@ struct GenaiInterfaceImpl : GenaiInterface { void Sequences_RewindTo(Sequences* p_this, size_t new_length) override { return p_this->RewindTo(new_length); } } g_genai; -#if USE_CUDA -CudaInterface* GetCudaInterface() { +DeviceInterface* GetCudaInterface() { // Load the shared library onnxruntime-genai-cuda.dll // This is a workaround to avoid linking the CUDA library to the generator library // The CUDA library is only needed for the CUDA allocator @@ -137,8 +132,8 @@ CudaInterface* GetCudaInterface() { throw std::runtime_error("Cuda interface not available."); } - Generators::CudaInterface* GetInterface(GenaiInterface * p_genai); - static CudaInterface* cuda_interface{[] { + Generators::DeviceInterface* GetInterface(GenaiInterface * p_genai); + static DeviceInterface* cuda_interface{[] { #ifdef _WIN32 auto get_cuda_fn = reinterpret_cast(GetProcAddress(reinterpret_cast(cuda_library.get()), "GetInterface")); #else @@ -150,32 +145,6 @@ CudaInterface* GetCudaInterface() { return cuda_interface; } - -namespace cuda { -void LaunchInt32ToInt64(const int32_t* input, int64_t* output, int count, cudaStream_t stream) { GetCudaInterface()->Int32ToInt64(input, output, count, stream); } -void LaunchFp16ToFp32(const uint16_t* input, float* output, int count, cudaStream_t stream) { GetCudaInterface()->Fp16ToFp32(input, output, count, stream); } -void LaunchFp32ToFp16(const float* input, uint16_t* output, int count, cudaStream_t stream) { GetCudaInterface()->Fp32ToFp16(input, output, count, stream); } -void LaunchExpandAndInt32ToInt64(const int32_t* src, int64_t* dst, int num_beams, int batch_size, int sequence_length, cudaStream_t stream) { GetCudaInterface()->LaunchExpandAndInt32ToInt64(src, dst, num_beams, batch_size, sequence_length, stream); } -void LaunchExpand(const int32_t* src, int32_t* dst, int num_beams, int batch_size, int sequence_length, cudaStream_t stream) { GetCudaInterface()->LaunchExpand(src, dst, num_beams, batch_size, sequence_length, stream); } -template <> -void Launch_UpdatePositionIds(int32_t* position_ids, int batch_beam_size, int total_length, int new_kv_length, cudaStream_t stream) { GetCudaInterface()->Launch_UpdatePositionIds(position_ids, batch_beam_size, total_length, new_kv_length, stream); } -template <> -void Launch_UpdatePositionIds(int64_t* position_ids, int batch_beam_size, int total_length, int new_kv_length, cudaStream_t stream) { GetCudaInterface()->Launch_UpdatePositionIds(position_ids, batch_beam_size, total_length, new_kv_length, stream); } -template <> -void Launch_UpdateAttentionMask(int32_t* mask_data, const int32_t* old_data, int batch_beam_size, int new_kv_length, int total_length, int max_length, bool update_only, cudaStream_t stream) { GetCudaInterface()->Launch_UpdateAttentionMask(mask_data, old_data, batch_beam_size, new_kv_length, total_length, max_length, update_only, stream); } -template <> -void Launch_UpdateAttentionMask(int64_t* mask_data, const int64_t* old_data, int batch_beam_size, int new_kv_length, int total_length, int max_length, bool update_only, cudaStream_t stream) { GetCudaInterface()->Launch_UpdateAttentionMask(mask_data, old_data, batch_beam_size, new_kv_length, total_length, max_length, update_only, stream); } -void LaunchHandleEOSArray(float* batch_logits, int batch_beam_size, int vocab_size, const int32_t* eos_token_ids, int eos_token_ids_count, cudaStream_t stream) { GetCudaInterface()->LaunchHandleEOSArray(batch_logits, batch_beam_size, vocab_size, eos_token_ids, eos_token_ids_count, stream); } -void UpdateCacheIndirectionKernelLauncher(int32_t* tgt_indir_cache, const int32_t* src_indir_cache, const int32_t* beam_ids, int batch_size, int beam_width, int input_seq_length, int max_seq_length, int current_length, cudaStream_t stream) { GetCudaInterface()->UpdateCacheIndirectionKernelLauncher(tgt_indir_cache, src_indir_cache, beam_ids, batch_size, beam_width, input_seq_length, max_seq_length, current_length, stream); } -void ReorderPastStatesKernelLauncher(void* out_buffer, const void* in_buffer, int batch_size, int num_heads, int max_length, int head_size, int chunk_size, cudaStream_t stream) { GetCudaInterface()->ReorderPastStatesKernelLauncher(out_buffer, in_buffer, batch_size, num_heads, max_length, head_size, chunk_size, stream); } -template <> -void LaunchCopyCrossQKSingleDecodeStep(cudaStream_t stream, float* cross_qk_buffer_data, float** qk_layer_pointers, int token_index, int batch_beam_size, int num_layers, int num_heads, int num_alignment_heads, const int* alignment_heads, int frames, int max_length) { GetCudaInterface()->LaunchCopyCrossQKSingleDecodeStep(stream, cross_qk_buffer_data, qk_layer_pointers, token_index, batch_beam_size, num_layers, num_heads, num_alignment_heads, alignment_heads, frames, max_length); } -template <> -void LaunchFinalizeCrossQK(cudaStream_t stream, int iteration_number, int context_decoding_len, int batch_size, int num_beams, int max_length, int num_alignment_heads, int frames_of_k, const float* cross_qk_buffer_data, float* cross_qk_output, int num_return_sequences, const int* cache_indir_data) { GetCudaInterface()->LaunchFinalizeCrossQK(stream, iteration_number, context_decoding_len, batch_size, num_beams, max_length, num_alignment_heads, frames_of_k, cross_qk_buffer_data, cross_qk_output, num_return_sequences, cache_indir_data); } -} // namespace cuda -#endif - - std::string to_string(DeviceType device_type) { switch (device_type) { case DeviceType::CPU: @@ -195,10 +164,8 @@ DeviceInterface* GetDeviceInterface(DeviceType type) { default: case DeviceType::CPU: return GetCpuInterface(); -#if USE_CUDA case DeviceType::CUDA: return GetCudaInterface(); -#endif #if USE_DML case DeviceType::DML: return GetDmlInterface(); @@ -215,7 +182,6 @@ GeneratorParams::GeneratorParams(const Model& model) : config{*model.config_.get()}, p_device{model.p_device_}, device_type{model.device_type_}, - cuda_stream{model.cuda_stream_}, is_cuda_graph_enabled_{IsCudaGraphEnabled(model.config_->model.decoder.session_options)} { use_cuda_graph = is_cuda_graph_enabled_; if (use_cuda_graph) { @@ -444,8 +410,3 @@ DeviceSpan Generator::GetSequence(size_t index) const { } } // namespace Generators - -#if USE_CUDA -cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream) { return Generators::GetCudaInterface()->cudaMemcpyAsync(dst, src, count, kind, stream); } -cudaError_t cudaMemsetAsync(void* ptr, int value, size_t count, cudaStream_t stream) { return Generators::GetCudaInterface()->cudaMemsetAsync(ptr, value, count, stream); } -#endif diff --git a/src/generators.h b/src/generators.h index 7275aa976..0449e198e 100644 --- a/src/generators.h +++ b/src/generators.h @@ -23,16 +23,10 @@ #include #include #include -#if USE_CUDA -#include -#else -// If we don't include cuda_runtime.h, we define this to avoid lots of extra #ifdefs -using cudaStream_t = void*; -#endif #include "leakcheck.h" -#include "smartptrs.h" #include "models/onnxruntime_api.h" +#include "smartptrs.h" #include "models/debugging.h" #include "config.h" #include "logging.h" @@ -94,7 +88,6 @@ struct GeneratorParams : std::enable_shared_from_this, LeakChec DeviceInterface* p_device{}; DeviceType device_type{DeviceType::CPU}; - cudaStream_t cuda_stream{}; cpu_span aux_input_ids{}; // Intermediate solution to be used with SetInputs function for multimodal and whisper models diff --git a/src/models/audio_processor.cpp b/src/models/audio_processor.cpp index 3b0ac53ed..e81e357d1 100644 --- a/src/models/audio_processor.cpp +++ b/src/models/audio_processor.cpp @@ -30,7 +30,7 @@ std::unique_ptr ProcessMel(ort_extensions::OrtxObjectPtr& allocator.GetInfo(), std::span(const_cast(mel_data), input_features_value->GetTensorTypeAndShapeInfo()->GetElementCount()), shape_span); - ConvertFp32ToFp16(allocator, *input_features_fp32, input_features_value, DeviceType::CPU, nullptr); + Cast(*input_features_fp32, input_features_value, *GetDeviceInterface(DeviceType::CPU), Ort::TypeToTensorType); } return input_features_value; diff --git a/src/models/captured_graph_pool.h b/src/models/captured_graph_pool.h index 42e3be51d..4d405018e 100644 --- a/src/models/captured_graph_pool.h +++ b/src/models/captured_graph_pool.h @@ -145,11 +145,6 @@ struct CapturedGraphInfo { std::unique_ptr sb_embeddings_; std::unique_ptr key_; -#if USE_DML - std::unique_ptr sb_attention_mask_next_; - std::unique_ptr sb_input_ids_int32_; -#endif - // Generates a unique annotation ID across different captured graph objects. This is necessary because different // generators could be alive at the same time and run the same batch size but with different static buffers, so // they need to have different annotation IDs. diff --git a/src/models/debugging.cpp b/src/models/debugging.cpp index 67e0df3b1..1d127be6c 100644 --- a/src/models/debugging.cpp +++ b/src/models/debugging.cpp @@ -89,41 +89,9 @@ void DumpTensor(const Model& model, std::ostream& stream, OrtValue* value, bool auto device_span = model.p_device_->WrapMemory(tensor_span); DumpValues(stream, type, device_span.CopyDeviceToCpu().data(), element_count); break; -#if 0 -#if USE_CUDA - auto type = type_info->GetElementType(); - size_t element_size = SizeOf(type); - auto cpu_copy = std::make_unique(element_size * element_count); - CudaCheck() == cudaMemcpy(cpu_copy.get(), value->GetTensorRawData(), element_size * element_count, cudaMemcpyDeviceToHost); - DumpValues(stream, type, cpu_copy.get(), element_count); -#elif USE_DML - auto type = type_info->GetElementType(); - size_t element_size = SizeOf(type); - auto cpu_copy = std::make_unique(element_size * element_count); - - if (value->GetTensorMutableRawData()) { - ComPtr gpu_resource; - Ort::ThrowOnError(model.GetOrtDmlApi()->GetD3D12ResourceFromAllocation( - model.allocator_device_, - value->GetTensorMutableRawData(), - &gpu_resource)); - - model.GetDmlReadbackHeap()->ReadbackFromGpu( - std::span(cpu_copy.get(), element_size * element_count), - gpu_resource.Get(), - 0, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - } - - DumpValues(stream, type, cpu_copy.get(), element_count); -#else - stream << "Unexpected, using GPU memory but not compiled with CUDA or DML?"; -#endif -#endif - break; } default: - stream << "Unhandled device type"; + stream << "Unhandled device type: " << static_cast(memory_info.GetDeviceType()) << "\r\n"; break; } } diff --git a/src/models/extra_inputs.cpp b/src/models/extra_inputs.cpp index a4bab8ce7..b08a8c8d2 100644 --- a/src/models/extra_inputs.cpp +++ b/src/models/extra_inputs.cpp @@ -1,7 +1,6 @@ #include "../generators.h" #include "model.h" #include "extra_inputs.h" -#include "kernels.h" namespace Generators { diff --git a/src/models/input_ids.cpp b/src/models/input_ids.cpp index 47233f3d8..d0be2c67c 100644 --- a/src/models/input_ids.cpp +++ b/src/models/input_ids.cpp @@ -1,7 +1,6 @@ #include "../generators.h" #include "model.h" #include "input_ids.h" -#include "kernels.h" namespace Generators { @@ -49,7 +48,7 @@ void InputIDs::Add() { } } -void InputIDs::Update(DeviceSpan& new_tokens) { +void InputIDs::Update(DeviceSpan new_tokens) { auto new_tokens_cpu = new_tokens.CopyDeviceToCpu(); const auto get_unpadded_sequence_length = [](std::span input_ids, int32_t pad_token_id) { @@ -77,69 +76,32 @@ void InputIDs::Update(DeviceSpan& new_tokens) { if (static_cast(shape_[1]) != sequence_length) { shape_[1] = sequence_length; if (!sb_input_ids_) { - value_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_); + value_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_); } else { - value_ = sb_input_ids_->CreateTensorOnStaticBuffer(shape_, type_); + value_ = sb_input_ids_->CreateTensorOnStaticBuffer(shape_, Ort::TypeToTensorType); } state_.inputs_[input_index_] = value_.get(); } - // Update input_ids with next tokens, converting from 32-bit to 64-bit - if (type_ == Ort::TypeToTensorType) { - switch (model_.device_type_) { - case DeviceType::CUDA: { -#if USE_CUDA - auto* data = value_->GetTensorMutableData(); - auto next_tokens = new_tokens.Span(); - // For beam search - if (is_prompt_ && state_.params_->search.num_beams > 1) - cuda::LaunchExpandAndInt32ToInt64(next_tokens.data(), data, state_.params_->search.num_beams, state_.params_->search.batch_size, static_cast(sequence_length), model_.cuda_stream_); - else - cuda::LaunchInt32ToInt64(next_tokens.data(), data, static_cast(next_tokens.size()), model_.cuda_stream_); -#endif - } break; - - default: { - // CPU, DML, WEBGPU - auto* data = value_->GetTensorMutableData(); - auto next_tokens = new_tokens.Span(); - for (int b = 0; b < shape_[0]; b++) { - for (int i = 0; i < shape_[1]; i++) { - // For beam search - int32_t next_token; - if (is_prompt_ && state_.params_->search.num_beams > 1) - next_token = next_tokens[(b / state_.params_->search.num_beams) * shape_[1] + i]; - else - next_token = next_tokens[b * shape_[1] + i]; - data[b * shape_[1] + i] = next_token; - } - } - } + // Update input_ids with next tokens + auto data_span = WrapTensor(*model_.p_device_, *value_); + + // For beam search + if (is_prompt_ && state_.params_->search.num_beams > 1) { + int row_size = static_cast(shape_[1]); + for (int b = 0; b < shape_[0]; b++) { + int in_offset = (b / state_.params_->search.num_beams) * row_size; + int out_offset = b * row_size; + data_span.subspan(out_offset, row_size).CopyFrom(new_tokens.subspan(in_offset, row_size)); } } else { - auto* data = value_->GetTensorMutableData(); -#if USE_CUDA - if (model_.device_type_ == DeviceType::CUDA) { - if (is_prompt_ && state_.params_->search.num_beams > 1) { - cuda::LaunchExpand(new_tokens.Span().data(), data, state_.params_->search.num_beams, state_.params_->search.batch_size, static_cast(sequence_length), model_.cuda_stream_); - } else { - cudaMemcpyAsync(data, new_tokens.Span().data(), shape_[0] * shape_[1] * sizeof(int32_t), cudaMemcpyDeviceToDevice, model_.cuda_stream_); - } - } else -#endif - { - // For beam search - if (is_prompt_ && state_.params_->search.num_beams > 1) { - for (int b = 0; b < shape_[0]; b++) { - int in_offset = (b / state_.params_->search.num_beams) * static_cast(shape_[1]); - int out_offset = b * static_cast(shape_[1]); - memcpy(data + out_offset, new_tokens.Span().data() + in_offset, shape_[1] * sizeof(int32_t)); - } - } else { - memcpy(data, new_tokens.Span().data(), shape_[0] * shape_[1] * sizeof(int32_t)); - } - } + data_span.CopyFrom(new_tokens); + } + + if (type_ == Ort::TypeToTensorType) { + Cast(*value_, cast_value_, *model_.p_device_, type_); + state_.inputs_[input_index_] = cast_value_.get(); } is_prompt_ = false; diff --git a/src/models/input_ids.h b/src/models/input_ids.h index 02af3a98a..6c3a5fc76 100644 --- a/src/models/input_ids.h +++ b/src/models/input_ids.h @@ -14,7 +14,7 @@ struct InputIDs { void Add(); // Resize input_ids based on size of next_tokens. // Update value with next_tokens. - void Update(DeviceSpan& next_tokens); + void Update(DeviceSpan next_tokens); auto& GetShape() const { return shape_; } const char* name_; @@ -31,6 +31,7 @@ struct InputIDs { std::array shape_{}; ONNXTensorElementDataType type_; std::unique_ptr value_; + std::unique_ptr cast_value_; // Used for decoding runs with cuda graphs. StaticBuffer* sb_input_ids_{}; diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp index 4fa3243e2..3f02d4db3 100644 --- a/src/models/kv_cache.cpp +++ b/src/models/kv_cache.cpp @@ -108,21 +108,13 @@ void KV_Cache_Combined::RewindPastTensorsTo(size_t index) { for (int i = 0; i < layer_count_; i++) { OrtValue& present = *presents_[i]; std::unique_ptr past = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_); + auto present_span = WrapTensor(*model_.p_device_, present); + auto past_span = WrapTensor(*model_.p_device_, *past); + for (int j = 0; j < 2 * batch_x_num_heads; j++) { - auto present_data = present.GetTensorData() + j * old_length_x_head_size; - auto past_data = past->GetTensorMutableData() + j * new_length_x_head_size; -#if USE_CUDA - if (model_.device_type_ == DeviceType::CUDA) { - cudaMemcpyAsync(past_data, present_data, new_length_x_head_size * sizeof(T), cudaMemcpyDeviceToDevice, model_.cuda_stream_); - } else -#elif USE_DML - if (model_.device_type_ == DeviceType::DML) { - // TODO: Implement DML version - } else -#endif - { - copy(std::span(present_data, new_length_x_head_size), std::span(past_data, new_length_x_head_size)); - } + auto present_data = present_span.subspan(j * old_length_x_head_size, new_length_x_head_size); + auto past_data = past_span.subspan(j * new_length_x_head_size, new_length_x_head_size); + past_data.CopyFrom(present_data); } pasts_[i] = std::move(past); state_.inputs_[input_index_ + i] = pasts_[i].get(); @@ -135,38 +127,22 @@ void KV_Cache_Combined::PickPastState(DeviceSpan beam_indices_device, i std::span beam_indices = beam_indices_device.CopyDeviceToCpu(); auto block_size_per_beam = shape_[2] * shape_[3] * shape_[4]; auto past_key_size = shape_[1] * block_size_per_beam; - auto element_count = shape_[0] * past_key_size; - const OrtValue& present = *presents_[index]; + OrtValue& present = *presents_[index]; std::unique_ptr past = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_); - auto past_span = std::span(past->GetTensorMutableData(), element_count); - auto present_span = std::span(present.GetTensorData(), element_count); - -#if USE_CUDA - if (model_.device_type_ == DeviceType::CUDA) { - for (size_t j = 0; j < beam_indices.size(); j++) { - int32_t beam_index = beam_indices[j]; - auto present_key = present_span.subspan(beam_index * block_size_per_beam, block_size_per_beam); - auto present_value = present_span.subspan(past_key_size + beam_index * block_size_per_beam, block_size_per_beam); - - auto past_key = past_span.subspan(j * block_size_per_beam, block_size_per_beam); - auto past_value = past_span.subspan(past_key_size + j * block_size_per_beam, block_size_per_beam); - cudaMemcpyAsync(past_key.data(), present_key.data(), present_key.size_bytes(), cudaMemcpyDeviceToDevice, model_.cuda_stream_); - cudaMemcpyAsync(past_value.data(), present_value.data(), present_value.size_bytes(), cudaMemcpyDeviceToDevice, model_.cuda_stream_); - } - } else -#endif - { - for (size_t j = 0; j < beam_indices.size(); j++) { - int32_t const beam_index = beam_indices[j]; - auto present_key = present_span.subspan(beam_index * block_size_per_beam, block_size_per_beam); - auto present_value = present_span.subspan(past_key_size + beam_index * block_size_per_beam, block_size_per_beam); - - auto past_key = past_span.subspan(j * block_size_per_beam, block_size_per_beam); - auto past_value = past_span.subspan(past_key_size + j * block_size_per_beam, block_size_per_beam); - copy(present_key, past_key); - copy(present_value, past_value); - } + + auto past_span = WrapTensor(*model_.p_device_, *past); + auto present_span = WrapTensor(*model_.p_device_, present); + + for (size_t j = 0; j < beam_indices.size(); j++) { + int32_t beam_index = beam_indices[j]; + auto present_key = present_span.subspan(beam_index * block_size_per_beam, block_size_per_beam); + auto present_value = present_span.subspan(past_key_size + beam_index * block_size_per_beam, block_size_per_beam); + + auto past_key = past_span.subspan(j * block_size_per_beam, block_size_per_beam); + auto past_value = past_span.subspan(past_key_size + j * block_size_per_beam, block_size_per_beam); + past_key.CopyFrom(present_key); + past_value.CopyFrom(present_value); } pasts_[index] = std::move(past); @@ -325,21 +301,14 @@ void KV_Cache::RewindPastTensorsTo(size_t index) { for (int i = 0; i < layer_count_ * 2; i++) { OrtValue& present = *presents_[i]; std::unique_ptr past = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_); + + auto past_span = WrapTensor(*model_.p_device_, *past); + auto present_span = WrapTensor(*model_.p_device_, present); + for (int j = 0; j < batch_x_num_heads; j++) { - auto present_data = present.GetTensorData() + j * old_length_x_head_size; - auto past_data = past->GetTensorMutableData() + j * new_length_x_head_size; -#if USE_CUDA - if (model_.device_type_ == DeviceType::CUDA) { - cudaMemcpyAsync(past_data, present_data, new_length_x_head_size * sizeof(T), cudaMemcpyDeviceToDevice, model_.cuda_stream_); - } else -#elif USE_DML - if (model_.device_type_ == DeviceType::DML) { - // TODO: Implement DML copy - } else -#endif - { - copy(std::span(present_data, new_length_x_head_size), std::span(past_data, new_length_x_head_size)); - } + auto present_data = present_span.subspan(j * old_length_x_head_size, new_length_x_head_size); + auto past_data = past_span.subspan(j * new_length_x_head_size, new_length_x_head_size); + past_data.CopyFrom(present_data); } pasts_[i] = std::move(past); state_.inputs_[input_index_ + i] = pasts_[i].get(); @@ -351,32 +320,20 @@ template void KV_Cache::PickPastState(DeviceSpan beam_indices_device, int index) { std::span beam_indices = beam_indices_device.Span(); auto block_size_per_beam = shape_[1] * shape_[2] * shape_[3]; - auto element_count = shape_[0] * block_size_per_beam; - const OrtValue& present_value = *presents_[index]; + OrtValue& present_value = *presents_[index]; std::unique_ptr past_value = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_); - auto past_span = std::span(past_value->GetTensorMutableData(), element_count); - auto present_span = std::span(present_value.GetTensorData(), element_count); - -#if USE_CUDA - if (model_.device_type_ == DeviceType::CUDA) { - for (size_t j = 0; j < beam_indices.size(); j++) { - int32_t beam_index = beam_indices[j]; - auto present = present_span.subspan(beam_index * block_size_per_beam, block_size_per_beam); - auto past = past_span.subspan(j * block_size_per_beam, block_size_per_beam); - cudaMemcpyAsync(past.data(), present.data(), present.size_bytes(), cudaMemcpyDeviceToDevice, model_.cuda_stream_); - } - } else -#endif - { - for (size_t j = 0; j < beam_indices.size(); j++) { - int32_t const beam_index = beam_indices[j]; - auto present = present_span.subspan(beam_index * block_size_per_beam, block_size_per_beam); - auto past = past_span.subspan(j * block_size_per_beam, block_size_per_beam); - copy(present, past); - } - } + auto past_span = WrapTensor(*model_.p_device_, *past_value); + auto present_span = WrapTensor(*model_.p_device_, present_value); + + for (size_t j = 0; j < beam_indices.size(); j++) { + int32_t beam_index = beam_indices[j]; + auto present = present_span.subspan(beam_index * block_size_per_beam, block_size_per_beam); + auto past = past_span.subspan(j * block_size_per_beam, block_size_per_beam); + past.CopyFrom(present); + } + pasts_[index] = std::move(past_value); } diff --git a/src/models/logits.cpp b/src/models/logits.cpp index ae7d9bdcd..c270d6ccc 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -3,10 +3,6 @@ #include "../generators.h" #include "model.h" #include "logits.h" -#if USE_CUDA -#include "../cuda/cuda_common.h" -#include "kernels.h" -#endif namespace Generators { @@ -25,14 +21,12 @@ Logits::Logits(State& state) } } -#if USE_CUDA if (model_.device_type_ == DeviceType::CUDA && !model_.config_->model.eos_token_ids.empty()) { auto& cpu_ids = model_.config_->model.eos_token_ids; cuda_eos_token_ids_ = state_.params_->p_device->Allocate(cpu_ids.size()); copy(std::span{cpu_ids}, cuda_eos_token_ids_.CpuSpan()); cuda_eos_token_ids_.CopyCpuToDevice(); } -#endif input_sequence_lengths.resize(state_.params_->search.batch_size); } @@ -79,26 +73,23 @@ DeviceSpan Logits::Get() { // Convert from float16 to float32 if necessary if (type_ == Ort::TypeToTensorType) { - ConvertFp16ToFp32(*model_.allocator_device_, *logits_of_last_token, logits_of_last_token_fp32_, model_.device_type_, model_.cuda_stream_); - logits_of_last_token = logits_of_last_token_fp32_.get(); + Cast(*logits_of_last_token, logits_of_last_token_fp32_, *model_.p_device_, Ort::TypeToTensorType); + logits_of_last_token = logits_of_last_token_fp32_.get(); } if (logits_.empty() || logits_of_last_token->GetTensorMutableRawData() != logits_.Span().data()) logits_ = WrapTensor(*state_.params_->p_device, *logits_of_last_token); -#if USE_CUDA if (model_.device_type_ == DeviceType::CUDA) { if (!cuda_eos_token_ids_.empty()) - cuda::LaunchHandleEOSArray( + model_.p_device_->LaunchHandleEOSArray( logits_.Span().data(), static_cast(shape_[0]) /* batch_beam_size*/, static_cast(shape_[2]) /* vocab_size */, cuda_eos_token_ids_.Span().data(), - static_cast(cuda_eos_token_ids_.size()), - model_.cuda_stream_); + static_cast(cuda_eos_token_ids_.size())); return logits_; } -#endif HandleEOSArray(logits_.Span()); return logits_; diff --git a/src/models/logits.h b/src/models/logits.h index 187ecd8cb..3eae05875 100644 --- a/src/models/logits.h +++ b/src/models/logits.h @@ -43,9 +43,7 @@ struct Logits { StaticBuffer* sb_logits32_{}; StaticBuffer* sb_logits16_{}; -#if USE_CUDA DeviceSpan cuda_eos_token_ids_; // eos_token_ids from params, but in cuda accessible memory -#endif }; } // namespace Generators diff --git a/src/models/model.cpp b/src/models/model.cpp index 9270a2cc7..487874b25 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -11,7 +11,6 @@ #include "gpt.h" #include "decoder_only.h" #include "whisper.h" -#include "kernels.h" #include "multi_modal_vision_model.h" #include "decoder_only_pipeline.h" #if USE_DML @@ -228,7 +227,7 @@ Ort::Allocator* GetDeviceAllocator(OrtSession& session, DeviceType type) { static const char* device_type_names[static_cast(DeviceType::MAX)] = {"CPU", "Cuda", "DML", "WebGPU Buffer"}; auto memory_info = OrtMemoryInfo::Create(device_type_names[static_cast(type)], OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); device = Ort::Allocator::Create(session, *memory_info); - GetDeviceInterface(type)->InitAllocator(*device); + GetDeviceInterface(type)->InitOrt(*Ort::api, *device); } return device.get(); } @@ -284,9 +283,9 @@ Model::Model(std::unique_ptr config) : config_{std::move(config)} { Model::~Model() = default; -void Model::InitDeviceAllocator([[maybe_unused]] OrtSession& session) { +void Model::InitDeviceAllocator(OrtSession& session) { allocator_device_ = &allocator_cpu_; - if (device_type_== DeviceType::CUDA) + if (device_type_ == DeviceType::CUDA) allocator_device_ = GetDeviceAllocator(session, device_type_); allocator_kvcache_ = allocator_device_; @@ -400,8 +399,7 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ p_device_ = GetDeviceInterface(device_type_); // Create and set our cudaStream_t - cuda_stream_ = p_device_->GetCudaStream(); - ort_provider_options->UpdateValue("user_compute_stream", cuda_stream_); + ort_provider_options->UpdateValue("user_compute_stream", p_device_->GetCudaStream()); } session_options.AppendExecutionProvider_CUDA_V2(*ort_provider_options); @@ -419,7 +417,7 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ session_options.AppendExecutionProvider_ROCM(ort_provider_options); #if USE_DML } else if (provider_options.name == "dml") { - if (!p_dml_api_) { + if (!GetDmlInterface()) { LUID device_luid{}; LUID* p_device_luid{}; for (const auto& [name, value] : provider_options.options) { @@ -537,80 +535,17 @@ std::shared_ptr CreateGeneratorParams(const Config& config) { return std::make_shared(config); } -void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr& p_out, DeviceType device_type, cudaStream_t stream) { - auto shape_info = in.GetTensorTypeAndShapeInfo(); - auto shape = shape_info->GetShape(); - assert(shape_info->GetElementType() == Ort::TypeToTensorType); +void Cast(OrtValue& input, std::unique_ptr& output, DeviceInterface& device, ONNXTensorElementDataType output_type) { + auto input_info = input.GetTensorTypeAndShapeInfo(); + auto shape = input_info->GetShape(); - bool allocate_p_out = p_out == nullptr; - if (p_out) { - auto out_shape_info = p_out->GetTensorTypeAndShapeInfo(); - auto out_shape = out_shape_info->GetShape(); - allocate_p_out = shape != out_shape; - } - - if (allocate_p_out) - p_out = OrtValue::CreateTensor(allocator, shape); - - int count = static_cast(shape_info->GetElementCount()); - auto* fp16 = in.GetTensorData(); - auto* fp32 = p_out->GetTensorMutableData(); - - switch (device_type) { - case DeviceType::WEBGPU: - case DeviceType::DML: - // DML, WebGpu doesn't currently support on-device scoring, so we fall back to the CPU - case DeviceType::CPU: - for (int i = 0; i < count; i++) - fp32[i] = FastFloat16ToFloat32(fp16[i]); - break; + if (output && shape != output->GetTensorTypeAndShapeInfo()->GetShape()) + output = nullptr; + if (!output) + output = OrtValue::CreateTensor(device.GetAllocator(), shape, output_type); -#if USE_CUDA - case DeviceType::CUDA: - cuda::LaunchFp16ToFp32(fp16, fp32, count, stream); - break; -#endif - - default: - throw std::runtime_error("ConvertFp16ToFp32 - Unsupported device type"); - } -} - -void ConvertFp32ToFp16(OrtAllocator& allocator, OrtValue& in, std::unique_ptr& p_out, - DeviceType device_type, cudaStream_t stream) { - auto shape_info = in.GetTensorTypeAndShapeInfo(); - auto shape = shape_info->GetShape(); - assert(shape_info->GetElementType() == Ort::TypeToTensorType); - - bool allocate_p_out = p_out == nullptr; - if (p_out) { - auto out_shape_info = p_out->GetTensorTypeAndShapeInfo(); - auto out_shape = out_shape_info->GetShape(); - allocate_p_out = shape != out_shape; - } - - if (allocate_p_out) - p_out = OrtValue::CreateTensor(allocator, shape); - - int count = static_cast(shape_info->GetElementCount()); - auto* fp32 = in.GetTensorData(); - auto* fp16 = p_out->GetTensorMutableData(); - - switch (device_type) { - case DeviceType::DML: - case DeviceType::CPU: - for (int i = 0; i < count; i++) - fp16[i] = FastFloat32ToFloat16(fp32[i]); - break; - -#if USE_CUDA - case DeviceType::CUDA: - cuda::LaunchFp32ToFp16(fp32, fp16, count, stream); -#endif - - default: - throw std::runtime_error("ConvertFp32ToFp16 - Unsupported device type"); - } + if(!device.Cast(input, *output)) + GetDeviceInterface(DeviceType::CPU)->Cast(input, *output); } std::unique_ptr Model::ExpandInputs(std::unique_ptr& input, int num_beams) const { @@ -625,44 +560,21 @@ std::unique_ptr Model::ExpandInputs(std::unique_ptr& input, auto input_type_info = input->GetTensorTypeAndShapeInfo(); auto element_type = input_type_info->GetElementType(); - auto element_size = SizeOf(element_type); auto input_shape = input_type_info->GetShape(); const int64_t batch_size = input_shape[0]; - const int64_t data_size_bytes = input_type_info->GetElementCount() * element_size / batch_size; + const int64_t data_size_bytes = input_type_info->GetElementCount() * SizeOf(element_type) / batch_size; input_shape[0] *= num_beams; auto& allocator = device_type_ == DeviceType::DML ? allocator_cpu_ : *allocator_device_; auto expanded = OrtValue::CreateTensor(allocator, input_shape, element_type); - const auto* input_data = reinterpret_cast(input->GetTensorRawData()); - auto* expanded_data = reinterpret_cast(expanded->GetTensorMutableRawData()); - auto* target = expanded_data; - - switch (device_type_) { - case DeviceType::WEBGPU: - case DeviceType::DML: - // DML and WebGpu doesn't currently support on-device scoring, so we use the CPU for non-cache inputs/outputs - case DeviceType::CPU: - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < num_beams; j++) { - memcpy(target, input_data + i * data_size_bytes, data_size_bytes); - target += data_size_bytes; - } - } - break; - -#if USE_CUDA - case DeviceType::CUDA: - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < num_beams; j++) { - cudaMemcpyAsync(target, input_data + i * data_size_bytes, data_size_bytes, cudaMemcpyHostToDevice, cuda_stream_); - target += data_size_bytes; - } - } - break; -#endif - default: - throw std::runtime_error("ExpandInputs - Unsupported device type"); + auto input_span = ByteWrapTensor(*GetDeviceInterface(DeviceType::CPU), *input); + auto expanded_span = ByteWrapTensor(*p_device_, *expanded); + + for (int i = 0; i < batch_size; i++) { + for (int j = 0; j < num_beams; j++) { + expanded_span.subspan((i * num_beams + j) * data_size_bytes, data_size_bytes).CopyFrom(input_span.subspan(i * data_size_bytes, data_size_bytes)); + } } return expanded; } diff --git a/src/models/model.h b/src/models/model.h index 55ad960b3..63fd7749f 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -8,22 +8,11 @@ #include "audio_processor.h" #include "adapters.h" -#if USE_DML -#include "dml_provider_factory.h" -#include "../dml/dml_helpers.h" -#include "../dml/dml_execution_context.h" -#include "../dml/dml_pooled_upload_heap.h" -#include "../dml/dml_readback_heap.h" -#endif - namespace Generators { struct Tokenizer; -void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr& p_out, DeviceType device_type, cudaStream_t stream); - -void ConvertFp32ToFp16(OrtAllocator& allocator, OrtValue& in, std::unique_ptr& p_out, DeviceType device_type, cudaStream_t stream); - +void Cast(OrtValue& input, std::unique_ptr& output, DeviceInterface& device, ONNXTensorElementDataType type); void CheckResult(extError_t error); struct State { @@ -145,7 +134,6 @@ struct Model : std::enable_shared_from_this, LeakChecked { std::unique_ptr config_; std::unique_ptr session_options_; - cudaStream_t cuda_stream_{}; mutable DeviceInterface* p_device_{}; DeviceType device_type_{DeviceType::CPU}; Ort::Allocator& allocator_cpu_{Ort::Allocator::GetWithDefaultOptions()}; @@ -156,15 +144,6 @@ struct Model : std::enable_shared_from_this, LeakChecked { std::shared_ptr external_owner_; // Set to 'this' when created by the C API to preserve lifetime -#if USE_DML - DmlExecutionContext* GetDmlExecutionContext() const { return dml_execution_context_.get(); } - DmlReadbackHeap* GetDmlReadbackHeap() const { return dml_readback_heap_.get(); } - DmlPooledUploadHeap* GetDmlUploadHeap() const { return dml_pooled_upload_heap_.get(); } - const OrtDmlApi* GetOrtDmlApi() const { return p_dml_api_; } - IDMLDevice* GetDmlDevice() const { return dml_device_.Get(); } - ID3D12Device* GetD3D12Device() const { return dml_objects_.d3d12_device.Get(); } -#endif - protected: void InitDeviceAllocator(OrtSession& session); void CreateSessionOptions(); @@ -174,14 +153,6 @@ struct Model : std::enable_shared_from_this, LeakChecked { bool is_primary_session_options, bool disable_graph_capture); -#if USE_DML - mutable DmlObjects dml_objects_; - const OrtDmlApi* p_dml_api_{}; - std::unique_ptr dml_pooled_upload_heap_; - std::unique_ptr dml_execution_context_; - std::unique_ptr dml_readback_heap_; - ComPtr dml_device_; -#endif #if USE_WEBGPU std::unique_ptr webgpu_io_binding_; #endif diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp index 5d4c82684..4f57637a8 100644 --- a/src/models/position_inputs.cpp +++ b/src/models/position_inputs.cpp @@ -1,11 +1,6 @@ #include "../generators.h" #include "model.h" #include "position_inputs.h" -#include "kernels.h" - -#if USE_DML -#include "../dml/dml_update_mask_kernel.h" -#endif namespace Generators { @@ -49,12 +44,6 @@ PositionInputs::PositionInputs(const Model& model, State& state, DeviceSpansb_attention_mask_.get(); - -#if USE_DML - if (model_.device_type_ == DeviceType::DML) { - sb_attention_mask_next_ = state_.GetCapturedGraphInfo()->sb_attention_mask_next_.get(); - } -#endif } } } @@ -104,9 +93,7 @@ void PositionInputs::RewindTo(size_t index) { // Rewind the mask input to a previous state } else if (has_mask_input_) { if (attention_mask_shape_[0] == 1) { -#if USE_CUDA RewindMask(index); -#endif } else throw std::runtime_error("PositionInputs::RewindTo - Unsupported batch size"); } @@ -126,29 +113,6 @@ void PositionInputs::AddPositionIDs() { state_.input_names_.push_back(model_.config_->model.decoder.inputs.position_ids.c_str()); } -#if USE_CUDA || USE_DML -void PositionInputs::CopyNextPositionIDsToCurrent() { -#if USE_CUDA - assert(model_.device_type_ == DeviceType::CUDA); - cudaMemcpyAsync(position_ids_->GetTensorMutableRawData(), - position_ids_next_->GetTensorMutableRawData(), - (type_ == Ort::TypeToTensorType ? sizeof(int32_t) : sizeof(int64_t)) * position_ids_shape_[0], - cudaMemcpyDeviceToDevice, - model_.cuda_stream_); -#elif USE_DML - assert(model_.device_type_ == DeviceType::DML); - ComPtr target_resource; - Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, position_ids_->GetTensorMutableRawData(), &target_resource)); - auto source = std::span(position_ids_next_->GetTensorData(), (type_ == Ort::TypeToTensorType ? sizeof(int32_t) : sizeof(int64_t)) * position_ids_shape_[0]); - model_.GetDmlUploadHeap()->BeginUploadToGpu( - target_resource.Get(), - 0, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - source); -#endif -} -#endif - void PositionInputs::CreateNextPositionIDsTensor() { if (!sb_position_ids_) { if (position_ids_shape_[1] == 1 && position_ids_next_) { @@ -158,12 +122,12 @@ void PositionInputs::CreateNextPositionIDsTensor() { position_ids_ = OrtValue::CreateTensor(*model_.allocator_device_, position_ids_shape_, type_); } } else { -#if USE_CUDA || USE_DML position_ids_ = sb_position_ids_->CreateTensorOnStaticBuffer(position_ids_shape_, type_); if (position_ids_shape_[1] == 1) { - CopyNextPositionIDsToCurrent(); + auto position_ids_span = ByteWrapTensor(*model_.p_device_, *position_ids_); + auto position_ids_next_span = ByteWrapTensor(*model_.p_device_, *position_ids_next_); + position_ids_span.CopyFrom(position_ids_next_span); } -#endif } } @@ -182,25 +146,16 @@ void PositionInputs::UpdatePositionIDs(int total_length, int new_kv_length) { switch (model_.device_type_) { case DeviceType::WEBGPU: + case DeviceType::DML: case DeviceType::CPU: { type_ == Ort::TypeToTensorType ? UpdatePositionIDsImpl(total_length, new_kv_length) : UpdatePositionIDsImpl(total_length, new_kv_length); break; } -#if USE_CUDA case DeviceType::CUDA: { - if (type_ == Ort::TypeToTensorType) - cuda::Launch_UpdatePositionIds(position_ids_->GetTensorMutableData(), static_cast(position_ids_shape_[0]), total_length, new_kv_length, model_.cuda_stream_); - else - cuda::Launch_UpdatePositionIds(position_ids_->GetTensorMutableData(), static_cast(position_ids_shape_[0]), total_length, new_kv_length, model_.cuda_stream_); - break; - } -#elif USE_DML - case DeviceType::DML: { - UpdatePositionIDsImplDML(); + model_.p_device_->UpdatePositionIds(position_ids_->GetTensorMutableRawData(), static_cast(position_ids_shape_[0]), total_length, new_kv_length, type_); break; } -#endif default: throw std::runtime_error("PositionIDs::Update - Unsupported device type"); } @@ -210,22 +165,12 @@ void PositionInputs::CreateNextAttentionMaskTensor(int total_length) { if (!sb_attention_mask_) { attention_mask_shape_[1] = total_length; attention_mask_next_ = OrtValue::CreateTensor(*model_.allocator_device_, attention_mask_shape_, type_); -#if USE_DML - if (model_.device_type_ == DeviceType::DML) - attention_mask_ = OrtValue::CreateTensor(*model_.allocator_device_, attention_mask_shape_, type_); -#endif } else { -#if USE_CUDA attention_mask_shape_[1] = state_.params_->search.max_length; attention_mask_next_ = sb_attention_mask_->CreateTensorOnStaticBuffer(attention_mask_shape_, type_); if (is_first_mask_update_) { ByteWrapTensor(*model_.p_device_, *attention_mask_next_).Zero(); } -#elif USE_DML - attention_mask_shape_[1] = state_.params_->search.max_length; - attention_mask_ = sb_attention_mask_->CreateTensorOnStaticBuffer(attention_mask_shape_, type_); - attention_mask_next_ = sb_attention_mask_next_->CreateTensorOnStaticBuffer(attention_mask_shape_, type_); -#endif } } @@ -240,52 +185,31 @@ void PositionInputs::UpdateAttentionMask(int total_length, int new_kv_length) { switch (model_.device_type_) { case DeviceType::WEBGPU: + case DeviceType::DML: case DeviceType::CPU: { type_ == Ort::TypeToTensorType ? UpdateAttentionMaskImpl(total_length) : UpdateAttentionMaskImpl(total_length); break; } -#if USE_CUDA case DeviceType::CUDA: { - int max_length = sb_attention_mask_ ? state_.params_->search.max_length : total_length; + int max_length = sb_attention_mask_ ? state_.params_->search.max_length : total_length; bool update_only = sb_attention_mask_ && !is_first_mask_update_; - if (type_ == Ort::TypeToTensorType) { - cuda::Launch_UpdateAttentionMask(attention_mask_next_->GetTensorMutableData(), - attention_mask_->GetTensorData(), - static_cast(attention_mask_shape_[0]), - new_kv_length, - total_length, - max_length, - update_only, - model_.cuda_stream_); - } else { - cuda::Launch_UpdateAttentionMask(attention_mask_next_->GetTensorMutableData(), - attention_mask_->GetTensorData(), - static_cast(attention_mask_shape_[0]), - new_kv_length, - total_length, - max_length, - update_only, - model_.cuda_stream_); - } + model_.p_device_->UpdateAttentionMask(attention_mask_next_->GetTensorMutableRawData(), + attention_mask_->GetTensorRawData(), + static_cast(attention_mask_shape_[0]), + new_kv_length, + total_length, + max_length, + update_only, + type_); break; } -#elif USE_DML - case DeviceType::DML: { - UpdateAttentionMaskImplDML(total_length); - break; - } -#endif + default: throw std::runtime_error("PositionInputs::Update - Unsupported device type"); } -#if USE_DML - if (model_.device_type_ != DeviceType::DML) { - attention_mask_ = std::move(attention_mask_next_); - } -#else + attention_mask_ = std::move(attention_mask_next_); -#endif state_.inputs_[mask_input_index_] = attention_mask_.get(); is_first_mask_update_ = false; } @@ -365,25 +289,6 @@ void PositionInputs::UpdatePositionIDsImpl(int total_length, int new_kv_length) } } -#if USE_DML -void PositionInputs::UpdatePositionIDsImplDML() { - ComPtr target_resource; - Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, position_ids_->GetTensorMutableRawData(), &target_resource)); - - dml_update_position_ids_kernel_ = DmlIncrementValuesKernel( - model_.GetD3D12Device(), - model_.GetDmlExecutionContext(), - static_cast(position_ids_shape_[0]), - type_, - target_resource.Get()); - - // Execute the cached command list - ComPtr fence; - uint64_t completion_value; - model_.GetDmlExecutionContext()->ExecuteCommandList(dml_update_position_ids_kernel_->GetCommandList(), &fence, &completion_value); -} -#endif - template void PositionInputs::UpdateAttentionMaskImpl(int total_length) { auto* data = attention_mask_next_->GetTensorMutableData(); @@ -403,44 +308,10 @@ void PositionInputs::UpdateAttentionMaskImpl(int total_length) { } } -#if USE_DML -void PositionInputs::UpdateAttentionMaskImplDML(int total_length) { - ComPtr attention_mask_resource; - Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, attention_mask_->GetTensorMutableRawData(), &attention_mask_resource)); - ComPtr attention_mask_next_resource; - Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, attention_mask_next_->GetTensorMutableRawData(), &attention_mask_next_resource)); - if (is_first_mask_update_) { - dml_update_mask_kernel_ = DmlUpdateMaskKernel( - model_.GetD3D12Device(), - model_.GetDmlExecutionContext(), - static_cast(attention_mask_shape_[0]), - static_cast(attention_mask_shape_[1]), - type_, - total_length, - attention_mask_resource.Get(), - attention_mask_next_resource.Get()); - is_second_mask_update_ = true; - } else if (is_second_mask_update_) { - dml_update_mask_kernel_ = DmlUpdateMaskKernel( - model_.GetD3D12Device(), - model_.GetDmlExecutionContext(), - static_cast(attention_mask_shape_[0]), - static_cast(attention_mask_shape_[1]), - type_, - 1, - attention_mask_resource.Get(), - attention_mask_next_resource.Get()); - is_second_mask_update_ = false; - } - ComPtr fence; - uint64_t completion_value; - model_.GetDmlExecutionContext()->ExecuteCommandList(dml_update_mask_kernel_->GetCommandList(), &fence, &completion_value); -} -#endif - -#if USE_CUDA void PositionInputs::RewindMask(size_t index) { if (sb_attention_mask_ && !is_first_mask_update_) { + throw std::runtime_error("PositionInputs::RewindMask - Static buffer is not supported for continuous decoding."); +#if 0 // TODO: Fix implementation, cudaMemsetAsync of 1 is setting bytes of 1 vs int32's of 1 int past_length = static_cast(index); int max_length = static_cast(state_.params_->search.max_length); cudaMemsetAsync(attention_mask_->GetTensorMutableRawData(), @@ -451,8 +322,8 @@ void PositionInputs::RewindMask(size_t index) { 1, (type_ == Ort::TypeToTensorType ? sizeof(int32_t) : sizeof(int64_t)) * past_length, model_.cuda_stream_); +#endif } } -#endif } // namespace Generators diff --git a/src/models/position_inputs.h b/src/models/position_inputs.h index 64a38779c..6c297cc85 100644 --- a/src/models/position_inputs.h +++ b/src/models/position_inputs.h @@ -1,12 +1,6 @@ #pragma once - #include "static_buffer.h" -#if USE_DML -#include "../dml/dml_update_mask_kernel.h" -#include "../dml/dml_increment_values_kernel.h" -#endif - namespace Generators { struct PositionInputs { @@ -39,18 +33,7 @@ struct PositionInputs { template void UpdateAttentionMaskImpl(int total_length); -#if USE_CUDA || USE_DML - void CopyNextPositionIDsToCurrent(); -#endif - -#if USE_DML - void UpdatePositionIDsImplDML(); - void UpdateAttentionMaskImplDML(int total_length); -#endif - -#if USE_CUDA void RewindMask(size_t index); -#endif const Model& model_; State& state_; @@ -77,13 +60,6 @@ struct PositionInputs { bool is_first_mask_update_{true}; bool is_first_update_{true}; - -#if USE_DML - std::optional dml_update_mask_kernel_; - StaticBuffer* sb_attention_mask_next_{}; - std::optional dml_update_position_ids_kernel_; - bool is_second_mask_update_{}; -#endif }; } // namespace Generators diff --git a/src/models/prompt_image_processor.cpp b/src/models/prompt_image_processor.cpp index 249711a0b..33ed2cfa3 100644 --- a/src/models/prompt_image_processor.cpp +++ b/src/models/prompt_image_processor.cpp @@ -89,7 +89,7 @@ std::unique_ptr ProcessPixelValues(ortc::Tensor* pixel_values, allocator.GetInfo(), std::span(const_cast(pixel_values->Data()), pixel_values->NumberOfElement()), pixel_values->Shape()); - ConvertFp32ToFp16(allocator, *pixel_values_fp32, pixel_values_value, DeviceType::CPU, nullptr); + Cast(*pixel_values_fp32, pixel_values_value, *GetDeviceInterface(DeviceType::CPU), Ort::TypeToTensorType); } return pixel_values_value; diff --git a/src/models/whisper.cpp b/src/models/whisper.cpp index 723b7aeda..ec6cf42bf 100644 --- a/src/models/whisper.cpp +++ b/src/models/whisper.cpp @@ -3,10 +3,6 @@ #include "../generators.h" #include "whisper.h" #include -#include "kernels.h" -#if USE_CUDA -#include "../cuda/cuda_common.h" -#endif namespace Generators { @@ -42,7 +38,7 @@ Whisper_State::Whisper_State(const Whisper_Model& model, DeviceSpan seq } if (inputs.alignment_heads != nullptr) { -#if USE_CUDA +#if 0 // USE_CUDA auto alignment_heads_type_and_shape_info = inputs.alignment_heads->ort_tensor_->GetTensorTypeAndShapeInfo(); auto alignment_heads_type = alignment_heads_type_and_shape_info->GetElementType(); // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32 auto alignment_heads_shape = alignment_heads_type_and_shape_info->GetShape(); @@ -101,7 +97,7 @@ Whisper_State::Whisper_State(const Whisper_Model& model, DeviceSpan seq } } -#if USE_CUDA +#if 0 // USE_CUDA template void TransposeKCacheForDMMHA(T* dest_data, T* temp_buffer, @@ -147,7 +143,7 @@ DeviceSpan Whisper_State::Run(int current_length, DeviceSpan& ne const auto copy_data_size_all = src_shape_info->GetElementCount() * SizeOf(src_shape_info->GetElementType()); -#if USE_CUDA +#if 0 // USE_CUDA const auto src_dims = src_shape_info->GetShape(); const auto src_element_type = src_shape_info->GetElementType(); const auto src_element_size = SizeOf(src_element_type); @@ -187,7 +183,7 @@ DeviceSpan Whisper_State::Run(int current_length, DeviceSpan& ne auto dest_data = presents_[i]->GetTensorMutableRawData(); switch (model_.device_type_) { -#if USE_CUDA +#if 0 // USE_CUDA case DeviceType::CUDA: if (self_attn_kv_cache_element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) { // CUDA EP + FP16 precision == `DecoderMaskedMultiHeadAttention` op is used @@ -228,7 +224,7 @@ DeviceSpan Whisper_State::Run(int current_length, DeviceSpan& ne } } -#if USE_CUDA +#if 0 // USE_CUDA if (self_attn_kv_cache_element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 && model_.device_type_ == DeviceType::CUDA) { // Transpose cross attention K caches for `DecoderMaskedMultiHeadAttention` @@ -331,7 +327,7 @@ void Whisper_State::UpdateInputsOutputs(DeviceSpan& next_tokens, Device } if (cache_indirection_) { -#if USE_CUDA +#if 0 // USE_CUDA auto beam_indices_gpu = gpu_span{beam_indices.Span()}; if (beam_indices_gpu.empty()) { auto beam_indices_cpu = beam_indices.CpuSpan(); @@ -359,7 +355,7 @@ void Whisper_State::UpdateInputsOutputs(DeviceSpan& next_tokens, Device } if (output_cross_qk_.size() && alignment_heads_) { -#if USE_CUDA +#if 0 // USE_CUDA // Collect a GPU array of float* pointers from the vector of OrtValues to pass to the kernel auto output_cross_qk_ptrs = cross_qk_ptrs_gpu_.CpuSpan(); assert(output_cross_qk_ptrs.size() == output_cross_qk_.size()); @@ -390,7 +386,7 @@ void Whisper_State::Initialize(DeviceSpan& next_tokens, int total_lengt void Whisper_State::Finalize() { if (output_cross_qk_.size() && alignment_heads_) { -#if USE_CUDA +#if 0 // USE_CUDA int decoded_length = *(past_sequence_length_->GetTensorMutableData()) + 1; auto output_cross_qk_dims = output_cross_qk_[0]->GetTensorTypeAndShapeInfo()->GetShape(); diff --git a/src/smartptrs.h b/src/smartptrs.h index fbaf12ed2..a25c8c133 100644 --- a/src/smartptrs.h +++ b/src/smartptrs.h @@ -85,7 +85,8 @@ struct DeviceSpan { struct DeviceInterface { virtual ~DeviceInterface() {} - virtual void InitAllocator(Ort::Allocator& allocator) = 0; + virtual void InitOrt(const OrtApi& api, Ort::Allocator& allocator) = 0; + virtual Ort::Allocator& GetAllocator() = 0; template DeviceSpan Allocate(size_t count) { return DeviceSpan(AllocateBase(sizeof(T) * count)); } @@ -101,7 +102,18 @@ struct DeviceInterface { virtual void Synchronize() = 0; // Synchronize the device, typically used for timing or debugging - virtual cudaStream_t GetCudaStream() { + virtual bool Cast(OrtValue& /*input*/, OrtValue& /*output*/) { return false; } + + virtual void UpdatePositionIds(void* /*position_ids*/, int /*batch_beam_size*/, int /*total_length*/, int /*new_kv_length*/, ONNXTensorElementDataType /*type*/) { assert(false); } + virtual void UpdateAttentionMask(void* /*mask_data*/, const void* /*old_data*/, int /*batch_beam_size*/, int /*new_kv_length*/, int /*total_length*/, int /*max_length*/, bool /*update_only*/, ONNXTensorElementDataType /*type*/) { assert(false); } + + virtual void LaunchHandleEOSArray(float* /*batch_logits*/, int /*batch_beam_size*/, int /*vocab_size*/, const int32_t* /*eos_token_ids*/, int /*eos_token_ids_count*/) { assert(false); } + virtual void UpdateCacheIndirectionKernelLauncher(int32_t* /*tgt_indir_cache*/, const int32_t* /*src_indir_cache*/, const int32_t* /*beam_ids*/, int /*batch_size*/, int /*beam_width*/, int /*input_seq_length*/, int /*max_seq_length*/, int /*current_length*/) { assert(false); } + virtual void ReorderPastStatesKernelLauncher(void* /*out_buffer*/, const void* /*in_buffer*/, int /*batch_size*/, int /*num_heads*/, int /*max_length*/, int /*head_size*/, int /*chunk_size*/) { assert(false); } + virtual void LaunchCopyCrossQKSingleDecodeStep(float* /*cross_qk_buffer_data*/, float** /*qk_layer_pointers*/, int /*token_index*/, int /*batch_beam_size*/, int /*num_layers*/, int /*num_heads*/, int /*num_alignment_heads*/, const int* /*alignment_heads*/, int /*frames*/, int /*max_length*/) { assert(false); } + virtual void LaunchFinalizeCrossQK(int /*iteration_number*/, int /*context_decoding_len*/, int /*batch_size*/, int /*num_beams*/, int /*max_length*/, int /*num_alignment_heads*/, int /*frames_of_k*/, const float* /*cross_qk_buffer_data*/, float* /*cross_qk_output*/, int /*num_return_sequences*/, const int* /*cache_indir_data*/) { assert(false); } + + virtual void* GetCudaStream() { assert(false); return nullptr; } // Temporary until we fully factor out providers diff --git a/test/model_tests.cpp b/test/model_tests.cpp index 321d1ac46..9482a0998 100644 --- a/test/model_tests.cpp +++ b/test/model_tests.cpp @@ -18,12 +18,6 @@ #define PHI2_PATH MODEL_PATH "phi-2/int4/cpu" #endif #endif -#if USE_DML -#include -#include -#include -#include -#endif // To generate this file: // python convert_generation.py --model_type gpt2 -m hf-internal-testing/tiny-random-gpt2 --output tiny_gpt2_greedysearch_fp16.onnx --use_gpu --max_length 20 @@ -35,7 +29,7 @@ static const std::pair c_tiny_gpt2_model_paths[] = { #if USE_DML TEST(ModelTests, DMLAdapterSelection) { -#if TEST_PHI2 +#if 0 // TEST_PHI2 TODO: Remove this? Can't access the device directly anymore. auto model = Generators::CreateModel(Generators::GetOrtEnv(), PHI2_PATH); auto d3d12Device = model->GetD3D12Device(); From bdbb09c80a9aa4a3e37461d4b1b05819a89072da Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Wed, 15 Jan 2025 16:04:50 -0800 Subject: [PATCH 04/31] Fix merge build issues --- src/models/input_ids.cpp | 4 +-- src/models/input_ids.h | 4 +-- src/models/model.h | 4 --- src/models/position_inputs.cpp | 62 ++++++++++++---------------------- src/ort_genai_c.cpp | 7 ++++ 5 files changed, 32 insertions(+), 49 deletions(-) diff --git a/src/models/input_ids.cpp b/src/models/input_ids.cpp index 253fbc2bf..133be44f5 100644 --- a/src/models/input_ids.cpp +++ b/src/models/input_ids.cpp @@ -44,7 +44,7 @@ void DefaultInputIDs::Add() { } } -void InputIDs::Update(DeviceSpan new_tokens) { +void DefaultInputIDs::Update(DeviceSpan new_tokens) { auto new_tokens_cpu = new_tokens.CopyDeviceToCpu(); const auto get_unpadded_sequence_length = [](std::span input_ids, int32_t pad_token_id) { @@ -130,7 +130,7 @@ void WindowedInputIDs::Add() { state_.input_names_.push_back(name_); } -void WindowedInputIDs::Update(DeviceSpan& new_tokens) { +void WindowedInputIDs::Update(DeviceSpan new_tokens) { if (window_index_ == 0) { num_windows_ = (new_tokens.size() + window_size_ - 1) / window_size_; diff --git a/src/models/input_ids.h b/src/models/input_ids.h index b81718022..cc9ab5640 100644 --- a/src/models/input_ids.h +++ b/src/models/input_ids.h @@ -8,7 +8,7 @@ struct InputIDs { virtual ~InputIDs() = default; virtual void Add() = 0; virtual std::array GetShape() const = 0; - virtual void Update(DeviceSpan& next_tokens) = 0; + virtual void Update(DeviceSpan next_tokens) = 0; }; struct DefaultInputIDs : InputIDs { @@ -60,7 +60,7 @@ struct WindowedInputIDs : public InputIDs { WindowedInputIDs& operator=(const WindowedInputIDs&) = delete; void Add() override; - void Update(DeviceSpan& next_tokens) override; + void Update(DeviceSpan next_tokens) override; std::array GetShape() const override { return shape_; } private: diff --git a/src/models/model.h b/src/models/model.h index 034c0defb..9635e79d2 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -155,10 +155,6 @@ struct Model : std::enable_shared_from_this, LeakChecked { bool is_primary_session_options, bool disable_graph_capture); -#endif -#if USE_DML || USE_WEBGPU - std::unique_ptr memory_info_device_; -#endif std::shared_ptr captured_graph_pool_; std::map> pipeline_session_options_; }; diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp index 6e02d92d7..76372bd54 100644 --- a/src/models/position_inputs.cpp +++ b/src/models/position_inputs.cpp @@ -113,7 +113,7 @@ void DefaultPositionInputs::AddPositionIDs() { state_.input_names_.push_back(model_.config_->model.decoder.inputs.position_ids.c_str()); } -void PositionInputs::CreateNextPositionIDsTensor() { +void DefaultPositionInputs::CreateNextPositionIDsTensor() { if (!sb_position_ids_) { if (position_ids_shape_[1] == 1 && position_ids_next_) { position_ids_ = std::move(position_ids_next_); @@ -142,20 +142,11 @@ void DefaultPositionInputs::UpdatePositionIDs(int total_length, int new_kv_lengt state_.inputs_[posid_input_index_] = position_ids_.get(); } - switch (model_.device_type_) { - case DeviceType::WEBGPU: - case DeviceType::DML: - case DeviceType::CPU: { - type_ == Ort::TypeToTensorType ? UpdatePositionIDsImpl(total_length, new_kv_length) - : UpdatePositionIDsImpl(total_length, new_kv_length); - break; - } - case DeviceType::CUDA: { - model_.p_device_->UpdatePositionIds(position_ids_->GetTensorMutableRawData(), static_cast(position_ids_shape_[0]), total_length, new_kv_length, type_); - break; - } - default: - throw std::runtime_error("PositionIDs::Update - Unsupported device type"); + if (model_.device_type_ == DeviceType::CUDA) + model_.p_device_->UpdatePositionIds(position_ids_->GetTensorMutableRawData(), static_cast(position_ids_shape_[0]), total_length, new_kv_length, type_); + else { + type_ == Ort::TypeToTensorType ? UpdatePositionIDsImpl(total_length, new_kv_length) + : UpdatePositionIDsImpl(total_length, new_kv_length); } } @@ -181,31 +172,20 @@ void DefaultPositionInputs::UpdateAttentionMask(int total_length, int new_kv_len CreateNextAttentionMaskTensor(total_length); state_.inputs_[mask_input_index_] = attention_mask_.get(); - switch (model_.device_type_) { - case DeviceType::WEBGPU: - case DeviceType::DML: - case DeviceType::QNN: - case DeviceType::CPU: { - type_ == Ort::TypeToTensorType ? UpdateAttentionMaskImpl(total_length) - : UpdateAttentionMaskImpl(total_length); - break; - } - case DeviceType::CUDA: { - int max_length = sb_attention_mask_ ? state_.params_->search.max_length : total_length; - bool update_only = sb_attention_mask_ && !is_first_mask_update_; - model_.p_device_->UpdateAttentionMask(attention_mask_next_->GetTensorMutableRawData(), - attention_mask_->GetTensorRawData(), - static_cast(attention_mask_shape_[0]), - new_kv_length, - total_length, - max_length, - update_only, - type_); - break; - } - - default: - throw std::runtime_error("DefaultPositionInputs::Update - Unsupported device type"); + if (model_.device_type_ == DeviceType::CUDA) { + int max_length = sb_attention_mask_ ? state_.params_->search.max_length : total_length; + bool update_only = sb_attention_mask_ && !is_first_mask_update_; + model_.p_device_->UpdateAttentionMask(attention_mask_next_->GetTensorMutableRawData(), + attention_mask_->GetTensorRawData(), + static_cast(attention_mask_shape_[0]), + new_kv_length, + total_length, + max_length, + update_only, + type_); + } else { + type_ == Ort::TypeToTensorType ? UpdateAttentionMaskImpl(total_length) + : UpdateAttentionMaskImpl(total_length); } attention_mask_ = std::move(attention_mask_next_); @@ -307,7 +287,7 @@ void DefaultPositionInputs::UpdateAttentionMaskImpl(int total_length) { } } -void PositionInputs::RewindMask(size_t index) { +void DefaultPositionInputs::RewindMask(size_t index) { if (sb_attention_mask_ && !is_first_mask_update_) { throw std::runtime_error("PositionInputs::RewindMask - Static buffer is not supported for continuous decoding."); #if 0 // TODO: Fix implementation, cudaMemsetAsync of 1 is setting bytes of 1 vs int32's of 1 diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp index 0da495420..82fac1c7d 100644 --- a/src/ort_genai_c.cpp +++ b/src/ort_genai_c.cpp @@ -339,6 +339,13 @@ OgaResult* OGA_API_CALL OgaGenerator_GetOutput(const OgaGenerator* oga_generator auto copy_span = Generators::ByteWrapTensor(*Generators::GetDeviceInterface(Generators::DeviceType::CPU), *ortvalue_clone); copy_span.CopyFrom(output_span); + auto tensor = std::make_shared(std::move(ortvalue_clone)); + tensor->external_owner_ = tensor; + *out = reinterpret_cast(tensor.get()); + return nullptr; + OGA_CATCH +} + OgaResult* OGA_API_CALL OgaGenerator_GetLogits(OgaGenerator* oga_generator, OgaTensor** out) { OGA_TRY auto generator = reinterpret_cast(oga_generator); From 66321dd8fa84457e7839934156fabbbf13455552 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Wed, 15 Jan 2025 23:08:47 -0800 Subject: [PATCH 05/31] Formatting --- src/dml/interface.cpp | 3 +-- src/dml/interface.h | 2 +- src/generators.h | 5 +++-- src/models/input_ids.cpp | 2 +- src/models/kv_cache.cpp | 2 +- src/models/logits.cpp | 2 -- src/models/model.cpp | 2 +- src/models/whisper.cpp | 16 ++++++++-------- src/smartptrs.h | 2 +- 9 files changed, 17 insertions(+), 19 deletions(-) diff --git a/src/dml/interface.cpp b/src/dml/interface.cpp index 459a71054..58a3457e2 100644 --- a/src/dml/interface.cpp +++ b/src/dml/interface.cpp @@ -17,7 +17,7 @@ std::string CurrentModulePath(); namespace Generators { -namespace Dml { // If this was in a shared library it wouldn't need to be in its own namespace +namespace Dml { // If this was in a shared library it wouldn't need to be in its own namespace Ort::Allocator* ort_allocator_{}; const char* label_dml = "dml"; @@ -95,7 +95,6 @@ struct GpuMemory final : DeviceBuffer { }; struct DmlInterfaceImpl : DeviceInterface { - DmlInterfaceImpl(LUID* p_device_luid) { Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast(&dml_api_))); if (!dml_api_) { diff --git a/src/dml/interface.h b/src/dml/interface.h index c70faf721..6f745ce13 100644 --- a/src/dml/interface.h +++ b/src/dml/interface.h @@ -8,4 +8,4 @@ void SetDmlProvider(OrtSessionOptions& options); DeviceInterface* GetDmlInterface(); -} \ No newline at end of file +} // namespace Generators \ No newline at end of file diff --git a/src/generators.h b/src/generators.h index c7b2875d0..9efe85d2d 100644 --- a/src/generators.h +++ b/src/generators.h @@ -50,10 +50,10 @@ DeviceSpan WrapTensor(DeviceInterface& device, OrtValue& value) { DeviceSpan ByteWrapTensor(DeviceInterface& device, OrtValue& value); -template +template struct OrtTensor { OrtTensor(std::unique_ptr ort_value, DeviceInterface& device) - : ort_value_{std::move(ort_value)}, device_span_{WrapTensor(device, *ort_value_)} {} + : ort_value_{std::move(ort_value)}, device_span_{WrapTensor(device, *ort_value_)} {} operator OrtValue*() { return ort_value_.get(); } @@ -151,6 +151,7 @@ struct OrtGlobals { std::unique_ptr env_; std::unique_ptr allocator_device_[static_cast(DeviceType::MAX)]; + private: OrtGlobals(const OrtGlobals&) = delete; void operator=(const OrtGlobals&) = delete; diff --git a/src/models/input_ids.cpp b/src/models/input_ids.cpp index 133be44f5..38bdba445 100644 --- a/src/models/input_ids.cpp +++ b/src/models/input_ids.cpp @@ -81,7 +81,7 @@ void DefaultInputIDs::Update(DeviceSpan new_tokens) { } // Update input_ids with next tokens - auto data_span = WrapTensor(*model_.p_device_, *value_); + auto data_span = WrapTensor(*model_.p_device_, *value_); // For beam search if (is_prompt_ && state_.params_->search.num_beams > 1) { diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp index a8eff8005..51210ae4e 100644 --- a/src/models/kv_cache.cpp +++ b/src/models/kv_cache.cpp @@ -341,7 +341,7 @@ void DefaultKeyValueCache::PickPastState(DeviceSpan beam_indices_device auto past = past_span.subspan(j * block_size_per_beam, block_size_per_beam); past.CopyFrom(present); } - + pasts_[index] = std::move(past_value); } diff --git a/src/models/logits.cpp b/src/models/logits.cpp index cd28c5b31..4a7ac7f6a 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -12,7 +12,6 @@ Logits::Logits(State& state) type_{model_.session_info_->GetOutputDataType(model_.config_->model.decoder.outputs.logits)} { output_raw_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_); - if (model_.device_type_ == DeviceType::CUDA && !model_.config_->model.eos_token_ids.empty()) { auto& cpu_ids = model_.config_->model.eos_token_ids; cuda_eos_token_ids_ = state_.params_->p_device->Allocate(cpu_ids.size()); @@ -52,7 +51,6 @@ DeviceSpan Logits::Get() { // Find the first non pad token from the end size_t token_index = input_sequence_lengths[batch_index] - 1; for (int beam_index = 0; beam_index < num_beams; beam_index++) { - auto target = logits_last_tokens.subspan(vocab_index * element_size, vocab_size * element_size); auto source = logits_raw.subspan((vocab_index * seq_length + token_index * vocab_size) * element_size, vocab_size * element_size); target.CopyFrom(source); diff --git a/src/models/model.cpp b/src/models/model.cpp index 29869f7e7..fd552b7c8 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -564,7 +564,7 @@ void Cast(OrtValue& input, std::unique_ptr& output, DeviceInterface& d if (!output) output = OrtValue::CreateTensor(device.GetAllocator(), shape, output_type); - if(!device.Cast(input, *output)) + if (!device.Cast(input, *output)) GetDeviceInterface(DeviceType::CPU)->Cast(input, *output); } diff --git a/src/models/whisper.cpp b/src/models/whisper.cpp index ec6cf42bf..6c46f218d 100644 --- a/src/models/whisper.cpp +++ b/src/models/whisper.cpp @@ -38,7 +38,7 @@ Whisper_State::Whisper_State(const Whisper_Model& model, DeviceSpan seq } if (inputs.alignment_heads != nullptr) { -#if 0 // USE_CUDA +#if 0 // USE_CUDA auto alignment_heads_type_and_shape_info = inputs.alignment_heads->ort_tensor_->GetTensorTypeAndShapeInfo(); auto alignment_heads_type = alignment_heads_type_and_shape_info->GetElementType(); // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32 auto alignment_heads_shape = alignment_heads_type_and_shape_info->GetShape(); @@ -97,7 +97,7 @@ Whisper_State::Whisper_State(const Whisper_Model& model, DeviceSpan seq } } -#if 0 // USE_CUDA +#if 0 // USE_CUDA template void TransposeKCacheForDMMHA(T* dest_data, T* temp_buffer, @@ -143,7 +143,7 @@ DeviceSpan Whisper_State::Run(int current_length, DeviceSpan& ne const auto copy_data_size_all = src_shape_info->GetElementCount() * SizeOf(src_shape_info->GetElementType()); -#if 0 // USE_CUDA +#if 0 // USE_CUDA const auto src_dims = src_shape_info->GetShape(); const auto src_element_type = src_shape_info->GetElementType(); const auto src_element_size = SizeOf(src_element_type); @@ -183,7 +183,7 @@ DeviceSpan Whisper_State::Run(int current_length, DeviceSpan& ne auto dest_data = presents_[i]->GetTensorMutableRawData(); switch (model_.device_type_) { -#if 0 // USE_CUDA +#if 0 // USE_CUDA case DeviceType::CUDA: if (self_attn_kv_cache_element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) { // CUDA EP + FP16 precision == `DecoderMaskedMultiHeadAttention` op is used @@ -224,7 +224,7 @@ DeviceSpan Whisper_State::Run(int current_length, DeviceSpan& ne } } -#if 0 // USE_CUDA +#if 0 // USE_CUDA if (self_attn_kv_cache_element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 && model_.device_type_ == DeviceType::CUDA) { // Transpose cross attention K caches for `DecoderMaskedMultiHeadAttention` @@ -327,7 +327,7 @@ void Whisper_State::UpdateInputsOutputs(DeviceSpan& next_tokens, Device } if (cache_indirection_) { -#if 0 // USE_CUDA +#if 0 // USE_CUDA auto beam_indices_gpu = gpu_span{beam_indices.Span()}; if (beam_indices_gpu.empty()) { auto beam_indices_cpu = beam_indices.CpuSpan(); @@ -355,7 +355,7 @@ void Whisper_State::UpdateInputsOutputs(DeviceSpan& next_tokens, Device } if (output_cross_qk_.size() && alignment_heads_) { -#if 0 // USE_CUDA +#if 0 // USE_CUDA // Collect a GPU array of float* pointers from the vector of OrtValues to pass to the kernel auto output_cross_qk_ptrs = cross_qk_ptrs_gpu_.CpuSpan(); assert(output_cross_qk_ptrs.size() == output_cross_qk_.size()); @@ -386,7 +386,7 @@ void Whisper_State::Initialize(DeviceSpan& next_tokens, int total_lengt void Whisper_State::Finalize() { if (output_cross_qk_.size() && alignment_heads_) { -#if 0 // USE_CUDA +#if 0 // USE_CUDA int decoded_length = *(past_sequence_length_->GetTensorMutableData()) + 1; auto output_cross_qk_dims = output_cross_qk_[0]->GetTensorTypeAndShapeInfo()->GetShape(); diff --git a/src/smartptrs.h b/src/smartptrs.h index a25c8c133..1e2a79f47 100644 --- a/src/smartptrs.h +++ b/src/smartptrs.h @@ -110,7 +110,7 @@ struct DeviceInterface { virtual void LaunchHandleEOSArray(float* /*batch_logits*/, int /*batch_beam_size*/, int /*vocab_size*/, const int32_t* /*eos_token_ids*/, int /*eos_token_ids_count*/) { assert(false); } virtual void UpdateCacheIndirectionKernelLauncher(int32_t* /*tgt_indir_cache*/, const int32_t* /*src_indir_cache*/, const int32_t* /*beam_ids*/, int /*batch_size*/, int /*beam_width*/, int /*input_seq_length*/, int /*max_seq_length*/, int /*current_length*/) { assert(false); } virtual void ReorderPastStatesKernelLauncher(void* /*out_buffer*/, const void* /*in_buffer*/, int /*batch_size*/, int /*num_heads*/, int /*max_length*/, int /*head_size*/, int /*chunk_size*/) { assert(false); } - virtual void LaunchCopyCrossQKSingleDecodeStep(float* /*cross_qk_buffer_data*/, float** /*qk_layer_pointers*/, int /*token_index*/, int /*batch_beam_size*/, int /*num_layers*/, int /*num_heads*/, int /*num_alignment_heads*/, const int* /*alignment_heads*/, int /*frames*/, int /*max_length*/) { assert(false); } + virtual void LaunchCopyCrossQKSingleDecodeStep(float* /*cross_qk_buffer_data*/, float** /*qk_layer_pointers*/, int /*token_index*/, int /*batch_beam_size*/, int /*num_layers*/, int /*num_heads*/, int /*num_alignment_heads*/, const int* /*alignment_heads*/, int /*frames*/, int /*max_length*/) { assert(false); } virtual void LaunchFinalizeCrossQK(int /*iteration_number*/, int /*context_decoding_len*/, int /*batch_size*/, int /*num_beams*/, int /*max_length*/, int /*num_alignment_heads*/, int /*frames_of_k*/, const float* /*cross_qk_buffer_data*/, float* /*cross_qk_output*/, int /*num_return_sequences*/, const int* /*cache_indir_data*/) { assert(false); } virtual void* GetCudaStream() { From 0bc39a5b7e4fb9df026b184647e20d8248810c9f Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Thu, 16 Jan 2025 20:15:19 -0800 Subject: [PATCH 06/31] Build fixes --- src/dml/interface.h | 4 ++++ src/generators.cpp | 3 ++- src/models/input_ids.h | 2 +- src/models/model.cpp | 5 +---- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/dml/interface.h b/src/dml/interface.h index 6f745ce13..5c43aa11b 100644 --- a/src/dml/interface.h +++ b/src/dml/interface.h @@ -1,6 +1,10 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#ifndef _WIN32 +using LUID = void; +#endif + namespace Generators { void InitDmlInterface(LUID* p_device_luid); diff --git a/src/generators.cpp b/src/generators.cpp index c5ad02e45..e7a781642 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -164,8 +164,9 @@ std::string to_string(DeviceType device_type) { return "WebGpu"; case DeviceType::QNN: return "QnnWithSharedMemory"; + default: + throw std::runtime_error("Unknown device type"); } - throw std::runtime_error("Unknown device type"); } DeviceInterface* GetDeviceInterface(DeviceType type) { diff --git a/src/models/input_ids.h b/src/models/input_ids.h index cc9ab5640..3388f0cd8 100644 --- a/src/models/input_ids.h +++ b/src/models/input_ids.h @@ -21,7 +21,7 @@ struct DefaultInputIDs : InputIDs { void Add() override; // Resize input_ids based on size of next_tokens. // Update value with next_tokens. - void Update(DeviceSpan next_tokens); + void Update(DeviceSpan next_tokens) override; std::array GetShape() const override { return shape_; } const char* name_; diff --git a/src/models/model.cpp b/src/models/model.cpp index fd552b7c8..4009481cc 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -15,9 +15,7 @@ #include "whisper.h" #include "multi_modal_vision_model.h" #include "decoder_only_pipeline.h" -#if USE_DML #include "../dml/interface.h" -#endif namespace Generators { @@ -425,7 +423,6 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ Ort::ThrowOnError(Ort::api->UpdateROCMProviderOptions(&ort_provider_options, keys.data(), values.data(), keys.size())); session_options.AppendExecutionProvider_ROCM(ort_provider_options); -#if USE_DML } else if (provider_options.name == "dml") { if (!GetDmlInterface()) { LUID device_luid{}; @@ -452,7 +449,7 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ if (is_primary_session_options) device_type_ = DeviceType::DML; // We use a DML allocator for input/output caches, but other tensors will use CPU tensors -#endif + } else if (provider_options.name == "qnn") { session_options.AddConfigEntry("ep.share_ep_contexts", "1"); std::unordered_map opts; From 5244049dc15dbbec7b65a85ca7abcd59501d26ca Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Thu, 16 Jan 2025 21:21:24 -0800 Subject: [PATCH 07/31] Build fix --- src/dml/interface.h | 7 +++++-- src/models/model.cpp | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/dml/interface.h b/src/dml/interface.h index 5c43aa11b..e3f2afdae 100644 --- a/src/dml/interface.h +++ b/src/dml/interface.h @@ -2,7 +2,10 @@ // Licensed under the MIT License. #ifndef _WIN32 -using LUID = void; +typedef struct _LUID { + DWORD LowPart; + LONG HighPart; +} LUID, *PLUID; #endif namespace Generators { @@ -12,4 +15,4 @@ void SetDmlProvider(OrtSessionOptions& options); DeviceInterface* GetDmlInterface(); -} // namespace Generators \ No newline at end of file +} // namespace Generators diff --git a/src/models/model.cpp b/src/models/model.cpp index 4009481cc..71c876fba 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -423,6 +423,7 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ Ort::ThrowOnError(Ort::api->UpdateROCMProviderOptions(&ort_provider_options, keys.data(), values.data(), keys.size())); session_options.AppendExecutionProvider_ROCM(ort_provider_options); +#if USE_DML } else if (provider_options.name == "dml") { if (!GetDmlInterface()) { LUID device_luid{}; @@ -449,7 +450,7 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ if (is_primary_session_options) device_type_ = DeviceType::DML; // We use a DML allocator for input/output caches, but other tensors will use CPU tensors - +#endif } else if (provider_options.name == "qnn") { session_options.AddConfigEntry("ep.share_ep_contexts", "1"); std::unordered_map opts; From 49b51ef1431232cb0f4b0ac5f0f05f48a32026cf Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Thu, 16 Jan 2025 21:33:46 -0800 Subject: [PATCH 08/31] Build fix --- src/dml/interface.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dml/interface.h b/src/dml/interface.h index e3f2afdae..9ef2c5785 100644 --- a/src/dml/interface.h +++ b/src/dml/interface.h @@ -3,8 +3,8 @@ #ifndef _WIN32 typedef struct _LUID { - DWORD LowPart; - LONG HighPart; + uint32_t LowPart; + int32_t HighPart; } LUID, *PLUID; #endif From d3db2f6673fc2283d383e452410a98eb756d9465 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Tue, 21 Jan 2025 14:06:51 -0800 Subject: [PATCH 09/31] Fix input_ids issue from merge --- src/models/input_ids.cpp | 9 ++------- src/models/input_ids.h | 3 --- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/models/input_ids.cpp b/src/models/input_ids.cpp index 38bdba445..a3a988608 100644 --- a/src/models/input_ids.cpp +++ b/src/models/input_ids.cpp @@ -7,7 +7,7 @@ namespace Generators { DefaultInputIDs::DefaultInputIDs(State& state) : state_{state} { name_ = model_.config_->model.decoder.inputs.input_ids.c_str(); - shape_ = {state_.params_->search.batch_size, 0}; + shape_ = {state_.params_->BatchBeamSize(), 0}; type_ = model_.session_info_->GetInputDataType(name_); if (model_.session_info_->HasInput(model_.config_->model.decoder.inputs.current_sequence_length) && @@ -71,12 +71,7 @@ void DefaultInputIDs::Update(DeviceSpan new_tokens) { if (static_cast(shape_[1]) != sequence_length) { shape_[1] = sequence_length; - if (!sb_input_ids_) { - value_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_); - } else { - value_ = sb_input_ids_->CreateTensorOnStaticBuffer(shape_, Ort::TypeToTensorType); - } - + value_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_); state_.inputs_[input_index_] = value_.get(); } diff --git a/src/models/input_ids.h b/src/models/input_ids.h index 3388f0cd8..fd4b159e6 100644 --- a/src/models/input_ids.h +++ b/src/models/input_ids.h @@ -40,9 +40,6 @@ struct DefaultInputIDs : InputIDs { std::unique_ptr value_; std::unique_ptr cast_value_; - // Used for decoding runs with cuda graphs. - StaticBuffer* sb_input_ids_{}; - std::unique_ptr current_sequence_length_; std::unique_ptr past_sequence_length_; }; From 133d5a09698a7b2c665035abfc2f227aadde3c5d Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Tue, 21 Jan 2025 16:56:59 -0800 Subject: [PATCH 10/31] Fix C# unit tests --- test/csharp/TestOnnxRuntimeGenAIAPI.cs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/test/csharp/TestOnnxRuntimeGenAIAPI.cs b/test/csharp/TestOnnxRuntimeGenAIAPI.cs index ddced9e42..1a3256625 100644 --- a/test/csharp/TestOnnxRuntimeGenAIAPI.cs +++ b/test/csharp/TestOnnxRuntimeGenAIAPI.cs @@ -8,10 +8,11 @@ using System.Runtime.CompilerServices; using Xunit; using Xunit.Abstractions; +using Microsoft.ML.OnnxRuntimeGenAI; namespace Microsoft.ML.OnnxRuntimeGenAI.Tests { - public class OnnxRuntimeGenAITests + public class OnnxRuntimeGenAITests : IDisposable { private readonly ITestOutputHelper output; @@ -86,12 +87,17 @@ private static string GetDirectoryInTreeThatContains(string currentDirectory, st }); private static string _adaptersPath => _lazyAdaptersPath.Value; - + private OgaHandle ogaHandle; public OnnxRuntimeGenAITests(ITestOutputHelper o) { + this.ogaHandle = new OgaHandle(); this.output = o; } + public void Dispose() + { + ogaHandle?.Dispose(); + } private class IgnoreOnModelAbsenceFact : FactAttribute { From b079b7407b64557b3c613eb0cd61ccd9dfc7e21e Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Tue, 21 Jan 2025 20:51:21 -0800 Subject: [PATCH 11/31] Try again to fix C# test --- test/csharp/TestOnnxRuntimeGenAIAPI.cs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/test/csharp/TestOnnxRuntimeGenAIAPI.cs b/test/csharp/TestOnnxRuntimeGenAIAPI.cs index 1a3256625..34baf64e7 100644 --- a/test/csharp/TestOnnxRuntimeGenAIAPI.cs +++ b/test/csharp/TestOnnxRuntimeGenAIAPI.cs @@ -12,7 +12,7 @@ namespace Microsoft.ML.OnnxRuntimeGenAI.Tests { - public class OnnxRuntimeGenAITests : IDisposable + public class OnnxRuntimeGenAITests { private readonly ITestOutputHelper output; @@ -87,17 +87,14 @@ private static string GetDirectoryInTreeThatContains(string currentDirectory, st }); private static string _adaptersPath => _lazyAdaptersPath.Value; - private OgaHandle ogaHandle; + private static OgaHandle ogaHandle; public OnnxRuntimeGenAITests(ITestOutputHelper o) { - this.ogaHandle = new OgaHandle(); + ogaHandle = new OgaHandle(); + AppDomain.CurrentDomain.ProcessExit += (sender, e) => ogaHandle.Dispose(); this.output = o; } - public void Dispose() - { - ogaHandle?.Dispose(); - } private class IgnoreOnModelAbsenceFact : FactAttribute { From afecf1dd48b3d17f02910da0c8b7d5c735574a6c Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Wed, 22 Jan 2025 16:19:11 -0800 Subject: [PATCH 12/31] Test theory --- src/generators.cpp | 2 ++ src/models/model.cpp | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/generators.cpp b/src/generators.cpp index 6fe0a8ae4..57fa43f67 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -50,6 +50,8 @@ OrtGlobals::OrtGlobals() auto arena_config = OrtArenaCfg::Create(0, -1, -1, -1); Ort::Allocator& allocator_cpu{Ort::Allocator::GetWithDefaultOptions()}; env_->CreateAndRegisterAllocator(allocator_cpu.GetInfo(), *arena_config); + + std::cerr << "OrtGlobals::OrtGlobals completed" << std::endl; } // Ensure Shutdown() has been called before process exit diff --git a/src/models/model.cpp b/src/models/model.cpp index 221786307..ea2626b5a 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -225,9 +225,12 @@ Ort::Allocator* GetDeviceAllocator(OrtSession& session, DeviceType type) { auto& device = GetOrtGlobals()->allocator_device_[static_cast(type)]; if (!device) { static const char* device_type_names[static_cast(DeviceType::MAX)] = {"CPU", "Cuda", "DML", "WebGPU Buffer"}; + std::cerr << "GetDeviceAllocator: Creating device allocator for " << device_type_names[static_cast(type)] << std::endl; + auto memory_info = OrtMemoryInfo::Create(device_type_names[static_cast(type)], OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); device = Ort::Allocator::Create(session, *memory_info); - GetDeviceInterface(type)->InitOrt(*Ort::api, *device); + GetDeviceInterface(type)->InitOrt(*Ort::api, *device); // Necessary for any shared library providers so they can access Ort::api + std::cerr << "GetDeviceAllocator: Device created" << std::endl; } return device.get(); } From 1734f5c18b2cda02411b9e392431340cfe46d299 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Fri, 24 Jan 2025 03:36:49 -0800 Subject: [PATCH 13/31] Test instrumenting --- test/csharp/TestOnnxRuntimeGenAIAPI.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/csharp/TestOnnxRuntimeGenAIAPI.cs b/test/csharp/TestOnnxRuntimeGenAIAPI.cs index e9f825ddf..58ad56fcf 100644 --- a/test/csharp/TestOnnxRuntimeGenAIAPI.cs +++ b/test/csharp/TestOnnxRuntimeGenAIAPI.cs @@ -91,6 +91,10 @@ private static string GetDirectoryInTreeThatContains(string currentDirectory, st public OnnxRuntimeGenAITests(ITestOutputHelper o) { + Console.WriteLine("**** Running OnnxRuntimeGenAITests constructor"); + // Initialize GenAI and register a handler to dispose it on process exit + ogaHandle = new OgaHandle(); + AppDomain.CurrentDomain.ProcessExit += (sender, e) => ogaHandle.Dispose(); this.output = o; } @@ -575,6 +579,8 @@ public IgnoreOnAdaptersAbsentFact() [IgnoreOnAdaptersAbsentFact(DisplayName = "TestAdapters")] public void TestAdapters() { + Console.WriteLine("**** Running TestAdapters"); + string modelPath = _adaptersPath; string adapterPath = Path.Combine(modelPath, "adapters.onnx_adapter"); From 2bc83ebb7c3f75ca1ea725678f6e69dafd84f406 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Fri, 24 Jan 2025 13:46:49 -0800 Subject: [PATCH 14/31] Crash investigation --- test/csharp/TestOnnxRuntimeGenAIAPI.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/csharp/TestOnnxRuntimeGenAIAPI.cs b/test/csharp/TestOnnxRuntimeGenAIAPI.cs index 58ad56fcf..747675a1a 100644 --- a/test/csharp/TestOnnxRuntimeGenAIAPI.cs +++ b/test/csharp/TestOnnxRuntimeGenAIAPI.cs @@ -8,7 +8,6 @@ using System.Runtime.CompilerServices; using Xunit; using Xunit.Abstractions; -using Microsoft.ML.OnnxRuntimeGenAI; namespace Microsoft.ML.OnnxRuntimeGenAI.Tests { @@ -96,6 +95,7 @@ public OnnxRuntimeGenAITests(ITestOutputHelper o) ogaHandle = new OgaHandle(); AppDomain.CurrentDomain.ProcessExit += (sender, e) => ogaHandle.Dispose(); this.output = o; + Console.WriteLine("**** OnnxRuntimeGenAI constructor completed"); } private class IgnoreOnModelAbsenceFact : FactAttribute From fd788d76aff72754c8632538592b1257d5cc0074 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Fri, 24 Jan 2025 13:48:33 -0800 Subject: [PATCH 15/31] Extra debug logging --- src/generators.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/generators.cpp b/src/generators.cpp index 57fa43f67..c789e5bcc 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -37,6 +37,11 @@ void ThrowErrorIfSessionTerminated(bool is_session_terminated) { namespace Generators { +static bool _1 = []() { + std::cerr << "OrtGlobals::OrtGlobals started" << std::endl; + return false; +}(); + static bool _ = (Ort::InitApi(), false); static OrtLoggingLevel GetDefaultOrtLoggingLevel() { @@ -47,6 +52,9 @@ static OrtLoggingLevel GetDefaultOrtLoggingLevel() { OrtGlobals::OrtGlobals() : env_{OrtEnv::Create(GetDefaultOrtLoggingLevel())} { + std::cerr << "OrtGlobals::OrtGlobals started" << std::endl; + + auto arena_config = OrtArenaCfg::Create(0, -1, -1, -1); Ort::Allocator& allocator_cpu{Ort::Allocator::GetWithDefaultOptions()}; env_->CreateAndRegisterAllocator(allocator_cpu.GetInfo(), *arena_config); From 0303592f461721963c98c11395e624c6a5e8e07f Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Fri, 24 Jan 2025 18:17:04 -0800 Subject: [PATCH 16/31] Undefined behavior fix in startup --- src/cpu/interface.cpp | 8 +++++--- src/generators.cpp | 11 ++--------- src/models/model.cpp | 10 ++++++---- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/src/cpu/interface.cpp b/src/cpu/interface.cpp index c6705fa3b..0b34b80c4 100644 --- a/src/cpu/interface.cpp +++ b/src/cpu/interface.cpp @@ -44,7 +44,6 @@ struct CpuMemory final : DeviceBuffer { struct CpuInterface : DeviceInterface { CpuInterface() { - InitOrt(*Ort::api, Ort::Allocator::GetWithDefaultOptions()); } void InitOrt(const OrtApi& /*api*/, Ort::Allocator& allocator) override { @@ -101,8 +100,11 @@ struct CpuInterface : DeviceInterface { std::unique_ptr CreateBeam(const GeneratorParams& params) override { return std::make_unique(params); } void Synchronize() override {} // Nothing to do as CPU is always in sync with itself -} g_cpu; +}; -DeviceInterface* GetCpuInterface() { return &g_cpu; } +DeviceInterface* GetCpuInterface() { + static std::unique_ptr g_cpu = std::make_unique(); + return g_cpu.get(); +} } // namespace Generators diff --git a/src/generators.cpp b/src/generators.cpp index c789e5bcc..b003ac331 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -37,11 +37,6 @@ void ThrowErrorIfSessionTerminated(bool is_session_terminated) { namespace Generators { -static bool _1 = []() { - std::cerr << "OrtGlobals::OrtGlobals started" << std::endl; - return false; -}(); - static bool _ = (Ort::InitApi(), false); static OrtLoggingLevel GetDefaultOrtLoggingLevel() { @@ -52,14 +47,12 @@ static OrtLoggingLevel GetDefaultOrtLoggingLevel() { OrtGlobals::OrtGlobals() : env_{OrtEnv::Create(GetDefaultOrtLoggingLevel())} { - std::cerr << "OrtGlobals::OrtGlobals started" << std::endl; - - auto arena_config = OrtArenaCfg::Create(0, -1, -1, -1); Ort::Allocator& allocator_cpu{Ort::Allocator::GetWithDefaultOptions()}; env_->CreateAndRegisterAllocator(allocator_cpu.GetInfo(), *arena_config); - std::cerr << "OrtGlobals::OrtGlobals completed" << std::endl; + // Init the CPU device (special case because it always exists, and its allocator is special + GetDeviceInterface(DeviceType::CPU)->InitOrt(*Ort::api, allocator_cpu); } // Ensure Shutdown() has been called before process exit diff --git a/src/models/model.cpp b/src/models/model.cpp index ea2626b5a..b328bed11 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -222,15 +222,17 @@ int32_t Tokenizer::TokenToTokenId(const char* token) const { // has been destroyed. Without this, we will crash in the Onnxruntime BFCArena code when deleting tensors due to the // arena already being destroyed. Ort::Allocator* GetDeviceAllocator(OrtSession& session, DeviceType type) { + // CPU Allocator is a special case, so don't try to create it + if (type == DeviceType::CPU) + return &GetDeviceInterface(DeviceType::CPU)->GetAllocator(); + auto& device = GetOrtGlobals()->allocator_device_[static_cast(type)]; if (!device) { - static const char* device_type_names[static_cast(DeviceType::MAX)] = {"CPU", "Cuda", "DML", "WebGPU Buffer"}; - std::cerr << "GetDeviceAllocator: Creating device allocator for " << device_type_names[static_cast(type)] << std::endl; + static const char* device_type_names[static_cast(DeviceType::MAX)] = {"CPU - SEE ABOVE", "Cuda", "DML", "WebGPU Buffer"}; auto memory_info = OrtMemoryInfo::Create(device_type_names[static_cast(type)], OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); device = Ort::Allocator::Create(session, *memory_info); - GetDeviceInterface(type)->InitOrt(*Ort::api, *device); // Necessary for any shared library providers so they can access Ort::api - std::cerr << "GetDeviceAllocator: Device created" << std::endl; + GetDeviceInterface(type)->InitOrt(*Ort::api, *device); // Necessary for any shared library providers so they can access Ort::api } return device.get(); } From d87807c9629b35d4fb0314fd8b9953c3b05b37a4 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Fri, 24 Jan 2025 20:40:43 -0800 Subject: [PATCH 17/31] Don't load cuda library outside of linux & windows --- src/generators.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/generators.cpp b/src/generators.cpp index b003ac331..af2285336 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -11,7 +11,7 @@ #include "cuda/interface.h" #include "dml/interface.h" -#if _WIN32 +#if defined(_WIN32) EXTERN_C IMAGE_DOS_HEADER __ImageBase; std::string CurrentModulePath() { @@ -130,12 +130,14 @@ DeviceInterface* GetCudaInterface() { // Load the shared library onnxruntime-genai-cuda.dll // This is a workaround to avoid linking the CUDA library to the generator library // The CUDA library is only needed for the CUDA allocator -#ifdef _WIN32 +#if defined(_WIN32) static std::unique_ptr cuda_library{LoadLibrary((CurrentModulePath() + "onnxruntime-genai-cuda.dll").c_str()), [](void* h) { FreeLibrary(reinterpret_cast(h)); }}; -#else +#elif defined(__linux__) static std::unique_ptr cuda_library{dlopen((Ort::GetCurrentModuleDir() + "/libonnxruntime-genai-cuda.so").c_str(), RTLD_NOW | RTLD_DEEPBIND), [](void* h) { dlclose(h); }}; +#else + static std::unique_ptr cuda_library{nullptr, [](void* h) {}}; #endif if (!cuda_library) { From 2df5fe17390bcf0501badac172598a2fdd9fe72d Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Fri, 24 Jan 2025 20:44:57 -0800 Subject: [PATCH 18/31] Fix iOS break --- src/generators.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/generators.cpp b/src/generators.cpp index af2285336..984cd788f 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -146,10 +146,12 @@ DeviceInterface* GetCudaInterface() { Generators::DeviceInterface* GetInterface(GenaiInterface * p_genai); static DeviceInterface* cuda_interface{[] { -#ifdef _WIN32 +#if defined(_WIN32) auto get_cuda_fn = reinterpret_cast(GetProcAddress(reinterpret_cast(cuda_library.get()), "GetInterface")); -#else +#elif defined(__linux__) auto get_cuda_fn = reinterpret_cast(dlsym(cuda_library.get(), "GetInterface")); +#else + auto get_cuda_fn = [](GenaiInterface*) { return nullptr; }; #endif return get_cuda_fn(&g_genai); }()}; From 67365174296873ed6603e30d9ab79f69fb9baf62 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Fri, 24 Jan 2025 20:59:24 -0800 Subject: [PATCH 19/31] Android tweak --- src/generators.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/generators.cpp b/src/generators.cpp index 984cd788f..9aa561e9e 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -133,7 +133,7 @@ DeviceInterface* GetCudaInterface() { #if defined(_WIN32) static std::unique_ptr cuda_library{LoadLibrary((CurrentModulePath() + "onnxruntime-genai-cuda.dll").c_str()), [](void* h) { FreeLibrary(reinterpret_cast(h)); }}; -#elif defined(__linux__) +#elif defined(__linux__) && !defined(__ANDROID__) static std::unique_ptr cuda_library{dlopen((Ort::GetCurrentModuleDir() + "/libonnxruntime-genai-cuda.so").c_str(), RTLD_NOW | RTLD_DEEPBIND), [](void* h) { dlclose(h); }}; #else @@ -148,7 +148,7 @@ DeviceInterface* GetCudaInterface() { static DeviceInterface* cuda_interface{[] { #if defined(_WIN32) auto get_cuda_fn = reinterpret_cast(GetProcAddress(reinterpret_cast(cuda_library.get()), "GetInterface")); -#elif defined(__linux__) +#elif defined(__linux__) && !defined(__ANDROID__) auto get_cuda_fn = reinterpret_cast(dlsym(cuda_library.get(), "GetInterface")); #else auto get_cuda_fn = [](GenaiInterface*) { return nullptr; }; From a011fe0dc6f5fc0078343c75237dca27e500bda3 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Sun, 26 Jan 2025 18:06:18 -0800 Subject: [PATCH 20/31] Leftover #ifdef fix --- test/c_api_tests.cpp | 4 ---- test/sampling_benchmark.cpp | 3 --- test/sampling_tests.cpp | 3 --- test/tests_helper.cu | 4 ++-- test/tests_helper.cuh | 4 ++-- 5 files changed, 4 insertions(+), 14 deletions(-) diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp index c91c69733..8d9d07522 100644 --- a/test/c_api_tests.cpp +++ b/test/c_api_tests.cpp @@ -31,10 +31,6 @@ TEST(CAPITests, Config) { config->AppendProvider("cuda"); #endif } - -#undef USE_CUDA -#define USE_CUDA 0 - TEST(CAPITests, TokenizerCAPI) { #if TEST_PHI2 auto config = OgaConfig::Create(PHI2_PATH); diff --git a/test/sampling_benchmark.cpp b/test/sampling_benchmark.cpp index c15478009..eb7f04cd9 100644 --- a/test/sampling_benchmark.cpp +++ b/test/sampling_benchmark.cpp @@ -14,9 +14,6 @@ #define MODEL_PATH "../../test/test_models/" #endif -#undef USE_CUDA -#define USE_CUDA 0 - // Defined in sampling_tests.cpp void CreateRandomLogits(float* logits, int num_large, int vocab_size, int batch_size, std::mt19937& engine); diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp index 188367088..6ca0080eb 100644 --- a/test/sampling_tests.cpp +++ b/test/sampling_tests.cpp @@ -13,9 +13,6 @@ #define MODEL_PATH "../../test/test_models/" #endif -#undef USE_CUDA -#define USE_CUDA 0 - template auto AllocateFromCpuMem(Generators::DeviceInterface& device, std::span cpu_memory) { auto memory = device.Allocate(cpu_memory.size()); diff --git a/test/tests_helper.cu b/test/tests_helper.cu index c15539d02..b6a3c9d5d 100644 --- a/test/tests_helper.cu +++ b/test/tests_helper.cu @@ -21,10 +21,10 @@ __global__ void GeometricDecayKernel(float* logits, int vocab_size, int num_larg } } -void LaunchGeometricDecayKernel(float* logits, int vocab_size, int batch_size, int num_large, float large_val, cudaStream_t stream) { +void LaunchGeometricDecayKernel(float* logits, int vocab_size, int batch_size, int num_large, float large_val, void* stream) { int num_threads = 256; int num_blocks = batch_size; - GeometricDecayKernel<<>>(logits, vocab_size, num_large, large_val); + GeometricDecayKernel<<(stream)>>>(logits, vocab_size, num_large, large_val); } __global__ void FisherYatesKernel(float* logits, int* indices, int vocab_size, curandState* states) { diff --git a/test/tests_helper.cuh b/test/tests_helper.cuh index ff0f3f319..d4dd48a92 100644 --- a/test/tests_helper.cuh +++ b/test/tests_helper.cuh @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -void LaunchGeometricDecayKernel(float* logits, int vocab_size, int batch_size, int num_large, float large_val, cudaStream_t stream); -void LaunchFisherYatesKernel(float* logits, int* indices, int vocab_size, int batch_size, cudaStream_t stream); \ No newline at end of file +void LaunchGeometricDecayKernel(float* logits, int vocab_size, int batch_size, int num_large, float large_val, void* stream); +void LaunchFisherYatesKernel(float* logits, int* indices, int vocab_size, int batch_size, void* stream); \ No newline at end of file From c11704f35cbd6232e9662a124e58d0587e5dd3da Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Sun, 26 Jan 2025 18:13:35 -0800 Subject: [PATCH 21/31] Type tweak --- test/sampling_tests.cpp | 12 ++++++------ test/tests_helper.cu | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp index 6ca0080eb..a42609306 100644 --- a/test/sampling_tests.cpp +++ b/test/sampling_tests.cpp @@ -417,8 +417,8 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) { for (int i = 0; i < num_iter; i++) { int num_large = dist(engine); auto generator = Generators::CreateGenerator(*model, *params); - LaunchGeometricDecayKernel(logits_gpu.Span().data(), config.model.vocab_size, batch_size, num_large, 20.0f, params->cuda_stream); - LaunchFisherYatesKernel(logits_gpu.Span().data(), indices_buffer.Span().data(), config.model.vocab_size, batch_size, params->cuda_stream); + LaunchGeometricDecayKernel(logits_gpu.Span().data(), config.model.vocab_size, batch_size, num_large, 20.0f, params->p_device->GetCudaStream()); + LaunchFisherYatesKernel(logits_gpu.Span().data(), indices_buffer.Span().data(), config.model.vocab_size, batch_size, params->p_device->GetCudaStream()); generator->SetLogits(logits_gpu); generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().CopyDeviceToCpu(); @@ -522,8 +522,8 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) { for (int i = 0; i < num_iter; i++) { int num_large = dist(engine); auto generator = Generators::CreateGenerator(*model, *params); - LaunchGeometricDecayKernel(logits_gpu.Span().data(), config.model.vocab_size, batch_size, num_large, 20.0f, params->cuda_stream); - LaunchFisherYatesKernel(logits_gpu.Span().data(), indices_buffer.Span().data(), config.model.vocab_size, batch_size, params->cuda_stream); + LaunchGeometricDecayKernel(logits_gpu.Span().data(), config.model.vocab_size, batch_size, num_large, 20.0f, params->p_device->GetCudaStream()); + LaunchFisherYatesKernel(logits_gpu.Span().data(), indices_buffer.Span().data(), config.model.vocab_size, batch_size, params->p_device->GetCudaStream()); generator->SetLogits(logits_gpu); generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().CopyDeviceToCpu(); @@ -558,8 +558,8 @@ TEST(SamplingTests, RandomizedSamplingSelectTopCuda) { int num_iter = 100; for (int i = 0; i < num_iter; i++) { int num_large = dist(engine); - LaunchGeometricDecayKernel(logits_gpu.Span().data(), config.model.vocab_size, batch_size, num_large, 20.0f, params->cuda_stream); - LaunchFisherYatesKernel(logits_gpu.Span().data(), indices_buffer.Span().data(), config.model.vocab_size, batch_size, params->cuda_stream); + LaunchGeometricDecayKernel(logits_gpu.Span().data(), config.model.vocab_size, batch_size, num_large, 20.0f, params->p_device->GetCudaStream()); + LaunchFisherYatesKernel(logits_gpu.Span().data(), indices_buffer.Span().data(), config.model.vocab_size, batch_size, params->p_device->GetCudaStream()); auto generator = Generators::CreateGenerator(*model, *params); generator->SetLogits(logits_gpu); generator->GenerateNextToken(); diff --git a/test/tests_helper.cu b/test/tests_helper.cu index b6a3c9d5d..28f97b0e3 100644 --- a/test/tests_helper.cu +++ b/test/tests_helper.cu @@ -74,13 +74,13 @@ void LaunchPopulateIndices(int* indices, int size, int batch_size, cudaStream_t PopulateIndices<<>>(indices, size, batch_size); } -void LaunchFisherYatesKernel(float* logits, int* indices_buffer, int vocab_size, int batch_size, cudaStream_t stream) { +void LaunchFisherYatesKernel(float* logits, int* indices_buffer, int vocab_size, int batch_size, void* stream) { int num_threads = 256; int num_blocks = batch_size; curandState *random_states; cudaMalloc((void **)&random_states, num_threads * sizeof(curandState)); std::span logits_span{logits, static_cast(vocab_size * batch_size)}; std::span indices{indices_buffer, static_cast(vocab_size * batch_size)}; - LaunchPopulateIndices(indices.data(), vocab_size, batch_size, stream); - FisherYatesKernel<<>>(logits_span.data(), indices.data(), vocab_size, random_states); + LaunchPopulateIndices(indices.data(), vocab_size, batch_size, static_cast(stream)); + FisherYatesKernel<<(stream)>>>(logits_span.data(), indices.data(), vocab_size, random_states); } From 45dad2b97ba68867c64dc2ce5c382e1aa655d15a Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Tue, 28 Jan 2025 11:51:15 -0800 Subject: [PATCH 22/31] Review feedback --- src/cuda/beam_search_scorer_cuda.cpp | 3 +++ src/cuda/beam_search_scorer_cuda.cu | 3 +++ src/cuda/beam_search_scorer_cuda.cuh | 3 +++ src/cuda/beam_search_scorer_cuda.h | 3 +++ src/cuda/beam_search_topk.cu | 3 +++ src/cuda/cuda_sampling.cuh | 1 + src/cuda/kernels.h | 1 + src/cuda/model_kernels.cu | 1 + src/cuda/search_cuda.cpp | 3 +++ src/cuda/search_cuda.cu | 3 +++ src/cuda/search_cuda.cuh | 3 +++ src/cuda/search_cuda.h | 3 +++ src/generators.h | 1 + test/c_api_tests.cpp | 4 ++++ 14 files changed, 35 insertions(+) diff --git a/src/cuda/beam_search_scorer_cuda.cpp b/src/cuda/beam_search_scorer_cuda.cpp index e2295e00d..a59bc80bc 100644 --- a/src/cuda/beam_search_scorer_cuda.cpp +++ b/src/cuda/beam_search_scorer_cuda.cpp @@ -1,3 +1,6 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + #include "generators.h" #include "search.h" #include "search_cuda.h" diff --git a/src/cuda/beam_search_scorer_cuda.cu b/src/cuda/beam_search_scorer_cuda.cu index fa9148dcc..3add21bc1 100644 --- a/src/cuda/beam_search_scorer_cuda.cu +++ b/src/cuda/beam_search_scorer_cuda.cu @@ -1,3 +1,6 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + #include #include #include diff --git a/src/cuda/beam_search_scorer_cuda.cuh b/src/cuda/beam_search_scorer_cuda.cuh index 9e441d4bd..7a8834b69 100644 --- a/src/cuda/beam_search_scorer_cuda.cuh +++ b/src/cuda/beam_search_scorer_cuda.cuh @@ -1,3 +1,6 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + #include "models/onnxruntime_api.h" #include "smartptrs.h" diff --git a/src/cuda/beam_search_scorer_cuda.h b/src/cuda/beam_search_scorer_cuda.h index 8b23a4225..7ec208485 100644 --- a/src/cuda/beam_search_scorer_cuda.h +++ b/src/cuda/beam_search_scorer_cuda.h @@ -1,3 +1,6 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + namespace Generators { struct BeamSearchScorer_Cuda { diff --git a/src/cuda/beam_search_topk.cu b/src/cuda/beam_search_topk.cu index 222561ce8..32da76fa2 100644 --- a/src/cuda/beam_search_topk.cu +++ b/src/cuda/beam_search_topk.cu @@ -1,3 +1,6 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + #include #include #include diff --git a/src/cuda/cuda_sampling.cuh b/src/cuda/cuda_sampling.cuh index 390ff92bc..529d2e65a 100644 --- a/src/cuda/cuda_sampling.cuh +++ b/src/cuda/cuda_sampling.cuh @@ -1,5 +1,6 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. + #include #include "cuda_common.h" #include diff --git a/src/cuda/kernels.h b/src/cuda/kernels.h index 99be3a416..860af48c8 100644 --- a/src/cuda/kernels.h +++ b/src/cuda/kernels.h @@ -1,5 +1,6 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. + #pragma once namespace Generators { diff --git a/src/cuda/model_kernels.cu b/src/cuda/model_kernels.cu index 59b1f5431..23d9037e0 100644 --- a/src/cuda/model_kernels.cu +++ b/src/cuda/model_kernels.cu @@ -1,5 +1,6 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. + #include #include #include diff --git a/src/cuda/search_cuda.cpp b/src/cuda/search_cuda.cpp index a53ff37dd..5bfbc6ac8 100644 --- a/src/cuda/search_cuda.cpp +++ b/src/cuda/search_cuda.cpp @@ -1,3 +1,6 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + #include "generators.h" #include "interface.h" #include "search.h" diff --git a/src/cuda/search_cuda.cu b/src/cuda/search_cuda.cu index f8c9ed3bf..fcb21200d 100644 --- a/src/cuda/search_cuda.cu +++ b/src/cuda/search_cuda.cu @@ -1,3 +1,6 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + #include #include #include diff --git a/src/cuda/search_cuda.cuh b/src/cuda/search_cuda.cuh index f21237c41..a0a07fb9f 100644 --- a/src/cuda/search_cuda.cuh +++ b/src/cuda/search_cuda.cuh @@ -1,3 +1,6 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + namespace Generators { namespace cuda { diff --git a/src/cuda/search_cuda.h b/src/cuda/search_cuda.h index bb31b4423..acdd0525f 100644 --- a/src/cuda/search_cuda.h +++ b/src/cuda/search_cuda.h @@ -1,3 +1,6 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + #pragma once #include #include "search_cuda.cuh" diff --git a/src/generators.h b/src/generators.h index 13b3fe617..6d2865a37 100644 --- a/src/generators.h +++ b/src/generators.h @@ -167,6 +167,7 @@ std::shared_ptr CreateGeneratorParams(const Model& model); std::shared_ptr CreateGeneratorParams(const Config& config); // For benchmarking purposes only std::unique_ptr CreateGenerator(const Model& model, const GeneratorParams& params); +// Fallback to copy between two separate device buffers by going through CPU memory (slow unless we're the CPU device) void CopyThroughCpu(DeviceBuffer& dest, size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes); float Float16ToFloat32(uint16_t v); // v is a IEEE 752-2008 binary16 format, 1 sign bit, 5 bit exponent, 10 bit fraction diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp index 8d9d07522..56e40a2c1 100644 --- a/test/c_api_tests.cpp +++ b/test/c_api_tests.cpp @@ -1,3 +1,6 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + #include #include #include @@ -31,6 +34,7 @@ TEST(CAPITests, Config) { config->AppendProvider("cuda"); #endif } + TEST(CAPITests, TokenizerCAPI) { #if TEST_PHI2 auto config = OgaConfig::Create(PHI2_PATH); From 53c666c1013223b731c18d32e7bc0f7562ca1ab7 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Wed, 29 Jan 2025 12:51:08 -0800 Subject: [PATCH 23/31] Edward gave me ideas. --- src/models/model.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/models/model.cpp b/src/models/model.cpp index b328bed11..e3387f9cb 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -228,10 +228,14 @@ Ort::Allocator* GetDeviceAllocator(OrtSession& session, DeviceType type) { auto& device = GetOrtGlobals()->allocator_device_[static_cast(type)]; if (!device) { - static const char* device_type_names[static_cast(DeviceType::MAX)] = {"CPU - SEE ABOVE", "Cuda", "DML", "WebGPU Buffer"}; + static const char* device_type_names[] = {"CPU (Not used, see above)", "Cuda", "DML", "WebGPU_Buffer", "QNN (Not used, uses CPU memory)"}; + static_assert(std::size(device_type_names) == static_cast(DeviceType::MAX)); - auto memory_info = OrtMemoryInfo::Create(device_type_names[static_cast(type)], OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); + auto name = device_type_names[static_cast(type)]; + auto memory_info = OrtMemoryInfo::Create(name, OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); device = Ort::Allocator::Create(session, *memory_info); + if (!device) + throw std::runtime_error("Unexpected failure to create device memory allocator for " + std::string(name)); GetDeviceInterface(type)->InitOrt(*Ort::api, *device); // Necessary for any shared library providers so they can access Ort::api } return device.get(); From e8046976c71bbd19e27b5b53d8b231963e345e63 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Wed, 29 Jan 2025 18:14:28 -0800 Subject: [PATCH 24/31] Clean up allocators, now everything is through p_device_* interfaces. --- cmake/global_variables.cmake | 4 ++ src/csharp/NativeMethods.cs | 3 +- src/cuda/interface.cpp | 6 +-- src/dml/interface.cpp | 14 ++--- src/generators.cpp | 6 +++ src/models/adapters.cpp | 6 +-- src/models/decoder_only_pipeline.cpp | 4 +- src/models/embeddings.cpp | 4 +- src/models/image_features.cpp | 4 +- src/models/input_ids.cpp | 6 +-- src/models/kv_cache.cpp | 58 ++++++++++---------- src/models/kv_cache.h | 13 +++++ src/models/logits.cpp | 18 +++---- src/models/model.cpp | 55 +++++++++---------- src/models/model.h | 9 ++-- src/models/position_inputs.cpp | 14 ++--- src/models/whisper.cpp | 8 +-- src/qnn/interface.cpp | 79 ++++++++++++++++++++++++++++ src/qnn/interface.h | 8 +++ src/webgpu/interface.cpp | 79 ++++++++++++++++++++++++++++ src/webgpu/interface.h | 8 +++ 21 files changed, 300 insertions(+), 106 deletions(-) create mode 100644 src/qnn/interface.cpp create mode 100644 src/qnn/interface.h create mode 100644 src/webgpu/interface.cpp create mode 100644 src/webgpu/interface.h diff --git a/cmake/global_variables.cmake b/cmake/global_variables.cmake index 9145bdb3c..b05731ee5 100644 --- a/cmake/global_variables.cmake +++ b/cmake/global_variables.cmake @@ -66,6 +66,10 @@ file(GLOB generator_srcs CONFIGURE_DEPENDS "${GENERATORS_ROOT}/*.cpp" "${GENERATORS_ROOT}/cpu/*.h" "${GENERATORS_ROOT}/cpu/*.cpp" + "${GENERATORS_ROOT}/qnn/*.h" + "${GENERATORS_ROOT}/qnn/*.cpp" + "${GENERATORS_ROOT}/webgpu/*.h" + "${GENERATORS_ROOT}/webgpu/*.cpp" "${MODELS_ROOT}/*.h" "${MODELS_ROOT}/*.cpp" ) diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs index 4dd11169e..e53da7421 100644 --- a/src/csharp/NativeMethods.cs +++ b/src/csharp/NativeMethods.cs @@ -17,7 +17,8 @@ internal class NativeLib // define the library name required for iOS internal const string DllName = "__Internal"; #else - internal const string DllName = "onnxruntime-genai"; +// internal const string DllName = "C:\\code\\onnxruntime-genai2\\build\\Windows\\Debug\\Debug\\onnxruntime-genai"; + internal const string DllName = "C:\\code\\onnxruntime-genai2\\build\\Windows\\RelWithDebInfo\\RelWithDebInfo\\onnxruntime-genai"; #endif } diff --git a/src/cuda/interface.cpp b/src/cuda/interface.cpp index 4c85379b0..2638d0e4b 100644 --- a/src/cuda/interface.cpp +++ b/src/cuda/interface.cpp @@ -13,7 +13,7 @@ namespace Generators { GenaiInterface* gp_genai{}; Ort::Allocator* ort_allocator_{}; -const char* label_cuda = "cuda"; +const char* device_label = "cuda"; cuda_stream_holder g_stream; cudaStream_t GetStream() { return g_stream.get(); } @@ -36,7 +36,7 @@ struct GpuMemory final : DeviceBuffer { ::cudaFreeHost(p_cpu_); } - const char* GetType() const override { return label_cuda; } + const char* GetType() const override { return device_label; } void AllocateCpu() override { if (!p_cpu_) @@ -55,7 +55,7 @@ struct GpuMemory final : DeviceBuffer { } void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override { - if (source.GetType() == label_cuda) + if (source.GetType() == device_label) ::cudaMemcpyAsync(p_device_ + begin_dest, source.p_device_ + begin_source, size_in_bytes, ::cudaMemcpyDeviceToDevice, GetStream()); else gp_genai->CopyThroughCpu(*this, begin_dest, source, begin_source, size_in_bytes); diff --git a/src/dml/interface.cpp b/src/dml/interface.cpp index 58a3457e2..a16d770d5 100644 --- a/src/dml/interface.cpp +++ b/src/dml/interface.cpp @@ -20,7 +20,7 @@ namespace Generators { namespace Dml { // If this was in a shared library it wouldn't need to be in its own namespace Ort::Allocator* ort_allocator_{}; -const char* label_dml = "dml"; +const char* device_label = "dml"; wil::unique_hmodule smart_directml_dll_; DmlObjects dml_objects_; @@ -50,7 +50,7 @@ struct GpuMemory final : DeviceBuffer { free(p_cpu_); } - const char* GetType() const override { return label_dml; } + const char* GetType() const override { return device_label; } void AllocateCpu() override { if (!p_cpu_) @@ -69,7 +69,7 @@ struct GpuMemory final : DeviceBuffer { } void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override { - if (source.GetType() == label_dml) { + if (source.GetType() == device_label) { auto& source_gpu = dynamic_cast(source); dml_execution_context_->CopyBufferRegion( gpu_resource_.Get(), @@ -94,8 +94,8 @@ struct GpuMemory final : DeviceBuffer { bool owned_; // If we own the memory, we delete it on destruction }; -struct DmlInterfaceImpl : DeviceInterface { - DmlInterfaceImpl(LUID* p_device_luid) { +struct InterfaceImpl : DeviceInterface { + InterfaceImpl(LUID* p_device_luid) { Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast(&dml_api_))); if (!dml_api_) { throw std::runtime_error("Unexpected nullptr getting OrtDmlApi"); @@ -209,11 +209,11 @@ struct DmlInterfaceImpl : DeviceInterface { } // namespace Dml -std::unique_ptr g_dml_device; +std::unique_ptr g_dml_device; void InitDmlInterface(LUID* p_device_luid) { if (!g_dml_device) - g_dml_device = std::make_unique(p_device_luid); + g_dml_device = std::make_unique(p_device_luid); } void SetDmlProvider(OrtSessionOptions& session_options) { diff --git a/src/generators.cpp b/src/generators.cpp index 9aa561e9e..66f49ee64 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -10,6 +10,8 @@ #include "cpu/interface.h" #include "cuda/interface.h" #include "dml/interface.h" +#include "qnn/interface.h" +#include "webgpu/interface.h" #if defined(_WIN32) EXTERN_C IMAGE_DOS_HEADER __ImageBase; @@ -187,6 +189,10 @@ DeviceInterface* GetDeviceInterface(DeviceType type) { case DeviceType::DML: return GetDmlInterface(); #endif + case DeviceType::WEBGPU: + return GetWebGPUInterface(); + case DeviceType::QNN: + return GetQNNInterface(); } } diff --git a/src/models/adapters.cpp b/src/models/adapters.cpp index 3840c9b88..5a95ebdd9 100644 --- a/src/models/adapters.cpp +++ b/src/models/adapters.cpp @@ -34,9 +34,9 @@ void Adapters::LoadAdapter(const char* adapter_file_path, const std::string& ada } adapters_.emplace(adapter_name, std::make_unique(adapter_file_path, - model_->allocator_device_ == &model_->allocator_cpu_ - ? nullptr - : model_->allocator_device_)); + model_->device_type_ == DeviceType::CUDA + ? &model_->p_device_->GetAllocator() + : nullptr)); } void Adapters::UnloadAdapter(const std::string& adapter_name) { diff --git a/src/models/decoder_only_pipeline.cpp b/src/models/decoder_only_pipeline.cpp index b18fea179..c8c43c2b1 100644 --- a/src/models/decoder_only_pipeline.cpp +++ b/src/models/decoder_only_pipeline.cpp @@ -12,7 +12,7 @@ DecoderOnlyPipelineModel::DecoderOnlyPipelineModel(std::unique_ptr confi sessions_.emplace_back(OrtSession::Create(ort_env, (config_->config_path / fs::path(model.filename)).c_str(), GetSessionOptions(model.model_id))); - if (!allocator_device_ && model.session_options.has_value()) { + if (!p_device_inputs_ && model.session_options.has_value()) { const auto& provider_options = (*model.session_options).provider_options; if (std::any_of(provider_options.begin(), provider_options.end(), [](const auto& elem) { return !elem.name.empty(); })) { @@ -21,7 +21,7 @@ DecoderOnlyPipelineModel::DecoderOnlyPipelineModel(std::unique_ptr confi } } - if (!allocator_device_) { + if (!p_device_inputs_) { // If the device allocator has not been created, it implies all // sessions are configured to run on CPU. // Pick any session to create the device allocator. diff --git a/src/models/embeddings.cpp b/src/models/embeddings.cpp index 01b494078..4b38db207 100644 --- a/src/models/embeddings.cpp +++ b/src/models/embeddings.cpp @@ -25,7 +25,7 @@ Embeddings::Embeddings(State& state, Embeddings::Mode mode, const std::string& n sb_embeddings_ = state_.GetCapturedGraphInfo()->sb_embeddings_.get(); } - embeddings_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_); + embeddings_ = OrtValue::CreateTensor(model_.p_device_->GetAllocator(), shape_, type_); } } @@ -54,7 +54,7 @@ void Embeddings::UpdateSequenceLength(size_t new_length) { if (mode_ == Embeddings::Mode::Input) { if (!sb_embeddings_) { - embeddings_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_); + embeddings_ = OrtValue::CreateTensor(model_.p_device_->GetAllocator(), shape_, type_); } else { embeddings_ = sb_embeddings_->CreateTensorOnStaticBuffer(shape_, type_); } diff --git a/src/models/image_features.cpp b/src/models/image_features.cpp index 71dee99fa..6f51abd0f 100644 --- a/src/models/image_features.cpp +++ b/src/models/image_features.cpp @@ -26,7 +26,7 @@ ImageFeatures::ImageFeatures(State& state, ImageFeatures::Mode mode, const std:: // 4) Created as an input for embedding model (num_image_tokens = 0) // The tensor does not need to be pre-allocated because it will be created during (2). if (mode == ImageFeatures::Mode::Output) { - image_features_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_); + image_features_ = OrtValue::CreateTensor(model_.p_device_->GetAllocator(), shape_, type_); } } @@ -50,7 +50,7 @@ void ImageFeatures::Update(bool is_prompt) { // num_image_tokens will be 0 when no image is provided if (!is_prompt && shape_[0] > 0) { // if num_image_tokens > 0 shape_[0] = 0; - image_features_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_); + image_features_ = OrtValue::CreateTensor(model_.p_device_->GetAllocator(), shape_, type_); state_.inputs_[index_] = image_features_.get(); } } diff --git a/src/models/input_ids.cpp b/src/models/input_ids.cpp index a3a988608..6a32a6233 100644 --- a/src/models/input_ids.cpp +++ b/src/models/input_ids.cpp @@ -71,12 +71,12 @@ void DefaultInputIDs::Update(DeviceSpan new_tokens) { if (static_cast(shape_[1]) != sequence_length) { shape_[1] = sequence_length; - value_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_); + value_ = OrtValue::CreateTensor(model_.p_device_inputs_->GetAllocator(), shape_); state_.inputs_[input_index_] = value_.get(); } // Update input_ids with next tokens - auto data_span = WrapTensor(*model_.p_device_, *value_); + auto data_span = WrapTensor(*model_.p_device_inputs_, *value_); // For beam search if (is_prompt_ && state_.params_->search.num_beams > 1) { @@ -91,7 +91,7 @@ void DefaultInputIDs::Update(DeviceSpan new_tokens) { } if (type_ == Ort::TypeToTensorType) { - Cast(*value_, cast_value_, *model_.p_device_, type_); + Cast(*value_, cast_value_, *model_.p_device_inputs_, type_); state_.inputs_[input_index_] = cast_value_.get(); } diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp index 51210ae4e..994e883ce 100644 --- a/src/models/kv_cache.cpp +++ b/src/models/kv_cache.cpp @@ -46,11 +46,11 @@ CombinedKeyValueCache::CombinedKeyValueCache(State& state) // Derive the KV data type from the KV input 0 type_ = model_.session_info_->GetInputDataType(input_name_strings_[0]); - empty_past_ = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_); + empty_past_ = OrtValue::CreateTensor(Allocator(), shape_, type_); shape_[3] = 0; for (int i = 0; i < layer_count_; ++i) { - presents_.push_back(OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_)); + presents_.push_back(OrtValue::CreateTensor(Allocator(), shape_, type_)); } } @@ -82,7 +82,7 @@ void CombinedKeyValueCache::Update(DeviceSpan beam_indices, int total_l shape_[3] = total_length; for (int i = 0; i < layer_count_; i++) { - presents_[i] = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_); + presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_); state_.outputs_[output_index_ + i] = presents_[i].get(); } @@ -119,9 +119,9 @@ void CombinedKeyValueCache::RewindPastTensorsTo(size_t index) { for (int i = 0; i < layer_count_; i++) { OrtValue& present = *presents_[i]; - std::unique_ptr past = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_); - auto present_span = WrapTensor(*model_.p_device_, present); - auto past_span = WrapTensor(*model_.p_device_, *past); + std::unique_ptr past = OrtValue::CreateTensor(Allocator(), shape_, type_); + auto present_span = WrapTensor(Device(), present); + auto past_span = WrapTensor(Device(), *past); for (int j = 0; j < 2 * batch_x_num_heads; j++) { auto present_data = present_span.subspan(j * old_length_x_head_size, new_length_x_head_size); @@ -141,10 +141,10 @@ void CombinedKeyValueCache::PickPastState(DeviceSpan beam_indices_devic auto past_key_size = shape_[1] * block_size_per_beam; OrtValue& present = *presents_[index]; - std::unique_ptr past = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_); + std::unique_ptr past = OrtValue::CreateTensor(Allocator(), shape_); - auto past_span = WrapTensor(*model_.p_device_, *past); - auto present_span = WrapTensor(*model_.p_device_, present); + auto past_span = WrapTensor(Device(), *past); + auto present_span = WrapTensor(Device(), present); for (size_t j = 0; j < beam_indices.size(); j++) { int32_t beam_index = beam_indices[j]; @@ -190,7 +190,7 @@ DefaultKeyValueCache::DefaultKeyValueCache(State& state) // Derive the KV data type from the KV input 0 type_ = model_.session_info_->GetInputDataType(input_name_strings_[0]); - empty_past_ = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_); + empty_past_ = OrtValue::CreateTensor(Allocator(), shape_, type_); // Set the size after empty_past_ has been created with 0 for this field if (past_present_share_buffer_) { @@ -207,10 +207,10 @@ DefaultKeyValueCache::DefaultKeyValueCache(State& state) try { for (int i = 0; i < layer_count_ * 2; ++i) { presents_.push_back( - sb_kv_caches_.empty() ? OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_) + sb_kv_caches_.empty() ? OrtValue::CreateTensor(Allocator(), shape_, type_) : sb_kv_caches_[i]->CreateTensorOnStaticBuffer(shape_, type_)); // Zero the memory so we don't leak any data from the previous run - ByteWrapTensor(*model_.p_device_, *presents_.back()).Zero(); + ByteWrapTensor(Device(), *presents_.back()).Zero(); } } catch (const Ort::Exception&) { std::ostringstream oss; @@ -269,7 +269,7 @@ void DefaultKeyValueCache::Update(DeviceSpan beam_indices, int total_le shape_[2] = total_length; for (int i = 0; i < layer_count_ * 2; i++) { - presents_[i] = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_); + presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_); state_.outputs_[output_index_ + i] = presents_[i].get(); } @@ -308,10 +308,10 @@ void DefaultKeyValueCache::RewindPastTensorsTo(size_t index) { for (int i = 0; i < layer_count_ * 2; i++) { OrtValue& present = *presents_[i]; - std::unique_ptr past = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_); + std::unique_ptr past = OrtValue::CreateTensor(Allocator(), shape_, type_); - auto past_span = WrapTensor(*model_.p_device_, *past); - auto present_span = WrapTensor(*model_.p_device_, present); + auto past_span = WrapTensor(Device(), *past); + auto present_span = WrapTensor(Device(), present); for (int j = 0; j < batch_x_num_heads; j++) { auto present_data = present_span.subspan(j * old_length_x_head_size, new_length_x_head_size); @@ -330,10 +330,10 @@ void DefaultKeyValueCache::PickPastState(DeviceSpan beam_indices_device auto block_size_per_beam = shape_[1] * shape_[2] * shape_[3]; OrtValue& present_value = *presents_[index]; - std::unique_ptr past_value = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_); + std::unique_ptr past_value = OrtValue::CreateTensor(Allocator(), shape_); - auto past_span = WrapTensor(*model_.p_device_, *past_value); - auto present_span = WrapTensor(*model_.p_device_, present_value); + auto past_span = WrapTensor(Device(), *past_value); + auto present_span = WrapTensor(Device(), present_value); for (size_t j = 0; j < beam_indices.size(); j++) { int32_t beam_index = beam_indices[j]; @@ -371,8 +371,8 @@ CrossCache::CrossCache(State& state) type_ = model_.session_info_->GetInputDataType(input_name_strings_[0]); for (int i = 0; i < layer_count_; ++i) { - values_.push_back(OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_)); - values_.push_back(OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_)); + values_.push_back(OrtValue::CreateTensor(Allocator(), shape_, type_)); + values_.push_back(OrtValue::CreateTensor(Allocator(), shape_, type_)); } } @@ -422,21 +422,21 @@ WindowedKeyValueCache::WindowedKeyValueCache(State& state) for (int i = 0; i < layer_count_; ++i) { key_caches_in_.push_back( - OrtValue::CreateTensor(*model_.allocator_device_, key_cache_shape_in_, type_)); + OrtValue::CreateTensor(Allocator(), key_cache_shape_in_, type_)); std::fill_n(key_caches_in_[i]->GetTensorMutableData(), ElementCountFromShape(key_cache_shape_in_), static_cast(model_.config_->model.decoder.sliding_window->pad_value)); value_caches_in_.push_back( - OrtValue::CreateTensor(*model_.allocator_device_, value_cache_shape_in_, type_)); + OrtValue::CreateTensor(Allocator(), value_cache_shape_in_, type_)); std::fill_n(value_caches_in_[i]->GetTensorMutableData(), ElementCountFromShape(value_cache_shape_in_), static_cast(model_.config_->model.decoder.sliding_window->pad_value)); key_caches_out_.push_back( - OrtValue::CreateTensor(*model_.allocator_device_, key_cache_shape_out_, type_)); + OrtValue::CreateTensor(Allocator(), key_cache_shape_out_, type_)); value_caches_out_.push_back( - OrtValue::CreateTensor(*model_.allocator_device_, value_cache_shape_out_, type_)); + OrtValue::CreateTensor(Allocator(), value_cache_shape_out_, type_)); } } @@ -548,7 +548,7 @@ void WindowedKeyValueCache::Update(DeviceSpan beam_indices, int current ThreadPool thread_pool{static_cast(layer_count_)}; thread_pool.Compute([&](size_t layer_idx) { - std::unique_ptr key_cache = OrtValue::CreateTensor(*model_.allocator_device_, updated_key_cache_shape_in, type_); + std::unique_ptr key_cache = OrtValue::CreateTensor(Allocator(), updated_key_cache_shape_in, type_); uint8_t* key_cache_data = key_cache->GetTensorMutableData(); uint8_t* key_cache_in_data = key_caches_in_[layer_idx]->GetTensorMutableData(); @@ -574,9 +574,9 @@ void WindowedKeyValueCache::Update(DeviceSpan beam_indices, int current } key_caches_in_[layer_idx] = std::move(key_cache); - key_caches_out_[layer_idx] = OrtValue::CreateTensor(*model_.allocator_device_, updated_key_cache_shape_out, type_); + key_caches_out_[layer_idx] = OrtValue::CreateTensor(Allocator(), updated_key_cache_shape_out, type_); - std::unique_ptr value_cache = OrtValue::CreateTensor(*model_.allocator_device_, updated_value_cache_shape_in, type_); + std::unique_ptr value_cache = OrtValue::CreateTensor(Allocator(), updated_value_cache_shape_in, type_); uint8_t* value_cache_data = value_cache->GetTensorMutableData(); uint8_t* value_cache_in_data = value_caches_in_[layer_idx]->GetTensorMutableData(); @@ -602,7 +602,7 @@ void WindowedKeyValueCache::Update(DeviceSpan beam_indices, int current } value_caches_in_[layer_idx] = std::move(value_cache); - value_caches_out_[layer_idx] = OrtValue::CreateTensor(*model_.allocator_device_, updated_value_cache_shape_out, type_); + value_caches_out_[layer_idx] = OrtValue::CreateTensor(Allocator(), updated_value_cache_shape_out, type_); }); window_size_ = 1; diff --git a/src/models/kv_cache.h b/src/models/kv_cache.h index 0e871d938..588e52045 100644 --- a/src/models/kv_cache.h +++ b/src/models/kv_cache.h @@ -30,6 +30,9 @@ struct CombinedKeyValueCache : KeyValueCache { template void RewindPastTensorsTo(size_t index); + DeviceInterface& Device() { return *model_.p_device_kvcache_; } + Ort::Allocator& Allocator() { return model_.p_device_kvcache_->GetAllocator(); } + State& state_; const Model& model_{state_.model_}; int layer_count_; @@ -64,6 +67,9 @@ struct DefaultKeyValueCache : KeyValueCache { template void RewindPastTensorsTo(size_t index); + DeviceInterface& Device() { return *model_.p_device_kvcache_; } + Ort::Allocator& Allocator() { return model_.p_device_kvcache_->GetAllocator(); } + State& state_; const Model& model_{state_.model_}; int layer_count_; @@ -89,6 +95,9 @@ struct CrossCache { void AddInputs(); private: + DeviceInterface& Device() { return *model_.p_device_kvcache_; } + Ort::Allocator& Allocator() { return model_.p_device_kvcache_->GetAllocator(); } + State& state_; const Model& model_{state_.model_}; int layer_count_; @@ -113,6 +122,10 @@ struct WindowedKeyValueCache : KeyValueCache { } private: + + DeviceInterface& Device() { return *model_.p_device_kvcache_; } + Ort::Allocator& Allocator() { return model_.p_device_kvcache_->GetAllocator(); } + void Slide(); State& state_; diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 4a7ac7f6a..48bdc4f32 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -10,11 +10,11 @@ Logits::Logits(State& state) : state_{state}, shape_{static_cast(state_.params_->BatchBeamSize()), 0, model_.config_->model.vocab_size}, type_{model_.session_info_->GetOutputDataType(model_.config_->model.decoder.outputs.logits)} { - output_raw_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_); + output_raw_ = OrtValue::CreateTensor(model_.p_device_inputs_->GetAllocator(), shape_, type_); if (model_.device_type_ == DeviceType::CUDA && !model_.config_->model.eos_token_ids.empty()) { auto& cpu_ids = model_.config_->model.eos_token_ids; - cuda_eos_token_ids_ = state_.params_->p_device->Allocate(cpu_ids.size()); + cuda_eos_token_ids_ = model_.p_device_->Allocate(cpu_ids.size()); copy(std::span{cpu_ids}, cuda_eos_token_ids_.CpuSpan()); cuda_eos_token_ids_.CopyCpuToDevice(); } @@ -34,18 +34,18 @@ DeviceSpan Logits::Get() { const size_t num_beams = state_.params_->search.num_beams; // create new OrtValue for logits_of_last_token and use output_last_tokens_ to hold it - output_last_tokens_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_last, type_); + output_last_tokens_ = OrtValue::CreateTensor(model_.p_device_inputs_->GetAllocator(), shape_last, type_); if (type_ == Ort::TypeToTensorType) - logits_of_last_token_fp32_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_); + logits_of_last_token_fp32_ = OrtValue::CreateTensor(model_.p_device_inputs_->GetAllocator(), shape_); logits_of_last_token = output_last_tokens_.get(); size_t element_size = SizeOf(type_); size_t vocab_index = 0; // Simpler math to have this index go up by vocab_size for every logit chunk we process - auto logits_raw = ByteWrapTensor(*state_.params_->p_device, *output_raw_); - auto logits_last_tokens = ByteWrapTensor(*state_.params_->p_device, *logits_of_last_token); + auto logits_raw = ByteWrapTensor(*model_.p_device_inputs_, *output_raw_); + auto logits_last_tokens = ByteWrapTensor(*model_.p_device_inputs_, *logits_of_last_token); for (int batch_index = 0; batch_index < state_.params_->search.batch_size; batch_index++) { // Find the first non pad token from the end @@ -63,12 +63,12 @@ DeviceSpan Logits::Get() { // Convert from float16 to float32 if necessary if (type_ == Ort::TypeToTensorType) { - Cast(*logits_of_last_token, logits_of_last_token_fp32_, *model_.p_device_, Ort::TypeToTensorType); + Cast(*logits_of_last_token, logits_of_last_token_fp32_, *model_.p_device_inputs_, Ort::TypeToTensorType); logits_of_last_token = logits_of_last_token_fp32_.get(); } if (logits_.empty() || logits_of_last_token->GetTensorMutableRawData() != logits_.Span().data()) - logits_ = WrapTensor(*state_.params_->p_device, *logits_of_last_token); + logits_ = WrapTensor(*model_.p_device_inputs_, *logits_of_last_token); if (model_.device_type_ == DeviceType::CUDA) { if (!cuda_eos_token_ids_.empty()) @@ -108,7 +108,7 @@ void Logits::Update(const DeviceSpan& next_tokens, size_t new_kv_length shape_[1] = new_kv_length; StaticBuffer* sb_logits = type_ == Ort::TypeToTensorType ? sb_logits16_ : sb_logits32_; - output_raw_ = !sb_logits ? OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_) + output_raw_ = !sb_logits ? OrtValue::CreateTensor(model_.p_device_inputs_->GetAllocator(), shape_, type_) : sb_logits->CreateTensorOnStaticBuffer(shape_, type_); if (state_.GetCapturedGraphInfo()) { diff --git a/src/models/model.cpp b/src/models/model.cpp index e3387f9cb..ad454f6c4 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -221,24 +221,24 @@ int32_t Tokenizer::TokenToTokenId(const char* token) const { // the allocator used is not destroyed until last. This keeps the allocator around until exit, after all other memory // has been destroyed. Without this, we will crash in the Onnxruntime BFCArena code when deleting tensors due to the // arena already being destroyed. -Ort::Allocator* GetDeviceAllocator(OrtSession& session, DeviceType type) { - // CPU Allocator is a special case, so don't try to create it +void EnsureDeviceOrtInit(OrtSession& session, DeviceType type) { + // CPU Allocator is a special case, it's not in the owned 'allocator_device_' table below so we handle it separately if (type == DeviceType::CPU) - return &GetDeviceInterface(DeviceType::CPU)->GetAllocator(); + return; auto& device = GetOrtGlobals()->allocator_device_[static_cast(type)]; - if (!device) { - static const char* device_type_names[] = {"CPU (Not used, see above)", "Cuda", "DML", "WebGPU_Buffer", "QNN (Not used, uses CPU memory)"}; - static_assert(std::size(device_type_names) == static_cast(DeviceType::MAX)); + if (device) + return; - auto name = device_type_names[static_cast(type)]; - auto memory_info = OrtMemoryInfo::Create(name, OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); - device = Ort::Allocator::Create(session, *memory_info); - if (!device) - throw std::runtime_error("Unexpected failure to create device memory allocator for " + std::string(name)); - GetDeviceInterface(type)->InitOrt(*Ort::api, *device); // Necessary for any shared library providers so they can access Ort::api - } - return device.get(); + static const char* device_type_names[] = {"CPU (Not used, see above)", "Cuda", "DML", "WebGPU_Buffer", "QnnHtpShared"}; + static_assert(std::size(device_type_names) == static_cast(DeviceType::MAX)); + + auto name = device_type_names[static_cast(type)]; + auto memory_info = OrtMemoryInfo::Create(name, OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); + device = Ort::Allocator::Create(session, *memory_info); + if (!device) + throw std::runtime_error("Unexpected failure to create device memory allocator for " + std::string(name)); + GetDeviceInterface(type)->InitOrt(*Ort::api, *device); // Necessary for any shared library providers so they can access Ort::api } SessionInfo::SessionInfo(OrtSession& session) { @@ -301,18 +301,19 @@ Model::Model(std::unique_ptr config) : config_{std::move(config)} { Model::~Model() = default; void Model::InitDeviceAllocator(OrtSession& session) { - allocator_device_ = &allocator_cpu_; + EnsureDeviceOrtInit(session, device_type_); + + // Only CUDA does every input on the device if (device_type_ == DeviceType::CUDA) - allocator_device_ = GetDeviceAllocator(session, device_type_); + p_device_inputs_ = p_device_; + else + p_device_inputs_ = GetDeviceInterface(DeviceType::CPU); - allocator_kvcache_ = allocator_device_; - if (device_type_ == DeviceType::WEBGPU || device_type_ == DeviceType::DML) { - // for dml and webgpu we only use device memory for kv_cache - allocator_kvcache_ = GetDeviceAllocator(session, device_type_); - } + // The kvcache is always allocated in device memory + p_device_kvcache_ = p_device_; session_info_ = std::make_unique(session); - captured_graph_pool_ = std::make_shared(config_.get(), session_info_.get(), allocator_device_); + captured_graph_pool_ = std::make_shared(config_.get(), session_info_.get(), &p_device_->GetAllocator()); } void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_session_options, @@ -487,7 +488,6 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ throw std::runtime_error("Unknown provider type: " + provider_options.name); } - // If no device is set, create it, default to CPU if (!p_device_) { p_device_ = GetDeviceInterface(device_type_); } @@ -495,10 +495,6 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ void Model::CreateSessionOptions() { session_options_ = OrtSessionOptions::Create(); -#if 0 - ClearProviders(*config_); - SetProviderOption(*config_, "dml", {}, {}); -#endif CreateSessionOptionsFromConfig(config_->model.decoder.session_options, *session_options_, true, false); @@ -593,10 +589,9 @@ std::unique_ptr Model::ExpandInputs(std::unique_ptr& input, input_shape[0] *= num_beams; - auto& allocator = device_type_ == DeviceType::DML ? allocator_cpu_ : *allocator_device_; - auto expanded = OrtValue::CreateTensor(allocator, input_shape, element_type); auto input_span = ByteWrapTensor(*GetDeviceInterface(DeviceType::CPU), *input); - auto expanded_span = ByteWrapTensor(*p_device_, *expanded); + auto expanded = OrtValue::CreateTensor(p_device_inputs_->GetAllocator(), input_shape, element_type); + auto expanded_span = ByteWrapTensor(*p_device_inputs_, *expanded); for (int i = 0; i < batch_size; i++) { for (int j = 0; j < num_beams; j++) { diff --git a/src/models/model.h b/src/models/model.h index 9635e79d2..ead0648a7 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -136,11 +136,12 @@ struct Model : std::enable_shared_from_this, LeakChecked { std::unique_ptr config_; std::unique_ptr session_options_; - mutable DeviceInterface* p_device_{}; DeviceType device_type_{DeviceType::CPU}; - Ort::Allocator& allocator_cpu_{Ort::Allocator::GetWithDefaultOptions()}; - Ort::Allocator* allocator_device_{}; // Can be CUDA or CPU based on the DeviceType in the model - Ort::Allocator* allocator_kvcache_{}; // keep allocator for kv_cache seperate to allow that only kv_cache is on device + mutable DeviceInterface* p_device_{}; // The device we're running on (matches device_type_) used for things that work the same on all devices + mutable DeviceInterface* p_device_inputs_{}; // For some model inputs, the device might be the CPU device (all but KV cache currently) + mutable DeviceInterface* p_device_kvcache_{}; // The kvcache is always allocated in device memory (TODO: Remove in favor of just p_device_?) + + Ort::Allocator& allocator_cpu_{GetDeviceInterface(DeviceType::CPU)->GetAllocator()}; std::unique_ptr session_info_; diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp index 76372bd54..8546cfb8f 100644 --- a/src/models/position_inputs.cpp +++ b/src/models/position_inputs.cpp @@ -119,13 +119,13 @@ void DefaultPositionInputs::CreateNextPositionIDsTensor() { position_ids_ = std::move(position_ids_next_); position_ids_next_ = nullptr; } else { - position_ids_ = OrtValue::CreateTensor(*model_.allocator_device_, position_ids_shape_, type_); + position_ids_ = OrtValue::CreateTensor(model_.p_device_inputs_->GetAllocator(), position_ids_shape_, type_); } } else { position_ids_ = sb_position_ids_->CreateTensorOnStaticBuffer(position_ids_shape_, type_); if (position_ids_shape_[1] == 1) { - auto position_ids_span = ByteWrapTensor(*model_.p_device_, *position_ids_); - auto position_ids_next_span = ByteWrapTensor(*model_.p_device_, *position_ids_next_); + auto position_ids_span = ByteWrapTensor(*model_.p_device_inputs_, *position_ids_); + auto position_ids_next_span = ByteWrapTensor(*model_.p_device_inputs_, *position_ids_next_); position_ids_span.CopyFrom(position_ids_next_span); } } @@ -143,7 +143,7 @@ void DefaultPositionInputs::UpdatePositionIDs(int total_length, int new_kv_lengt } if (model_.device_type_ == DeviceType::CUDA) - model_.p_device_->UpdatePositionIds(position_ids_->GetTensorMutableRawData(), static_cast(position_ids_shape_[0]), total_length, new_kv_length, type_); + model_.p_device_inputs_->UpdatePositionIds(position_ids_->GetTensorMutableRawData(), static_cast(position_ids_shape_[0]), total_length, new_kv_length, type_); else { type_ == Ort::TypeToTensorType ? UpdatePositionIDsImpl(total_length, new_kv_length) : UpdatePositionIDsImpl(total_length, new_kv_length); @@ -153,12 +153,12 @@ void DefaultPositionInputs::UpdatePositionIDs(int total_length, int new_kv_lengt void DefaultPositionInputs::CreateNextAttentionMaskTensor(int total_length) { if (!sb_attention_mask_) { attention_mask_shape_[1] = total_length; - attention_mask_next_ = OrtValue::CreateTensor(*model_.allocator_device_, attention_mask_shape_, type_); + attention_mask_next_ = OrtValue::CreateTensor(model_.p_device_inputs_->GetAllocator(), attention_mask_shape_, type_); } else { attention_mask_shape_[1] = state_.params_->search.max_length; attention_mask_next_ = sb_attention_mask_->CreateTensorOnStaticBuffer(attention_mask_shape_, type_); if (is_first_mask_update_) { - ByteWrapTensor(*model_.p_device_, *attention_mask_next_).Zero(); + ByteWrapTensor(*model_.p_device_inputs_, *attention_mask_next_).Zero(); } } } @@ -175,7 +175,7 @@ void DefaultPositionInputs::UpdateAttentionMask(int total_length, int new_kv_len if (model_.device_type_ == DeviceType::CUDA) { int max_length = sb_attention_mask_ ? state_.params_->search.max_length : total_length; bool update_only = sb_attention_mask_ && !is_first_mask_update_; - model_.p_device_->UpdateAttentionMask(attention_mask_next_->GetTensorMutableRawData(), + model_.p_device_inputs_->UpdateAttentionMask(attention_mask_next_->GetTensorMutableRawData(), attention_mask_->GetTensorRawData(), static_cast(attention_mask_shape_[0]), new_kv_length, diff --git a/src/models/whisper.cpp b/src/models/whisper.cpp index 6c46f218d..6bb64df65 100644 --- a/src/models/whisper.cpp +++ b/src/models/whisper.cpp @@ -63,7 +63,7 @@ Whisper_State::Whisper_State(const Whisper_Model& model, DeviceSpan seq auto hidden_states_type = model_.session_info_->GetOutputDataType("encoder_hidden_states"); auto encoder_hidden_states_shape = std::array{decoder_input_ids_.GetShape()[0], 1500, static_cast(model_.config_->model.decoder.num_attention_heads) * model_.config_->model.decoder.head_size}; - encoder_hidden_states_ = OrtValue::CreateTensor(*model_.allocator_device_, encoder_hidden_states_shape, hidden_states_type); + encoder_hidden_states_ = OrtValue::CreateTensor(model_.p_device_->GetAllocator(), encoder_hidden_states_shape, hidden_states_type); auto sequence_lengths = sequence_lengths_unk.CpuSpan(); for (int i = 0; i < decoder_input_ids_.GetShape()[0]; i++) { @@ -90,7 +90,7 @@ Whisper_State::Whisper_State(const Whisper_Model& model, DeviceSpan seq auto type = model_.session_info_->GetOutputDataType(output_names_[kv_cache_indices]); for (int i = 0; i < layer_count * 2; i++) { - init_presents_.emplace_back(OrtValue::CreateTensor(*model_.allocator_device_, shape, type)); + init_presents_.emplace_back(OrtValue::CreateTensor(model_.p_device_->GetAllocator(), shape, type)); presents_.emplace_back(outputs_[kv_cache_indices + i]); outputs_[kv_cache_indices + i] = init_presents_.back().get(); } @@ -268,7 +268,7 @@ DeviceSpan Whisper_State::Run(int current_length, DeviceSpan& ne } if (model_.session_info_->HasInput("cache_indirection")) { - cache_indirection_ = OrtValue::CreateTensor(*model_.allocator_device_, std::array{params_->search.batch_size, params_->search.num_beams, params_->search.max_length}); + cache_indirection_ = OrtValue::CreateTensor(model_.p_device_->GetAllocator(), std::array{params_->search.batch_size, params_->search.num_beams, params_->search.max_length}); cache_indirection_index_ = inputs_.size(); input_names_.push_back("cache_indirection"); inputs_.push_back(cache_indirection_.get()); @@ -284,7 +284,7 @@ DeviceSpan Whisper_State::Run(int current_length, DeviceSpan& ne char string[64]; snprintf(string, std::size(string), "output_cross_qk_%d", i); output_cross_qk_names_.emplace_back(string); - output_cross_qk_.emplace_back(OrtValue::CreateTensor(*model_.allocator_device_, shape, type)); + output_cross_qk_.emplace_back(OrtValue::CreateTensor(model_.p_device_->GetAllocator(), shape, type)); output_names_.emplace_back(output_cross_qk_names_.back().c_str()); outputs_.emplace_back(output_cross_qk_.back().get()); diff --git a/src/qnn/interface.cpp b/src/qnn/interface.cpp new file mode 100644 index 000000000..3acc3b58d --- /dev/null +++ b/src/qnn/interface.cpp @@ -0,0 +1,79 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "../generators.h" +#include "../search.h" +#include "interface.h" + +namespace Generators { +namespace QNN { + +static Ort::Allocator* ort_allocator_{}; +const char* device_label = "qnn"; + +struct QnnMemory final : DeviceBuffer { + QnnMemory(size_t size) : owned_{true} { + size_in_bytes_ = size; + p_cpu_ = p_device_ = static_cast(ort_allocator_->Alloc(size_in_bytes_)); + } + + QnnMemory(void* p, size_t size) : owned_{false} { + size_in_bytes_ = size; + p_cpu_ = p_device_ = static_cast(p); + } + + ~QnnMemory() override { + if (owned_) + ort_allocator_->Free(p_device_); + } + + const char* GetType() const override { return device_label; } + void AllocateCpu() override {} // Nothing to do, device memory is CPU accessible + void CopyDeviceToCpu() override {} // Nothing to do, device memory is CPU accessible + void CopyCpuToDevice() override {} // Nothing to do, device memory is CPU accessible + void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override { + CopyThroughCpu(*this, begin_dest, source, begin_source, size_in_bytes); + } + + void Zero() override { + memset(p_device_, 0, size_in_bytes_); + } + + bool owned_; +}; + +struct InterfaceImpl : DeviceInterface { + InterfaceImpl() { + } + + void InitOrt(const OrtApi& /*api*/, Ort::Allocator& allocator) override { + assert(!ort_allocator_); + ort_allocator_ = &allocator; + } + + Ort::Allocator& GetAllocator() override { + return *ort_allocator_; + } + + std::shared_ptr AllocateBase(size_t size) override { + return std::make_shared(size); + } + + std::shared_ptr WrapMemoryBase(void* p, size_t size) override { + return std::make_shared(p, size); + } + + std::unique_ptr CreateGreedy(const GeneratorParams& params) override { return std::make_unique(params); } + std::unique_ptr CreateBeam(const GeneratorParams& params) override { return std::make_unique(params); } + + void Synchronize() override {} // Nothing to do +}; + +} // namespace QNN + +DeviceInterface* GetQNNInterface() { + static std::unique_ptr g_device = std::make_unique(); + return g_device.get(); +} + +} // namespace Generators diff --git a/src/qnn/interface.h b/src/qnn/interface.h new file mode 100644 index 000000000..fcbfe1f64 --- /dev/null +++ b/src/qnn/interface.h @@ -0,0 +1,8 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +namespace Generators { + +DeviceInterface* GetQNNInterface(); + +} // namespace Generators \ No newline at end of file diff --git a/src/webgpu/interface.cpp b/src/webgpu/interface.cpp new file mode 100644 index 000000000..246cfed50 --- /dev/null +++ b/src/webgpu/interface.cpp @@ -0,0 +1,79 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "../generators.h" +#include "../search.h" +#include "interface.h" + +namespace Generators { +namespace WebGPU { + +static Ort::Allocator* ort_allocator_{}; +const char* device_label = "WebGPU"; + +struct WebGPUMemory final : DeviceBuffer { + WebGPUMemory(size_t size) : owned_{true} { + size_in_bytes_ = size; + p_cpu_ = p_device_ = static_cast(ort_allocator_->Alloc(size_in_bytes_)); + } + + WebGPUMemory(void* p, size_t size) : owned_{false} { + size_in_bytes_ = size; + p_cpu_ = p_device_ = static_cast(p); + } + + ~WebGPUMemory() override { + if (owned_) + ort_allocator_->Free(p_device_); + } + + const char* GetType() const override { return device_label; } + void AllocateCpu() override { throw std::runtime_error("CPU can't access WebGPU memory"); } + void CopyDeviceToCpu() override { throw std::runtime_error("CPU can't access WebGPU memory"); } + void CopyCpuToDevice() override { throw std::runtime_error("CPU can't access WebGPU memory"); } + void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override { + throw std::runtime_error("CPU can't access WebGPU memory"); + } + + void Zero() override { + throw std::runtime_error("Zeroing not implemented for WebGPU memory"); + } + + bool owned_; +}; + +struct InterfaceImpl : DeviceInterface { + InterfaceImpl() { + } + + void InitOrt(const OrtApi& /*api*/, Ort::Allocator& allocator) override { + assert(!ort_allocator_); + ort_allocator_ = &allocator; + } + + Ort::Allocator& GetAllocator() override { + return *ort_allocator_; + } + + std::shared_ptr AllocateBase(size_t size) override { + return std::make_shared(size); + } + + std::shared_ptr WrapMemoryBase(void* p, size_t size) override { + return std::make_shared(p, size); + } + + std::unique_ptr CreateGreedy(const GeneratorParams& params) override { return std::make_unique(params); } + std::unique_ptr CreateBeam(const GeneratorParams& params) override { return std::make_unique(params); } + + void Synchronize() override {} // Nothing to do? +}; + +} // namespace WebGPU + +DeviceInterface* GetWebGPUInterface() { + static std::unique_ptr g_device = std::make_unique(); + return g_device.get(); +} + +} // namespace Generators diff --git a/src/webgpu/interface.h b/src/webgpu/interface.h new file mode 100644 index 000000000..204b4dfed --- /dev/null +++ b/src/webgpu/interface.h @@ -0,0 +1,8 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +namespace Generators { + +DeviceInterface* GetWebGPUInterface(); + +} // namespace Generators \ No newline at end of file From f8ed9ce52a9ac409b689eee69d42f851edccca26 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Wed, 29 Jan 2025 18:16:08 -0800 Subject: [PATCH 25/31] Previous change also added device interfaces for webgpu & qnn Lint --- src/models/kv_cache.h | 1 - src/models/model.h | 2 +- src/models/position_inputs.cpp | 14 +++++++------- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/models/kv_cache.h b/src/models/kv_cache.h index 588e52045..9b8fe8e83 100644 --- a/src/models/kv_cache.h +++ b/src/models/kv_cache.h @@ -122,7 +122,6 @@ struct WindowedKeyValueCache : KeyValueCache { } private: - DeviceInterface& Device() { return *model_.p_device_kvcache_; } Ort::Allocator& Allocator() { return model_.p_device_kvcache_->GetAllocator(); } diff --git a/src/models/model.h b/src/models/model.h index ead0648a7..91b202c42 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -137,7 +137,7 @@ struct Model : std::enable_shared_from_this, LeakChecked { std::unique_ptr session_options_; DeviceType device_type_{DeviceType::CPU}; - mutable DeviceInterface* p_device_{}; // The device we're running on (matches device_type_) used for things that work the same on all devices + mutable DeviceInterface* p_device_{}; // The device we're running on (matches device_type_) used for things that work the same on all devices mutable DeviceInterface* p_device_inputs_{}; // For some model inputs, the device might be the CPU device (all but KV cache currently) mutable DeviceInterface* p_device_kvcache_{}; // The kvcache is always allocated in device memory (TODO: Remove in favor of just p_device_?) diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp index 8546cfb8f..13ab0f0f0 100644 --- a/src/models/position_inputs.cpp +++ b/src/models/position_inputs.cpp @@ -176,13 +176,13 @@ void DefaultPositionInputs::UpdateAttentionMask(int total_length, int new_kv_len int max_length = sb_attention_mask_ ? state_.params_->search.max_length : total_length; bool update_only = sb_attention_mask_ && !is_first_mask_update_; model_.p_device_inputs_->UpdateAttentionMask(attention_mask_next_->GetTensorMutableRawData(), - attention_mask_->GetTensorRawData(), - static_cast(attention_mask_shape_[0]), - new_kv_length, - total_length, - max_length, - update_only, - type_); + attention_mask_->GetTensorRawData(), + static_cast(attention_mask_shape_[0]), + new_kv_length, + total_length, + max_length, + update_only, + type_); } else { type_ == Ort::TypeToTensorType ? UpdateAttentionMaskImpl(total_length) : UpdateAttentionMaskImpl(total_length); From 198e8f8258f8083b9b8e443db5de9ec57f12549a Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Wed, 29 Jan 2025 22:40:37 -0800 Subject: [PATCH 26/31] Remove accidental change --- src/csharp/NativeMethods.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs index e53da7421..4dd11169e 100644 --- a/src/csharp/NativeMethods.cs +++ b/src/csharp/NativeMethods.cs @@ -17,8 +17,7 @@ internal class NativeLib // define the library name required for iOS internal const string DllName = "__Internal"; #else -// internal const string DllName = "C:\\code\\onnxruntime-genai2\\build\\Windows\\Debug\\Debug\\onnxruntime-genai"; - internal const string DllName = "C:\\code\\onnxruntime-genai2\\build\\Windows\\RelWithDebInfo\\RelWithDebInfo\\onnxruntime-genai"; + internal const string DllName = "onnxruntime-genai"; #endif } From e6b77f22967cb28848e5f9816b9d6d8e29391151 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Thu, 30 Jan 2025 01:24:37 -0800 Subject: [PATCH 27/31] Device check simplifications --- src/models/model.cpp | 7 +++---- src/models/position_inputs.cpp | 2 -- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/models/model.cpp b/src/models/model.cpp index ad454f6c4..e245c9786 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -575,11 +575,10 @@ std::unique_ptr Model::ExpandInputs(std::unique_ptr& input, // Input shape (batch_size, sequence_length). The input is required with data type T. // Output shape (batch_size * num_beams, sequence_length) - // If we're on CUDA, we still want to do the copy to move the data over to CUDA memory where we will read from it later. - // DML doesn't currently support on-device scoring, so we go the same route as the CPU - if (num_beams == 1 && (device_type_ == DeviceType::CPU || device_type_ == DeviceType::DML || device_type_ == DeviceType::WEBGPU)) { + // When num_beams == 1, we don't need to expand the input, but the expand has a side effect of copying from + // CPU memory to device memory, so we can skip if the p_device_inputs_ is the CPU device + if (num_beams == 1 && p_device_inputs_ == GetDeviceInterface(DeviceType::CPU)) return std::move(input); - } auto input_type_info = input->GetTensorTypeAndShapeInfo(); auto element_type = input_type_info->GetElementType(); diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp index 13ab0f0f0..06dca1d46 100644 --- a/src/models/position_inputs.cpp +++ b/src/models/position_inputs.cpp @@ -166,8 +166,6 @@ void DefaultPositionInputs::CreateNextAttentionMaskTensor(int total_length) { void DefaultPositionInputs::UpdateAttentionMask(int total_length, int new_kv_length) { if (position_ids_shape_[0] != 1 && !(total_length == 0 || new_kv_length == 1)) throw std::runtime_error("DefaultPositionInputs::UpdatePositionIDs - batch_size must be 1 for continuous decoding."); - if (DeviceType::DML == model_.device_type_ && !(total_length == 0 || new_kv_length == 1)) - throw std::runtime_error("DefaultPositionInputs::UpdatePositionIDs - DML does not support continuous decoding."); CreateNextAttentionMaskTensor(total_length); state_.inputs_[mask_input_index_] = attention_mask_.get(); From 4f2f0844fce0def8ead7bd876463f39c252e03d6 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Thu, 30 Jan 2025 16:02:48 -0800 Subject: [PATCH 28/31] Refactor device_type --- src/cpu/interface.cpp | 2 ++ src/cuda/interface.cpp | 2 ++ src/dml/interface.cpp | 2 ++ src/generators.cpp | 11 +++++------ src/generators.h | 12 +----------- src/models/adapters.cpp | 2 +- src/models/captured_graph_pool.cpp | 2 +- src/models/decoder_only_pipeline.cpp | 10 +++++----- src/models/logits.cpp | 22 ++++------------------ src/models/logits.h | 4 ---- src/models/model.cpp | 24 +++++++++--------------- src/models/model.h | 1 - src/models/position_inputs.cpp | 4 ++-- src/models/whisper.cpp | 2 +- src/python/python.cpp | 2 +- src/qnn/interface.cpp | 2 ++ src/smartptrs.h | 11 +++++++++++ src/webgpu/interface.cpp | 2 ++ test/sampling_benchmark.cpp | 1 - test/sampling_tests.cpp | 13 ------------- 20 files changed, 51 insertions(+), 80 deletions(-) diff --git a/src/cpu/interface.cpp b/src/cpu/interface.cpp index 0b34b80c4..93bbcc6f1 100644 --- a/src/cpu/interface.cpp +++ b/src/cpu/interface.cpp @@ -46,6 +46,8 @@ struct CpuInterface : DeviceInterface { CpuInterface() { } + DeviceType GetType() const override { return DeviceType::CPU; } + void InitOrt(const OrtApi& /*api*/, Ort::Allocator& allocator) override { assert(!ort_allocator_); ort_allocator_ = &allocator; diff --git a/src/cuda/interface.cpp b/src/cuda/interface.cpp index 2638d0e4b..bf21d48c3 100644 --- a/src/cuda/interface.cpp +++ b/src/cuda/interface.cpp @@ -75,6 +75,8 @@ struct CudaInterfaceImpl final : DeviceInterface { ~CudaInterfaceImpl() { } + DeviceType GetType() const override { return DeviceType::CUDA; } + void InitOrt(const OrtApi& api, Ort::Allocator& allocator) override { Ort::api = &api; assert(!ort_allocator_); diff --git a/src/dml/interface.cpp b/src/dml/interface.cpp index a16d770d5..5f6c40c6a 100644 --- a/src/dml/interface.cpp +++ b/src/dml/interface.cpp @@ -115,6 +115,8 @@ struct InterfaceImpl : DeviceInterface { Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast(&dml_api_))); } + DeviceType GetType() const override { return DeviceType::DML; } + void InitOrt(const OrtApi& api, Ort::Allocator& allocator) override { Ort::api = &api; assert(!ort_allocator_); diff --git a/src/generators.cpp b/src/generators.cpp index 66f49ee64..fd550421b 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -203,8 +203,7 @@ GeneratorParams::GeneratorParams(const Config& config) GeneratorParams::GeneratorParams(const Model& model) : config{*model.config_.get()}, - p_device{model.p_device_}, - device_type{model.device_type_}, + p_device{model.p_device_inputs_}, is_cuda_graph_enabled_{IsCudaGraphEnabled(model.config_->model.decoder.session_options)} { use_cuda_graph = is_cuda_graph_enabled_; if (use_cuda_graph) { @@ -213,12 +212,12 @@ GeneratorParams::GeneratorParams(const Model& model) } void GeneratorParams::TryGraphCapture(int max_bs) { - if (!is_cuda_graph_enabled_ || device_type == DeviceType::CPU) { + if (!is_cuda_graph_enabled_ || p_device->GetType() == DeviceType::CPU) { // no-op return; } - if (DeviceType::CUDA == device_type || DeviceType::DML == device_type) { + if (DeviceType::CUDA == p_device->GetType() || DeviceType::DML == p_device->GetType()) { if (max_bs == 0) { throw std::runtime_error("Graph capture is enabled, but max_batch_size is not set."); } @@ -323,8 +322,8 @@ void Generator::AppendTokens(cpu_span input_ids) { constexpr std::array devices_supporting_continuous_decoding{DeviceType::CPU, DeviceType::CUDA, DeviceType::WEBGPU}; if (search_->GetSequenceLength() != 0 && std::none_of(devices_supporting_continuous_decoding.begin(), devices_supporting_continuous_decoding.end(), - [this](DeviceType device_type) { return device_type == state_->params_->device_type; })) - throw std::runtime_error("Continuous decoding is not supported on the selected device type (" + to_string(state_->params_->device_type) + + [this](DeviceType device_type) { return device_type == state_->params_->p_device->GetType(); })) + throw std::runtime_error("Continuous decoding is not supported on the selected device type (" + to_string(state_->params_->p_device->GetType()) + "). Please recreate the generator instance to avoid using continuous decoding."); if (last_action_ == Action::generated) { diff --git a/src/generators.h b/src/generators.h index 6d2865a37..50962b744 100644 --- a/src/generators.h +++ b/src/generators.h @@ -64,15 +64,6 @@ struct OrtTensor { // OgaSequences are a vector of int32 vectors using TokenSequences = std::vector>; -enum struct DeviceType { - CPU, - CUDA, - DML, - WEBGPU, - QNN, - MAX -}; - std::string to_string(DeviceType device_type); DeviceInterface* GetDeviceInterface(DeviceType type); @@ -87,8 +78,7 @@ struct GeneratorParams : std::enable_shared_from_this, LeakChec bool use_cuda_graph{}; int BatchBeamSize() const { return search.num_beams * search.batch_size; } - DeviceInterface* p_device{}; - DeviceType device_type{DeviceType::CPU}; + DeviceInterface* p_device{}; // Scoring device (usually CPU, but can be CUDA) cpu_span aux_input_ids{}; // Intermediate solution to be used with SetInputs function for multimodal and whisper models diff --git a/src/models/adapters.cpp b/src/models/adapters.cpp index 5a95ebdd9..13791386c 100644 --- a/src/models/adapters.cpp +++ b/src/models/adapters.cpp @@ -34,7 +34,7 @@ void Adapters::LoadAdapter(const char* adapter_file_path, const std::string& ada } adapters_.emplace(adapter_name, std::make_unique(adapter_file_path, - model_->device_type_ == DeviceType::CUDA + model_->p_device_->GetType() == DeviceType::CUDA ? &model_->p_device_->GetAllocator() : nullptr)); } diff --git a/src/models/captured_graph_pool.cpp b/src/models/captured_graph_pool.cpp index baaa9243e..a5cea3701 100644 --- a/src/models/captured_graph_pool.cpp +++ b/src/models/captured_graph_pool.cpp @@ -19,7 +19,7 @@ void CapturedGraphInfoRecycler::operator()(CapturedGraphInfo* captured_graph_inf } CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model, const GeneratorParams& params) const { - if (!params.use_cuda_graph || (model.device_type_ != DeviceType::CUDA)) { + if (!params.use_cuda_graph || (model.p_device_->GetType() != DeviceType::CUDA)) { return nullptr; } diff --git a/src/models/decoder_only_pipeline.cpp b/src/models/decoder_only_pipeline.cpp index c8c43c2b1..8ae9246f9 100644 --- a/src/models/decoder_only_pipeline.cpp +++ b/src/models/decoder_only_pipeline.cpp @@ -58,9 +58,9 @@ bool IntermediatePipelineState::HasOutput(std::string_view name) const { } bool IntermediatePipelineState::SupportsPrimaryDevice() const { - if (model_.device_type_ == DeviceType::CPU || model_.device_type_ == DeviceType::QNN) { + if (model_.p_device_->GetType() == DeviceType::CPU || model_.p_device_->GetType() == DeviceType::QNN) { return true; - } else if (model_.device_type_ == DeviceType::CUDA) { + } else if (model_.p_device_->GetType() == DeviceType::CUDA) { if (!model_.config_->model.decoder.pipeline[id_].session_options.has_value()) { // No session options, so this session uses the default session options. // Default session options supports the cuda device type. @@ -134,7 +134,7 @@ void DecoderOnlyPipelineState::RunPipeline(int total_length, DeviceSpan if (!pipeline_state->SupportsPrimaryDevice()) { std::ostringstream oss; oss << "Managed input " << input_name << " resides on the primary device type (" - << to_string(model_.device_type_) << "). " + << to_string(model_.p_device_->GetType()) << "). " << "But the pipeline model " << model_.config_->model.decoder.pipeline[pipeline_state->id_].model_id << " is expecting it to reside elsewhere."; @@ -159,7 +159,7 @@ void DecoderOnlyPipelineState::RunPipeline(int total_length, DeviceSpan if (!pipeline_state->SupportsPrimaryDevice()) { std::ostringstream oss; oss << "Managed output " << output_name << " resides on the primary device type (" - << to_string(model_.device_type_) << "). " + << to_string(model_.p_device_->GetType()) << "). " << "But the pipeline model " << model_.config_->model.decoder.pipeline[pipeline_state->id_].model_id << " is expecting it to reside elsewhere."; @@ -178,7 +178,7 @@ void DecoderOnlyPipelineState::RunPipeline(int total_length, DeviceSpan if (!pipeline_state->SupportsPrimaryDevice()) { std::ostringstream oss; oss << "Managed input " << input_name << " resides on the primary device type (" - << to_string(model_.device_type_) << "). " + << to_string(model_.p_device_->GetType()) << "). " << "But the pipeline model " << model_.config_->model.decoder.pipeline[pipeline_state->id_].model_id << " is expecting it to reside elsewhere."; diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 48bdc4f32..369bd8e39 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -12,7 +12,7 @@ Logits::Logits(State& state) type_{model_.session_info_->GetOutputDataType(model_.config_->model.decoder.outputs.logits)} { output_raw_ = OrtValue::CreateTensor(model_.p_device_inputs_->GetAllocator(), shape_, type_); - if (model_.device_type_ == DeviceType::CUDA && !model_.config_->model.eos_token_ids.empty()) { + if (model_.p_device_inputs_->GetType() == DeviceType::CUDA && !model_.config_->model.eos_token_ids.empty()) { auto& cpu_ids = model_.config_->model.eos_token_ids; cuda_eos_token_ids_ = model_.p_device_->Allocate(cpu_ids.size()); copy(std::span{cpu_ids}, cuda_eos_token_ids_.CpuSpan()); @@ -70,9 +70,9 @@ DeviceSpan Logits::Get() { if (logits_.empty() || logits_of_last_token->GetTensorMutableRawData() != logits_.Span().data()) logits_ = WrapTensor(*model_.p_device_inputs_, *logits_of_last_token); - if (model_.device_type_ == DeviceType::CUDA) { + if (model_.p_device_inputs_->GetType() == DeviceType::CUDA) { if (!cuda_eos_token_ids_.empty()) - model_.p_device_->LaunchHandleEOSArray( + model_.p_device_inputs_->LaunchHandleEOSArray( logits_.Span().data(), static_cast(shape_[0]) /* batch_beam_size*/, static_cast(shape_[2]) /* vocab_size */, @@ -107,21 +107,7 @@ void Logits::Update(const DeviceSpan& next_tokens, size_t new_kv_length } shape_[1] = new_kv_length; - StaticBuffer* sb_logits = type_ == Ort::TypeToTensorType ? sb_logits16_ : sb_logits32_; - output_raw_ = !sb_logits ? OrtValue::CreateTensor(model_.p_device_inputs_->GetAllocator(), shape_, type_) - : sb_logits->CreateTensorOnStaticBuffer(shape_, type_); - - if (state_.GetCapturedGraphInfo()) { - if (!sb_logits16_ && !sb_logits32_) { - if (type_ == Ort::TypeToTensorType) { - sb_logits32_ = state_.GetCapturedGraphInfo()->sb_logits32_.get(); - } - if (type_ == Ort::TypeToTensorType) { - sb_logits16_ = state_.GetCapturedGraphInfo()->sb_logits16_.get(); - } - } - } - + output_raw_ = OrtValue::CreateTensor(model_.p_device_inputs_->GetAllocator(), shape_, type_); state_.outputs_[output_index_] = output_raw_.get(); } diff --git a/src/models/logits.h b/src/models/logits.h index 3eae05875..a9c658835 100644 --- a/src/models/logits.h +++ b/src/models/logits.h @@ -39,10 +39,6 @@ struct Logits { // OrtValue wrapped in a DeviceMemory object to make it universal DeviceSpan logits_; - // Used for decoding runs with cuda graphs. - StaticBuffer* sb_logits32_{}; - StaticBuffer* sb_logits16_{}; - DeviceSpan cuda_eos_token_ids_; // eos_token_ids from params, but in cuda accessible memory }; diff --git a/src/models/model.cpp b/src/models/model.cpp index e245c9786..266569419 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -301,10 +301,10 @@ Model::Model(std::unique_ptr config) : config_{std::move(config)} { Model::~Model() = default; void Model::InitDeviceAllocator(OrtSession& session) { - EnsureDeviceOrtInit(session, device_type_); + EnsureDeviceOrtInit(session, p_device_->GetType()); // Only CUDA does every input on the device - if (device_type_ == DeviceType::CUDA) + if (p_device_->GetType() == DeviceType::CUDA) p_device_inputs_ = p_device_; else p_device_inputs_ = GetDeviceInterface(DeviceType::CPU); @@ -413,8 +413,7 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ // Device type determines the scoring device. // Only use the primary session options to determine the device type if (is_primary_session_options) { - device_type_ = DeviceType::CUDA; // Scoring will use CUDA - p_device_ = GetDeviceInterface(device_type_); + p_device_ = GetDeviceInterface(DeviceType::CUDA); // Create and set our cudaStream_t ort_provider_options->UpdateValue("user_compute_stream", p_device_->GetCudaStream()); @@ -451,15 +450,10 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ InitDmlInterface(p_device_luid); } - if (!disable_graph_capture) { - session_options.AddConfigEntry("ep.dml.enable_graph_capture", "1"); - session_options.AddConfigEntry("ep.dml.disable_memory_arena", "1"); - } - SetDmlProvider(session_options); if (is_primary_session_options) - device_type_ = DeviceType::DML; // We use a DML allocator for input/output caches, but other tensors will use CPU tensors + p_device_ = GetDeviceInterface(DeviceType::DML); // We use a DML allocator for input/output caches, but other tensors will use CPU tensors #endif } else if (provider_options.name == "qnn") { session_options.AddConfigEntry("ep.share_ep_contexts", "1"); @@ -473,12 +467,12 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ // on the other hand, not sure if is_primary_session_options is the right thing to check here. if (const auto opt_it = opts.find("enable_htp_shared_memory_allocator"); opt_it != opts.end() && opt_it->second == "1") { - device_type_ = DeviceType::QNN; + p_device_ = GetDeviceInterface(DeviceType::QNN); } session_options.AppendExecutionProvider("QNN", opts); } else if (provider_options.name == "webgpu") { - device_type_ = DeviceType::WEBGPU; + p_device_ = GetDeviceInterface(DeviceType::WEBGPU); std::unordered_map opts; for (auto& option : provider_options.options) { opts.emplace(option.first, option.second); @@ -488,9 +482,9 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_ throw std::runtime_error("Unknown provider type: " + provider_options.name); } - if (!p_device_) { - p_device_ = GetDeviceInterface(device_type_); - } + // Fallback to CPU if no provider specific interface was set + if (!p_device_) + p_device_ = GetDeviceInterface(DeviceType::CPU); } void Model::CreateSessionOptions() { diff --git a/src/models/model.h b/src/models/model.h index 91b202c42..c17736b73 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -136,7 +136,6 @@ struct Model : std::enable_shared_from_this, LeakChecked { std::unique_ptr config_; std::unique_ptr session_options_; - DeviceType device_type_{DeviceType::CPU}; mutable DeviceInterface* p_device_{}; // The device we're running on (matches device_type_) used for things that work the same on all devices mutable DeviceInterface* p_device_inputs_{}; // For some model inputs, the device might be the CPU device (all but KV cache currently) mutable DeviceInterface* p_device_kvcache_{}; // The kvcache is always allocated in device memory (TODO: Remove in favor of just p_device_?) diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp index 06dca1d46..ce81bae66 100644 --- a/src/models/position_inputs.cpp +++ b/src/models/position_inputs.cpp @@ -142,7 +142,7 @@ void DefaultPositionInputs::UpdatePositionIDs(int total_length, int new_kv_lengt state_.inputs_[posid_input_index_] = position_ids_.get(); } - if (model_.device_type_ == DeviceType::CUDA) + if (model_.p_device_inputs_->GetType() == DeviceType::CUDA) model_.p_device_inputs_->UpdatePositionIds(position_ids_->GetTensorMutableRawData(), static_cast(position_ids_shape_[0]), total_length, new_kv_length, type_); else { type_ == Ort::TypeToTensorType ? UpdatePositionIDsImpl(total_length, new_kv_length) @@ -170,7 +170,7 @@ void DefaultPositionInputs::UpdateAttentionMask(int total_length, int new_kv_len CreateNextAttentionMaskTensor(total_length); state_.inputs_[mask_input_index_] = attention_mask_.get(); - if (model_.device_type_ == DeviceType::CUDA) { + if (model_.p_device_inputs_->GetType() == DeviceType::CUDA) { int max_length = sb_attention_mask_ ? state_.params_->search.max_length : total_length; bool update_only = sb_attention_mask_ && !is_first_mask_update_; model_.p_device_inputs_->UpdateAttentionMask(attention_mask_next_->GetTensorMutableRawData(), diff --git a/src/models/whisper.cpp b/src/models/whisper.cpp index 6bb64df65..05fc20171 100644 --- a/src/models/whisper.cpp +++ b/src/models/whisper.cpp @@ -182,7 +182,7 @@ DeviceSpan Whisper_State::Run(int current_length, DeviceSpan& ne auto src_data = init_presents_[i]->GetTensorRawData(); auto dest_data = presents_[i]->GetTensorMutableRawData(); - switch (model_.device_type_) { + switch (model_.p_device_inputs_->GetType()) { #if 0 // USE_CUDA case DeviceType::CUDA: if (self_attn_kv_cache_element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) { diff --git a/src/python/python.cpp b/src/python/python.cpp index deae726da..0ae6e6236 100644 --- a/src/python/python.cpp +++ b/src/python/python.cpp @@ -412,7 +412,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) { })) .def_property_readonly("type", [](const Model& model) { return model.config_->model.type; }) .def_property_readonly( - "device_type", [](const Model& model) { return to_string(model.device_type_); }, "The device type the model is running on") + "device_type", [](const Model& model) { return to_string(model.p_device_->GetType()); }, "The device type the model is running on") .def("create_multimodal_processor", [](const Model& model) { return model.CreateMultiModalProcessor(); }); pybind11::class_(m, "Generator") diff --git a/src/qnn/interface.cpp b/src/qnn/interface.cpp index 3acc3b58d..0fc746437 100644 --- a/src/qnn/interface.cpp +++ b/src/qnn/interface.cpp @@ -46,6 +46,8 @@ struct InterfaceImpl : DeviceInterface { InterfaceImpl() { } + DeviceType GetType() const override { return DeviceType::QNN; } + void InitOrt(const OrtApi& /*api*/, Ort::Allocator& allocator) override { assert(!ort_allocator_); ort_allocator_ = &allocator; diff --git a/src/smartptrs.h b/src/smartptrs.h index 1e2a79f47..642196ca3 100644 --- a/src/smartptrs.h +++ b/src/smartptrs.h @@ -83,8 +83,19 @@ struct DeviceSpan { friend struct DeviceSpan; // All DeviceSpans are friends }; +enum struct DeviceType { + CPU, + CUDA, + DML, + WEBGPU, + QNN, + MAX +}; + struct DeviceInterface { virtual ~DeviceInterface() {} + + virtual DeviceType GetType() const = 0; virtual void InitOrt(const OrtApi& api, Ort::Allocator& allocator) = 0; virtual Ort::Allocator& GetAllocator() = 0; diff --git a/src/webgpu/interface.cpp b/src/webgpu/interface.cpp index 246cfed50..3b1fd9c25 100644 --- a/src/webgpu/interface.cpp +++ b/src/webgpu/interface.cpp @@ -46,6 +46,8 @@ struct InterfaceImpl : DeviceInterface { InterfaceImpl() { } + DeviceType GetType() const override { return DeviceType::WEBGPU; } + void InitOrt(const OrtApi& /*api*/, Ort::Allocator& allocator) override { assert(!ort_allocator_); ort_allocator_ = &allocator; diff --git a/test/sampling_benchmark.cpp b/test/sampling_benchmark.cpp index eb7f04cd9..28137fdc9 100644 --- a/test/sampling_benchmark.cpp +++ b/test/sampling_benchmark.cpp @@ -32,7 +32,6 @@ struct SamplingBenchmark { params->search.max_length = 10; params->search.batch_size = batch_size_; params->p_device = Generators::GetDeviceInterface(device_type_); - params->device_type = device_type_; std::random_device rd; std::mt19937 engine(rd()); diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp index a42609306..6fa206873 100644 --- a/test/sampling_tests.cpp +++ b/test/sampling_tests.cpp @@ -39,7 +39,6 @@ TEST(SamplingTests, BatchedSamplingTopPCpu) { params->search.top_p = 0.25f; params->search.batch_size = 4; params->p_device = Generators::GetDeviceInterface(Generators::DeviceType::CPU); - params->device_type = Generators::DeviceType::CPU; auto generator = Generators::CreateGenerator(*model, *params); auto logits = params->p_device->WrapMemory(logits_cpu); generator->SetLogits(logits); @@ -66,7 +65,6 @@ TEST(SamplingTests, BatchedSamplingTopKCpu) { params->search.top_k = 2; params->search.batch_size = batch_size; params->p_device = Generators::GetDeviceInterface(Generators::DeviceType::CPU); - params->device_type = Generators::DeviceType::CPU; auto generator = Generators::CreateGenerator(*model, *params); auto logits_copy = logits_cpu; auto logits = params->p_device->WrapMemory(logits_copy); @@ -101,7 +99,6 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCpu) { params->search.top_p = 0.25f; params->search.batch_size = batch_size; params->p_device = Generators::GetDeviceInterface(Generators::DeviceType::CPU); - params->device_type = Generators::DeviceType::CPU; auto generator = Generators::CreateGenerator(*model, *params); auto logits_copy = logits_cpu; auto logits = params->p_device->WrapMemory(logits_copy); @@ -152,7 +149,6 @@ TEST(SamplingTests, RandomizedSamplingTopPCpu) { params->search.top_p = 0.95f; params->search.batch_size = batch_size; params->p_device = Generators::GetDeviceInterface(Generators::DeviceType::CPU); - params->device_type = Generators::DeviceType::CPU; std::vector logits_cpu(config.model.vocab_size * batch_size); std::random_device rd; std::mt19937 engine(rd()); @@ -205,7 +201,6 @@ TEST(SamplingTests, RandomizedSamplingTopKCpu) { params->search.top_k = k; params->search.batch_size = batch_size; params->p_device = Generators::GetDeviceInterface(Generators::DeviceType::CPU); - params->device_type = Generators::DeviceType::CPU; // Create data structures for testing std::random_device rd; @@ -270,7 +265,6 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) { params->search.top_p = p; params->search.batch_size = batch_size; params->p_device = Generators::GetDeviceInterface(Generators::DeviceType::CPU); - params->device_type = Generators::DeviceType::CPU; std::vector logits_cpu(config.model.vocab_size * batch_size); std::random_device rd; std::mt19937 engine(rd()); @@ -317,7 +311,6 @@ TEST(SamplingTests, BatchedSamplingTopPCuda) { params->search.top_p = 0.25f; params->search.batch_size = batch_size; params->p_device = Generators::GetDeviceInterface(Generators::DeviceType::CUDA); - params->device_type = Generators::DeviceType::CUDA; auto logits = AllocateFromCpuMem(*params->p_device, logits_cpu); auto generator = Generators::CreateGenerator(*model, *params); generator->SetLogits(logits); @@ -345,7 +338,6 @@ TEST(SamplingTests, BatchedSamplingTopKCuda) { params->search.top_k = 2; params->search.batch_size = batch_size; params->p_device = Generators::GetDeviceInterface(Generators::DeviceType::CUDA); - params->device_type = Generators::DeviceType::CUDA; auto logits = AllocateFromCpuMem(*params->p_device, logits_cpu); auto generator = Generators::CreateGenerator(*model, *params); generator->SetLogits(logits); @@ -378,7 +370,6 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCuda) { params->search.top_p = 0.25f; params->search.batch_size = batch_size; params->p_device = Generators::GetDeviceInterface(Generators::DeviceType::CUDA); - params->device_type = Generators::DeviceType::CUDA; auto logits = AllocateFromCpuMem(*params->p_device, logits_cpu); auto generator = Generators::CreateGenerator(*model, *params); generator->SetLogits(logits); @@ -406,7 +397,6 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) { params->search.top_p = 0.95f; params->search.batch_size = batch_size; params->p_device = Generators::GetDeviceInterface(Generators::DeviceType::CUDA); - params->device_type = Generators::DeviceType::CUDA; auto logits_gpu = params->p_device->Allocate(config.model.vocab_size * batch_size); auto indices_buffer = params->p_device->Allocate(config.model.vocab_size * batch_size); @@ -448,7 +438,6 @@ TEST(SamplingTests, RandomizedSamplingTopKCuda) { params->search.top_k = k; params->search.batch_size = batch_size; params->p_device = Generators::GetDeviceInterface(Generators::DeviceType::CUDA); - params->device_type = Generators::DeviceType::CUDA; // Create data structures for testing std::random_device rd; @@ -512,7 +501,6 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) { params->search.top_p = p; params->search.batch_size = batch_size; params->p_device = Generators::GetDeviceInterface(Generators::DeviceType::CUDA); - params->device_type = Generators::DeviceType::CUDA; auto logits_gpu = params->p_device->Allocate(config.model.vocab_size * batch_size); auto indices_buffer = params->p_device->Allocate(config.model.vocab_size * batch_size); std::random_device rd; @@ -549,7 +537,6 @@ TEST(SamplingTests, RandomizedSamplingSelectTopCuda) { params->search.max_length = 10; params->search.batch_size = batch_size; params->p_device = Generators::GetDeviceInterface(Generators::DeviceType::CUDA); - params->device_type = Generators::DeviceType::CUDA; auto logits_gpu = params->p_device->Allocate(config.model.vocab_size * batch_size); auto indices_buffer = params->p_device->Allocate(config.model.vocab_size * batch_size); std::random_device rd; From acba52cb41d50465def66dd2d42291b1d6c2d588 Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Mon, 3 Feb 2025 15:24:33 -0800 Subject: [PATCH 29/31] Update src/models/model.h Co-authored-by: aciddelgado <139922440+aciddelgado@users.noreply.github.com> --- src/models/model.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/model.h b/src/models/model.h index c17736b73..9b04e589d 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -137,7 +137,7 @@ struct Model : std::enable_shared_from_this, LeakChecked { std::unique_ptr session_options_; mutable DeviceInterface* p_device_{}; // The device we're running on (matches device_type_) used for things that work the same on all devices - mutable DeviceInterface* p_device_inputs_{}; // For some model inputs, the device might be the CPU device (all but KV cache currently) + mutable DeviceInterface* p_device_inputs_{}; // For some model inputs, the device might be the CPU device (all but KV cache currently for WebGPU and DML) mutable DeviceInterface* p_device_kvcache_{}; // The kvcache is always allocated in device memory (TODO: Remove in favor of just p_device_?) Ort::Allocator& allocator_cpu_{GetDeviceInterface(DeviceType::CPU)->GetAllocator()}; From 68a6ea7d8c4162bb42d6a8507e334a298add223e Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Wed, 12 Feb 2025 18:26:47 -0800 Subject: [PATCH 30/31] Fix merge conflicts --- src/beam_search_scorer.cpp | 2 +- src/generators.h | 1 - src/models/windowed_kv_cache.cpp | 16 ++++++++-------- src/models/windowed_kv_cache.h | 3 +++ 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/beam_search_scorer.cpp b/src/beam_search_scorer.cpp index b9cbceffe..22765b6c4 100644 --- a/src/beam_search_scorer.cpp +++ b/src/beam_search_scorer.cpp @@ -27,7 +27,7 @@ void BeamHypotheses::Add(std::span hypothesis, float sum_logprobs) { return; } } else { - beams_used_++; + beams_used_++; } // Rotate existing elements over while the new element scores higher diff --git a/src/generators.h b/src/generators.h index c84358b0e..185c04583 100644 --- a/src/generators.h +++ b/src/generators.h @@ -26,7 +26,6 @@ #include "leakcheck.h" #include "make_string.h" -#include "smartptrs.h" #include "models/onnxruntime_api.h" #include "smartptrs.h" #include "models/debugging.h" diff --git a/src/models/windowed_kv_cache.cpp b/src/models/windowed_kv_cache.cpp index a143d86b3..8b147f65d 100644 --- a/src/models/windowed_kv_cache.cpp +++ b/src/models/windowed_kv_cache.cpp @@ -47,21 +47,21 @@ WindowedKeyValueCache::WindowedKeyValueCache(State& state) for (int i = 0; i < layer_count_; ++i) { key_caches_in_.push_back( - OrtValue::CreateTensor(*model_.allocator_device_, key_cache_shape_in_, type_)); + OrtValue::CreateTensor(Allocator(), key_cache_shape_in_, type_)); std::fill_n(key_caches_in_[i]->GetTensorMutableData(), ElementCountFromShape(key_cache_shape_in_), static_cast(model_.config_->model.decoder.sliding_window->pad_value)); value_caches_in_.push_back( - OrtValue::CreateTensor(*model_.allocator_device_, value_cache_shape_in_, type_)); + OrtValue::CreateTensor(Allocator(), value_cache_shape_in_, type_)); std::fill_n(value_caches_in_[i]->GetTensorMutableData(), ElementCountFromShape(value_cache_shape_in_), static_cast(model_.config_->model.decoder.sliding_window->pad_value)); key_caches_out_.push_back( - OrtValue::CreateTensor(*model_.allocator_device_, key_cache_shape_out_, type_)); + OrtValue::CreateTensor(Allocator(), key_cache_shape_out_, type_)); value_caches_out_.push_back( - OrtValue::CreateTensor(*model_.allocator_device_, value_cache_shape_out_, type_)); + OrtValue::CreateTensor(Allocator(), value_cache_shape_out_, type_)); } } @@ -187,7 +187,7 @@ void WindowedKeyValueCache::Update(DeviceSpan /* beam_indices */, int c ThreadPool thread_pool{static_cast(layer_count_)}; thread_pool.Compute([&](size_t layer_idx) { - std::unique_ptr key_cache = OrtValue::CreateTensor(*model_.allocator_device_, updated_key_cache_shape_in, type_); + std::unique_ptr key_cache = OrtValue::CreateTensor(Allocator(), updated_key_cache_shape_in, type_); uint8_t* key_cache_data = key_cache->GetTensorMutableData(); uint8_t* key_cache_in_data = key_caches_in_[layer_idx]->GetTensorMutableData(); @@ -213,9 +213,9 @@ void WindowedKeyValueCache::Update(DeviceSpan /* beam_indices */, int c } key_caches_in_[layer_idx] = std::move(key_cache); - key_caches_out_[layer_idx] = OrtValue::CreateTensor(*model_.allocator_device_, updated_key_cache_shape_out, type_); + key_caches_out_[layer_idx] = OrtValue::CreateTensor(Allocator(), updated_key_cache_shape_out, type_); - std::unique_ptr value_cache = OrtValue::CreateTensor(*model_.allocator_device_, updated_value_cache_shape_in, type_); + std::unique_ptr value_cache = OrtValue::CreateTensor(Allocator(), updated_value_cache_shape_in, type_); uint8_t* value_cache_data = value_cache->GetTensorMutableData(); uint8_t* value_cache_in_data = value_caches_in_[layer_idx]->GetTensorMutableData(); @@ -241,7 +241,7 @@ void WindowedKeyValueCache::Update(DeviceSpan /* beam_indices */, int c } value_caches_in_[layer_idx] = std::move(value_cache); - value_caches_out_[layer_idx] = OrtValue::CreateTensor(*model_.allocator_device_, updated_value_cache_shape_out, type_); + value_caches_out_[layer_idx] = OrtValue::CreateTensor(Allocator(), updated_value_cache_shape_out, type_); }); window_size_ = 1; diff --git a/src/models/windowed_kv_cache.h b/src/models/windowed_kv_cache.h index a842eb04d..4e7de10ea 100644 --- a/src/models/windowed_kv_cache.h +++ b/src/models/windowed_kv_cache.h @@ -31,6 +31,9 @@ struct WindowedKeyValueCache : KeyValueCache { void SlideAllLayers(); void SlideLayers(std::span layer_indices); + DeviceInterface& Device() { return *model_.p_device_kvcache_; } + Ort::Allocator& Allocator() { return model_.p_device_kvcache_->GetAllocator(); } + State& state_; const Model& model_{state_.model_}; int layer_count_{}; From 12e2f76c97d2585d7d6230798ac56985a1b9e1bf Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Wed, 12 Feb 2025 18:27:45 -0800 Subject: [PATCH 31/31] Formatting --- src/beam_search_scorer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/beam_search_scorer.cpp b/src/beam_search_scorer.cpp index 22765b6c4..b9cbceffe 100644 --- a/src/beam_search_scorer.cpp +++ b/src/beam_search_scorer.cpp @@ -27,7 +27,7 @@ void BeamHypotheses::Add(std::span hypothesis, float sum_logprobs) { return; } } else { - beams_used_++; + beams_used_++; } // Rotate existing elements over while the new element scores higher