From 948b0343d549ca7e97309f7cbc923ea81b5c3efe Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 1 Nov 2024 16:52:40 -0700 Subject: [PATCH 1/7] Capacity Aware Parititioning begins Implement GetSizeFromTensorTypeProo Wire in accounting Make CUDA EP resource aware and account on assignment Fix missing accountant for Ort format Remove redundant functions Remove unnecessary interface Fix DML issue, minor fixes Fix alert DEMO changes Implement node memory stats collection Place container in the session. Support nested graphs Add synchronization Update stats for the max consumption. Introduce input sizes computation. --- .../core/framework/execution_provider.h | 5 +- .../core/framework/op_kernel_context.h | 4 + .../core/framework/resource_accountant.h | 100 +++++++ include/onnxruntime/core/graph/graph.h | 7 + .../core/graph/indexed_sub_graph.h | 49 ++++ .../onnxruntime_session_options_config_keys.h | 25 ++ onnxruntime/core/framework/execution_frame.cc | 8 + onnxruntime/core/framework/execution_frame.h | 20 +- .../core/framework/execution_provider.cc | 3 +- .../core/framework/graph_partitioner.cc | 159 +++++++++-- onnxruntime/core/framework/op_kernel.cc | 10 + .../framework/op_kernel_context_internal.h | 67 +++++ .../core/framework/resource_accountant.cc | 47 ++++ .../core/framework/sequential_executor.cc | 61 ++++- onnxruntime/core/framework/session_state.h | 22 ++ .../core/framework/tensorprotoutils.cc | 60 +++-- onnxruntime/core/framework/tensorprotoutils.h | 3 + onnxruntime/core/graph/graph.cc | 36 +++ .../providers/acl/acl_execution_provider.cc | 3 +- .../providers/acl/acl_execution_provider.h | 3 +- .../providers/cann/cann_execution_provider.cc | 3 +- .../providers/cann/cann_execution_provider.h | 3 +- .../coreml/coreml_execution_provider.cc | 3 +- .../coreml/coreml_execution_provider.h | 3 +- .../providers/cuda/cuda_execution_provider.cc | 72 ++++- .../providers/cuda/cuda_execution_provider.h | 3 +- .../src/ExecutionProvider.cpp | 10 +- .../src/ExecutionProvider.h | 10 +- .../providers/dnnl/dnnl_execution_provider.cc | 3 +- .../providers/dnnl/dnnl_execution_provider.h | 3 +- .../providers/js/js_execution_provider.cc | 3 +- .../core/providers/js/js_execution_provider.h | 3 +- .../migraphx/migraphx_execution_provider.cc | 3 +- .../migraphx/migraphx_execution_provider.h | 3 +- .../nnapi_builtin/nnapi_execution_provider.cc | 5 +- .../nnapi_builtin/nnapi_execution_provider.h | 3 +- .../openvino/openvino_execution_provider.cc | 3 +- .../openvino/openvino_execution_provider.h | 3 +- .../providers/qnn/qnn_execution_provider.cc | 3 +- .../providers/qnn/qnn_execution_provider.h | 3 +- .../rknpu/rknpu_execution_provider.cc | 3 +- .../rknpu/rknpu_execution_provider.h | 3 +- .../providers/rocm/rocm_execution_provider.cc | 3 +- .../providers/rocm/rocm_execution_provider.h | 3 +- .../provider_bridge_provider.cc | 5 +- .../shared_library/provider_interfaces.h | 8 +- .../shared_library/provider_wrappedtypes.h | 10 + .../providers/snpe/snpe_execution_provider.cc | 3 +- .../providers/snpe/snpe_execution_provider.h | 3 +- .../tensorrt/tensorrt_execution_provider.cc | 3 +- .../tensorrt/tensorrt_execution_provider.h | 3 +- .../vitisai/vitisai_execution_provider.cc | 2 +- .../vitisai/vitisai_execution_provider.h | 3 +- .../vsinpu/vsinpu_execution_provider.cc | 4 +- .../vsinpu/vsinpu_execution_provider.h | 3 +- .../webgpu/webgpu_execution_provider.cc | 3 +- .../webgpu/webgpu_execution_provider.h | 3 +- .../webnn/webnn_execution_provider.cc | 3 +- .../webnn/webnn_execution_provider.h | 3 +- .../xnnpack/xnnpack_execution_provider.cc | 3 +- .../xnnpack/xnnpack_execution_provider.h | 3 +- onnxruntime/core/session/inference_session.cc | 38 +++ onnxruntime/core/session/inference_session.h | 31 +++ onnxruntime/core/session/onnxruntime_c_api.cc | 1 + .../core/session/provider_bridge_ort.cc | 15 +- .../test/framework/inference_session_test.cc | 204 ++++++++++---- .../test/framework/session_state_test.cc | 253 ++++++++++++++++++ onnxruntime/test/framework/test_utils.h | 41 ++- .../internal_testing_execution_provider.cc | 3 +- .../internal_testing_execution_provider.h | 3 +- .../test/providers/qnn/qnn_test_utils.cc | 4 +- onnxruntime/test/shared_lib/test_inference.cc | 13 + 72 files changed, 1366 insertions(+), 154 deletions(-) create mode 100644 include/onnxruntime/core/framework/resource_accountant.h create mode 100644 onnxruntime/core/framework/resource_accountant.cc diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h index 0d9e6db1a7748..c9a15de9ef897 100644 --- a/include/onnxruntime/core/framework/execution_provider.h +++ b/include/onnxruntime/core/framework/execution_provider.h @@ -38,6 +38,8 @@ struct OrtRunOptions; namespace onnxruntime { +class IResourceAccountant; + /** Logical device representation. */ @@ -130,7 +132,8 @@ class IExecutionProvider { */ virtual std::vector> GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& kernel_lookup) const; + const IKernelLookup& kernel_lookup, + IResourceAccountant* resource_accountant = nullptr) const; /** Get kernel registry per execution provider type. diff --git a/include/onnxruntime/core/framework/op_kernel_context.h b/include/onnxruntime/core/framework/op_kernel_context.h index ac22d9130983a..a67d7b8ae0174 100644 --- a/include/onnxruntime/core/framework/op_kernel_context.h +++ b/include/onnxruntime/core/framework/op_kernel_context.h @@ -204,6 +204,10 @@ class OpKernelContext { virtual OrtValue* GetOrCreateOutputMLValue(int index); + virtual int GetOrtValueIndexForInput(int input_index) const; + + virtual int GetOrtValueIndexForOutput(int output_index) const; + private: ORT_DISALLOW_COPY_AND_ASSIGNMENT(OpKernelContext); int GetInputArgIndex(int index) const; diff --git a/include/onnxruntime/core/framework/resource_accountant.h b/include/onnxruntime/core/framework/resource_accountant.h new file mode 100644 index 0000000000000..982b37c969fe7 --- /dev/null +++ b/include/onnxruntime/core/framework/resource_accountant.h @@ -0,0 +1,100 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "core/common/common.h" + +namespace onnxruntime { + +// Common holder for potentially different resource accounting +// for different EPs +using ResourceCount = std::variant; + +/// +/// This class is used for graph partitioning by EPs +/// It stores the cumulative amount of the resource such as +/// memory that would be consumed by the graph nodes if it is assigned to the EP. +/// +/// It provides interfaces to add, remove and query the resource consumption. +/// +/// Each provider may assign its own meaning to the resource according to its constraints. +/// +class IResourceAccountant { + protected: + IResourceAccountant() = default; + IResourceAccountant(const ResourceCount& threshold) : threshold_(threshold) {} + + public: + virtual ~IResourceAccountant() = default; + virtual ResourceCount GetConsumedAmount() const = 0; + virtual void AddConsumedAmount(const ResourceCount& amount) = 0; + virtual void RemoveConsumedAmount(const ResourceCount& amount) = 0; + virtual ResourceCount ComputeResourceCount(const std::string& node_name) const = 0; + + std::optional GetThreshold() const { + return threshold_; + } + + void SetStopAssignment() noexcept { + stop_assignment_ = true; + } + + bool IsStopIssued() const noexcept { return stop_assignment_; } + + private: + bool stop_assignment_ = false; + std::optional threshold_; +}; + +// This struct keeps accounting of the memory allocation stats +// for a kernel during runtime if enabled. +struct NodeAllocationStats { + size_t input_sizes = 0; + size_t initializers_sizes = 0; + size_t total_dynamic_sizes = 0; + size_t total_temp_allocations = 0; + + NodeAllocationStats& operator+=(const NodeAllocationStats& other) { + input_sizes += other.input_sizes; + initializers_sizes += other.initializers_sizes; + total_dynamic_sizes += other.total_dynamic_sizes; + total_temp_allocations += other.total_temp_allocations; + return *this; + } + + void UpdateIfGreater(const NodeAllocationStats& other) { + input_sizes = std::max(input_sizes, other.input_sizes); + initializers_sizes = std::max(initializers_sizes, other.initializers_sizes); + total_dynamic_sizes = std::max(total_dynamic_sizes, other.total_dynamic_sizes); + total_temp_allocations = std::max(total_temp_allocations, other.total_temp_allocations); + } +}; + +class NodeStatsRecorder { + public: + explicit NodeStatsRecorder(const std::filesystem::path& stats_file_name); + ~NodeStatsRecorder(); + + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(NodeStatsRecorder); + + const std::filesystem::path& GetNodeStatsFileName() const noexcept; + + void ReportNodeStats(const std::string& node_name, const NodeAllocationStats& stats); + + void DumpStats(std::ostream& os) const; + + private: + // We would like to hide certain things that may not compile + // with some device compilers + struct Impl; + std::unique_ptr impl_; +}; + +} // namespace onnxruntime diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index 7798394b045dc..1eaf2119f34fe 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -883,6 +883,13 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi return ConstGraphNodes(nodes_, std::move(filter_func)); } + /** Compute node memory requirements, which is mostly initializers + and large attributes that are copied on device (special cases for some nodes) + + Returns no value if the node was not found. + */ + size_t ComputeNodeMemoryUsage(NodeIndex) const; + /** Gets the maximum NodeIndex value used in the Graph. WARNING: This actually returns the max index value used + 1. */ diff --git a/include/onnxruntime/core/graph/indexed_sub_graph.h b/include/onnxruntime/core/graph/indexed_sub_graph.h index c57db41254159..959b183e272ea 100644 --- a/include/onnxruntime/core/graph/indexed_sub_graph.h +++ b/include/onnxruntime/core/graph/indexed_sub_graph.h @@ -7,6 +7,8 @@ #include #include +#include "core/common/inlined_containers_fwd.h" +#include "core/framework/resource_accountant.h" #include "core/graph/basic_types.h" #include "core/graph/onnx_protobuf.h" @@ -70,9 +72,56 @@ struct IndexedSubGraph { return meta_def_.get(); } + // Check if the accounting is enabled for the current EP + bool IsAccountingEnabled() const { + return resource_accountant != nullptr && + nodes_costs.size() == nodes.size(); + } + + // Should call IsAccountingEnabled() first + // Takes the previously computed ResourceCount for the node + // (usually during GetCapabiilty()) + // if present and adds it to the consumed amount + void AccountForNode(size_t cost_index) const { + assert(cost_index < nodes_costs.size()); + if (nodes_costs[cost_index].has_value()) { + resource_accountant->AddConsumedAmount(*nodes_costs[cost_index]); + } + } + + // This computes and accounts for the resource cost for the node that just + // been fused from other nodes, and the EP did not had a chance to compute the costs. + void ComputeAndAccountForNode(const std::string& node_name) const { + assert(resource_accountant != nullptr); + resource_accountant->AddConsumedAmount(resource_accountant->ComputeResourceCount(node_name)); + } + + void SetAccountant(IResourceAccountant* res_accountant) { + resource_accountant = res_accountant; + } + + // Append resource count to the list of costs for the nodes. + void AppendNodeCost(const ResourceCount& cost) { + assert(resource_accountant != nullptr); + nodes_costs.emplace_back(cost); + } + + // Append an absent cost for the node that was already accounted for. + void AppendNodeEmptyCost() { + assert(resource_accountant != nullptr); + nodes_costs.emplace_back(); + } + private: // subgraph meta definition. std::unique_ptr meta_def_; + // Optional resource accountant for this subgraph. + IResourceAccountant* resource_accountant = nullptr; + // Vector with resource costs for nodes above. Should have the same size + // Some nodes that were previously accounted for because they already been assigned to an EP + // for example during multiple calls to GetCapabiility() will not have resource count present. + // may not have a resource count present, we skip it. + InlinedVector> nodes_costs; }; } // namespace onnxruntime diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index 64a4dd19c12b0..5d59380f7d643 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -261,6 +261,31 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMin static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers = "session.save_external_prepacked_constant_initializers"; +// Use this config when you want to collect memory stats for each node in the graph. +// The file format is a CSV file with the following columns: +// The file will be created if it does not exist, and will be overwritten if it does. +// +// The content of the file can be used to estimate memory requirements at run time including +// the temporary allocations. This operation is preferably done on a CPU device, as the model may exceed +// device memory limits in constrained environments. When enabling this option, it is important to disable +// memory patterns, as they tend to allocate large blocks to avoid fragmentation and accommodate needs of multiple +// kernels. Memory patterns may make it difficult to allocate on a device with limited memory. +// +// The collected stats then can be used to partition the graph among the devices in a way that only the +// required memory is allocated on each device. +// +// node_name, initializers_memory, dynamic_outputs_sizes, temp_allocations_size +// +// - "full path to file": there is not a default for this option. If the file can not be opened for writing, an error will be returned. +static const char* const kOrtSessionOptionsCollectNodeMemoryStatsToFile = "session.collect_node_memory_stats_to_file"; + +/// This is a composite CSV setting formatted as "memory limit in kb,file name for collected stats" +/// "limit > 0": enables Capacity Aware Partitioning for Cuda EP. The EP will place nodes on device +/// "file name" : this file is expected to be found at the same folder with the model. The file contains +/// pre-recorded stats collected when running with kOrtSessionOptionsCollectNodeMemoryStatsToFile enforce (see above) +static const char* const kOrtSessionOptionsResourceCudaPartitioningSettings = + "session.resource_cuda_partitioning_settings"; + // Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file. // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead. // "0": disable. (default) diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc index 894e0daae94b6..bc13c30294875 100644 --- a/onnxruntime/core/framework/execution_frame.cc +++ b/onnxruntime/core/framework/execution_frame.cc @@ -23,6 +23,8 @@ #include "core/framework/bfc_arena.h" +#include "core/session/onnxruntime_session_options_config_keys.h" + using namespace onnxruntime::common; namespace onnxruntime { @@ -614,6 +616,12 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va #endif } +#if !defined(ORT_MINIMAL_BUILD) + if (session_state_.GetNodeStatsRecorder() != nullptr) { + ort_value_to_dynamic_allocations_size_.insert_or_assign(ort_value_index, size); + } +#endif + return Status::OK(); } diff --git a/onnxruntime/core/framework/execution_frame.h b/onnxruntime/core/framework/execution_frame.h index de571f86f1c77..7b5a8fd8a4b01 100644 --- a/onnxruntime/core/framework/execution_frame.h +++ b/onnxruntime/core/framework/execution_frame.h @@ -92,10 +92,10 @@ class IExecutionFrame { Status ReleaseMLValue(int ort_value_idx); - protected: // get the ort_value_idx from NodeIndexInfo int GetNodeIdxToMLValueIdx(int index) const; + protected: OrtValue& GetMutableMLValue(int ort_value_index) { return const_cast(GetMLValue(ort_value_index)); } virtual Status ReleaseMLValueImpl(int ort_value_idx); @@ -103,6 +103,8 @@ class IExecutionFrame { // returns true if the ort_value_idx is an output from the graph bool IsOutput(int ort_value_idx) const; + const OrtValueNameIdxMap& GetOrtValueNameIdxMap() const noexcept { return ort_value_idx_map_; } + private: ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(IExecutionFrame); @@ -166,6 +168,16 @@ class ExecutionFrame final : public IExecutionFrame { return planner_.has_value(); } +#if !defined(ORT_MINIMAL_BUILD) + std::optional GetOrtValueDynamicAllocation(int ort_value_index) const { + auto it = ort_value_to_dynamic_allocations_size_.find(ort_value_index); + if (it != ort_value_to_dynamic_allocations_size_.end()) { + return it->second; + } + return std::nullopt; + } +#endif + // This function try retrieve the inferred shapes for the given NodeArg index. // If the retrival is successful, this function returns true and false otherwise. bool TryGetInferredShape(int index, TensorShape& shape) const override; @@ -258,10 +270,14 @@ class ExecutionFrame final : public IExecutionFrame { // This field is not physical memory size. // dynamic_activation_memory_sizes_in_byte_[location] is the dynamic memory consumption on "location". std::unordered_map dynamic_activation_memory_sizes_in_byte_; +#endif +#if !defined(ORT_MINIMAL_BUILD) + // OrtValue index to the size of dynamic memory allocation. + std::unordered_map ort_value_to_dynamic_allocations_size_; +#endif // Mutex which should be acquired when executing non-thread-safe member functions. // A current example is the tracker of dynamic memory allocation. mutable std::mutex mtx_; -#endif }; } // namespace onnxruntime diff --git a/onnxruntime/core/framework/execution_provider.cc b/onnxruntime/core/framework/execution_provider.cc index b39924d4c3ff9..3a937a119d03b 100644 --- a/onnxruntime/core/framework/execution_provider.cc +++ b/onnxruntime/core/framework/execution_provider.cc @@ -13,7 +13,8 @@ namespace onnxruntime { std::vector> IExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, - const IKernelLookup& kernel_lookup) const { + const IKernelLookup& kernel_lookup, + IResourceAccountant*) const { std::vector> result; for (const auto& node : graph.Nodes()) { if (const KernelCreateInfo* kernel_create_info = kernel_lookup.LookUpKernel(node); diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc index b97cf03e3bf59..8a01e3973cdc6 100644 --- a/onnxruntime/core/framework/graph_partitioner.cc +++ b/onnxruntime/core/framework/graph_partitioner.cc @@ -5,13 +5,17 @@ #include #include +#include +#include "core/common/inlined_containers.h" +#include "core/common/string_utils.h" #include "core/framework/compute_capability.h" #include "core/framework/execution_providers.h" #include "core/framework/func_kernel.h" #include "core/framework/kernel_lookup.h" #include "core/framework/kernel_registry_manager.h" #include "core/framework/kernel_registry.h" +#include "core/framework/resource_accountant.h" #include "core/graph/function.h" #include "core/graph/function_utils.h" #include "core/graph/graph_viewer.h" @@ -49,6 +53,9 @@ namespace onnxruntime { namespace { +// A map of Ep Type to a resource accountant for this EP +using ResourceAccountantMap = InlinedHashMap>; + // contains some common parameters used by the partitioning helper functions struct PartitionParams { std::reference_wrapper graph; @@ -60,6 +67,72 @@ struct PartitionParams { std::reference_wrapper debug_graph_fn; #endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) }; + +// Use this accountant if your resource can be counted with size_t type +class SizeTAccountant : public IResourceAccountant { + public: + SizeTAccountant() = default; + ~SizeTAccountant() = default; + + explicit SizeTAccountant(size_t threshold, InlinedHashMap&& node_stats) + : IResourceAccountant(threshold), node_stats_(std::move(node_stats)) {} + + ResourceCount GetConsumedAmount() const noexcept override { + return consumed_amount_; + } + void AddConsumedAmount(const ResourceCount& amount) noexcept override { + if (std::holds_alternative(amount)) { + consumed_amount_ += std::get(amount); + } + } + void RemoveConsumedAmount(const ResourceCount& amount) noexcept override { + if (std::holds_alternative(amount)) { + consumed_amount_ -= std::get<0>(amount); + } + } + + ResourceCount ComputeResourceCount(const std::string& node_name) const override { + auto hit = node_stats_.find(node_name); + if (hit != node_stats_.end()) { + const auto& stats = hit->second; + return stats.input_sizes + stats.initializers_sizes + + stats.total_dynamic_sizes + stats.total_temp_allocations; + } + return static_cast(0U); + } + + private: + size_t consumed_amount_ = 0; + InlinedHashMap node_stats_; +}; + +InlinedHashMap LoadNodeAllocationStats(const std::filesystem::path& model_path, + const std::filesystem::path& file_name) { + InlinedHashMap node_stats; + std::filesystem::path file_path = model_path; + if (file_path.has_filename()) { + file_path = file_path.parent_path(); + } + + file_path /= file_name; + + std::ifstream file(file_path); + ORT_ENFORCE(file.is_open(), "Failed to open file ", file_path); + std::string line; + // Read and load a CSV file line by line + while (std::getline(file, line)) { + auto splits = utils::SplitString(line, ",", false); + ORT_ENFORCE(splits.size() == 5, "Invalid line in the file ", file_path, ": ", line); + std::string node_name{splits[0]}; + size_t input_sizes = SafeInt(std::stoull(std::string{splits[1]})); + size_t initializers_sizes = SafeInt(std::stoull(std::string{splits[2]})); + size_t total_dynamic_sizes = SafeInt(std::stoull(std::string{splits[3]})); + size_t total_temp_allocations = SafeInt(std::stoull(std::string{splits[4]})); + node_stats.insert_or_assign(node_name, {input_sizes, initializers_sizes, + total_dynamic_sizes, total_temp_allocations}); + } + return node_stats; +} } // namespace #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) @@ -92,11 +165,14 @@ static bool TryAssignNodes(Graph& graph, const IndexedSubGraph& capability, } } - for (auto node_index : capability.nodes) { - auto* node = graph.GetNode(node_index); + const bool acc_enabled = capability.IsAccountingEnabled(); + for (size_t i = 0, limit = capability.nodes.size(); i < limit; ++i) { + auto* node = graph.GetNode(capability.nodes[i]); node->SetExecutionProviderType(provider_type); + if (acc_enabled) { + capability.AccountForNode(i); + } } - return true; } @@ -113,6 +189,9 @@ static bool TryAssignSingleNode(Graph& graph, if (nullptr != node && node->GetExecutionProviderType().empty()) { // The node was not fused or assigned. Assign it to . node->SetExecutionProviderType(provider_type); + if (indexed_sub_graph.IsAccountingEnabled()) { + indexed_sub_graph.AccountForNode(0); + } return true; } @@ -131,12 +210,14 @@ struct GetCapabilityForEPParams { std::reference_wrapper transform_layout; std::reference_wrapper debug_graph_fn; #endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) + IResourceAccountant* resource_accountant; }; auto get_capabilities = [](const IExecutionProvider& ep, const GraphViewer& graph_viewer, - const IExecutionProvider::IKernelLookup& kernel_lookup) { - auto capabilities = ep.GetCapability(graph_viewer, kernel_lookup); + const IExecutionProvider::IKernelLookup& kernel_lookup, + IResourceAccountant* resource_accountant) { + auto capabilities = ep.GetCapability(graph_viewer, kernel_lookup, resource_accountant); // In theory an EP could return an empty capability. Remove those. capabilities.erase(std::remove_if(capabilities.begin(), capabilities.end(), @@ -173,7 +254,7 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params, const l { const GraphViewer graph_viewer(graph); - capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup); + capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup, params.resource_accountant); if (capabilities.empty()) { return Status::OK(); @@ -211,7 +292,7 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params, const l capabilities.clear(); const GraphViewer graph_viewer(graph); - capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup); + capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup, params.resource_accountant); // all nodes with an index >= first_new_node with domain of kMSInternalNHWCDomain should be in the capabilities InlinedHashSet new_nodes_in_capabilities; @@ -260,7 +341,7 @@ static Status GetCapabilityForEPForAotInlining(const GraphViewer& graph_viewer, logger}; // TODO: Provide EP with a capability to look inside the functions. - capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup); + capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup, nullptr); return Status::OK(); } @@ -318,6 +399,7 @@ static Node* PlaceNode(Graph& graph, const IndexedSubGraph& capability, } if (sub_graph_available_for_assignment) { + const bool acc_enabled = capability.IsAccountingEnabled(); if (mode == GraphPartitioner::Mode::kNormal) { std::ostringstream oss; oss << provider_type << "_" << capability.GetMetaDef()->name << "_" << fused_node_unique_id++; @@ -333,6 +415,13 @@ static Node* PlaceNode(Graph& graph, const IndexedSubGraph& capability, } fused_node->SetExecutionProviderType(provider_type); + if (acc_enabled) { + // We account for the fused node. We operate under assumption + // that the fused node would use no more memory when the nodes we are fusing. + // and potentially less than that, and therefore, no threshold check is needed here. + // All threshold checks are done within the EP. + capability.ComputeAndAccountForNode(fused_node->Name()); + } result = fused_node; } else { @@ -340,10 +429,13 @@ static Node* PlaceNode(Graph& graph, const IndexedSubGraph& capability, // This is used when exporting an ORT format model to maintain the original nodes and re-do the fusion // at runtime. The original nodes provide a fallback if fewer nodes can be fused at runtime due to device // capabilities. - for (auto node_index : capability.nodes) { - auto* node = graph.GetNode(node_index); + for (size_t i = 0, limit = capability.nodes.size(); i < limit; ++i) { + auto* node = graph.GetNode(capability.nodes[i]); if (node != nullptr) { node->SetExecutionProviderType(provider_type); + if (acc_enabled) { + capability.AccountForNode(i); + } } } } @@ -363,7 +455,7 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr, int& fused_node_unique_id, const layout_transformation::TransformLayoutFunction& transform_layout_fn, const layout_transformation::DebugGraphFn& debug_graph_fn, - const logging::Logger& logger) { + const logging::Logger& logger, IResourceAccountant* resource_accountant) { // handle testing edge case where optimizers or constant lifting results in graph with no nodes. // doing it here saves all providers checking for this in GetCapability if (graph.NumberOfNodes() == 0) { @@ -377,7 +469,7 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr, // we pass through the FuncManager from the top level graph ORT_RETURN_IF_ERROR(PartitionOnnxFormatModelImpl(*subgraph, func_mgr, kernel_registry_mgr, fused_kernel_registry, current_ep, mode, fused_node_unique_id, - transform_layout_fn, debug_graph_fn, logger)); + transform_layout_fn, debug_graph_fn, logger, resource_accountant)); } } @@ -400,7 +492,8 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr, std::ref(capabilities), mode, std::cref(transform_layout_fn), - std::cref(debug_graph_fn)}; + std::cref(debug_graph_fn), + resource_accountant}; ORT_RETURN_IF_ERROR(GetCapabilityForEP(get_capability_params, logger)); if (capabilities.empty()) { @@ -735,7 +828,7 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, GraphPartitioner::Mode mode, const ExecutionProviders& execution_providers, KernelRegistryManager& kernel_registry_manager, - const logging::Logger& logger) { + const ResourceAccountantMap& acc_map, const logging::Logger& logger) { bool modified_graph = false; auto& graph = partition_params.graph.get(); @@ -747,11 +840,16 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, do { // process full graph with each EP for (const auto& ep : execution_providers) { + IResourceAccountant* resource_accountant = nullptr; + auto hit = acc_map.find(ep->Type()); + if (hit != acc_map.end()) { + resource_accountant = hit->second.get(); + } ORT_RETURN_IF_ERROR(PartitionOnnxFormatModelImpl(graph, func_mgr, kernel_registry_manager, fused_kernel_registry, *ep, mode, fused_node_unique_id, transform_layout_function, partition_params.debug_graph_fn, - logger)); + logger, resource_accountant)); } // expand any nodes that have an ONNX function definition but no matching ORT kernel. @@ -786,8 +884,8 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param auto& subgraph = *entry.second; PartitionParams subgraph_partition_params = partition_params; subgraph_partition_params.graph = std::ref(subgraph); - ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(subgraph_partition_params, kernel_registry_mgr, current_ep, - logger)); + ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(subgraph_partition_params, kernel_registry_mgr, + current_ep, logger)); } } @@ -803,6 +901,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param std::cref(partition_params.transform_layout_function), std::cref(partition_params.debug_graph_fn), #endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) + nullptr }; // clang-format on @@ -835,6 +934,9 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param Node& fused_node = graph.BeginFuseSubGraph(indexed_sub_graph, node_name); fused_node.SetExecutionProviderType(type); + if (indexed_sub_graph.IsAccountingEnabled()) { + indexed_sub_graph.ComputeAndAccountForNode(fused_node.Name()); + } // create filtered graph viewer for this set of nodes // @@ -851,6 +953,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) // We will compile the fused nodes one by one, and fuse the subgraph if successful. for (const auto& compilation_entry : compilation_entries) { + const bool acc_enabled = compilation_entry.capability.get().sub_graph->IsAccountingEnabled(); Node& node = compilation_entry.fused_node; std::vector single_node_compute_func; ORT_RETURN_IF_ERROR(current_ep.Compile({IExecutionProvider::FusedNodeAndGraph{node, *compilation_entry.viewer}}, @@ -878,6 +981,9 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param // now that we're done compiling we can remove the original nodes from the Graph and wire in the new one graph.FinalizeFuseSubGraph(indexed_sub_graph, node); + if (acc_enabled) { + compilation_entry.capability.get().sub_graph->ComputeAndAccountForNode(node.Name()); + } } #endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) @@ -988,9 +1094,26 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr, #endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) + // We use this only if Resource Aware Partitioning is enabled for any of the EPs + ResourceAccountantMap ep_acc_map; + // Zero, it is disabled by default + const std::string resource_partitioning_settings = config_options.GetConfigOrDefault( + kOrtSessionOptionsResourceCudaPartitioningSettings, ""); + if (!resource_partitioning_settings.empty()) { + auto splits = utils::SplitString(resource_partitioning_settings, ",", false); + if (splits.size() == 4) { + SafeInt cuda_memory_limit = std::stoul(std::string{splits[0]}); + cuda_memory_limit *= 1024; // to bytes + auto node_to_stats = LoadNodeAllocationStats(graph.ModelPath(), splits[1]); + ep_acc_map[kCudaExecutionProvider] = std::make_unique(cuda_memory_limit, + std::move(node_to_stats)); + } + } + if (mode == Mode::kNormal || mode == Mode::kAssignOnly) { #if !defined(ORT_MINIMAL_BUILD) - ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_, logger)); + ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_, + ep_acc_map, logger)); bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1"; std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); diff --git a/onnxruntime/core/framework/op_kernel.cc b/onnxruntime/core/framework/op_kernel.cc index 94b6224440ed0..1d05cb4e5e818 100644 --- a/onnxruntime/core/framework/op_kernel.cc +++ b/onnxruntime/core/framework/op_kernel.cc @@ -130,6 +130,16 @@ OrtValue* OpKernelContext::GetOrCreateOutputMLValue(int index) { return value; } +int OpKernelContext::GetOrtValueIndexForInput(int input_index) const { + int input_arg_index = GetInputArgIndex(input_index); + return execution_frame_->GetNodeIdxToMLValueIdx(input_arg_index); +} + +int OpKernelContext::GetOrtValueIndexForOutput(int output_index) const { + int output_arg_index = GetOutputArgIndex(output_index); + return execution_frame_->GetNodeIdxToMLValueIdx(output_arg_index); +} + int OpKernelContext::GetInputArgIndex(int index) const { return node_input_start_index_ + index; } diff --git a/onnxruntime/core/framework/op_kernel_context_internal.h b/onnxruntime/core/framework/op_kernel_context_internal.h index 64bd70465a1c7..c970243ba461e 100644 --- a/onnxruntime/core/framework/op_kernel_context_internal.h +++ b/onnxruntime/core/framework/op_kernel_context_internal.h @@ -36,6 +36,15 @@ class OpKernelContextInternal : public OpKernelContext { implicit_inputs[i]->Name(), " does not."); implicit_input_values_.push_back(entry); } + +#if !defined(ORT_MINIMAL_BUILD) + if (session_state_.GetNodeStatsRecorder() != nullptr) { + auto alloc = OpKernelContext::GetAllocator(kernel.GetDevice(OrtMemTypeDefault)); + if (alloc != nullptr) { + accounting_allocator_ = std::make_shared(std::move(alloc)); + } + } +#endif } bool GetUseDeterministicCompute() const override { @@ -69,9 +78,67 @@ class OpKernelContextInternal : public OpKernelContext { return implicit_input_values_; } + int GetOrtValueIndexForInput(int input_index) const override { + return OpKernelContext::GetOrtValueIndexForInput(input_index); + } + + int GetOrtValueIndexForOutput(int output_index) const override { + return OpKernelContext::GetOrtValueIndexForOutput(output_index); + } + +#if !defined(ORT_MINIMAL_BUILD) + Status GetTempSpaceAllocator(AllocatorPtr* output) const override { + if (accounting_allocator_) { + *output = accounting_allocator_; + return Status::OK(); + } + return OpKernelContext::GetTempSpaceAllocator(output); + } +#endif + +#if !defined(ORT_MINIMAL_BUILD) + bool GetAllocatorStats(AllocatorStats& stats) { + if (accounting_allocator_ == nullptr) { + return false; + } + accounting_allocator_->GetStats(&stats); + return true; + } +#endif + const bool& GetTerminateFlag() const noexcept { return terminate_flag_; } private: +#if !defined(ORT_MINIMAL_BUILD) + class AccountingAllocator : public IAllocator { + public: + AccountingAllocator(AllocatorPtr alloc) : IAllocator(alloc->Info()), allocator_(std::move(alloc)) { + } + + void* Alloc(size_t size) override { + void* p = allocator_->Alloc(size); + if (p != nullptr) { + stats_.total_allocated_bytes += size; + } + return p; + } + + void Free(void* p) override { + allocator_->Free(p); + } + + void GetStats(AllocatorStats* stats) override { + *stats = stats_; + } + + private: + AllocatorPtr allocator_; + AllocatorStats stats_; + }; + + AllocatorPtr accounting_allocator_; +#endif + const SessionState& session_state_; const bool& terminate_flag_; std::vector implicit_input_values_; diff --git a/onnxruntime/core/framework/resource_accountant.cc b/onnxruntime/core/framework/resource_accountant.cc new file mode 100644 index 0000000000000..5c2d4feaaf126 --- /dev/null +++ b/onnxruntime/core/framework/resource_accountant.cc @@ -0,0 +1,47 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/framework/resource_accountant.h" +#include "core/common/inlined_containers.h" + +#include + +namespace onnxruntime { + +struct NodeStatsRecorder::Impl { + std::filesystem::path node_stats_path_; + // This is a node name to allocation stats map + InlinedHashMap node_stats_; + mutable std::mutex mut_; +}; + +NodeStatsRecorder::NodeStatsRecorder(const std::filesystem::path& node_stats_path) + : impl_(std::make_unique()) { + impl_->node_stats_path_ = node_stats_path; +} + +NodeStatsRecorder::~NodeStatsRecorder() = default; + +const std::filesystem::path& NodeStatsRecorder::GetNodeStatsFileName() const noexcept { + return impl_->node_stats_path_; +} + +void NodeStatsRecorder::ReportNodeStats(const std::string& node_name, const NodeAllocationStats& stats) { + std::lock_guard lock(impl_->mut_); + auto result = impl_->node_stats_.emplace(node_name, stats); + if (!result.second) { + // Node already exists, update the stats + result.first->second.UpdateIfGreater(stats); + } +} + +void NodeStatsRecorder::DumpStats(std::ostream& os) const { + std::lock_guard lock(impl_->mut_); + for (const auto& [name, stats] : impl_->node_stats_) { + os << name << "," << stats.input_sizes << "," << stats.initializers_sizes << "," + << stats.total_dynamic_sizes << "," + << stats.total_temp_allocations << "\n"; + } +} + +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc index 61fd9b08655b7..8a7564c7d4236 100644 --- a/onnxruntime/core/framework/sequential_executor.cc +++ b/onnxruntime/core/framework/sequential_executor.cc @@ -11,6 +11,7 @@ #include "core/common/logging/logging.h" #include "core/framework/allocation_planner.h" #include "core/framework/execution_frame.h" +#include "core/framework/resource_accountant.h" #include "core/framework/stream_execution_context.h" #include "core/framework/session_state.h" #include "core/framework/op_kernel_context_internal.h" @@ -104,7 +105,7 @@ static void CalculateTotalInputSizes(const OpKernelContextInternal* op_kernel_co const int input_count = op_kernel_context->InputCount(); for (auto i = 0; i < input_count; i++) { const OrtValue* p_input = op_kernel_context->GetInputMLValue(i); - if (p_input != nullptr && p_input->IsTensor() && p_input->IsAllocated()) { + if (p_input != nullptr && p_input->IsAllocated() && p_input->IsTensor()) { const OpKernelInfo& op_kernel_info = p_op_kernel->Info(); const Tensor* p_tensor = nullptr; bool is_param = op_kernel_info.TryGetConstantInput(i, &p_tensor); @@ -256,6 +257,8 @@ class SessionScope { TimePoint session_start_; #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE) const ExecutionFrame& frame_; +#endif +#if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE) // Whether memory profiler need create events and flush to file. // For partial graph run, when the last subgraph of the whole graph is executing, we need flush to file. bool flush_memory_info_ = true; @@ -487,6 +490,61 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx, } #else status = p_kernel->Compute(&kernel_ctx); + +#if !defined(ORT_MINIMAL_BUILD) + auto* node_stats_recorder = ctx.GetSessionState().GetNodeStatsRecorder(); + if (node_stats_recorder != nullptr) { + // Lets first check if any inputs are initializers, + // if so we need to account for their memory usage. + const auto& const_initializers = ctx.GetSessionState().GetConstantInitializedTensors(); + SafeInt initializers_size = 0; + SafeInt input_sizes = 0; + for (int i = 0, lim = kernel_ctx.InputCount(); i < lim; ++i) { + // Need to get ort_value_index for each input. + int ort_vaue_index = kernel_ctx.GetOrtValueIndexForInput(i); + auto hit = const_initializers.find(ort_vaue_index); + if (hit != const_initializers.end()) { + const auto& ort_value = hit->second; + initializers_size += ort_value.Get().SizeInBytes(); + } else { + // If the input is not an initializer, we account it as something that had to be + // on the same device with this kernel + const OrtValue* ort_value = kernel_ctx.GetInputMLValue(i); + if (ort_value != nullptr && ort_value->IsAllocated() && ort_value->IsTensor()) { + input_sizes += ort_value->Get().SizeInBytes(); + } + } + } + + // XXX: Should we account for implicit inputs? + + // Get outputs and see if any were allocated dynamically + SafeInt total_dynamic_sizes = 0; + const auto& exec_frame = ctx.GetExecutionFrame(); + for (int i = 0, lim = kernel_ctx.OutputCount(); i < lim; ++i) { + int ort_vaue_index = kernel_ctx.GetOrtValueIndexForOutput(i); + auto maybe_val = exec_frame.GetOrtValueDynamicAllocation(ort_vaue_index); + if (maybe_val.has_value()) { + total_dynamic_sizes += *maybe_val; + } + } + + NodeAllocationStats node_stats; + node_stats.input_sizes = static_cast(input_sizes); + node_stats.initializers_sizes = static_cast(initializers_size); + node_stats.total_dynamic_sizes = total_dynamic_sizes; + + // Get the temporary allocations + AllocatorStats temp_stats; + if (kernel_ctx.GetAllocatorStats(temp_stats)) { + node_stats.total_temp_allocations = narrow(temp_stats.total_allocated_bytes); + } + + // Record node allocation stats + const auto& node = p_kernel->Node(); + node_stats_recorder->ReportNodeStats(node.Name(), node_stats); + } +#endif #endif } ORT_CATCH(const std::exception& ex) { @@ -510,6 +568,7 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx, LOGS(logger, ERROR) << msg_string; return Status(status.Category(), status.Code(), msg_string); } + ctx.RecycleNodeInputs(idx); VLOGS(logger, 0) << "stream " << stream_idx << " launch kernel with idx " << idx; return Status::OK(); diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h index 82f520f4a4252..964c059e529f9 100644 --- a/onnxruntime/core/framework/session_state.h +++ b/onnxruntime/core/framework/session_state.h @@ -375,6 +375,24 @@ class SessionState { /// true of false bool GetSaveModeForPrepacks(bool saving_model, bool saving_ort_format); +#if !defined(ORT_MINIMAL_BUILD) + + void SetNodeStatsRecorder(NodeStatsRecorder* node_stats_recorder) { + node_stats_recorder_ = node_stats_recorder; + } + + /** + * Returns a pointer to the NodeStatsRecorder object if it was enabled for the session. + * The object pointer is only present at the root SessionState object + */ + NodeStatsRecorder* GetNodeStatsRecorder() const { + if (parent_ != nullptr) { + return parent_->GetNodeStatsRecorder(); + } + return node_stats_recorder_; + } +#endif + private: ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SessionState); @@ -502,6 +520,10 @@ class SessionState { MemoryProfiler* memory_profiler_; #endif +#if !defined(ORT_MINIMAL_BUILD) + NodeStatsRecorder* node_stats_recorder_ = nullptr; +#endif + // switch for enable memory pattern optimization or not. bool enable_mem_pattern_; diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc index 097ce436f4419..17c37b8882168 100644 --- a/onnxruntime/core/framework/tensorprotoutils.cc +++ b/onnxruntime/core/framework/tensorprotoutils.cc @@ -844,18 +844,9 @@ INSTANTIATE_UNPACK_TENSOR(UInt4x2) break; template -common::Status GetSizeInBytesFromTensorProto(const ONNX_NAMESPACE::TensorProto& tensor_proto, size_t* out) { - const auto& dims = tensor_proto.dims(); - size_t size = 1; - for (google::protobuf::int64 dim : dims) { - if (dim < 0 || static_cast(dim) >= std::numeric_limits::max()) { - return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Invalid TensorProto"); - } - if (!IAllocator::CalcMemSizeForArray(size, static_cast(dim), &size)) { - return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Invalid TensorProto"); - } - } - switch (tensor_proto.data_type()) { +common::Status GetSizeInBytesFromTensorShapeAndType(const TensorShape& shape, int32_t element_type, size_t* out) { + const auto size = narrow(shape.Size()); + switch (element_type) { CASE_PROTO_TRACE(FLOAT, float); CASE_PROTO_TRACE(DOUBLE, double); CASE_PROTO_TRACE(BOOL, bool); @@ -884,24 +875,61 @@ common::Status GetSizeInBytesFromTensorProto(const ONNX_NAMESPACE::TensorProto& return Status::OK(); } +template +common::Status GetSizeInBytesFromTensorProto(const ONNX_NAMESPACE::TensorProto& tensor_proto, size_t* out) { + TensorShape tensor_shape = GetTensorShapeFromTensorProto(tensor_proto); + + bool any_out_of_bounds = std::any_of(tensor_shape.GetDims().begin(), tensor_shape.GetDims().end(), + [](int64_t dim) { + if (dim < 0 || + static_cast(dim) >= std::numeric_limits::max()) { + return true; + } + return false; + }); + + ORT_RETURN_IF(any_out_of_bounds, "Out of bounds dimensions in TypeProto_Tensor"); + + return GetSizeInBytesFromTensorShapeAndType(tensor_shape, tensor_proto.data_type(), out); +} + +template +common::Status GetSizeInBytesFromTensorTypeProto(const ONNX_NAMESPACE::TypeProto_Tensor& tensor_proto, size_t* out) { + ORT_RETURN_IF_NOT(HasShape(tensor_proto), "TypeProto_Tensor does not have shape"); + ORT_RETURN_IF_NOT(HasElemType(tensor_proto), "TypeProto_Tensor does not have element type"); + + TensorShape tensor_shape = GetTensorShapeFromTensorShapeProto(tensor_proto.shape()); + + bool any_out_of_bounds = std::any_of(tensor_shape.GetDims().begin(), tensor_shape.GetDims().end(), + [](int64_t dim) { + return dim < 0 || + static_cast(dim) >= std::numeric_limits::max(); + }); + ORT_RETURN_IF(any_out_of_bounds, "Out of bounds dimensions in TypeProto_Tensor"); + + return GetSizeInBytesFromTensorShapeAndType(tensor_shape, tensor_proto.elem_type(), out); +} + +template Status GetSizeInBytesFromTensorTypeProto<0>(const ONNX_NAMESPACE::TypeProto_Tensor& tensor_proto, size_t* out); + TensorShape GetTensorShapeFromTensorShapeProto(const ONNX_NAMESPACE::TensorShapeProto& tensor_shape_proto) { const auto& dims = tensor_shape_proto.dim(); - std::vector tensor_shape_vec(static_cast(dims.size())); + TensorShapeVector tensor_shape_vec(static_cast(dims.size())); for (int i = 0; i < dims.size(); ++i) { tensor_shape_vec[i] = HasDimValue(dims[i]) ? dims[i].dim_value() : -1; /* symbolic dimensions are represented as -1 in onnxruntime*/ } - return TensorShape(std::move(tensor_shape_vec)); + return TensorShape(tensor_shape_vec); } TensorShape GetTensorShapeFromTensorProto(const ONNX_NAMESPACE::TensorProto& tensor_proto) { const auto& dims = tensor_proto.dims(); - std::vector tensor_shape_vec(static_cast(dims.size())); + TensorShapeVector tensor_shape_vec(static_cast(dims.size())); for (int i = 0; i < dims.size(); ++i) { tensor_shape_vec[i] = dims[i]; } - return TensorShape(std::move(tensor_shape_vec)); + return TensorShape(tensor_shape_vec); } struct UnInitializeParam { diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h index 7b9a47842388c..f5dec7ae988f2 100644 --- a/onnxruntime/core/framework/tensorprotoutils.h +++ b/onnxruntime/core/framework/tensorprotoutils.h @@ -157,6 +157,9 @@ ONNXTensorElementDataType GetTensorElementType(const ONNX_NAMESPACE::TensorProto template common::Status GetSizeInBytesFromTensorProto(const ONNX_NAMESPACE::TensorProto& tensor_proto, size_t* out); +template +Status GetSizeInBytesFromTensorTypeProto(const ONNX_NAMESPACE::TypeProto_Tensor& tensor_proto, size_t* out); + /** Special marker used to indicate an existing memory buffer contains the TensorProto external data. If the 'location' field of the external data info is set to this marker, the 'offset' field should contain the diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index 7ee794ccbd2e8..6949eec7f6347 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -5522,6 +5522,42 @@ Graph::Graph(const Model& owning_model, is_loaded_from_model_file_(true) { // true as the Graph isn't manually constructed from scratch } +size_t Graph::ComputeNodeMemoryUsage(NodeIndex node_idx) const { + /// XXX: In some cases some kernels can copy its attributes to a device + // those are edge cases which we currently do not account for. + const Node* node = GetNode(node_idx); + if (node != nullptr) { + SafeInt result = 0; + for (const auto* input : node->InputDefs()) { + if (input->Exists()) { + // Let's see if this is an initializer + constexpr const bool check_outer_scope_true = true; + const ONNX_NAMESPACE::TensorProto* initializer = + GetConstantInitializer(input->Name(), check_outer_scope_true); + if (initializer != nullptr) { + size_t out; + if (utils::GetSizeInBytesFromTensorProto<0>(*initializer, &out).IsOK()) { + result += out; + } + } else { + const auto* proto = input->TypeAsProto(); + if (proto != nullptr && utils::HasTensorType(*proto)) { + const auto& tensor_type = proto->tensor_type(); + if (utils::HasElemType(tensor_type) && utils::HasShape(tensor_type)) { + size_t size; + if (utils::GetSizeInBytesFromTensorTypeProto<0>(tensor_type, &size).IsOK()) { + result += size; + } + } + } + } + } + } + return static_cast(result); + } + return 0; +} + common::Status Graph::LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph, const OrtFormatLoadOptions& load_options) { // We deserialize the graph from ORT format in the following order: diff --git a/onnxruntime/core/providers/acl/acl_execution_provider.cc b/onnxruntime/core/providers/acl/acl_execution_provider.cc index 8d34e36fe7cd6..ede476ff74d1b 100644 --- a/onnxruntime/core/providers/acl/acl_execution_provider.cc +++ b/onnxruntime/core/providers/acl/acl_execution_provider.cc @@ -152,7 +152,8 @@ std::shared_ptr ACLExecutionProvider::GetKernelRegistry() const std::vector> ACLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, - const IKernelLookup& kernel_lookup) const { + const IKernelLookup& kernel_lookup, + IResourceAccountant*) const { std::vector> result; for (const auto& node : graph.Nodes()) { if (const KernelCreateInfo* kernel_create_info = kernel_lookup.LookUpKernel(node); diff --git a/onnxruntime/core/providers/acl/acl_execution_provider.h b/onnxruntime/core/providers/acl/acl_execution_provider.h index 1c267d8713673..d635e56add30b 100755 --- a/onnxruntime/core/providers/acl/acl_execution_provider.h +++ b/onnxruntime/core/providers/acl/acl_execution_provider.h @@ -38,7 +38,8 @@ class ACLExecutionProvider : public IExecutionProvider { std::vector> GetCapability( const onnxruntime::GraphViewer& graph, - const IKernelLookup& kernel_lookup) const override; + const IKernelLookup& kernel_lookup, + IResourceAccountant* resource_accountant) const override; Status OnRunStart(const onnxruntime::RunOptions&) override; diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc index f954baf3eabae..07e83933a890c 100644 --- a/onnxruntime/core/providers/cann/cann_execution_provider.cc +++ b/onnxruntime/core/providers/cann/cann_execution_provider.cc @@ -1253,7 +1253,8 @@ GetSubGraphPartition(const std::vector& topological_order, const std: std::vector> CANNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& kernel_lookup) const { + const IKernelLookup& kernel_lookup, + IResourceAccountant*) const { std::vector> result; // TODO(FFFrog): Feature Enhancement diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.h b/onnxruntime/core/providers/cann/cann_execution_provider.h index 7debfa72778fd..5ff935463a1c1 100644 --- a/onnxruntime/core/providers/cann/cann_execution_provider.h +++ b/onnxruntime/core/providers/cann/cann_execution_provider.h @@ -55,7 +55,8 @@ class CANNExecutionProvider : public IExecutionProvider { std::vector> GetCapability( const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& kernel_lookup) const override; + const IKernelLookup& kernel_lookup, + IResourceAccountant* resource_accountant) const override; Status Compile(const std::vector& fused_nodes_and_graphs, std::vector& node_compute_funcs) override; diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc index b6bb4f2c1d66a..3fa3868267c9b 100644 --- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc +++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc @@ -38,7 +38,8 @@ CoreMLExecutionProvider::~CoreMLExecutionProvider() {} std::vector> CoreMLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const { + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const { std::vector> result; const auto& logger = *GetLogger(); diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.h b/onnxruntime/core/providers/coreml/coreml_execution_provider.h index 650d81a4fecf7..0609bf6af726d 100644 --- a/onnxruntime/core/providers/coreml/coreml_execution_provider.h +++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.h @@ -19,7 +19,8 @@ class CoreMLExecutionProvider : public IExecutionProvider { std::vector> GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const override; + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* resource_accountant) const override; #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) common::Status Compile(const std::vector& fused_nodes, diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index d4013a7dc3d57..2fb1bc35630fa 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -5,6 +5,7 @@ #include "core/common/inlined_containers.h" #include "core/common/parse_string.h" #include "core/framework/int4.h" +#include "core/framework/resource_accountant.h" #include "core/providers/shared_library/provider_api.h" #include "core/platform/env_var_utils.h" #include "core/providers/cuda/cuda_execution_provider.h" @@ -2626,11 +2627,43 @@ std::unique_ptr CUDAExecutionProvider::GetDataTransf std::vector> CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, - const IKernelLookup& kernel_lookup) const { + const IKernelLookup& kernel_lookup, + IResourceAccountant* resource_accountant) const { + std::vector> result; + const logging::Logger& logger = *GetLogger(); + + // Figure out the memory limit if accountant is available + size_t memory_threshold = std::numeric_limits::max(); + SafeInt consumed_memory = 0; + if (resource_accountant != nullptr) { + if (resource_accountant->IsStopIssued()) { + LOGS(logger, WARNING) << "CUDA_EP returning due to Stop Set"; + return result; + } + + auto threshold = resource_accountant->GetThreshold(); + if (!threshold.has_value()) { + // info_.gpu_mem_limit is for BFC arena + size_t free_memory, total_memory; + if (0 != cudaMemGetInfo(&free_memory, &total_memory)) { + memory_threshold = info_.gpu_mem_limit; + } else { + memory_threshold = std::min(free_memory, info_.gpu_mem_limit); + } + } else { + memory_threshold = std::get<0>(threshold.value()); + } + + consumed_memory = std::get<0>(resource_accountant->GetConsumedAmount()); + } + + InlinedHashSet previously_assigned_nodes; + // On repeated calls to this function, we may have most of the nodes already + // assigned to a CUDA EP capability. We'll skip accounting for these nodes. + previously_assigned_nodes.reserve(graph.NumberOfNodes()); InlinedVector candidates; // A subset of the above vector. A subset of the tentative_nodes might be moved to CPU. InlinedVector tentative_nodes; - const logging::Logger& logger = *GetLogger(); for (auto& node_index : graph.GetNodesInTopologicalOrder()) { const auto* p_node = graph.GetNode(node_index); if (p_node == nullptr) @@ -2640,6 +2673,7 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, if (!node.GetExecutionProviderType().empty()) { if (node.GetExecutionProviderType() == kCudaExecutionProvider) { candidates.push_back(node.Index()); + previously_assigned_nodes.insert(node.Index()); } continue; } @@ -2694,14 +2728,40 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, // These are usually shape related computation subgraphs // Following logic can be extended for other EPs auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, logger); - std::vector> result; for (auto& node_index : candidates) { if (cpu_nodes.count(node_index) > 0) continue; - auto sub_graph = IndexedSubGraph::Create(); - sub_graph->Nodes().push_back(node_index); - result.push_back(ComputeCapability::Create(std::move(sub_graph))); + // Previously assigned nodes have been accounted before + if (previously_assigned_nodes.count(node_index) > 0 || resource_accountant == nullptr) { + auto sub_graph = IndexedSubGraph::Create(); + sub_graph->Nodes().push_back(node_index); + result.push_back(ComputeCapability::Create(std::move(sub_graph))); + } else { + auto* node = graph.GetNode(node_index); + auto resource_count = std::get<0>(resource_accountant->ComputeResourceCount(node->Name())); + const auto would_be_consumed = resource_count + consumed_memory; + LOGS(logger, INFO) << "CUDA_EP Node: " << node_index << " Memory usage : " << resource_count + << " would be consumed " << static_cast(would_be_consumed) + << " threshold: " << memory_threshold; + if (would_be_consumed < memory_threshold) { + consumed_memory = would_be_consumed; + auto sub_graph = IndexedSubGraph::Create(); + sub_graph->SetAccountant(resource_accountant); + sub_graph->Nodes().push_back(node_index); + sub_graph->AppendNodeCost(resource_count); + result.push_back(ComputeCapability::Create(std::move(sub_graph))); + } else { + // We break here so we do not have patches of CUDA assigned nodes. + auto* node = graph.GetNode(node_index); + if (node != nullptr) { + LOGS(logger, WARNING) << "CUDA_EP Halting assignment due to capacity threshold at node: " + << node->Name() << " index: " << node_index; + } + resource_accountant->SetStopAssignment(); + break; + } + } } /* std::vector> result; diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h index bd2be2eac2181..79a48e7cb89e1 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h @@ -72,7 +72,8 @@ class CUDAExecutionProvider : public IExecutionProvider { std::vector> GetCapability( const onnxruntime::GraphViewer& graph, - const IKernelLookup& kernel_lookup) const override; + const IKernelLookup& kernel_lookup, + IResourceAccountant* resource_accountant) const override; int GetDeviceId() const override { return info_.device_id; } const cudaDeviceProp& GetDeviceProp() const { return device_prop_; }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 826f48b5f7a68..dd868ddd8307a 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -92,12 +92,13 @@ namespace Dml std::vector> ExecutionProvider::GetCapability( const onnxruntime::GraphViewer& graph, - const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup) const + const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup, + onnxruntime::IResourceAccountant* resource_accountant) const { #ifdef ENABLE_GRAPH_COMPILATION - return m_impl->GetCapability(graph, kernel_lookup, *GetLogger()); + return m_impl->GetCapability(graph, kernel_lookup, resource_accountant, *GetLogger()); #else - return onnxruntime::IExecutionProvider::GetCapability(graph, kernel_lookup); + return onnxruntime::IExecutionProvider::GetCapability(graph, kernel_lookup, resource_accountant); #endif } @@ -877,8 +878,7 @@ namespace Dml ExecutionProviderImpl::GetCapability( const onnxruntime::GraphViewer& graph, const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup, - const onnxruntime::logging::Logger& logger) const - { + const onnxruntime::logging::Logger& logger, onnxruntime::IResourceAccountant*) const { uint32_t deviceDataTypeMask = GetSupportedDeviceDataTypeMask(); // Each bit corresponds to each DML_TENSOR_DATA_TYPE. std::vector> result; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index e7d859c5764de..3002177db13f4 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -11,6 +11,10 @@ #include #include +namespace onnxruntime { +class IResourceAccountant; +} + namespace WRL { template using Base = Microsoft::WRL::RuntimeClass< @@ -89,8 +93,7 @@ namespace Dml GetCapability( const onnxruntime::GraphViewer& graph, const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup, - const onnxruntime::logging::Logger& logger - ) const; + const onnxruntime::logging::Logger& logger, onnxruntime::IResourceAccountant* resource_accountant) const; uint32_t GetSupportedDeviceDataTypeMask() const; @@ -283,7 +286,8 @@ namespace Dml std::vector> GetCapability(const onnxruntime::GraphViewer& graph, - const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup) const final override; + const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup, + onnxruntime::IResourceAccountant* resource_accountant) const final override; onnxruntime::common::Status OnSessionInitializationEnd() override { diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc index c96f9cc1ff400..4da82b351f1d6 100644 --- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc +++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc @@ -146,7 +146,8 @@ std::vector> DnnlExecutionProvider::GetSupportedNodes(con std::vector> DnnlExecutionProvider::GetCapability( const GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const { + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const { // follow from coreml ep's Getcapability std::vector> result; diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h index b7fcbb7765180..bde18e139f2a3 100644 --- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h +++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h @@ -24,7 +24,8 @@ class DnnlExecutionProvider : public IExecutionProvider { std::vector> GetCapability(const onnxruntime::GraphViewer& graph, - const IKernelLookup& /*kernel_lookup*/) const override; + const IKernelLookup& /*kernel_lookup*/, + onnxruntime::IResourceAccountant* /* resource_accountant */) const override; common::Status Compile(const std::vector& fused_nodes_and_graphs, std::vector& node_compute_funcs) override; diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc index d2d1d5e6fdd03..5a753d1ccf79a 100644 --- a/onnxruntime/core/providers/js/js_execution_provider.cc +++ b/onnxruntime/core/providers/js/js_execution_provider.cc @@ -790,7 +790,8 @@ std::vector JsExecutionProvider::CreatePreferredAllocators() { std::vector> JsExecutionProvider::GetCapability( const onnxruntime::GraphViewer& graph, - const IKernelLookup& kernel_lookup) const { + const IKernelLookup& kernel_lookup, + IResourceAccountant* /* resource_accountant */) const { InlinedVector candidates; // `tenative_candidates` is a subset of `candidates`. InlinedVector tenative_candidates; diff --git a/onnxruntime/core/providers/js/js_execution_provider.h b/onnxruntime/core/providers/js/js_execution_provider.h index 966f9c6980212..4bead50fc782e 100644 --- a/onnxruntime/core/providers/js/js_execution_provider.h +++ b/onnxruntime/core/providers/js/js_execution_provider.h @@ -44,7 +44,8 @@ class JsExecutionProvider : public IExecutionProvider { std::vector> GetCapability( const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const override; + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const override; std::shared_ptr GetKernelRegistry() const override; std::unique_ptr GetDataTransfer() const override; diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc index 95fbe7ab58ce2..1558d22137c05 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc @@ -992,7 +992,8 @@ GetPartitionedSubgraphs(const std::vector& topological_order, std::vector> MIGraphXExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const { + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const { std::vector> result; auto model = graph_viewer.CreateModel(*GetLogger()); auto model_proto = model->ToProto(); diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h index 91b6a4741b55e..d6af991f9b77e 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h @@ -68,7 +68,8 @@ class MIGraphXExecutionProvider : public IExecutionProvider { std::vector> GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const override; + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const override; common::Status Compile(const std::vector& fused_nodes, std::vector& node_compute_funcs) override; diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc index f92c9592742d5..27bd584e2d3c6 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc @@ -80,9 +80,10 @@ NnapiExecutionProvider::~NnapiExecutionProvider() {} std::vector> NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const { - const auto& logger = *GetLogger(); + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const { std::vector> result; + const logging::Logger& logger = *GetLogger(); // TODO: Task 812756: NNAPI EP, add support for subgraph (If and Loop operators) if (graph_viewer.IsSubgraph()) { diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h index 460616c41991f..ebf9372eb668d 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h @@ -25,7 +25,8 @@ class NnapiExecutionProvider : public IExecutionProvider { std::vector> GetCapability(const onnxruntime::GraphViewer& graph_view, - const IKernelLookup& /*kernel_lookup*/) const override; + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const override; #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) common::Status Compile(const std::vector& fused_nodes, diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 72a188108adef..0cda59ef4eb19 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -82,7 +82,8 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv std::vector> OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const { + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const { std::vector> result; std::string openvino_sdk_version = std::to_string(global_context_->OpenVINO_Version.at(0)) + "." + diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index d5c22a4e2a9e4..1d7d3db95bb1d 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -183,7 +183,8 @@ class OpenVINOExecutionProvider : public IExecutionProvider { std::vector> GetCapability(const GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const override; + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const override; Status Compile(const std::vector& fused_nodes, std::vector& node_compute_funcs) override; diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index e9d6884b8c8ca..fd3ab8622dc76 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -670,7 +670,8 @@ static void PartitionCtxModel(const onnxruntime::GraphViewer& graph_viewer, std::vector> QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const { + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const { std::vector> result; if (graph_viewer.IsSubgraph()) { diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h index 317b34e66a6e4..c717bafa41398 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h @@ -37,7 +37,8 @@ class QNNExecutionProvider : public IExecutionProvider { std::vector> GetCapability(const onnxruntime::GraphViewer& graph_view, - const IKernelLookup& /*kernel_lookup*/) const override; + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const override; Status Compile(const std::vector& fused_nodes_and_graphs, std::vector& node_compute_funcs) override; diff --git a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc index 44b34f4b4ce6c..10fd81786f977 100644 --- a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc +++ b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc @@ -50,7 +50,8 @@ std::vector> RknpuExecutionProvider::GetSupportedNodes( std::vector> RknpuExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const { + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const { // Find inputs, initializers and outputs for each supported subgraph std::vector> result; diff --git a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.h b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.h index 1289c8569f8e8..ce16d63e111d9 100644 --- a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.h +++ b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.h @@ -19,7 +19,8 @@ class RknpuExecutionProvider : public IExecutionProvider { std::vector> GetCapability(const onnxruntime::GraphViewer& graph, - const IKernelLookup& /*kernel_lookup*/) const override; + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const override; common::Status Compile(const std::vector& fused_nodes_and_graphs, std::vector& node_compute_funcs) override; std::shared_ptr GetKernelRegistry() const override; diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index 0a427b146dcaa..9d6e9df907ce3 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -2440,7 +2440,8 @@ std::unique_ptr ROCMExecutionProvider::GetDataTransf std::vector> ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, - const IKernelLookup& kernel_lookup) const { + const IKernelLookup& kernel_lookup, + IResourceAccountant* /* resource_accountant */) const { InlinedVector candidates; // A subset of the above vector. A subset of the tentative_nodes might be moved to CPU. InlinedVector tentative_nodes; diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.h b/onnxruntime/core/providers/rocm/rocm_execution_provider.h index be467869248ea..ff2bff7c98723 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h @@ -61,7 +61,8 @@ class ROCMExecutionProvider : public IExecutionProvider { std::vector> GetCapability( const onnxruntime::GraphViewer& graph, - const IKernelLookup& kernel_lookup) const override; + const IKernelLookup& kernel_lookup, + IResourceAccountant* /* resource_accountant */) const override; int GetDeviceId() const override { return info_.device_id; } const hipDeviceProp_t& GetDeviceProp() const { return device_prop_; }; diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc index aa8c367d25d51..e9f8c061d9ef3 100644 --- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc +++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc @@ -331,8 +331,9 @@ bool IAllocator::CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, siz } std::vector> IExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& kernel_lookup) const { - return g_host->IExecutionProvider__GetCapability(this, graph_viewer, kernel_lookup); + const IKernelLookup& kernel_lookup, + IResourceAccountant* resource_accountant) const { + return g_host->IExecutionProvider__GetCapability(this, graph_viewer, kernel_lookup, resource_accountant); } common::Status IExecutionProvider::Compile(const std::vector& fused_nodes_and_graphs, std::vector& node_compute_funcs) { diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 5a179ec622f8c..49c514e121178 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -11,6 +11,7 @@ #include "core/providers/shared_library/provider_host_api.h" #include "core/common/inlined_containers_fwd.h" +#include "core/framework/resource_accountant.h" #include "core/providers/shared/common.h" #define PROVIDER_DISALLOW_ALL(TypeName) \ @@ -246,7 +247,8 @@ struct ProviderHost { // IExecutionProvider virtual std::vector> IExecutionProvider__GetCapability(const IExecutionProvider* p, const onnxruntime::GraphViewer& graph_viewer, - const IExecutionProvider::IKernelLookup& kernel_lookup) = 0; + const IExecutionProvider::IKernelLookup& kernel_lookup, + IResourceAccountant* resource_accountant) = 0; virtual common::Status IExecutionProvider__Compile(IExecutionProvider* p, const std::vector& fused_nodes_and_graphs, std::vector& node_compute_funcs) = 0; @@ -628,6 +630,7 @@ struct ProviderHost { virtual std::unique_ptr IndexedSubGraph__construct() = 0; virtual void IndexedSubGraph__operator_delete(IndexedSubGraph* p) = 0; + virtual const std::vector& IndexedSubGraph__Nodes(const IndexedSubGraph* p) = 0; virtual std::vector& IndexedSubGraph__Nodes(IndexedSubGraph* p) = 0; virtual void IndexedSubGraph__SetMetaDef(IndexedSubGraph* p, std::unique_ptr&& meta_def_) = 0; @@ -635,6 +638,9 @@ struct ProviderHost { virtual void IndexedSubGraph__SetSchemaSource(IndexedSubGraph* p, IndexedSubGraph_SourceOfSchema schema_source) = 0; virtual IndexedSubGraph_SourceOfSchema IndexedSubGraph__GetSchemaSource(const IndexedSubGraph* p) = 0; + virtual void IndexedSubGraph__SetAccountant(IndexedSubGraph* p, IResourceAccountant*) = 0; + virtual void IndexedSubGraph__AppendNodeCost(IndexedSubGraph* p, const ResourceCount& count) = 0; + virtual void IndexedSubGraph__AppendNodeEmptyCost(IndexedSubGraph* p) = 0; // KernelDef virtual void KernelDef__operator_delete(KernelDef* p) = 0; diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index 76b6d8063fd66..bf75507b3c6b3 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -553,6 +553,7 @@ struct IndexedSubGraph final { static std::unique_ptr Create() { return g_host->IndexedSubGraph__construct(); } static void operator delete(void* p) { g_host->IndexedSubGraph__operator_delete(reinterpret_cast(p)); } + gsl::span Nodes() const { return g_host->IndexedSubGraph__Nodes(this); } std::vector& Nodes() { return g_host->IndexedSubGraph__Nodes(this); } void SetMetaDef(std::unique_ptr&& meta_def_) { return g_host->IndexedSubGraph__SetMetaDef(this, std::move(*reinterpret_cast*>(&meta_def_))); } @@ -560,6 +561,15 @@ struct IndexedSubGraph final { void SetSchemaSource(IndexedSubGraph_SourceOfSchema schema_source) { return g_host->IndexedSubGraph__SetSchemaSource(this, schema_source); } IndexedSubGraph_SourceOfSchema GetSchemaSource() const { return g_host->IndexedSubGraph__GetSchemaSource(this); } + void SetAccountant(IResourceAccountant* resource_accountant) { + g_host->IndexedSubGraph__SetAccountant(this, resource_accountant); + } + void AppendNodeCost(const ResourceCount& resource_count) { + g_host->IndexedSubGraph__AppendNodeCost(this, resource_count); + } + void AppendNodeEmptyCost() { + g_host->IndexedSubGraph__AppendNodeEmptyCost(this); + } IndexedSubGraph() = delete; IndexedSubGraph(const IndexedSubGraph&) = delete; diff --git a/onnxruntime/core/providers/snpe/snpe_execution_provider.cc b/onnxruntime/core/providers/snpe/snpe_execution_provider.cc index fb9ce580ea2dc..c7fc6d3a556a7 100644 --- a/onnxruntime/core/providers/snpe/snpe_execution_provider.cc +++ b/onnxruntime/core/providers/snpe/snpe_execution_provider.cc @@ -71,7 +71,8 @@ SNPEExecutionProvider::~SNPEExecutionProvider() {} std::vector> SNPEExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, - const IKernelLookup& kernel_lookup) const { + const IKernelLookup& kernel_lookup, + IResourceAccountant* /* resource_accountant */) const { std::vector candidates; for (auto& node_index : graph.GetNodesInTopologicalOrder()) { const auto* p_node = graph.GetNode(node_index); diff --git a/onnxruntime/core/providers/snpe/snpe_execution_provider.h b/onnxruntime/core/providers/snpe/snpe_execution_provider.h index c0a62eea11a25..99033649fcbbf 100644 --- a/onnxruntime/core/providers/snpe/snpe_execution_provider.h +++ b/onnxruntime/core/providers/snpe/snpe_execution_provider.h @@ -18,7 +18,8 @@ class SNPEExecutionProvider : public IExecutionProvider { std::vector> GetCapability( const onnxruntime::GraphViewer& graph, - const IKernelLookup& kernel_lookup) const override; + const IKernelLookup& kernel_lookup, + IResourceAccountant* /* resource_accountant */) const override; std::shared_ptr GetKernelRegistry() const override; std::unordered_map GetRuntimeOptions() const { return runtime_options_; } diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index c583598bbcc52..0ee5cef7cbaa1 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -2451,7 +2451,8 @@ bool TensorrtExecutionProvider::DetectTensorRTGraphCycles(SubGraphCollection_t& std::vector> TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, - const IKernelLookup& /*kernel_lookup*/) const { + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const { // Construct subgraph capability from node list std::vector> result; // Get ModelPath diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index d3e0b0fba8891..92fdcbd3d950c 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -247,7 +247,8 @@ class TensorrtExecutionProvider : public IExecutionProvider { std::vector> GetCapability(const GraphViewer& graph, - const IKernelLookup& /*kernel_lookup*/) const override; + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const override; int GetDeviceId() const { return device_id_; } diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc index 3a99f56bb732a..5d2204b0b1979 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc +++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc @@ -51,7 +51,7 @@ const InlinedVector VitisAIExecutionProvider::GetEpContextNodes() c return ep_context_node_ptrs; } std::vector> VitisAIExecutionProvider::GetCapability( - const onnxruntime::GraphViewer& graph_viewer, const IKernelLookup& kernel_lookup) const { + const onnxruntime::GraphViewer& graph_viewer, const IKernelLookup& kernel_lookup, IResourceAccountant* /* resource_accountant */) const { if (graph_viewer.IsSubgraph()) { // VITIS AI EP not support sungraph. Assigned to CPU. return {}; diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h index f0d1a289a2a73..5b031ab882839 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h +++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h @@ -28,7 +28,8 @@ class VitisAIExecutionProvider : public IExecutionProvider { ~VitisAIExecutionProvider() = default; std::vector> GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const override; + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const override; int GetDeviceId() const { return 0; } common::Status OnRunStart(const onnxruntime::RunOptions& /*run_options*/) override; diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc index 7da7cc6cb63ba..4b9f6fae86423 100644 --- a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc +++ b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc @@ -61,8 +61,8 @@ VSINPUExecutionProvider::~VSINPUExecutionProvider() {} std::vector> VSINPUExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const { - const auto& logger = *GetLogger(); + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const { std::vector> result; if (graph_viewer.IsSubgraph()) { diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h index c2605eb65faee..16cfbc8a9c581 100644 --- a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h +++ b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h @@ -39,7 +39,8 @@ class VSINPUExecutionProvider : public IExecutionProvider { std::vector> GetCapability( const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& kernel_lookup) const override; + const IKernelLookup& kernel_lookup, + IResourceAccountant* /* resource_accountant */) const override; std::shared_ptr GetKernelRegistry() const override; Status Compile(const std::vector& fused_nodes_and_graphs, std::vector& node_compute_funcs) override; diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index dec7e48786bf5..7909084e7177a 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -760,7 +760,8 @@ std::vector WebGpuExecutionProvider::CreatePreferredAllocators() { std::vector> WebGpuExecutionProvider::GetCapability( const onnxruntime::GraphViewer& graph, - const IKernelLookup& kernel_lookup) const { + const IKernelLookup& kernel_lookup, + IResourceAccountant* /* resource_accountant */) const { InlinedVector candidates; // `tenative_candidates` is a subset of `candidates`. InlinedVector tenative_candidates; diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h index ad81924e06901..5df276fa2d8a0 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h @@ -42,7 +42,8 @@ class WebGpuExecutionProvider : public IExecutionProvider { std::vector> GetCapability( const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const override; + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const override; std::shared_ptr GetKernelRegistry() const override; std::unique_ptr GetDataTransfer() const override; diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc index 00fbb26b731f8..df95b653bd863 100644 --- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc +++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc @@ -55,7 +55,8 @@ WebNNExecutionProvider::~WebNNExecutionProvider() {} std::vector> WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_registries*/) const { + const IKernelLookup& /*kernel_registries*/, + IResourceAccountant* /* resource_accountant */) const { // For subgraph which is the attribute of the control flow nodes, part of its initializers are stored in its // ancestor graphs as common initializers shared for other subgraphs. We need to collect all of them used for // identifying the required initializer names and storing into 'meta_def->constant_initializers'. diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.h b/onnxruntime/core/providers/webnn/webnn_execution_provider.h index 26c5e476bcc4f..e806dc340d53e 100644 --- a/onnxruntime/core/providers/webnn/webnn_execution_provider.h +++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.h @@ -24,7 +24,8 @@ class WebNNExecutionProvider : public IExecutionProvider { std::vector> GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_registries*/) const override; + const IKernelLookup& /*kernel_registries*/, + IResourceAccountant* /* resource_accountant */) const override; DataLayout GetPreferredLayout() const override { return preferred_layout_; } diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index ee4e7be0f1f49..641f8b0729d0a 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -257,7 +257,8 @@ static void AddComputeCapabilityForEachNodeInNodeUnit( std::vector> XnnpackExecutionProvider::GetCapability( const onnxruntime::GraphViewer& graph, - const IKernelLookup& /*kernel_lookup*/) const { + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const { const auto& logger = *GetLogger(); std::vector> capabilities; diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.h b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.h index 395dc2f90070e..152bef1a1c52c 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.h +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.h @@ -32,7 +32,8 @@ class XnnpackExecutionProvider : public IExecutionProvider { std::vector> GetCapability( const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const override; + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const override; std::shared_ptr GetKernelRegistry() const override; diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 223eed248800e..f3ca991c0e1e1 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -1594,6 +1594,17 @@ common::Status InferenceSession::AddPrePackedWeightsContainer(PrepackedWeightsCo return Status::OK(); } +#if !defined(ORT_MINIMAL_BUILD) +Status onnxruntime::InferenceSession::CreateNodeStatsRecorder(const std::filesystem::path& node_stats_file) { + if (node_stats_recorder_.has_value()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "The session already has an instance of NodeStatsRecorder"); + } + node_stats_recorder_.emplace(node_stats_file); + return Status::OK(); +} +#endif + namespace { Status PartitionOrtFormatModel(onnxruntime::Graph& graph, const ExecutionProviders& providers, @@ -1795,6 +1806,17 @@ common::Status InferenceSession::Initialize() { } } +#if !defined(ORT_MINIMAL_BUILD) + const std::string node_stats_file = session_options_.config_options.GetConfigOrDefault( + kOrtSessionOptionsCollectNodeMemoryStatsToFile, ""); + + if (!node_stats_file.empty()) { + ORT_RETURN_IF_ERROR_SESSIONID_(CreateNodeStatsRecorder(node_stats_file)); + } + + session_state_->SetNodeStatsRecorder(GetNodeStatsRecorder()); +#endif + #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE) // Don't want to pollute SessionState constructor since memory profile is enabled optionally. session_state_->SetMemoryProfiler(&memory_profiler_); @@ -2726,6 +2748,22 @@ Status InferenceSession::Run(const RunOptions& run_options, TraceLoggingWriteStop(ortrun_activity, "OrtRun"); #endif +#if !defined(ORT_MINIMAL_BUILD) + if (GetNodeStatsRecorder() != nullptr && retval.IsOK()) { + // Dump node stats if the run was successful + const auto* node_stats_recorder = GetNodeStatsRecorder(); + auto node_stats_file = session_state_->GetGraphViewer().ModelPath(); + if (node_stats_file.has_filename()) { + node_stats_file = node_stats_file.parent_path(); + } + node_stats_file /= node_stats_recorder->GetNodeStatsFileName(); + std::ofstream ofs(node_stats_file, std::ofstream::out); + ORT_ENFORCE(ofs.is_open(), "Failed to open file: ", node_stats_file); + node_stats_recorder->DumpStats(ofs); + ofs.close(); + } +#endif + // As N+1 inference runs (N for memory allocation and 1 for graph capturing) // are needed before replaying the captured graph, here run N inference runs recursively until graph captured, // so that users just need one session run to capture the graph. diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h index e28ff75345785..2c0c09dfd3e51 100644 --- a/onnxruntime/core/session/inference_session.h +++ b/onnxruntime/core/session/inference_session.h @@ -21,6 +21,7 @@ #include "core/framework/external_data_loader_manager.h" #include "core/framework/kernel_registry_manager.h" #include "core/framework/prepacked_weights_container.h" +#include "core/framework/resource_accountant.h" #include "core/framework/session_state.h" #include "core/framework/tuning_results.h" #include "core/framework/framework_provider_common.h" @@ -545,6 +546,31 @@ class InferenceSession { */ Status AddPrePackedWeightsContainer(PrepackedWeightsContainer* prepacked_weights_container); +#if !defined(ORT_MINIMAL_BUILD) + /** + * CreateNodeStats recorder and enable collection of node statistics that is useful + * for resource constrained partitioning and otherwise. + * + * @param node_stats_file - this file will be created at the same folder where the model file is present. + */ + Status CreateNodeStatsRecorder(const std::filesystem::path& node_stats_file); + + /** + * Returns true if collection is enabled + */ + bool IsNodeStatsCollectionEnabled() const noexcept { + return node_stats_recorder_.has_value(); + } + + /** + * NodeStatsRecorder pointer. If not present, returns nullptr + */ + NodeStatsRecorder* GetNodeStatsRecorder() noexcept { + return node_stats_recorder_.has_value() ? &*node_stats_recorder_ : nullptr; + } + +#endif + protected: #if !defined(ORT_MINIMAL_BUILD) @@ -911,6 +937,11 @@ class InferenceSession { }; CachedExecutionProviderForGraphReplay cached_execution_provider_for_graph_replay_; + +#if !defined(ORT_MINIMAL_BUILD) + // Enable nodestats collection + std::optional node_stats_recorder_; +#endif }; struct SessionIOBinding { diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index ca6950af0227a..3761b4ca0ec41 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -5,6 +5,7 @@ #include "core/session/allocator_adapters.h" #include "core/session/inference_session_utils.h" #include "core/session/IOBinding.h" +#include "core/session/onnxruntime_session_options_config_keys.h" #include "core/framework/allocator.h" #include "core/framework/error_code_helper.h" #include "core/framework/execution_provider.h" diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index af39edae2074d..3208c5634b438 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -336,8 +336,9 @@ struct ProviderHostImpl : ProviderHost { // IExecutionProvider (direct) std::vector> IExecutionProvider__GetCapability( const IExecutionProvider* p, const onnxruntime::GraphViewer& graph_viewer, - const IExecutionProvider::IKernelLookup& kernel_lookup) override { - return p->IExecutionProvider::GetCapability(graph_viewer, kernel_lookup); + const IExecutionProvider::IKernelLookup& kernel_lookup, + IResourceAccountant* resource_accountant) override { + return p->IExecutionProvider::GetCapability(graph_viewer, kernel_lookup, resource_accountant); } common::Status IExecutionProvider__Compile(IExecutionProvider* p, const std::vector& fused_nodes_and_graphs, std::vector& node_compute_funcs) override { @@ -761,6 +762,9 @@ struct ProviderHostImpl : ProviderHost { std::unique_ptr IndexedSubGraph__construct() override { return std::make_unique(); } void IndexedSubGraph__operator_delete(IndexedSubGraph* p) override { delete p; } + const std::vector& IndexedSubGraph__Nodes(const IndexedSubGraph* p) override { + return p->nodes; + } std::vector& IndexedSubGraph__Nodes(IndexedSubGraph* p) override { return p->nodes; } void IndexedSubGraph__SetMetaDef(IndexedSubGraph* p, std::unique_ptr&& meta_def_) override { p->SetMetaDef(std::move(meta_def_)); } @@ -768,6 +772,13 @@ struct ProviderHostImpl : ProviderHost { void IndexedSubGraph__SetSchemaSource(IndexedSubGraph* p, IndexedSubGraph_SourceOfSchema schema_source) override { p->schema_source = schema_source; } IndexedSubGraph_SourceOfSchema IndexedSubGraph__GetSchemaSource(const IndexedSubGraph* p) override { return p->schema_source; } + void IndexedSubGraph__SetAccountant(IndexedSubGraph* p, IResourceAccountant* resource_accountant) override { + p->SetAccountant(resource_accountant); + } + void IndexedSubGraph__AppendNodeCost(IndexedSubGraph* p, const ResourceCount& resource_count) override { + p->AppendNodeCost(resource_count); + } + void IndexedSubGraph__AppendNodeEmptyCost(IndexedSubGraph* p) override { p->AppendNodeEmptyCost(); } // KernelDef (wrapped) void KernelDef__operator_delete(KernelDef* p) override { delete p; } diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index 740c566794f15..7ac0aaa291f67 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "core/common/denormal.h" @@ -59,7 +60,6 @@ #include "gtest/gtest.h" #include "gmock/gmock.h" -using namespace std; using namespace ONNX_NAMESPACE; using namespace onnxruntime::logging; using namespace onnxruntime::concurrency; @@ -137,7 +137,8 @@ class FuseExecutionProvider : public IExecutionProvider { std::vector> GetCapability(const onnxruntime::GraphViewer& graph, - const IKernelLookup& /*kernel_lookup*/) const override { + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const override { // Fuse two add into one. std::vector> result; std::unique_ptr sub_graph = std::make_unique(); @@ -283,7 +284,7 @@ void RunModelWithBindingMatMul(InferenceSession& session_object, ProviderType allocation_provider, IExecutionProvider* gpu_provider, OrtDevice* output_device) { - unique_ptr io_binding; + std::unique_ptr io_binding; Status st = session_object.NewIOBinding(&io_binding); ASSERT_TRUE(st.IsOK()); auto input_allocator = io_binding->GetCPUAllocator(bind_provider_type); @@ -358,7 +359,7 @@ void RunModelWithBindingMatMul(InferenceSession& session_object, (output_device && output_device->Type() == OrtDevice::GPU)) { #if defined(USE_CUDA) || defined(USE_ROCM) // in this case we need to copy the tensor from cuda to cpu - vector& outputs = io_binding->GetOutputs(); + std::vector& outputs = io_binding->GetOutputs(); ASSERT_EQ(1u, outputs.size()); auto& rtensor = outputs.front().Get(); auto element_type = rtensor.DataType(); @@ -388,6 +389,106 @@ void RunModelWithBindingMatMul(InferenceSession& session_object, } } +#if 0 +namespace { +// generate random inputs +template +InlinedVector GenerateRandomInput(size_t size) { + InlinedVector values(size); + std::random_device dev; + std::mt19937 rng(dev()); + std::uniform_int_distribution distribution(1, 100); + std::generate(values.begin(), values.end(), [&]() { return static_cast(distribution(rng)); }); + return values; +} + +template <> +InlinedVector GenerateRandomInput(size_t size) { + InlinedVector values(size); + std::random_device dev; + std::default_random_engine rng(dev()); + std::uniform_real_distribution distribution(-1.f, 1.f); + std::generate(values.begin(), values.end(), [&]() { return static_cast(distribution(rng)); }); + return values; +} + +template +void CreateMLValueFromRandom(const AllocatorPtr& alloc, gsl::span shape, + OrtValue& ort_value) { + const auto elements = narrow(std::accumulate(shape.begin(), shape.end(), + static_cast(1), + std::multiplies())); + const auto values = GenerateRandomInput(elements); + CreateMLValue(alloc, shape, values, &ort_value); +} + +} // namespace + +TEST(InferenceSessionTests, GenerateNodeStatsWithRandomInput) { + static constexpr const ORTCHAR_T* STAT_MODEL = + ORT_TSTR("D:/dev/data/FunctionsConverterProfling/HF_Mobile_Bert/attention_mask2d_fp32.onnx"); + + SessionOptions so; + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsCollectNodeMemoryStatsToFile, + "attention_mask2d_fp32_node_stats.txt")); + InferenceSession session_object{so, GetEnvironment()}; + ASSERT_STATUS_OK(session_object.Load(STAT_MODEL)); + ASSERT_STATUS_OK(session_object.Initialize()); + + auto allocators = TestCPUExecutionProvider()->CreatePreferredAllocators(); + auto inputs_defs = session_object.GetModelInputs(); + ASSERT_STATUS_OK(inputs_defs.first); + NameMLValMap feeds; + for (const auto* def : *inputs_defs.second) { + if (!def->Exists()) { + continue; + } + + OrtValue ml_value; + const auto* type_proto = def->TypeAsProto(); + ASSERT_TRUE(utils::HasTensorType(*type_proto)); + const auto elem_type = type_proto->tensor_type().elem_type(); + ASSERT_TRUE(utils::HasShape(*type_proto)); + const auto& tensor_shape_proto = type_proto->tensor_type().shape(); + + TensorShapeVector input_dims; + for (const auto& dim : tensor_shape_proto.dim()) { + input_dims.push_back(dim.dim_value()); + } + + switch (elem_type) { + case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: { + CreateMLValueFromRandom(allocators[0], input_dims, ml_value); + break; + } + case ONNX_NAMESPACE::TensorProto_DataType_INT32: { + CreateMLValueFromRandom(allocators[0], input_dims, ml_value); + break; + } + case ONNX_NAMESPACE::TensorProto_DataType_INT64: { + CreateMLValueFromRandom(allocators[0], input_dims, ml_value); + break; + } + + default: + ASSERT_TRUE(false) << "Unsupported type: " << elem_type; + } + feeds.insert_or_assign(def->Name(), std::move(ml_value)); + } + + InlinedVector output_names; + auto outputs = session_object.GetModelOutputs(); + ASSERT_STATUS_OK(outputs.first); + for (const auto& output : *outputs.second) { + output_names.push_back(output->Name()); + } + + RunOptions run_options; + std::vector fetches; + ASSERT_STATUS_OK(session_object.Run(run_options, feeds, output_names, &fetches)); +} +#endif + TEST(InferenceSessionTests, NoTimeout) { SessionOptions so; @@ -438,7 +539,7 @@ TEST(InferenceSessionTests, TestModelSerialization) { // Load model with level 0 transform level // and assert that the model has Identity nodes. SessionOptions so; - const string test_model = "testdata/transform/abs-id-max.onnx"; + const std::string test_model = "testdata/transform/abs-id-max.onnx"; so.session_logid = "InferenceSessionTests.TestModelSerialization"; so.graph_optimization_level = TransformerLevel::Default; InferenceSessionWrapper session_object_noopt{so, GetEnvironment()}; @@ -478,9 +579,9 @@ TEST(InferenceSessionTests, TestModelSerialization) { // Assert that re-feed of optimized model with default transform level results // in same runtime model as abs-id-max.onnx with TransformLevel-1. - std::ifstream model_fs_session1(so.optimized_model_filepath, ios::in | ios::binary); + std::ifstream model_fs_session1(so.optimized_model_filepath, std::ios::in | std::ios::binary); ASSERT_TRUE(model_fs_session1.good()); - std::ifstream model_fs_session2(so_opt.optimized_model_filepath, ios::in | ios::binary); + std::ifstream model_fs_session2(so_opt.optimized_model_filepath, std::ios::in | std::ios::binary); ASSERT_TRUE(model_fs_session2.good()); ASSERT_TRUE(model_fs_session1.tellg() == model_fs_session2.tellg()); model_fs_session1.seekg(0, std::ifstream::beg); @@ -499,7 +600,7 @@ TEST(InferenceSessionTests, TestModelSerialization) { #ifdef ORT_RUN_EXTERNAL_ONNX_TESTS static bool Compare(const InputDefList& f_arg, const InputDefList& s_arg) { if (f_arg.size() != s_arg.size()) { - cout << "Sizes differ: f_arg size: " << f_arg.size() << " s_arg size: " << s_arg.size() << endl; + std::cout << "Sizes differ: f_arg size: " << f_arg.size() << " s_arg size: " << s_arg.size() << std::endl; return false; } @@ -564,9 +665,9 @@ TEST(InferenceSessionTests, ModelMetadata) { } auto retval = session_object.GetModelInputs(); - cout << "weights size: " << weights.size() - << " inputs.size(): " << inputs.size() - << " from session: " << retval.second->size() << endl; + std::cout << "weights size: " << weights.size() + << " inputs.size(): " << inputs.size() + << " from session: " << retval.second->size() << std::endl; ASSERT_TRUE(retval.first.IsOK()); ASSERT_TRUE(Compare(inputs_no_weights, *retval.second)); } @@ -617,7 +718,7 @@ TEST(InferenceSessionTests, CheckRunLogger) { bool have_log_entry_with_run_tag = (std::find_if(msgs.begin(), msgs.end(), [&run_options](std::string msg) { - return msg.find(run_options.run_tag) != string::npos; + return msg.find(run_options.run_tag) != std::string::npos; }) != msgs.end()); ASSERT_TRUE(have_log_entry_with_run_tag); @@ -660,18 +761,18 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) { auto size = lines.size(); ASSERT_TRUE(size > 1); - ASSERT_TRUE(lines[0].find("[") != string::npos); - ASSERT_TRUE(lines[1].find("model_loading_uri") != string::npos); - ASSERT_TRUE(lines[size - 1].find("]") != string::npos); + ASSERT_TRUE(lines[0].find("[") != std::string::npos); + ASSERT_TRUE(lines[1].find("model_loading_uri") != std::string::npos); + ASSERT_TRUE(lines[size - 1].find("]") != std::string::npos); std::vector tags = {"pid", "dur", "ts", "ph", "X", "name", "args"}; bool has_kernel_info = false; for (size_t i = 1; i < size - 1; ++i) { for (auto& s : tags) { - ASSERT_TRUE(lines[i].find(s) != string::npos); - has_kernel_info = has_kernel_info || lines[i].find("Kernel") != string::npos && - lines[i].find("stream") != string::npos && - lines[i].find("block_x") != string::npos; + ASSERT_TRUE(lines[i].find(s) != std::string::npos); + has_kernel_info = has_kernel_info || lines[i].find("Kernel") != std::string::npos && + lines[i].find("stream") != std::string::npos && + lines[i].find("block_x") != std::string::npos; } } @@ -717,25 +818,25 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions2) { auto size = lines.size(); ASSERT_TRUE(size > 1); - ASSERT_TRUE(lines[0].find("[") != string::npos); - ASSERT_TRUE(lines[1].find("model_loading_uri") != string::npos); - ASSERT_TRUE(lines[size - 1].find("]") != string::npos); + ASSERT_TRUE(lines[0].find("[") != std::string::npos); + ASSERT_TRUE(lines[1].find("model_loading_uri") != std::string::npos); + ASSERT_TRUE(lines[size - 1].find("]") != std::string::npos); std::vector tags = {"pid", "dur", "ts", "ph", "X", "name", "args"}; [[maybe_unused]] bool has_api_info = false; for (size_t i = 1; i < size - 1; ++i) { for (auto& s : tags) { - ASSERT_TRUE(lines[i].find(s) != string::npos); + ASSERT_TRUE(lines[i].find(s) != std::string::npos); #ifdef USE_CUDA - has_api_info = has_api_info || lines[i].find("Api") != string::npos && - lines[i].find("cudaLaunch") != string::npos; + has_api_info = has_api_info || lines[i].find("Api") != std::string::npos && + lines[i].find("cudaLaunch") != std::string::npos; #endif #ifdef USE_ROCM - has_api_info = has_api_info || lines[i].find("Api") != string::npos && - lines[i].find("hipLaunch") != string::npos; + has_api_info = has_api_info || lines[i].find("Api") != std::string::npos && + lines[i].find("hipLaunch") != std::string::npos; #endif #ifdef USE_WEBGPU - has_api_info = has_api_info || lines[i].find("Api") != string::npos; + has_api_info = has_api_info || lines[i].find("Api") != std::string::npos; #endif } } @@ -769,17 +870,17 @@ TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) { int count = 0; while (std::getline(profile, line)) { if (count == 0) { - ASSERT_TRUE(line.find("[") != string::npos); + ASSERT_TRUE(line.find("[") != std::string::npos); } else if (count <= 3) { for (auto& s : tags) { - ASSERT_TRUE(line.find(s) != string::npos); + ASSERT_TRUE(line.find(s) != std::string::npos); } } else { - ASSERT_TRUE(line.find("]") != string::npos); + ASSERT_TRUE(line.find("]") != std::string::npos); } if (count == 1) { - ASSERT_TRUE(line.find("mul_1_kernel_time") != string::npos); + ASSERT_TRUE(line.find("mul_1_kernel_time") != std::string::npos); } count++; } @@ -929,7 +1030,7 @@ TEST(InferenceSessionTests, ConfigureVerbosityLevel) { std::copy(msgs.begin(), msgs.end(), std::ostream_iterator(std::cout, "\n")); bool have_log_entry_with_vlog_session_msg = (std::find_if(msgs.begin(), msgs.end(), - [&](std::string msg) { return msg.find("Added input argument with name") != string::npos; }) != + [&](std::string msg) { return msg.find("Added input argument with name") != std::string::npos; }) != msgs.end()); ASSERT_TRUE(have_log_entry_with_vlog_session_msg); @@ -942,7 +1043,8 @@ TEST(InferenceSessionTests, ConfigureVerbosityLevel) { // ASSERT_TRUE(have_log_entry_with_vlog_run_msg); bool has_num_streams_msg = - (std::find_if(msgs.begin(), msgs.end(), [&](std::string msg) { return msg.find("Number of streams") != string::npos; }) != msgs.end()); + (std::find_if(msgs.begin(), msgs.end(), [&](std::string msg) { return msg.find("Number of streams") != + std::string::npos; }) != msgs.end()); ASSERT_TRUE(has_num_streams_msg); #endif @@ -983,7 +1085,7 @@ TEST(InferenceSessionTests, UseUserSpecifiedLoggingFunctionInSession) { #ifndef NDEBUG bool have_log_entry_with_vlog_session_msg = (std::find_if(log_msgs.begin(), log_msgs.end(), - [&](std::string msg) { return msg.find("Added input argument with name") != string::npos; }) != + [&](std::string msg) { return msg.find("Added input argument with name") != std::string::npos; }) != log_msgs.end()); ASSERT_TRUE(have_log_entry_with_vlog_session_msg); #endif @@ -996,7 +1098,7 @@ TEST(InferenceSessionTests, TestWithIstream) { InferenceSession session_object{so, GetEnvironment()}; - std::ifstream model_file_stream(MODEL_URI, ios::in | ios::binary); + std::ifstream model_file_stream(MODEL_URI, std::ios::in | std::ios::binary); ASSERT_TRUE(model_file_stream.good()); ASSERT_TRUE(session_object.Load(model_file_stream).IsOK()); ASSERT_STATUS_OK(session_object.Initialize()); @@ -1015,7 +1117,7 @@ TEST(InferenceSessionTests, TestRegisterExecutionProvider) { CPUExecutionProviderInfo epi; ASSERT_TRUE(session_object.RegisterExecutionProvider(std::make_unique(epi)).IsOK()); - std::ifstream model_file_stream(MODEL_URI, ios::in | ios::binary); + std::ifstream model_file_stream(MODEL_URI, std::ios::in | std::ios::binary); ASSERT_TRUE(model_file_stream.good()); ASSERT_TRUE(session_object.Load(model_file_stream).IsOK()); ASSERT_STATUS_OK(session_object.Initialize()); @@ -1092,13 +1194,14 @@ TEST(InferenceSessionTests, TestIOBindingReuse) { std::stringstream sstr(s1); ASSERT_TRUE(session_object.Load(sstr).IsOK()); ASSERT_STATUS_OK(session_object.Initialize()); - unique_ptr io_binding; + std::unique_ptr io_binding; Status st = session_object.NewIOBinding(&io_binding); ASSERT_TRUE(st.IsOK()); OrtValue ml_value1; - vector v1{2.f}; - CreateMLValue(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], {1}, v1, &ml_value1); + const std::vector v1{2.f}; + const int64_t shape[] = {1}; + CreateMLValue(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], shape, v1, &ml_value1); ASSERT_STATUS_OK(io_binding->BindOutput("foo", ml_value1)); ASSERT_TRUE(io_binding->GetOutputs().size() == 1); auto span = io_binding->GetOutputs()[0].Get().DataAsSpan(); @@ -1108,8 +1211,8 @@ TEST(InferenceSessionTests, TestIOBindingReuse) { } OrtValue ml_value2; - vector v2{3.f}; - CreateMLValue(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], {1}, v2, &ml_value2); + const std::vector v2{3.f}; + CreateMLValue(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], shape, v2, &ml_value2); ASSERT_STATUS_OK(io_binding->BindOutput("foo", ml_value2)); ASSERT_TRUE(io_binding->GetOutputs().size() == 1); span = io_binding->GetOutputs()[0].Get().DataAsSpan(); @@ -1651,7 +1754,7 @@ TEST(InferenceSessionTests, Test3LayerNestedSubgraph) { run_options.run_tag = so.session_logid; std::vector dim = {1}; - std::vector va = {false}; + InlinedVector va = {false}; OrtValue ml_value_x; CreateMLValue(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dim, va, &ml_value_x); @@ -1807,8 +1910,9 @@ TEST(InferenceSessionTests, Test2LayerNestedSubgraph) { OrtValue ml_value_input_0; CreateMLValue(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dim_input_0, data_input_0, &ml_value_input_0); - std::vector dim_input_1 = {1}; - std::vector data_input_1 = {false}; + + const int64_t dim_input_1[] = {1}; + const bool data_input_1[] = {false}; OrtValue ml_value_input_1; CreateMLValue(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dim_input_1, data_input_1, &ml_value_input_1); @@ -2047,7 +2151,7 @@ TEST(InferenceSessionTests, TestCopyToFromDevices) { // It creates and registers a dummy transformer and after session initialize // validates that this transformer was called regardless of the graph optimization level set. TEST(InferenceSessionTests, TestRegisterTransformers) { - string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx"; + std::string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx"; for (int i = static_cast(TransformerLevel::Default); i <= static_cast(TransformerLevel::MaxLevel); i++) { SessionOptions so; @@ -2126,7 +2230,7 @@ TEST(InferenceSessionTests, TestStrictShapeInference) { tester.AddInput("data", input_shape, input_data); tester.AddOutput("output", invalid_output_shape, output_data); - const std::unordered_set excluded_provider_types = { + const std::unordered_set excluded_provider_types = { kTensorrtExecutionProvider, // Doesn't handle Unsqueeze. kOpenVINOExecutionProvider}; // Disabled temporarily. @@ -2144,7 +2248,7 @@ TEST(InferenceSessionTests, TestStrictShapeInference) { #ifdef USE_CUDA // disable it, since we are going to enable parallel execution with cuda ep TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) { - string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx"; + std::string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx"; SessionOptions so; so.execution_mode = ExecutionMode::ORT_PARALLEL; @@ -2822,10 +2926,10 @@ TEST(InferenceSessionTests, InitializerSharing_EnsureSessionsUseUserAddedInitial std::vector input_data_vec{1., 2., 3., 4., 5., 6.}; auto allocator = TestCPUExecutionProvider()->CreatePreferredAllocators()[0]; - CreateMLValue(allocator, {3, 2}, input_data_vec, &val_to_share_from_allocator); + CreateMLValue(allocator, AsSpan({3, 2}), input_data_vec, &val_to_share_from_allocator); OrtMemoryInfo mem_info{CPU, OrtArenaAllocator}; - CreateMLValue(std::array{3, 2}, input_data_vec.data(), mem_info, &val_to_share); + CreateMLValue(AsSpan({3, 2}), input_data_vec.data(), mem_info, &val_to_share); // create sessions to share the allocator SessionOptions so1; diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc index e7f8b1aaa49d8..c34b9ac84b259 100644 --- a/onnxruntime/test/framework/session_state_test.cc +++ b/onnxruntime/test/framework/session_state_test.cc @@ -22,6 +22,8 @@ #include "core/util/thread_utils.h" #include "gtest/gtest.h" #include "test/test_environment.h" +#include "test/optimizer/graph_transform_test_builder.h" +#include "test/util/include/test_environment.h" #include "test/util/include/default_providers.h" #include "test/util/include/file_util.h" #include "core/optimizer/layout_transformation/layout_transformation.h" @@ -440,6 +442,257 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) { } } +#ifdef USE_CUDA +namespace { +void BuildTestModel(Graph& graph, const std::vector& input_shape, + size_t approx_init_a_size, + size_t approx_init_b_size) { + ASSERT_EQ(2, input_shape.size()); + + // Create two MatMul nodes each with the initializers, that are going to + // dictate the cost of the nodes + const auto init_a_dim_0 = input_shape[1]; + const int64_t init_a_dim_1 = approx_init_a_size / input_shape[1]; + const std::vector init_a_shape = {init_a_dim_0, init_a_dim_1}; + + // This is also an A input to mm_2 + const std::vector mm_1_output_shape = {input_shape[0], init_a_shape[1]}; + + const int64_t init_b_dim_0 = mm_1_output_shape[1]; + const int64_t init_b_dim_1 = approx_init_b_size / mm_1_output_shape[1]; + const std::vector init_b_shape = {init_b_dim_0, init_b_dim_1}; + + const std::vector output_shape = {mm_1_output_shape[0], init_b_dim_1}; + + ModelTestBuilder builder(graph); + + std::optional> in_shape = input_shape; + NodeArg* model_input = builder.MakeInput(in_shape, "input"); + NodeArg* init_a = builder.MakeInitializer(init_a_shape, 1.f, 10.f); + NodeArg* mm_1_output = builder.MakeIntermediate(mm_1_output_shape); + NodeArg* init_b = builder.MakeIntermediate(init_b_shape); + NodeArg* mm_2_output = builder.MakeOutput(output_shape); + + builder.AddNode("MatMul", {model_input, init_a}, {mm_1_output}); + builder.AddNode("MatMul", {mm_1_output, init_b}, {mm_2_output}); +} +} // namespace + +// Produces node stats for the model. This requires running the model. +// TEST(SessionStateTest, TestResourceAwareParitioningSaveNodeStats) { +// +// const auto& log_manager = DefaultLoggingManager(); +// log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE); +// const auto& default_logger = log_manager.DefaultLogger(); +// std::unordered_map domain_to_version; +// domain_to_version[kOnnxDomain] = 16; // We can make it a parameter +// Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), +// domain_to_version, {}, default_logger); +// +// const std::vector input_shape = {1024, 1024}; +// constexpr const size_t approx_init_a_size = 1024 * 1024; // 1Mb +// constexpr const size_t approx_init_b_size = 1024 * 1024; // 1Mb +// +// auto& graph = model.MainGraph(); +// BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size); +// ASSERT_STATUS_OK(graph.Resolve()); +// +// auto model_proto = model.ToProto(); +// const auto model_string = model_proto.SerializeAsString(); +// std::ofstream model_file("model.onnx", std::ios::binary); +//} + +/// XXX: Optionally add resource aware parameters +/// This test can only run with CUDA present currently. +TEST(SessionStateTest, TestResourceAwarePartitioning_NoLimit) { + const auto& log_manager = DefaultLoggingManager(); + log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE); + const auto& default_logger = log_manager.DefaultLogger(); + std::unordered_map domain_to_version; + domain_to_version[kOnnxDomain] = 16; // We can make it a parameter + Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), + domain_to_version, {}, default_logger); + + // Input Shape + const std::vector input_shape = {1024, 1024}; + constexpr const size_t approx_init_a_size = 1024 * 1024; // 1Mb + constexpr const size_t approx_init_b_size = 1024 * 1024; // 1Mb + + auto& graph = model.MainGraph(); + BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size); + ASSERT_STATUS_OK(graph.Resolve()); + + OrtThreadPoolParams to; + to.thread_pool_size = 1; + auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP); + + ExecutionProviders execution_providers; + auto tmp_cpu_execution_provider = DefaultCudaExecutionProvider(); + tmp_cpu_execution_provider->SetLogger(&default_logger); + ASSERT_STATUS_OK(execution_providers.Add(kCudaExecutionProvider, std::move(tmp_cpu_execution_provider))); + + KernelRegistryManager krm; + ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers)); + + DataTransferManager dtm; + ExternalDataLoaderManager edlm; + profiling::Profiler profiler; + // Try to load the model without restrictions + // and verify nodes have been placed to CUDA + SessionOptions sess_options; + sess_options.enable_mem_pattern = false; + sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; + sess_options.use_deterministic_compute = false; + sess_options.enable_mem_reuse = false; + + SessionState session_state(graph, execution_providers, tp.get(), nullptr, dtm, edlm, + default_logger, profiler, sess_options); + + GraphPartitioner partitioner(krm, execution_providers); + layout_transformation::TransformLayoutFunction transform_layout_fn; + layout_transformation::DebugGraphFn debug_graph_fn; + ASSERT_STATUS_OK( + partitioner.Partition(graph, session_state.GetMutableFuncMgr(), transform_layout_fn, + sess_options.config_options, default_logger, + GraphPartitioner::Mode::kNormal, debug_graph_fn)); + + // All nodes have been placed to CUDA + const auto& graph_nodes = graph.Nodes(); + for (const auto& node : graph_nodes) { + EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider); + } +} + +// TEST(SessionStateTest, TestResourceAwarePartitioning_LargeLimit) { +// const auto& log_manager = DefaultLoggingManager(); +// log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE); +// const auto& default_logger = log_manager.DefaultLogger(); +// std::unordered_map domain_to_version; +// domain_to_version[kOnnxDomain] = 16; // We can make it a parameter +// Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), +// domain_to_version, {}, default_logger); +// +// // Input Shape +// const std::vector input_shape = {1024, 1024}; +// constexpr const size_t approx_init_a_size = 1024 * 1024; // 1Mb +// constexpr const size_t approx_init_b_size = 1024 * 1024; // 1Mb +// +// auto& graph = model.MainGraph(); +// BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size); +// ASSERT_STATUS_OK(graph.Resolve()); +// +// OrtThreadPoolParams to; +// to.thread_pool_size = 1; +// auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP); +// +// ExecutionProviders execution_providers; +// auto tmp_cpu_execution_provider = DefaultCudaExecutionProvider(); +// tmp_cpu_execution_provider->SetLogger(&default_logger); +// ASSERT_STATUS_OK(execution_providers.Add(kCudaExecutionProvider, std::move(tmp_cpu_execution_provider))); +// +// KernelRegistryManager krm; +// ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers)); +// +// DataTransferManager dtm; +// ExternalDataLoaderManager edlm; +// profiling::Profiler profiler; +// // Try to load the model without restrictions +// // and verify nodes have been placed to CUDA +// SessionOptions sess_options; +// sess_options.enable_mem_pattern = false; +// sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; +// sess_options.use_deterministic_compute = false; +// sess_options.enable_mem_reuse = false; +// ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(kOrtSessionOptionsResourceCudaPartitioningSettings, +// "4206592")); +// +// SessionState session_state(graph, execution_providers, tp.get(), nullptr, dtm, edlm, +// default_logger, profiler, sess_options); +// +// GraphPartitioner partitioner(krm, execution_providers); +// layout_transformation::TransformLayoutFunction transform_layout_fn; +// layout_transformation::DebugGraphFn debug_graph_fn; +// ASSERT_STATUS_OK( +// partitioner.Partition(graph, session_state.GetMutableFuncMgr(), transform_layout_fn, +// sess_options.config_options, default_logger, +// GraphPartitioner::Mode::kNormal, debug_graph_fn)); +// +// // All nodes have been placed to CUDA +// const auto& graph_nodes = graph.Nodes(); +// for (const auto& node : graph_nodes) { +// EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider); +// } +// } + +// TEST(SessionStateTest, TestResourceAwarePartitioning_SecondNodeCutOff) { +// const auto& log_manager = DefaultLoggingManager(); +// log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE); +// const auto& default_logger = log_manager.DefaultLogger(); +// std::unordered_map domain_to_version; +// domain_to_version[kOnnxDomain] = 16; // We can make it a parameter +// Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), +// domain_to_version, {}, default_logger); +// +// // Input Shape +// const std::vector input_shape = {1024, 1024}; +// constexpr const size_t approx_init_a_size = 1024 * 1024; // 1Mb +// constexpr const size_t approx_init_b_size = 1024 * 1024; // 1Mb +// +// auto& graph = model.MainGraph(); +// BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size); +// ASSERT_STATUS_OK(graph.Resolve()); +// +// OrtThreadPoolParams to; +// to.thread_pool_size = 1; +// auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP); +// +// ExecutionProviders execution_providers; +// auto tmp_cpu_execution_provider = DefaultCudaExecutionProvider(); +// tmp_cpu_execution_provider->SetLogger(&default_logger); +// ASSERT_STATUS_OK(execution_providers.Add(kCudaExecutionProvider, std::move(tmp_cpu_execution_provider))); +// +// KernelRegistryManager krm; +// ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers)); +// +// DataTransferManager dtm; +// ExternalDataLoaderManager edlm; +// profiling::Profiler profiler; +// // Try to load the model without restrictions +// // and verify nodes have been placed to CUDA +// SessionOptions sess_options; +// sess_options.enable_mem_pattern = false; +// sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; +// sess_options.use_deterministic_compute = false; +// sess_options.enable_mem_reuse = false; +// ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(kOrtSessionOptionsResourceCudaPartitioningSettings, +// "16383")); +// +// SessionState session_state(graph, execution_providers, tp.get(), nullptr, dtm, edlm, +// default_logger, profiler, sess_options); +// +// GraphPartitioner partitioner(krm, execution_providers); +// layout_transformation::TransformLayoutFunction transform_layout_fn; +// layout_transformation::DebugGraphFn debug_graph_fn; +// ASSERT_STATUS_OK( +// partitioner.Partition(graph, session_state.GetMutableFuncMgr(), transform_layout_fn, +// sess_options.config_options, default_logger, +// GraphPartitioner::Mode::kNormal, debug_graph_fn)); +// +// // Second node did not make it to CUDA +// const auto& graph_nodes = graph.Nodes(); +// size_t count = 0; +// for (const auto& node : graph_nodes) { +// if (count == 0) { +// EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider); +// } else { +// EXPECT_TRUE(node.GetExecutionProviderType().empty()); +// } +// count++; +// } +// } + +#endif // USE_CUDA + INSTANTIATE_TEST_SUITE_P(SessionStateTests, SessionStateTestP, testing::ValuesIn(param_list)); #ifndef ENABLE_TRAINING_CORE diff --git a/onnxruntime/test/framework/test_utils.h b/onnxruntime/test/framework/test_utils.h index 51b02ee3e7f8c..9c5893948ff1b 100644 --- a/onnxruntime/test/framework/test_utils.h +++ b/onnxruntime/test/framework/test_utils.h @@ -32,8 +32,13 @@ namespace test { IExecutionProvider* TestCPUExecutionProvider(); template +inline void CopyVectorToTensor(gsl::span value, Tensor& tensor) { + gsl::copy(value, tensor.MutableDataAsSpan()); +} + +template inline void CopyVectorToTensor(const std::vector& value, Tensor& tensor) { - gsl::copy(gsl::make_span(value), tensor.MutableDataAsSpan()); + gsl::copy(AsSpan(value), tensor.MutableDataAsSpan()); } // vector is specialized so we need to handle it separately @@ -45,8 +50,20 @@ inline void CopyVectorToTensor(const std::vector& value, Tensor& ten } } +template +void CreateMLValue(AllocatorPtr alloc, gsl::span dims, const std::vector& value, + OrtValue* p_mlvalue) { + TensorShape shape(dims); + auto element_type = DataTypeImpl::GetType(); + Tensor::InitOrtValue(element_type, shape, std::move(alloc), *p_mlvalue); + if (!value.empty()) { + Tensor& tensor = *p_mlvalue->GetMutable(); + CopyVectorToTensor(value, tensor); + } +} + template -void CreateMLValue(AllocatorPtr alloc, const std::vector& dims, const std::vector& value, +void CreateMLValue(AllocatorPtr alloc, gsl::span dims, gsl::span value, OrtValue* p_mlvalue) { TensorShape shape(dims); auto element_type = DataTypeImpl::GetType(); @@ -58,6 +75,24 @@ void CreateMLValue(AllocatorPtr alloc, const std::vector& dims, const s } } +template +void CreateMLValue(AllocatorPtr alloc, std::initializer_list dims, gsl::span value, + OrtValue* p_mlvalue) { + CreateMLValue(alloc, AsSpan(dims), value, p_mlvalue); +} + +template +void CreateMLValue(AllocatorPtr alloc, gsl::span dims, std::initializer_list value, + OrtValue* p_mlvalue) { + CreateMLValue(alloc, dims, AsSpan(value), p_mlvalue); +} + +template +void CreateMLValue(AllocatorPtr alloc, std::initializer_list dims, std::initializer_list value, + OrtValue* p_mlvalue) { + CreateMLValue(alloc, AsSpan(dims), AsSpan(value), p_mlvalue); +} + // Lifetime of data_buffer should be managed by the caller. template void CreateMLValue(gsl::span dims, T* data_buffer, const OrtMemoryInfo& info, @@ -68,7 +103,7 @@ void CreateMLValue(gsl::span dims, T* data_buffer, const OrtMemor } template -void AllocateMLValue(AllocatorPtr alloc, const std::vector& dims, OrtValue* p_mlvalue) { +void AllocateMLValue(AllocatorPtr alloc, gsl::span dims, OrtValue* p_mlvalue) { TensorShape shape(dims); auto element_type = DataTypeImpl::GetType(); Tensor::InitOrtValue(element_type, shape, std::move(alloc), *p_mlvalue); diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc index 2e073def5d643..b753bc386d722 100644 --- a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc +++ b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc @@ -110,7 +110,8 @@ DataLayout InternalTestingExecutionProvider::GetPreferredLayout() const { std::vector> InternalTestingExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer, - const IKernelLookup& kernel_lookup) const { + const IKernelLookup& kernel_lookup, + IResourceAccountant* /* resource_accountant */) const { // find nodes that have ops in our supported list std::unordered_set supported_static_nodes; std::unordered_set supported_compiled_nodes; diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h index 6615eb82f2b05..d2ed8259ee974 100644 --- a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h +++ b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h @@ -19,7 +19,8 @@ class InternalTestingExecutionProvider : public IExecutionProvider { std::vector> GetCapability(const onnxruntime::GraphViewer& graph_view, - const IKernelLookup& /*kernel_lookup*/) const override; + const IKernelLookup& /*kernel_lookup*/, + IResourceAccountant* /* resource_accountant */) const override; common::Status Compile(const std::vector& fused_nodes, std::vector& node_compute_funcs) override; diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.cc b/onnxruntime/test/providers/qnn/qnn_test_utils.cc index 4feeb5f830508..c8ed550c0625c 100644 --- a/onnxruntime/test/providers/qnn/qnn_test_utils.cc +++ b/onnxruntime/test/providers/qnn/qnn_test_utils.cc @@ -281,7 +281,7 @@ static BackendSupport GetHTPSupport(const onnxruntime::logging::Logger& logger) {{"backend_path", "QnnHtp.dll"}}); qnn_ep->SetLogger(&logger); - auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup); + auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup, nullptr); return result.empty() ? BackendSupport::UNSUPPORTED : BackendSupport::SUPPORTED; } @@ -344,7 +344,7 @@ static BackendSupport GetCPUSupport(const onnxruntime::logging::Logger& logger) {{"backend_path", "QnnCpu.dll"}}); qnn_ep->SetLogger(&logger); - auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup); + auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup, nullptr); return result.empty() ? BackendSupport::UNSUPPORTED : BackendSupport::SUPPORTED; } diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index dab73d3824d3b..882ef80d76441 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -4768,3 +4768,16 @@ TEST(CApiTest, OrtCustomOp_GetInPlace) { ASSERT_EQ(len, static_cast(2)); mock_gqa.ReleaseAliasMap(input_index, output_index); } + +/*TEST(CApiTest, RunWithNodeStats) { + Ort::Env env(ORT_LOGGING_LEVEL_INFO); + constexpr const ORTCHAR_T* model_path = TSTR("testdata/attention_mask2d_fp32.onnx"); + + Ort::SessionOptions session_options; + session_options.DisableCpuMemArena(); + session_options.DisableMemPattern(); + session_options.AddConfigEntry(kOrtSessionOptionsCollectNodeMemoryStatsToFile, + "D:/dev/data/FunctionsConverterProfling/HF_Mobile_Bert/attention_memory.txt"); + + Ort::Session session(env, model_path, session_options); +}*/ \ No newline at end of file From b8f6b7b6ceb9c0e899686215592e5e1b4ffd927a Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 31 Jan 2025 11:45:08 -0800 Subject: [PATCH 2/7] Adjust CSV parsing --- onnxruntime/core/framework/graph_partitioner.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc index 8a01e3973cdc6..b955e05ec803b 100644 --- a/onnxruntime/core/framework/graph_partitioner.cc +++ b/onnxruntime/core/framework/graph_partitioner.cc @@ -121,8 +121,11 @@ InlinedHashMap LoadNodeAllocationStats(const s std::string line; // Read and load a CSV file line by line while (std::getline(file, line)) { - auto splits = utils::SplitString(line, ",", false); + auto splits = utils::SplitString(line, ",", true); ORT_ENFORCE(splits.size() == 5, "Invalid line in the file ", file_path, ": ", line); + if (splits[0].empty()) { + continue; + } std::string node_name{splits[0]}; size_t input_sizes = SafeInt(std::stoull(std::string{splits[1]})); size_t initializers_sizes = SafeInt(std::stoull(std::string{splits[2]})); @@ -1101,7 +1104,7 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr, kOrtSessionOptionsResourceCudaPartitioningSettings, ""); if (!resource_partitioning_settings.empty()) { auto splits = utils::SplitString(resource_partitioning_settings, ",", false); - if (splits.size() == 4) { + if (splits.size() == 2) { SafeInt cuda_memory_limit = std::stoul(std::string{splits[0]}); cuda_memory_limit *= 1024; // to bytes auto node_to_stats = LoadNodeAllocationStats(graph.ModelPath(), splits[1]); From b1d1467bea9d9c159a05e4877c1fd6a6a1b8c4fb Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 5 Feb 2025 18:41:35 -0800 Subject: [PATCH 3/7] Tests pass --- .../core/framework/op_kernel_context.h | 2 +- .../core/framework/resource_accountant.h | 24 +- .../core/framework/graph_partitioner.cc | 104 +------ .../framework/op_kernel_context_internal.h | 4 + .../core/framework/resource_accountant.cc | 148 ++++++++- .../core/framework/sequential_executor.cc | 46 +-- .../shared_library/provider_interfaces.h | 1 - onnxruntime/core/session/inference_session.cc | 14 +- .../test/framework/inference_session_test.cc | 100 ------ .../test/framework/session_state_test.cc | 286 +++++------------- onnxruntime/test/shared_lib/test_inference.cc | 79 ++++- .../tiny_gpt2_beamsearch_node_stats.txt | 56 ++++ 12 files changed, 407 insertions(+), 457 deletions(-) create mode 100644 onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt diff --git a/include/onnxruntime/core/framework/op_kernel_context.h b/include/onnxruntime/core/framework/op_kernel_context.h index a67d7b8ae0174..e9a1490dedc34 100644 --- a/include/onnxruntime/core/framework/op_kernel_context.h +++ b/include/onnxruntime/core/framework/op_kernel_context.h @@ -192,7 +192,7 @@ class OpKernelContext { onnxruntime::NodeIndex GetNodeIndex() const; virtual const OrtValue* GetInputMLValue(int index) const; - const OrtValue* GetImplicitInputMLValue(int index) const; + virtual const OrtValue* GetImplicitInputMLValue(int index) const; OrtValue* GetOutputMLValue(int index); #ifdef ENABLE_ATEN diff --git a/include/onnxruntime/core/framework/resource_accountant.h b/include/onnxruntime/core/framework/resource_accountant.h index 982b37c969fe7..1f2e9ea5ccfb0 100644 --- a/include/onnxruntime/core/framework/resource_accountant.h +++ b/include/onnxruntime/core/framework/resource_accountant.h @@ -7,15 +7,19 @@ #include #include #include +#include #include #include "core/common/common.h" +#include "core/common/inlined_containers_fwd.h" namespace onnxruntime { +struct ConfigOptions; + // Common holder for potentially different resource accounting // for different EPs -using ResourceCount = std::variant; +using ResourceCount = std::variant; /// /// This class is used for graph partitioning by EPs @@ -53,6 +57,9 @@ class IResourceAccountant { std::optional threshold_; }; +// A map of Ep Type to a resource accountant for this EP +using ResourceAccountantMap = InlinedHashMap>; + // This struct keeps accounting of the memory allocation stats // for a kernel during runtime if enabled. struct NodeAllocationStats { @@ -86,13 +93,22 @@ class NodeStatsRecorder { const std::filesystem::path& GetNodeStatsFileName() const noexcept; + bool ShouldAccountFor(const std::string& input_output_name) const; + + void ResetPerRunNameDeduper(); + void ReportNodeStats(const std::string& node_name, const NodeAllocationStats& stats); - void DumpStats(std::ostream& os) const; + void DumpStats(const std::filesystem::path& model_path) const; + + static Status CreateAccountants( + const ConfigOptions& config_options, + const std::filesystem::path& model_path, + std::optional& acc_map); private: - // We would like to hide certain things that may not compile - // with some device compilers + void DumpStats(std::ostream& os) const; + struct Impl; std::unique_ptr impl_; }; diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc index d6eaea70565a7..08ddfd872ca78 100644 --- a/onnxruntime/core/framework/graph_partitioner.cc +++ b/onnxruntime/core/framework/graph_partitioner.cc @@ -5,7 +5,6 @@ #include #include -#include #include "core/common/inlined_containers.h" #include "core/common/string_utils.h" @@ -54,9 +53,6 @@ namespace onnxruntime { namespace { -// A map of Ep Type to a resource accountant for this EP -using ResourceAccountantMap = InlinedHashMap>; - // contains some common parameters used by the partitioning helper functions struct PartitionParams { std::reference_wrapper graph; @@ -68,75 +64,6 @@ struct PartitionParams { std::reference_wrapper debug_graph_fn; #endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) }; - -// Use this accountant if your resource can be counted with size_t type -class SizeTAccountant : public IResourceAccountant { - public: - SizeTAccountant() = default; - ~SizeTAccountant() = default; - - explicit SizeTAccountant(size_t threshold, InlinedHashMap&& node_stats) - : IResourceAccountant(threshold), node_stats_(std::move(node_stats)) {} - - ResourceCount GetConsumedAmount() const noexcept override { - return consumed_amount_; - } - void AddConsumedAmount(const ResourceCount& amount) noexcept override { - if (std::holds_alternative(amount)) { - consumed_amount_ += std::get(amount); - } - } - void RemoveConsumedAmount(const ResourceCount& amount) noexcept override { - if (std::holds_alternative(amount)) { - consumed_amount_ -= std::get<0>(amount); - } - } - - ResourceCount ComputeResourceCount(const std::string& node_name) const override { - auto hit = node_stats_.find(node_name); - if (hit != node_stats_.end()) { - const auto& stats = hit->second; - return stats.input_sizes + stats.initializers_sizes + - stats.total_dynamic_sizes + stats.total_temp_allocations; - } - return static_cast(0U); - } - - private: - size_t consumed_amount_ = 0; - InlinedHashMap node_stats_; -}; - -InlinedHashMap LoadNodeAllocationStats(const std::filesystem::path& model_path, - const std::filesystem::path& file_name) { - InlinedHashMap node_stats; - std::filesystem::path file_path = model_path; - if (file_path.has_filename()) { - file_path = file_path.parent_path(); - } - - file_path /= file_name; - - std::ifstream file(file_path); - ORT_ENFORCE(file.is_open(), "Failed to open file ", file_path); - std::string line; - // Read and load a CSV file line by line - while (std::getline(file, line)) { - auto splits = utils::SplitString(line, ",", true); - ORT_ENFORCE(splits.size() == 5, "Invalid line in the file ", file_path, ": ", line); - if (splits[0].empty()) { - continue; - } - std::string node_name{splits[0]}; - size_t input_sizes = SafeInt(std::stoull(std::string{splits[1]})); - size_t initializers_sizes = SafeInt(std::stoull(std::string{splits[2]})); - size_t total_dynamic_sizes = SafeInt(std::stoull(std::string{splits[3]})); - size_t total_temp_allocations = SafeInt(std::stoull(std::string{splits[4]})); - node_stats.insert_or_assign(node_name, {input_sizes, initializers_sizes, - total_dynamic_sizes, total_temp_allocations}); - } - return node_stats; -} } // namespace #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) @@ -848,7 +775,8 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, GraphPartitioner::Mode mode, const ExecutionProviders& execution_providers, KernelRegistryManager& kernel_registry_manager, - const ResourceAccountantMap& acc_map, const logging::Logger& logger) { + const std::optional& acc_map, + const logging::Logger& logger) { bool modified_graph = false; auto& graph = partition_params.graph.get(); @@ -861,9 +789,11 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, // process full graph with each EP for (const auto& ep : execution_providers) { IResourceAccountant* resource_accountant = nullptr; - auto hit = acc_map.find(ep->Type()); - if (hit != acc_map.end()) { - resource_accountant = hit->second.get(); + if (acc_map.has_value()) { + auto hit = acc_map->find(ep->Type()); + if (hit != acc_map->end()) { + resource_accountant = hit->second.get(); + } } ORT_RETURN_IF_ERROR(PartitionOnnxFormatModelImpl(graph, func_mgr, kernel_registry_manager, fused_kernel_registry, *ep, mode, fused_node_unique_id, @@ -1114,24 +1044,12 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr, #endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) - // We use this only if Resource Aware Partitioning is enabled for any of the EPs - ResourceAccountantMap ep_acc_map; - // Zero, it is disabled by default - const std::string resource_partitioning_settings = config_options.GetConfigOrDefault( - kOrtSessionOptionsResourceCudaPartitioningSettings, ""); - if (!resource_partitioning_settings.empty()) { - auto splits = utils::SplitString(resource_partitioning_settings, ",", false); - if (splits.size() == 2) { - SafeInt cuda_memory_limit = std::stoul(std::string{splits[0]}); - cuda_memory_limit *= 1024; // to bytes - auto node_to_stats = LoadNodeAllocationStats(graph.ModelPath(), splits[1]); - ep_acc_map[kCudaExecutionProvider] = std::make_unique(cuda_memory_limit, - std::move(node_to_stats)); - } - } - if (mode == Mode::kNormal || mode == Mode::kAssignOnly) { #if !defined(ORT_MINIMAL_BUILD) + // We use this only if Resource Aware Partitioning is enabled for any of the EPs + std::optional ep_acc_map; + ORT_RETURN_IF_ERROR(NodeStatsRecorder::CreateAccountants(config_options, graph.ModelPath(), ep_acc_map)); + ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_, ep_acc_map, logger)); diff --git a/onnxruntime/core/framework/op_kernel_context_internal.h b/onnxruntime/core/framework/op_kernel_context_internal.h index c970243ba461e..64932dce50917 100644 --- a/onnxruntime/core/framework/op_kernel_context_internal.h +++ b/onnxruntime/core/framework/op_kernel_context_internal.h @@ -59,6 +59,10 @@ class OpKernelContextInternal : public OpKernelContext { return OpKernelContext::GetInputMLValue(index); } + const OrtValue* GetImplicitInputMLValue(int index) const override { + return OpKernelContext::GetImplicitInputMLValue(index); + } + OrtValue* GetOutputMLValue(int index) { return OpKernelContext::GetOutputMLValue(index); } diff --git a/onnxruntime/core/framework/resource_accountant.cc b/onnxruntime/core/framework/resource_accountant.cc index 5c2d4feaaf126..786da13e69458 100644 --- a/onnxruntime/core/framework/resource_accountant.cc +++ b/onnxruntime/core/framework/resource_accountant.cc @@ -2,33 +2,86 @@ // Licensed under the MIT License. #include "core/framework/resource_accountant.h" + #include "core/common/inlined_containers.h" +#include "core/common/safeint.h" +#include "core/common/string_utils.h" + +#include "core/framework/config_options.h" +#include "core/graph/constants.h" +#include "core/session/onnxruntime_session_options_config_keys.h" -#include +#include namespace onnxruntime { +// Use this accountant if your resource can be counted with size_t type +class SizeTAccountant : public IResourceAccountant { + public: + SizeTAccountant() = default; + ~SizeTAccountant() = default; + + explicit SizeTAccountant(size_t threshold, InlinedHashMap&& node_stats) + : IResourceAccountant(threshold), node_stats_(std::move(node_stats)) {} + + ResourceCount GetConsumedAmount() const noexcept override { + return consumed_amount_; + } + void AddConsumedAmount(const ResourceCount& amount) noexcept override { + if (std::holds_alternative(amount)) { + consumed_amount_ += std::get(amount); + } + } + void RemoveConsumedAmount(const ResourceCount& amount) noexcept override { + if (std::holds_alternative(amount)) { + consumed_amount_ -= std::get<0>(amount); + } + } + + ResourceCount ComputeResourceCount(const std::string& node_name) const override { + auto hit = node_stats_.find(node_name); + if (hit != node_stats_.end()) { + const auto& stats = hit->second; + return stats.input_sizes + stats.initializers_sizes + + stats.total_dynamic_sizes + stats.total_temp_allocations; + } + return static_cast(0U); + } + + private: + size_t consumed_amount_ = 0; + InlinedHashMap node_stats_; +}; + struct NodeStatsRecorder::Impl { - std::filesystem::path node_stats_path_; + std::filesystem::path node_stats_path; // This is a node name to allocation stats map - InlinedHashMap node_stats_; - mutable std::mutex mut_; + InlinedHashMap node_stats; + // Keeps track of nodes for which input/output sizes are accounted + InlinedHashSet input_output_accounted; }; NodeStatsRecorder::NodeStatsRecorder(const std::filesystem::path& node_stats_path) : impl_(std::make_unique()) { - impl_->node_stats_path_ = node_stats_path; + impl_->node_stats_path = node_stats_path; } NodeStatsRecorder::~NodeStatsRecorder() = default; const std::filesystem::path& NodeStatsRecorder::GetNodeStatsFileName() const noexcept { - return impl_->node_stats_path_; + return impl_->node_stats_path; +} + +bool NodeStatsRecorder::ShouldAccountFor(const std::string& input_output_name) const { + return impl_->input_output_accounted.insert(input_output_name).second; +} + +void NodeStatsRecorder::ResetPerRunNameDeduper() { + impl_->input_output_accounted.clear(); } void NodeStatsRecorder::ReportNodeStats(const std::string& node_name, const NodeAllocationStats& stats) { - std::lock_guard lock(impl_->mut_); - auto result = impl_->node_stats_.emplace(node_name, stats); + auto result = impl_->node_stats.emplace(node_name, stats); if (!result.second) { // Node already exists, update the stats result.first->second.UpdateIfGreater(stats); @@ -36,12 +89,87 @@ void NodeStatsRecorder::ReportNodeStats(const std::string& node_name, const Node } void NodeStatsRecorder::DumpStats(std::ostream& os) const { - std::lock_guard lock(impl_->mut_); - for (const auto& [name, stats] : impl_->node_stats_) { + for (const auto& [name, stats] : impl_->node_stats) { os << name << "," << stats.input_sizes << "," << stats.initializers_sizes << "," << stats.total_dynamic_sizes << "," << stats.total_temp_allocations << "\n"; } } +void NodeStatsRecorder::DumpStats(const std::filesystem::path& model_path) const { + auto node_stats_file = model_path; + if (node_stats_file.has_filename()) { + node_stats_file = node_stats_file.parent_path(); + } + node_stats_file /= GetNodeStatsFileName(); + std::ofstream ofs(node_stats_file, std::ofstream::out); + ORT_ENFORCE(ofs.is_open(), "Failed to open file: ", node_stats_file); + DumpStats(ofs); + ofs.close(); +} + +static Status LoadNodeAllocationStats( + const std::filesystem::path& model_path, const std::filesystem::path& file_name, + InlinedHashMap& result) { + InlinedHashMap node_stats; + std::filesystem::path file_path = model_path; + if (file_path.has_filename()) { + file_path = file_path.parent_path(); + } + + file_path /= file_name; + + std::ifstream file(file_path); + ORT_RETURN_IF_NOT(file.is_open(), "Failed to open file ", file_path); + std::string line; + // Read and load a CSV file line by line + while (std::getline(file, line)) { + auto splits = utils::SplitString(line, ",", true); + ORT_ENFORCE(splits.size() == 5, "Invalid line in the file ", file_path, ": ", line); + if (splits[0].empty()) { + continue; + } + std::string node_name{splits[0]}; + size_t input_sizes = SafeInt(std::stoull(std::string{splits[1]})); + size_t initializers_sizes = SafeInt(std::stoull(std::string{splits[2]})); + size_t total_dynamic_sizes = SafeInt(std::stoull(std::string{splits[3]})); + size_t total_temp_allocations = SafeInt(std::stoull(std::string{splits[4]})); + node_stats.insert_or_assign(node_name, {input_sizes, initializers_sizes, + total_dynamic_sizes, total_temp_allocations}); + } + + result.swap(node_stats); + return Status::OK(); +} + +Status NodeStatsRecorder::CreateAccountants( + const ConfigOptions& config_options, + const std::filesystem::path& model_path, + std::optional& acc_map) { + // Check if CUDA partitioning settings are provided + const std::string resource_partitioning_settings = config_options.GetConfigOrDefault( + kOrtSessionOptionsResourceCudaPartitioningSettings, ""); + + if (!resource_partitioning_settings.empty()) { + auto splits = utils::SplitString(resource_partitioning_settings, ",", false); + if (splits.size() == 2) { + SafeInt cuda_memory_limit = std::stoul(std::string{splits[0]}); + cuda_memory_limit *= 1024; // to bytes + + InlinedHashMap loaded_stats; + ORT_RETURN_IF_ERROR(LoadNodeAllocationStats(model_path, splits[1], loaded_stats)); + + std::optional result; + auto& map = result.emplace(); + + map.insert_or_assign(kCudaExecutionProvider, + std::make_unique(cuda_memory_limit, + std::move(loaded_stats))); + acc_map = std::move(result); + } + } + + return Status::OK(); +} + } // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc index 8a7564c7d4236..35ae33328837c 100644 --- a/onnxruntime/core/framework/sequential_executor.cc +++ b/onnxruntime/core/framework/sequential_executor.cc @@ -494,38 +494,42 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx, #if !defined(ORT_MINIMAL_BUILD) auto* node_stats_recorder = ctx.GetSessionState().GetNodeStatsRecorder(); if (node_stats_recorder != nullptr) { + const auto& node = p_kernel->Node(); + const OpKernelInfo& op_kernel_info = p_kernel->Info(); + const auto input_defs = node.InputDefs(); + // Lets first check if any inputs are initializers, // if so we need to account for their memory usage. - const auto& const_initializers = ctx.GetSessionState().GetConstantInitializedTensors(); SafeInt initializers_size = 0; SafeInt input_sizes = 0; for (int i = 0, lim = kernel_ctx.InputCount(); i < lim; ++i) { // Need to get ort_value_index for each input. - int ort_vaue_index = kernel_ctx.GetOrtValueIndexForInput(i); - auto hit = const_initializers.find(ort_vaue_index); - if (hit != const_initializers.end()) { - const auto& ort_value = hit->second; - initializers_size += ort_value.Get().SizeInBytes(); - } else { - // If the input is not an initializer, we account it as something that had to be - // on the same device with this kernel - const OrtValue* ort_value = kernel_ctx.GetInputMLValue(i); - if (ort_value != nullptr && ort_value->IsAllocated() && ort_value->IsTensor()) { - input_sizes += ort_value->Get().SizeInBytes(); + const OrtValue* p_input = kernel_ctx.GetInputMLValue(i); + if (p_input != nullptr && p_input->IsAllocated() && p_input->IsTensor()) { + const auto& input_name = input_defs[i]->Name(); + if (node_stats_recorder->ShouldAccountFor(input_name)) { + const Tensor* p_tensor = nullptr; + const bool is_constant = op_kernel_info.TryGetConstantInput(i, &p_tensor); + if (!is_constant) { + p_tensor = &p_input->Get(); + } + input_sizes += p_tensor->SizeInBytes(); } } } - // XXX: Should we account for implicit inputs? - - // Get outputs and see if any were allocated dynamically + // Get outputs and see if anything were allocated dynamically + const auto output_defs = node.OutputDefs(); SafeInt total_dynamic_sizes = 0; const auto& exec_frame = ctx.GetExecutionFrame(); for (int i = 0, lim = kernel_ctx.OutputCount(); i < lim; ++i) { - int ort_vaue_index = kernel_ctx.GetOrtValueIndexForOutput(i); - auto maybe_val = exec_frame.GetOrtValueDynamicAllocation(ort_vaue_index); - if (maybe_val.has_value()) { - total_dynamic_sizes += *maybe_val; + const OrtValue* p_output = kernel_ctx.GetOutputMLValue(i); + if (p_output != nullptr && p_output->IsAllocated() && p_output->IsTensor()) { + int ort_value_index = kernel_ctx.GetOrtValueIndexForOutput(i); + auto maybe_val = exec_frame.GetOrtValueDynamicAllocation(ort_value_index); + if (maybe_val.has_value() && node_stats_recorder->ShouldAccountFor(output_defs[i]->Name())) { + total_dynamic_sizes += *maybe_val; + } } } @@ -541,8 +545,8 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx, } // Record node allocation stats - const auto& node = p_kernel->Node(); - node_stats_recorder->ReportNodeStats(node.Name(), node_stats); + const auto& name = (node.Name().empty()) ? node.OpType() : node.Name(); + node_stats_recorder->ReportNodeStats(name, node_stats); } #endif #endif diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index e8fe5428612d4..059a722958118 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -9,7 +9,6 @@ // Public wrappers around internal ort interfaces (currently) #include "core/providers/shared_library/provider_host_api.h" - #include "core/common/inlined_containers_fwd.h" #include "core/framework/resource_accountant.h" #include "core/providers/shared/common.h" diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index d2a5b9339eab7..fb0fcd55ffc63 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -2749,18 +2749,10 @@ Status InferenceSession::Run(const RunOptions& run_options, #endif #if !defined(ORT_MINIMAL_BUILD) - if (GetNodeStatsRecorder() != nullptr && retval.IsOK()) { + if (node_stats_recorder_.has_value() && retval.IsOK()) { // Dump node stats if the run was successful - const auto* node_stats_recorder = GetNodeStatsRecorder(); - auto node_stats_file = session_state_->GetGraphViewer().ModelPath(); - if (node_stats_file.has_filename()) { - node_stats_file = node_stats_file.parent_path(); - } - node_stats_file /= node_stats_recorder->GetNodeStatsFileName(); - std::ofstream ofs(node_stats_file, std::ofstream::out); - ORT_ENFORCE(ofs.is_open(), "Failed to open file: ", node_stats_file); - node_stats_recorder->DumpStats(ofs); - ofs.close(); + node_stats_recorder_->DumpStats(session_state_->GetGraphViewer().ModelPath()); + node_stats_recorder_->ResetPerRunNameDeduper(); } #endif diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index 7ac0aaa291f67..1b06eb55afbd2 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -389,106 +389,6 @@ void RunModelWithBindingMatMul(InferenceSession& session_object, } } -#if 0 -namespace { -// generate random inputs -template -InlinedVector GenerateRandomInput(size_t size) { - InlinedVector values(size); - std::random_device dev; - std::mt19937 rng(dev()); - std::uniform_int_distribution distribution(1, 100); - std::generate(values.begin(), values.end(), [&]() { return static_cast(distribution(rng)); }); - return values; -} - -template <> -InlinedVector GenerateRandomInput(size_t size) { - InlinedVector values(size); - std::random_device dev; - std::default_random_engine rng(dev()); - std::uniform_real_distribution distribution(-1.f, 1.f); - std::generate(values.begin(), values.end(), [&]() { return static_cast(distribution(rng)); }); - return values; -} - -template -void CreateMLValueFromRandom(const AllocatorPtr& alloc, gsl::span shape, - OrtValue& ort_value) { - const auto elements = narrow(std::accumulate(shape.begin(), shape.end(), - static_cast(1), - std::multiplies())); - const auto values = GenerateRandomInput(elements); - CreateMLValue(alloc, shape, values, &ort_value); -} - -} // namespace - -TEST(InferenceSessionTests, GenerateNodeStatsWithRandomInput) { - static constexpr const ORTCHAR_T* STAT_MODEL = - ORT_TSTR("D:/dev/data/FunctionsConverterProfling/HF_Mobile_Bert/attention_mask2d_fp32.onnx"); - - SessionOptions so; - ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsCollectNodeMemoryStatsToFile, - "attention_mask2d_fp32_node_stats.txt")); - InferenceSession session_object{so, GetEnvironment()}; - ASSERT_STATUS_OK(session_object.Load(STAT_MODEL)); - ASSERT_STATUS_OK(session_object.Initialize()); - - auto allocators = TestCPUExecutionProvider()->CreatePreferredAllocators(); - auto inputs_defs = session_object.GetModelInputs(); - ASSERT_STATUS_OK(inputs_defs.first); - NameMLValMap feeds; - for (const auto* def : *inputs_defs.second) { - if (!def->Exists()) { - continue; - } - - OrtValue ml_value; - const auto* type_proto = def->TypeAsProto(); - ASSERT_TRUE(utils::HasTensorType(*type_proto)); - const auto elem_type = type_proto->tensor_type().elem_type(); - ASSERT_TRUE(utils::HasShape(*type_proto)); - const auto& tensor_shape_proto = type_proto->tensor_type().shape(); - - TensorShapeVector input_dims; - for (const auto& dim : tensor_shape_proto.dim()) { - input_dims.push_back(dim.dim_value()); - } - - switch (elem_type) { - case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: { - CreateMLValueFromRandom(allocators[0], input_dims, ml_value); - break; - } - case ONNX_NAMESPACE::TensorProto_DataType_INT32: { - CreateMLValueFromRandom(allocators[0], input_dims, ml_value); - break; - } - case ONNX_NAMESPACE::TensorProto_DataType_INT64: { - CreateMLValueFromRandom(allocators[0], input_dims, ml_value); - break; - } - - default: - ASSERT_TRUE(false) << "Unsupported type: " << elem_type; - } - feeds.insert_or_assign(def->Name(), std::move(ml_value)); - } - - InlinedVector output_names; - auto outputs = session_object.GetModelOutputs(); - ASSERT_STATUS_OK(outputs.first); - for (const auto& output : *outputs.second) { - output_names.push_back(output->Name()); - } - - RunOptions run_options; - std::vector fetches; - ASSERT_STATUS_OK(session_object.Run(run_options, feeds, output_names, &fetches)); -} -#endif - TEST(InferenceSessionTests, NoTimeout) { SessionOptions so; diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc index c34b9ac84b259..b6b915f90d99a 100644 --- a/onnxruntime/test/framework/session_state_test.cc +++ b/onnxruntime/test/framework/session_state_test.cc @@ -443,83 +443,21 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) { } #ifdef USE_CUDA -namespace { -void BuildTestModel(Graph& graph, const std::vector& input_shape, - size_t approx_init_a_size, - size_t approx_init_b_size) { - ASSERT_EQ(2, input_shape.size()); - - // Create two MatMul nodes each with the initializers, that are going to - // dictate the cost of the nodes - const auto init_a_dim_0 = input_shape[1]; - const int64_t init_a_dim_1 = approx_init_a_size / input_shape[1]; - const std::vector init_a_shape = {init_a_dim_0, init_a_dim_1}; - - // This is also an A input to mm_2 - const std::vector mm_1_output_shape = {input_shape[0], init_a_shape[1]}; - - const int64_t init_b_dim_0 = mm_1_output_shape[1]; - const int64_t init_b_dim_1 = approx_init_b_size / mm_1_output_shape[1]; - const std::vector init_b_shape = {init_b_dim_0, init_b_dim_1}; - const std::vector output_shape = {mm_1_output_shape[0], init_b_dim_1}; - - ModelTestBuilder builder(graph); - - std::optional> in_shape = input_shape; - NodeArg* model_input = builder.MakeInput(in_shape, "input"); - NodeArg* init_a = builder.MakeInitializer(init_a_shape, 1.f, 10.f); - NodeArg* mm_1_output = builder.MakeIntermediate(mm_1_output_shape); - NodeArg* init_b = builder.MakeIntermediate(init_b_shape); - NodeArg* mm_2_output = builder.MakeOutput(output_shape); +namespace { - builder.AddNode("MatMul", {model_input, init_a}, {mm_1_output}); - builder.AddNode("MatMul", {mm_1_output, init_b}, {mm_2_output}); -} -} // namespace +using ParitionVerifierFn = std::function; -// Produces node stats for the model. This requires running the model. -// TEST(SessionStateTest, TestResourceAwareParitioningSaveNodeStats) { -// -// const auto& log_manager = DefaultLoggingManager(); -// log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE); -// const auto& default_logger = log_manager.DefaultLogger(); -// std::unordered_map domain_to_version; -// domain_to_version[kOnnxDomain] = 16; // We can make it a parameter -// Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), -// domain_to_version, {}, default_logger); -// -// const std::vector input_shape = {1024, 1024}; -// constexpr const size_t approx_init_a_size = 1024 * 1024; // 1Mb -// constexpr const size_t approx_init_b_size = 1024 * 1024; // 1Mb -// -// auto& graph = model.MainGraph(); -// BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size); -// ASSERT_STATUS_OK(graph.Resolve()); -// -// auto model_proto = model.ToProto(); -// const auto model_string = model_proto.SerializeAsString(); -// std::ofstream model_file("model.onnx", std::ios::binary); -//} - -/// XXX: Optionally add resource aware parameters -/// This test can only run with CUDA present currently. -TEST(SessionStateTest, TestResourceAwarePartitioning_NoLimit) { +void LoadWithResourceAwarePartitioning(const ORTCHAR_T* model_path, + const SessionOptions& sess_options, + const ParitionVerifierFn& verifier_fn) { const auto& log_manager = DefaultLoggingManager(); log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE); const auto& default_logger = log_manager.DefaultLogger(); - std::unordered_map domain_to_version; - domain_to_version[kOnnxDomain] = 16; // We can make it a parameter - Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), - domain_to_version, {}, default_logger); + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(model_path, model, nullptr, default_logger)); - // Input Shape - const std::vector input_shape = {1024, 1024}; - constexpr const size_t approx_init_a_size = 1024 * 1024; // 1Mb - constexpr const size_t approx_init_b_size = 1024 * 1024; // 1Mb - - auto& graph = model.MainGraph(); - BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size); + Graph& graph = model->MainGraph(); ASSERT_STATUS_OK(graph.Resolve()); OrtThreadPoolParams to; @@ -537,15 +475,8 @@ TEST(SessionStateTest, TestResourceAwarePartitioning_NoLimit) { DataTransferManager dtm; ExternalDataLoaderManager edlm; profiling::Profiler profiler; - // Try to load the model without restrictions - // and verify nodes have been placed to CUDA - SessionOptions sess_options; - sess_options.enable_mem_pattern = false; - sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; - sess_options.use_deterministic_compute = false; - sess_options.enable_mem_reuse = false; - SessionState session_state(graph, execution_providers, tp.get(), nullptr, dtm, edlm, + SessionState session_state(model->MainGraph(), execution_providers, tp.get(), nullptr, dtm, edlm, default_logger, profiler, sess_options); GraphPartitioner partitioner(krm, execution_providers); @@ -556,140 +487,75 @@ TEST(SessionStateTest, TestResourceAwarePartitioning_NoLimit) { sess_options.config_options, default_logger, GraphPartitioner::Mode::kNormal, debug_graph_fn)); - // All nodes have been placed to CUDA - const auto& graph_nodes = graph.Nodes(); - for (const auto& node : graph_nodes) { - EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider); - } + verifier_fn(graph); +} +} // namespace + +TEST(SessionStateTest, TestResourceAwarePartitioning_NoLimit) { + constexpr const ORTCHAR_T* model_path = ORT_TSTR("testdata/transformers/tiny_gpt2_beamsearch.onnx"); + + // Try to load the model without restrictions + // and verify nodes have been placed to CUDA + SessionOptions sess_options; + sess_options.enable_mem_pattern = false; + sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; + sess_options.use_deterministic_compute = false; + sess_options.enable_mem_reuse = false; + + LoadWithResourceAwarePartitioning(model_path, sess_options, [](const Graph& graph) { + const auto& graph_nodes = graph.Nodes(); + for (const auto& node : graph_nodes) { + EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider); + } + }); +} + +TEST(SessionStateTest, TestResourceAwarePartitioning_LargeLimit) { + constexpr const ORTCHAR_T* model_path = ORT_TSTR("testdata/transformers/tiny_gpt2_beamsearch.onnx"); + constexpr const char* limit_setting = "10000,tiny_gpt2_beamsearch_node_stats.txt"; + + // Large limit, all nodes are still assigned + SessionOptions sess_options; + sess_options.enable_mem_pattern = false; + sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; + sess_options.use_deterministic_compute = false; + sess_options.enable_mem_reuse = false; + ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry( + kOrtSessionOptionsResourceCudaPartitioningSettings, limit_setting)); + + LoadWithResourceAwarePartitioning(model_path, sess_options, [](const Graph& graph) { + const auto& graph_nodes = graph.Nodes(); + for (const auto& node : graph_nodes) { + EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider); + } + }); } -// TEST(SessionStateTest, TestResourceAwarePartitioning_LargeLimit) { -// const auto& log_manager = DefaultLoggingManager(); -// log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE); -// const auto& default_logger = log_manager.DefaultLogger(); -// std::unordered_map domain_to_version; -// domain_to_version[kOnnxDomain] = 16; // We can make it a parameter -// Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), -// domain_to_version, {}, default_logger); -// -// // Input Shape -// const std::vector input_shape = {1024, 1024}; -// constexpr const size_t approx_init_a_size = 1024 * 1024; // 1Mb -// constexpr const size_t approx_init_b_size = 1024 * 1024; // 1Mb -// -// auto& graph = model.MainGraph(); -// BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size); -// ASSERT_STATUS_OK(graph.Resolve()); -// -// OrtThreadPoolParams to; -// to.thread_pool_size = 1; -// auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP); -// -// ExecutionProviders execution_providers; -// auto tmp_cpu_execution_provider = DefaultCudaExecutionProvider(); -// tmp_cpu_execution_provider->SetLogger(&default_logger); -// ASSERT_STATUS_OK(execution_providers.Add(kCudaExecutionProvider, std::move(tmp_cpu_execution_provider))); -// -// KernelRegistryManager krm; -// ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers)); -// -// DataTransferManager dtm; -// ExternalDataLoaderManager edlm; -// profiling::Profiler profiler; -// // Try to load the model without restrictions -// // and verify nodes have been placed to CUDA -// SessionOptions sess_options; -// sess_options.enable_mem_pattern = false; -// sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; -// sess_options.use_deterministic_compute = false; -// sess_options.enable_mem_reuse = false; -// ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(kOrtSessionOptionsResourceCudaPartitioningSettings, -// "4206592")); -// -// SessionState session_state(graph, execution_providers, tp.get(), nullptr, dtm, edlm, -// default_logger, profiler, sess_options); -// -// GraphPartitioner partitioner(krm, execution_providers); -// layout_transformation::TransformLayoutFunction transform_layout_fn; -// layout_transformation::DebugGraphFn debug_graph_fn; -// ASSERT_STATUS_OK( -// partitioner.Partition(graph, session_state.GetMutableFuncMgr(), transform_layout_fn, -// sess_options.config_options, default_logger, -// GraphPartitioner::Mode::kNormal, debug_graph_fn)); -// -// // All nodes have been placed to CUDA -// const auto& graph_nodes = graph.Nodes(); -// for (const auto& node : graph_nodes) { -// EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider); -// } -// } - -// TEST(SessionStateTest, TestResourceAwarePartitioning_SecondNodeCutOff) { -// const auto& log_manager = DefaultLoggingManager(); -// log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE); -// const auto& default_logger = log_manager.DefaultLogger(); -// std::unordered_map domain_to_version; -// domain_to_version[kOnnxDomain] = 16; // We can make it a parameter -// Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), -// domain_to_version, {}, default_logger); -// -// // Input Shape -// const std::vector input_shape = {1024, 1024}; -// constexpr const size_t approx_init_a_size = 1024 * 1024; // 1Mb -// constexpr const size_t approx_init_b_size = 1024 * 1024; // 1Mb -// -// auto& graph = model.MainGraph(); -// BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size); -// ASSERT_STATUS_OK(graph.Resolve()); -// -// OrtThreadPoolParams to; -// to.thread_pool_size = 1; -// auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP); -// -// ExecutionProviders execution_providers; -// auto tmp_cpu_execution_provider = DefaultCudaExecutionProvider(); -// tmp_cpu_execution_provider->SetLogger(&default_logger); -// ASSERT_STATUS_OK(execution_providers.Add(kCudaExecutionProvider, std::move(tmp_cpu_execution_provider))); -// -// KernelRegistryManager krm; -// ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers)); -// -// DataTransferManager dtm; -// ExternalDataLoaderManager edlm; -// profiling::Profiler profiler; -// // Try to load the model without restrictions -// // and verify nodes have been placed to CUDA -// SessionOptions sess_options; -// sess_options.enable_mem_pattern = false; -// sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; -// sess_options.use_deterministic_compute = false; -// sess_options.enable_mem_reuse = false; -// ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(kOrtSessionOptionsResourceCudaPartitioningSettings, -// "16383")); -// -// SessionState session_state(graph, execution_providers, tp.get(), nullptr, dtm, edlm, -// default_logger, profiler, sess_options); -// -// GraphPartitioner partitioner(krm, execution_providers); -// layout_transformation::TransformLayoutFunction transform_layout_fn; -// layout_transformation::DebugGraphFn debug_graph_fn; -// ASSERT_STATUS_OK( -// partitioner.Partition(graph, session_state.GetMutableFuncMgr(), transform_layout_fn, -// sess_options.config_options, default_logger, -// GraphPartitioner::Mode::kNormal, debug_graph_fn)); -// -// // Second node did not make it to CUDA -// const auto& graph_nodes = graph.Nodes(); -// size_t count = 0; -// for (const auto& node : graph_nodes) { -// if (count == 0) { -// EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider); -// } else { -// EXPECT_TRUE(node.GetExecutionProviderType().empty()); -// } -// count++; -// } -// } +TEST(SessionStateTest, TestResourceAwarePartitioning_CPUOffloaded) { + constexpr const ORTCHAR_T* model_path = ORT_TSTR("testdata/transformers/tiny_gpt2_beamsearch.onnx"); + constexpr const char* limit_setting = "5000,tiny_gpt2_beamsearch_node_stats.txt"; + + // Large limit, all nodes are still assigned + SessionOptions sess_options; + sess_options.enable_mem_pattern = false; + sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; + sess_options.use_deterministic_compute = false; + sess_options.enable_mem_reuse = false; + ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry( + kOrtSessionOptionsResourceCudaPartitioningSettings, limit_setting)); + + LoadWithResourceAwarePartitioning(model_path, sess_options, [](const Graph& graph) { + const auto& graph_nodes = graph.Nodes(); + bool cpu_node_found = false; + for (const auto& node : graph_nodes) { + if (node.GetExecutionProviderType() != kCudaExecutionProvider) { + cpu_node_found = true; + break; + } + } + EXPECT_TRUE(cpu_node_found); + }); +} #endif // USE_CUDA diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index 2eee6399960dd..59920487a7248 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -40,6 +40,7 @@ #endif #ifdef USE_CUDA +#include "core/providers/cuda/cuda_provider_options.h" #include #endif @@ -4778,15 +4779,81 @@ TEST(CApiTest, OrtCustomOp_GetInPlace) { mock_gqa.ReleaseAliasMap(input_index, output_index); } -/*TEST(CApiTest, RunWithNodeStats) { +#if !defined(ORT_MINIMAL_BUILD) && defined(USE_CUDA) + +TEST(CApiTest, GenerateNodeStatsFile) { Ort::Env env(ORT_LOGGING_LEVEL_INFO); - constexpr const ORTCHAR_T* model_path = TSTR("testdata/attention_mask2d_fp32.onnx"); + constexpr const ORTCHAR_T* model_path = TSTR("testdata/transformers/tiny_gpt2_beamsearch.onnx"); Ort::SessionOptions session_options; - session_options.DisableCpuMemArena(); - session_options.DisableMemPattern(); session_options.AddConfigEntry(kOrtSessionOptionsCollectNodeMemoryStatsToFile, - "D:/dev/data/FunctionsConverterProfling/HF_Mobile_Bert/attention_memory.txt"); + "tiny_gpt2_beamsearch_node_stats.txt"); + + OrtCUDAProviderOptionsV2 cuda_options; + cuda_options.use_tf32 = false; + session_options.AppendExecutionProvider_CUDA_V2(cuda_options); + + std::vector input_ids_shape{3, 12}; + std::vector input_ids{ + 0, 0, 0, 0, 0, 52, 195, 731, 321, 301, 734, 620, + 41, 554, 74, 622, 206, 222, 75, 223, 221, 198, 224, 572, + 0, 0, 0, 52, 328, 219, 328, 206, 288, 227, 896, 328}; + + std::vector parameter_shape{1}; + std::vector max_length{20}; + std::vector min_length{1}; + std::vector num_beams{4}; + std::vector num_return_sequences{1}; + std::vector length_penalty{1.0f}; + std::vector repetition_penalty{1.0f}; + + std::vector expected_output_shape{input_ids_shape[0], num_return_sequences[0], max_length[0]}; + std::vector expected_output{ + 0, 0, 0, 0, 0, 52, 195, 731, 321, 301, 734, 620, 131, 131, 131, 181, 638, 638, 638, 638, + 41, 554, 74, 622, 206, 222, 75, 223, 221, 198, 224, 572, 292, 292, 292, 292, 292, 292, 292, 292, + 0, 0, 0, 52, 328, 219, 328, 206, 288, 227, 896, 328, 328, 669, 669, 669, 669, 669, 669, 669}; + + Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault); + auto input_ids_tensor = Ort::Value::CreateTensor( + info, input_ids.data(), input_ids.size(), input_ids_shape.data(), input_ids_shape.size()); + + auto max_length_tensor = Ort::Value::CreateTensor( + info, max_length.data(), max_length.size(), parameter_shape.data(), parameter_shape.size()); + + auto min_length_tensor = Ort::Value::CreateTensor( + info, min_length.data(), min_length.size(), parameter_shape.data(), parameter_shape.size()); + + auto num_beams_tensor = Ort::Value::CreateTensor( + info, num_beams.data(), num_beams.size(), parameter_shape.data(), parameter_shape.size()); + auto num_return_sequences_tensor = Ort::Value::CreateTensor( + info, num_return_sequences.data(), num_return_sequences.size(), parameter_shape.data(), parameter_shape.size()); + + auto length_penalty_tensor = Ort::Value::CreateTensor( + info, length_penalty.data(), length_penalty.size(), parameter_shape.data(), parameter_shape.size()); + + auto repetition_penalty_tensor = Ort::Value::CreateTensor( + info, repetition_penalty.data(), repetition_penalty.size(), parameter_shape.data(), parameter_shape.size()); + + std::vector ort_inputs; + ort_inputs.push_back(std::move(input_ids_tensor)); + ort_inputs.push_back(std::move(max_length_tensor)); + ort_inputs.push_back(std::move(min_length_tensor)); + ort_inputs.push_back(std::move(num_beams_tensor)); + ort_inputs.push_back(std::move(num_return_sequences_tensor)); + ort_inputs.push_back(std::move(length_penalty_tensor)); + ort_inputs.push_back(std::move(repetition_penalty_tensor)); + const char* input_names[] = {"input_ids", "max_length", "min_length", "num_beams", "num_return_sequences", + "length_penalty", "repetition_penalty"}; + const char* const output_names[] = {"sequences"}; + + // The ONNX model is generated like the following: + // python convert_generation.py --model_type gpt2 -m hf-internal-testing/tiny-random-gpt2 + // --output tiny_gpt2_beamsearch_fp16.onnx --use_gpu --max_length 20 + // (with separate_gpt2_decoder_for_init_run set to False as it is now set to True by default) Ort::Session session(env, model_path, session_options); -}*/ \ No newline at end of file + session.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(), + output_names, 1); +} + +#endif \ No newline at end of file diff --git a/onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt b/onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt new file mode 100644 index 0000000000000..d9150cf6768f5 --- /dev/null +++ b/onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt @@ -0,0 +1,56 @@ +GptAttention_1_add,18432,0,0,0 +GptAttention_0_matmul,4096,0,0,0 +GptAttention_2_matmul,22528,0,0,0 +FullyConnect_MatMul_5,90112,0,0,0 +GptAttention_3,30720,0,36864,165888 +LayerNorm_4,18432,0,0,0 +GptAttention_1_matmul,22528,0,0,0 +FullyConnect_Add_5,18432,0,0,0 +GptAttention_2_add,18432,0,0,0 +FullyConnect_Add_3,18432,0,0,0 +GptAttention_3_add,18432,0,0,0 +Add_689,18432,0,0,0 +Add_886,18432,0,0,0 +LayerNorm_7,18432,0,0,0 +FullyConnect_MatMul_6,34816,0,0,0 +GptAttention_4,30720,0,36864,165888 +GptAttention_4_add,18432,0,0,0 +SkipLayerNormalization,18432,0,0,0 +LayerNorm_1,18432,0,0,0 +GptAttention_3_matmul,22528,0,0,0 +LayerNorm_8,18432,0,0,0 +FullyConnect_MatMul_8,34816,0,0,0 +FullyConnect_Add_7,18432,0,0,0 +LayerNorm_9,18432,0,0,0 +FastGelu_AddBias_3,73728,0,0,0 +FullyConnect_Add_1,18432,0,0,0 +GptAttention_4_matmul,22528,0,0,0 +GptAttention_0,13248,0,55296,165888 +FullyConnect_MatMul_2,34816,0,0,0 +FullyConnect_MatMul_9,90112,0,0,0 +MatMul_1165,146432,0,576000,0 +GptAttention_2,30720,0,36864,165888 +LayerNorm_6,18432,0,0,0 +BeamSearch_gpt2,24,0,256,1823244 +FastGelu_AddBias_4,73728,0,0,0 +Add_951,18432,0,0,0 +GptAttention_1,30720,0,36864,165888 +LayerNorm_3,18432,0,0,0 +Add_295,18432,0,0,0 +Add_1083,18432,0,0,0 +EmbedLayerNormalization_0,194944,0,37120,0 +GptAttention_0_add,18432,0,0,0 +FullyConnect_MatMul_7,90112,0,0,0 +FastGelu_AddBias_1,73728,0,0,0 +LayerNorm_2,18432,0,0,0 +FastGelu_AddBias_2,73728,0,0,0 +Add_360,18432,0,0,0 +Add_754,18432,0,0,0 +FullyConnect_MatMul_3,90112,0,0,0 +FullyConnect_MatMul_4,34816,0,0,0 +Add_557,18432,0,0,0 +FullyConnect_MatMul_0,34816,0,73728,0 +FastGelu_AddBias_0,512,0,73728,0 +FullyConnect_MatMul_1,16384,0,0,0 +Add_492,18432,0,0,0 +LayerNorm_5,18432,0,0,0 From 08bfa77c63f903f3930deb6f1e2df062ff8118ed Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Thu, 6 Feb 2025 10:27:56 -0800 Subject: [PATCH 4/7] Address Dml build problem --- .../providers/dml/DmlExecutionProvider/src/ExecutionProvider.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index 3002177db13f4..7f420f8850001 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -93,7 +93,8 @@ namespace Dml GetCapability( const onnxruntime::GraphViewer& graph, const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup, - const onnxruntime::logging::Logger& logger, onnxruntime::IResourceAccountant* resource_accountant) const; + onnxruntime::IResourceAccountant* resource_accountant, + const onnxruntime::logging::Logger& logger) const; uint32_t GetSupportedDeviceDataTypeMask() const; From 6d57fa4e71c545f9c81374059b0d15055d1cd5a2 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Thu, 6 Feb 2025 14:24:10 -0800 Subject: [PATCH 5/7] Address comments --- include/onnxruntime/core/framework/op_kernel_context.h | 2 -- onnxruntime/core/framework/execution_frame.cc | 2 -- onnxruntime/core/framework/op_kernel.cc | 5 ----- onnxruntime/core/framework/op_kernel_context_internal.h | 8 -------- .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp | 2 +- onnxruntime/core/session/inference_session.cc | 2 +- onnxruntime/core/session/onnxruntime_c_api.cc | 1 - 7 files changed, 2 insertions(+), 20 deletions(-) diff --git a/include/onnxruntime/core/framework/op_kernel_context.h b/include/onnxruntime/core/framework/op_kernel_context.h index e9a1490dedc34..3fd9ee0d8b292 100644 --- a/include/onnxruntime/core/framework/op_kernel_context.h +++ b/include/onnxruntime/core/framework/op_kernel_context.h @@ -204,8 +204,6 @@ class OpKernelContext { virtual OrtValue* GetOrCreateOutputMLValue(int index); - virtual int GetOrtValueIndexForInput(int input_index) const; - virtual int GetOrtValueIndexForOutput(int output_index) const; private: diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc index bc13c30294875..c5046353ba528 100644 --- a/onnxruntime/core/framework/execution_frame.cc +++ b/onnxruntime/core/framework/execution_frame.cc @@ -23,8 +23,6 @@ #include "core/framework/bfc_arena.h" -#include "core/session/onnxruntime_session_options_config_keys.h" - using namespace onnxruntime::common; namespace onnxruntime { diff --git a/onnxruntime/core/framework/op_kernel.cc b/onnxruntime/core/framework/op_kernel.cc index 1d05cb4e5e818..212ce9c5069ea 100644 --- a/onnxruntime/core/framework/op_kernel.cc +++ b/onnxruntime/core/framework/op_kernel.cc @@ -130,11 +130,6 @@ OrtValue* OpKernelContext::GetOrCreateOutputMLValue(int index) { return value; } -int OpKernelContext::GetOrtValueIndexForInput(int input_index) const { - int input_arg_index = GetInputArgIndex(input_index); - return execution_frame_->GetNodeIdxToMLValueIdx(input_arg_index); -} - int OpKernelContext::GetOrtValueIndexForOutput(int output_index) const { int output_arg_index = GetOutputArgIndex(output_index); return execution_frame_->GetNodeIdxToMLValueIdx(output_arg_index); diff --git a/onnxruntime/core/framework/op_kernel_context_internal.h b/onnxruntime/core/framework/op_kernel_context_internal.h index 64932dce50917..4c7ee10a07691 100644 --- a/onnxruntime/core/framework/op_kernel_context_internal.h +++ b/onnxruntime/core/framework/op_kernel_context_internal.h @@ -59,10 +59,6 @@ class OpKernelContextInternal : public OpKernelContext { return OpKernelContext::GetInputMLValue(index); } - const OrtValue* GetImplicitInputMLValue(int index) const override { - return OpKernelContext::GetImplicitInputMLValue(index); - } - OrtValue* GetOutputMLValue(int index) { return OpKernelContext::GetOutputMLValue(index); } @@ -82,10 +78,6 @@ class OpKernelContextInternal : public OpKernelContext { return implicit_input_values_; } - int GetOrtValueIndexForInput(int input_index) const override { - return OpKernelContext::GetOrtValueIndexForInput(input_index); - } - int GetOrtValueIndexForOutput(int output_index) const override { return OpKernelContext::GetOrtValueIndexForOutput(output_index); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index dd868ddd8307a..9d23b8b950272 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -878,7 +878,7 @@ namespace Dml ExecutionProviderImpl::GetCapability( const onnxruntime::GraphViewer& graph, const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup, - const onnxruntime::logging::Logger& logger, onnxruntime::IResourceAccountant*) const { + onnxruntime::IResourceAccountant*, const onnxruntime::logging::Logger& logger) const { uint32_t deviceDataTypeMask = GetSupportedDeviceDataTypeMask(); // Each bit corresponds to each DML_TENSOR_DATA_TYPE. std::vector> result; diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index fb0fcd55ffc63..71b1cad06f3f5 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -2749,7 +2749,7 @@ Status InferenceSession::Run(const RunOptions& run_options, #endif #if !defined(ORT_MINIMAL_BUILD) - if (node_stats_recorder_.has_value() && retval.IsOK()) { + if (IsNodeStatsCollectionEnabled() && retval.IsOK()) { // Dump node stats if the run was successful node_stats_recorder_->DumpStats(session_state_->GetGraphViewer().ModelPath()); node_stats_recorder_->ResetPerRunNameDeduper(); diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 3761b4ca0ec41..ca6950af0227a 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -5,7 +5,6 @@ #include "core/session/allocator_adapters.h" #include "core/session/inference_session_utils.h" #include "core/session/IOBinding.h" -#include "core/session/onnxruntime_session_options_config_keys.h" #include "core/framework/allocator.h" #include "core/framework/error_code_helper.h" #include "core/framework/execution_provider.h" From d7504060eb1df87b7b157f6e15b457b2a40775b1 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 10 Feb 2025 12:46:06 -0800 Subject: [PATCH 6/7] Make threshold optional --- .../core/framework/resource_accountant.cc | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/onnxruntime/core/framework/resource_accountant.cc b/onnxruntime/core/framework/resource_accountant.cc index 786da13e69458..391f010c79f37 100644 --- a/onnxruntime/core/framework/resource_accountant.cc +++ b/onnxruntime/core/framework/resource_accountant.cc @@ -21,12 +21,16 @@ class SizeTAccountant : public IResourceAccountant { SizeTAccountant() = default; ~SizeTAccountant() = default; - explicit SizeTAccountant(size_t threshold, InlinedHashMap&& node_stats) + SizeTAccountant(size_t threshold, InlinedHashMap&& node_stats) : IResourceAccountant(threshold), node_stats_(std::move(node_stats)) {} + explicit SizeTAccountant(InlinedHashMap&& node_stats) + : IResourceAccountant(), node_stats_(std::move(node_stats)) {} + ResourceCount GetConsumedAmount() const noexcept override { return consumed_amount_; } + void AddConsumedAmount(const ResourceCount& amount) noexcept override { if (std::holds_alternative(amount)) { consumed_amount_ += std::get(amount); @@ -151,10 +155,11 @@ Status NodeStatsRecorder::CreateAccountants( kOrtSessionOptionsResourceCudaPartitioningSettings, ""); if (!resource_partitioning_settings.empty()) { - auto splits = utils::SplitString(resource_partitioning_settings, ",", false); + auto splits = utils::SplitString(resource_partitioning_settings, ",", true); if (splits.size() == 2) { - SafeInt cuda_memory_limit = std::stoul(std::string{splits[0]}); - cuda_memory_limit *= 1024; // to bytes + if (splits[1].empty()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid resource partitioning settings"); + } InlinedHashMap loaded_stats; ORT_RETURN_IF_ERROR(LoadNodeAllocationStats(model_path, splits[1], loaded_stats)); @@ -162,9 +167,17 @@ Status NodeStatsRecorder::CreateAccountants( std::optional result; auto& map = result.emplace(); - map.insert_or_assign(kCudaExecutionProvider, - std::make_unique(cuda_memory_limit, - std::move(loaded_stats))); + if (!splits[0].empty()) { + SafeInt cuda_memory_limit = std::stoul(std::string{splits[0]}); + cuda_memory_limit *= 1024; // to bytes + map.insert_or_assign(kCudaExecutionProvider, + std::make_unique(cuda_memory_limit, + std::move(loaded_stats))); + } else { + map.insert_or_assign(kCudaExecutionProvider, + std::make_unique(std::move(loaded_stats))); + } + acc_map = std::move(result); } } From 128d2fbdff47d559e190048a502aa9ce6ad1c48e Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 11 Feb 2025 17:53:22 -0800 Subject: [PATCH 7/7] Address review comments --- .../core/framework/resource_accountant.h | 11 +- include/onnxruntime/core/graph/graph.h | 7 -- .../core/graph/indexed_sub_graph.h | 19 +-- .../onnxruntime_session_options_config_keys.h | 7 +- .../core/framework/graph_partitioner.cc | 6 +- .../core/framework/resource_accountant.cc | 45 ++++++- .../core/framework/sequential_executor.cc | 2 +- onnxruntime/core/graph/graph.cc | 36 ------ .../providers/cuda/cuda_execution_provider.cc | 2 +- .../shared_library/provider_interfaces.h | 1 - .../shared_library/provider_wrappedtypes.h | 3 - .../core/session/provider_bridge_ort.cc | 1 - .../tiny_gpt2_beamsearch_node_stats.txt | 113 +++++++++--------- 13 files changed, 120 insertions(+), 133 deletions(-) diff --git a/include/onnxruntime/core/framework/resource_accountant.h b/include/onnxruntime/core/framework/resource_accountant.h index 1f2e9ea5ccfb0..274750a505fbd 100644 --- a/include/onnxruntime/core/framework/resource_accountant.h +++ b/include/onnxruntime/core/framework/resource_accountant.h @@ -16,6 +16,11 @@ namespace onnxruntime { struct ConfigOptions; +#ifndef SHARED_PROVIDER +class Node; +#else +struct Node; +#endif // Common holder for potentially different resource accounting // for different EPs @@ -40,7 +45,7 @@ class IResourceAccountant { virtual ResourceCount GetConsumedAmount() const = 0; virtual void AddConsumedAmount(const ResourceCount& amount) = 0; virtual void RemoveConsumedAmount(const ResourceCount& amount) = 0; - virtual ResourceCount ComputeResourceCount(const std::string& node_name) const = 0; + virtual ResourceCount ComputeResourceCount(const Node& node) const = 0; std::optional GetThreshold() const { return threshold_; @@ -52,6 +57,8 @@ class IResourceAccountant { bool IsStopIssued() const noexcept { return stop_assignment_; } + static std::string MakeUniqueNodeName(const Node& node); + private: bool stop_assignment_ = false; std::optional threshold_; @@ -101,7 +108,7 @@ class NodeStatsRecorder { void DumpStats(const std::filesystem::path& model_path) const; - static Status CreateAccountants( + [[nodiscard]] static Status CreateAccountants( const ConfigOptions& config_options, const std::filesystem::path& model_path, std::optional& acc_map); diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index 1eaf2119f34fe..7798394b045dc 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -883,13 +883,6 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi return ConstGraphNodes(nodes_, std::move(filter_func)); } - /** Compute node memory requirements, which is mostly initializers - and large attributes that are copied on device (special cases for some nodes) - - Returns no value if the node was not found. - */ - size_t ComputeNodeMemoryUsage(NodeIndex) const; - /** Gets the maximum NodeIndex value used in the Graph. WARNING: This actually returns the max index value used + 1. */ diff --git a/include/onnxruntime/core/graph/indexed_sub_graph.h b/include/onnxruntime/core/graph/indexed_sub_graph.h index 959b183e272ea..e457d3dcad1f1 100644 --- a/include/onnxruntime/core/graph/indexed_sub_graph.h +++ b/include/onnxruntime/core/graph/indexed_sub_graph.h @@ -84,16 +84,14 @@ struct IndexedSubGraph { // if present and adds it to the consumed amount void AccountForNode(size_t cost_index) const { assert(cost_index < nodes_costs.size()); - if (nodes_costs[cost_index].has_value()) { - resource_accountant->AddConsumedAmount(*nodes_costs[cost_index]); - } + resource_accountant->AddConsumedAmount(nodes_costs[cost_index]); } // This computes and accounts for the resource cost for the node that just // been fused from other nodes, and the EP did not had a chance to compute the costs. - void ComputeAndAccountForNode(const std::string& node_name) const { + void ComputeAndAccountForNode(const Node& node) const { assert(resource_accountant != nullptr); - resource_accountant->AddConsumedAmount(resource_accountant->ComputeResourceCount(node_name)); + resource_accountant->AddConsumedAmount(resource_accountant->ComputeResourceCount(node)); } void SetAccountant(IResourceAccountant* res_accountant) { @@ -106,22 +104,13 @@ struct IndexedSubGraph { nodes_costs.emplace_back(cost); } - // Append an absent cost for the node that was already accounted for. - void AppendNodeEmptyCost() { - assert(resource_accountant != nullptr); - nodes_costs.emplace_back(); - } - private: // subgraph meta definition. std::unique_ptr meta_def_; // Optional resource accountant for this subgraph. IResourceAccountant* resource_accountant = nullptr; // Vector with resource costs for nodes above. Should have the same size - // Some nodes that were previously accounted for because they already been assigned to an EP - // for example during multiple calls to GetCapabiility() will not have resource count present. - // may not have a resource count present, we skip it. - InlinedVector> nodes_costs; + InlinedVector nodes_costs; }; } // namespace onnxruntime diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index a50b19e4a8a56..f97964f49b582 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -286,8 +286,11 @@ static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers = static const char* const kOrtSessionOptionsCollectNodeMemoryStatsToFile = "session.collect_node_memory_stats_to_file"; /// This is a composite CSV setting formatted as "memory limit in kb,file name for collected stats" -/// "limit > 0": enables Capacity Aware Partitioning for Cuda EP. The EP will place nodes on device -/// "file name" : this file is expected to be found at the same folder with the model. The file contains +/// "limit > 0": enables Capacity Aware Partitioning for Cuda EP. `limit` is optional and when absent +/// the provider may attempt to figure out the memory available automatically. +/// The setting with no limit is expected to look like: ",file name for collected stats" +/// The EP will place nodes on device "file name" : +/// this file is expected to be found at the same folder with the model. The file contains /// pre-recorded stats collected when running with kOrtSessionOptionsCollectNodeMemoryStatsToFile enforce (see above) static const char* const kOrtSessionOptionsResourceCudaPartitioningSettings = "session.resource_cuda_partitioning_settings"; diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc index 08ddfd872ca78..1ec99f3dc8625 100644 --- a/onnxruntime/core/framework/graph_partitioner.cc +++ b/onnxruntime/core/framework/graph_partitioner.cc @@ -351,7 +351,7 @@ static Node* PlaceNode(Graph& graph, const IndexedSubGraph& capability, // that the fused node would use no more memory when the nodes we are fusing. // and potentially less than that, and therefore, no threshold check is needed here. // All threshold checks are done within the EP. - capability.ComputeAndAccountForNode(fused_node->Name()); + capability.ComputeAndAccountForNode(*fused_node); } result = fused_node; @@ -885,7 +885,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param Node& fused_node = graph.BeginFuseSubGraph(indexed_sub_graph, node_name); fused_node.SetExecutionProviderType(type); if (indexed_sub_graph.IsAccountingEnabled()) { - indexed_sub_graph.ComputeAndAccountForNode(fused_node.Name()); + indexed_sub_graph.ComputeAndAccountForNode(fused_node); } // create filtered graph viewer for this set of nodes @@ -932,7 +932,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param // now that we're done compiling we can remove the original nodes from the Graph and wire in the new one graph.FinalizeFuseSubGraph(indexed_sub_graph, node); if (acc_enabled) { - compilation_entry.capability.get().sub_graph->ComputeAndAccountForNode(node.Name()); + compilation_entry.capability.get().sub_graph->ComputeAndAccountForNode(node); } } #endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) diff --git a/onnxruntime/core/framework/resource_accountant.cc b/onnxruntime/core/framework/resource_accountant.cc index 391f010c79f37..4d537219ec714 100644 --- a/onnxruntime/core/framework/resource_accountant.cc +++ b/onnxruntime/core/framework/resource_accountant.cc @@ -4,11 +4,15 @@ #include "core/framework/resource_accountant.h" #include "core/common/inlined_containers.h" +#include "core/common/narrow.h" +#include "core/common/parse_string.h" #include "core/common/safeint.h" #include "core/common/string_utils.h" #include "core/framework/config_options.h" +#include "core/framework/murmurhash3.h" #include "core/graph/constants.h" +#include "core/graph/graph.h" #include "core/session/onnxruntime_session_options_config_keys.h" #include @@ -42,7 +46,8 @@ class SizeTAccountant : public IResourceAccountant { } } - ResourceCount ComputeResourceCount(const std::string& node_name) const override { + ResourceCount ComputeResourceCount(const Node& node) const override { + const auto node_name = MakeUniqueNodeName(node); auto hit = node_stats_.find(node_name); if (hit != node_stats_.end()) { const auto& stats = hit->second; @@ -88,11 +93,13 @@ void NodeStatsRecorder::ReportNodeStats(const std::string& node_name, const Node auto result = impl_->node_stats.emplace(node_name, stats); if (!result.second) { // Node already exists, update the stats + // This may happen when the user collects stats from multiple Runs() result.first->second.UpdateIfGreater(stats); } } void NodeStatsRecorder::DumpStats(std::ostream& os) const { + os << "#name,input_sizes,initializers_sizes,total_dynamic_sizes,total_temp_allocations\n"; for (const auto& [name, stats] : impl_->node_stats) { os << name << "," << stats.input_sizes << "," << stats.initializers_sizes << "," << stats.total_dynamic_sizes << "," @@ -128,6 +135,8 @@ static Status LoadNodeAllocationStats( std::string line; // Read and load a CSV file line by line while (std::getline(file, line)) { + if (line.empty() || line[0] == '#') continue; + auto splits = utils::SplitString(line, ",", true); ORT_ENFORCE(splits.size() == 5, "Invalid line in the file ", file_path, ": ", line); if (splits[0].empty()) { @@ -138,8 +147,8 @@ static Status LoadNodeAllocationStats( size_t initializers_sizes = SafeInt(std::stoull(std::string{splits[2]})); size_t total_dynamic_sizes = SafeInt(std::stoull(std::string{splits[3]})); size_t total_temp_allocations = SafeInt(std::stoull(std::string{splits[4]})); - node_stats.insert_or_assign(node_name, {input_sizes, initializers_sizes, - total_dynamic_sizes, total_temp_allocations}); + node_stats.insert_or_assign(std::move(node_name), {input_sizes, initializers_sizes, + total_dynamic_sizes, total_temp_allocations}); } result.swap(node_stats); @@ -168,8 +177,9 @@ Status NodeStatsRecorder::CreateAccountants( auto& map = result.emplace(); if (!splits[0].empty()) { - SafeInt cuda_memory_limit = std::stoul(std::string{splits[0]}); - cuda_memory_limit *= 1024; // to bytes + size_t cuda_memory_limit = 0; + ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(std::string{splits[0]}, cuda_memory_limit)); + cuda_memory_limit = SafeInt(cuda_memory_limit) * 1024; // to bytes map.insert_or_assign(kCudaExecutionProvider, std::make_unique(cuda_memory_limit, std::move(loaded_stats))); @@ -179,10 +189,35 @@ Status NodeStatsRecorder::CreateAccountants( } acc_map = std::move(result); + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid format for: ", + kOrtSessionOptionsResourceCudaPartitioningSettings, + " : expecting comma separated fields"); } } return Status::OK(); } +std::string IResourceAccountant::MakeUniqueNodeName(const Node& node) { + std::string result; + + uint32_t hash[4] = {0, 0, 0, 0}; + auto hash_str = [&hash](const std::string& str) { + MurmurHash3::x86_128(str.data(), narrow(str.size()), hash[0], &hash); + }; + + const auto& node_name = (node.Name().empty()) ? node.OpType() : node.Name(); + + for (const auto& def : node.InputDefs()) { + hash_str(def->Name()); + } + + HashValue node_hash = hash[0] | (uint64_t(hash[1]) << 32); + result.reserve(node_name.size() + 1 + 16); + result.append(node_name).append("_").append(std::to_string(node_hash)); + + return result; +} + } // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc index 35ae33328837c..26a57ec3ea02f 100644 --- a/onnxruntime/core/framework/sequential_executor.cc +++ b/onnxruntime/core/framework/sequential_executor.cc @@ -545,7 +545,7 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx, } // Record node allocation stats - const auto& name = (node.Name().empty()) ? node.OpType() : node.Name(); + const std::string name = IResourceAccountant::MakeUniqueNodeName(node); node_stats_recorder->ReportNodeStats(name, node_stats); } #endif diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index d0a1280ce9895..e4915616b7b7c 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -5537,42 +5537,6 @@ Graph::Graph(const Model& owning_model, is_loaded_from_model_file_(true) { // true as the Graph isn't manually constructed from scratch } -size_t Graph::ComputeNodeMemoryUsage(NodeIndex node_idx) const { - /// XXX: In some cases some kernels can copy its attributes to a device - // those are edge cases which we currently do not account for. - const Node* node = GetNode(node_idx); - if (node != nullptr) { - SafeInt result = 0; - for (const auto* input : node->InputDefs()) { - if (input->Exists()) { - // Let's see if this is an initializer - constexpr const bool check_outer_scope_true = true; - const ONNX_NAMESPACE::TensorProto* initializer = - GetConstantInitializer(input->Name(), check_outer_scope_true); - if (initializer != nullptr) { - size_t out; - if (utils::GetSizeInBytesFromTensorProto<0>(*initializer, &out).IsOK()) { - result += out; - } - } else { - const auto* proto = input->TypeAsProto(); - if (proto != nullptr && utils::HasTensorType(*proto)) { - const auto& tensor_type = proto->tensor_type(); - if (utils::HasElemType(tensor_type) && utils::HasShape(tensor_type)) { - size_t size; - if (utils::GetSizeInBytesFromTensorTypeProto<0>(tensor_type, &size).IsOK()) { - result += size; - } - } - } - } - } - } - return static_cast(result); - } - return 0; -} - common::Status Graph::LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph, const OrtFormatLoadOptions& load_options) { // We deserialize the graph from ORT format in the following order: diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 86909fd272be3..b675c08e5f804 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -2771,7 +2771,7 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, result.push_back(ComputeCapability::Create(std::move(sub_graph))); } else { auto* node = graph.GetNode(node_index); - auto resource_count = std::get<0>(resource_accountant->ComputeResourceCount(node->Name())); + auto resource_count = std::get<0>(resource_accountant->ComputeResourceCount(*node)); const auto would_be_consumed = resource_count + consumed_memory; LOGS(logger, INFO) << "CUDA_EP Node: " << node_index << " Memory usage : " << resource_count << " would be consumed " << static_cast(would_be_consumed) diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 059a722958118..0dd771f522336 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -670,7 +670,6 @@ struct ProviderHost { virtual IndexedSubGraph_SourceOfSchema IndexedSubGraph__GetSchemaSource(const IndexedSubGraph* p) = 0; virtual void IndexedSubGraph__SetAccountant(IndexedSubGraph* p, IResourceAccountant*) = 0; virtual void IndexedSubGraph__AppendNodeCost(IndexedSubGraph* p, const ResourceCount& count) = 0; - virtual void IndexedSubGraph__AppendNodeEmptyCost(IndexedSubGraph* p) = 0; // KernelDef virtual void KernelDef__operator_delete(KernelDef* p) = 0; diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index 6441547cab914..a502ce9c66f69 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -595,9 +595,6 @@ struct IndexedSubGraph final { void AppendNodeCost(const ResourceCount& resource_count) { g_host->IndexedSubGraph__AppendNodeCost(this, resource_count); } - void AppendNodeEmptyCost() { - g_host->IndexedSubGraph__AppendNodeEmptyCost(this); - } IndexedSubGraph() = delete; IndexedSubGraph(const IndexedSubGraph&) = delete; diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 83ba757886ff5..a1cd9af3b5091 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -844,7 +844,6 @@ struct ProviderHostImpl : ProviderHost { void IndexedSubGraph__AppendNodeCost(IndexedSubGraph* p, const ResourceCount& resource_count) override { p->AppendNodeCost(resource_count); } - void IndexedSubGraph__AppendNodeEmptyCost(IndexedSubGraph* p) override { p->AppendNodeEmptyCost(); } // KernelDef (wrapped) void KernelDef__operator_delete(KernelDef* p) override { delete p; } diff --git a/onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt b/onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt index d9150cf6768f5..df1e0c48825a0 100644 --- a/onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt +++ b/onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt @@ -1,56 +1,57 @@ -GptAttention_1_add,18432,0,0,0 -GptAttention_0_matmul,4096,0,0,0 -GptAttention_2_matmul,22528,0,0,0 -FullyConnect_MatMul_5,90112,0,0,0 -GptAttention_3,30720,0,36864,165888 -LayerNorm_4,18432,0,0,0 -GptAttention_1_matmul,22528,0,0,0 -FullyConnect_Add_5,18432,0,0,0 -GptAttention_2_add,18432,0,0,0 -FullyConnect_Add_3,18432,0,0,0 -GptAttention_3_add,18432,0,0,0 -Add_689,18432,0,0,0 -Add_886,18432,0,0,0 -LayerNorm_7,18432,0,0,0 -FullyConnect_MatMul_6,34816,0,0,0 -GptAttention_4,30720,0,36864,165888 -GptAttention_4_add,18432,0,0,0 -SkipLayerNormalization,18432,0,0,0 -LayerNorm_1,18432,0,0,0 -GptAttention_3_matmul,22528,0,0,0 -LayerNorm_8,18432,0,0,0 -FullyConnect_MatMul_8,34816,0,0,0 -FullyConnect_Add_7,18432,0,0,0 -LayerNorm_9,18432,0,0,0 -FastGelu_AddBias_3,73728,0,0,0 -FullyConnect_Add_1,18432,0,0,0 -GptAttention_4_matmul,22528,0,0,0 -GptAttention_0,13248,0,55296,165888 -FullyConnect_MatMul_2,34816,0,0,0 -FullyConnect_MatMul_9,90112,0,0,0 -MatMul_1165,146432,0,576000,0 -GptAttention_2,30720,0,36864,165888 -LayerNorm_6,18432,0,0,0 -BeamSearch_gpt2,24,0,256,1823244 -FastGelu_AddBias_4,73728,0,0,0 -Add_951,18432,0,0,0 -GptAttention_1,30720,0,36864,165888 -LayerNorm_3,18432,0,0,0 -Add_295,18432,0,0,0 -Add_1083,18432,0,0,0 -EmbedLayerNormalization_0,194944,0,37120,0 -GptAttention_0_add,18432,0,0,0 -FullyConnect_MatMul_7,90112,0,0,0 -FastGelu_AddBias_1,73728,0,0,0 -LayerNorm_2,18432,0,0,0 -FastGelu_AddBias_2,73728,0,0,0 -Add_360,18432,0,0,0 -Add_754,18432,0,0,0 -FullyConnect_MatMul_3,90112,0,0,0 -FullyConnect_MatMul_4,34816,0,0,0 -Add_557,18432,0,0,0 -FullyConnect_MatMul_0,34816,0,73728,0 -FastGelu_AddBias_0,512,0,73728,0 -FullyConnect_MatMul_1,16384,0,0,0 -Add_492,18432,0,0,0 -LayerNorm_5,18432,0,0,0 +#name,input_sizes,initializers_sizes,total_dynamic_sizes,total_temp_allocations +GptAttention_1_matmul_3390928670334833856,22528,0,0,0 +LayerNorm_8_16340230589392852003,18432,0,0,0 +LayerNorm_6_9539917679182944001,18432,0,0,0 +LayerNorm_4_3998281518089755446,18432,0,0,0 +Add_295_12458934867448263403,18432,0,0,0 +GptAttention_1_5945223373512700064,30720,0,36864,165888 +FastGelu_AddBias_0_8293496556664011978,512,0,73728,0 +FullyConnect_MatMul_7_9121431797220490115,90112,0,0,0 +GptAttention_0_7799922821510396356,13248,0,55296,165888 +GptAttention_2_13772881973491265914,30720,0,36864,165888 +LayerNorm_1_10060807585253518719,18432,0,0,0 +LayerNorm_5_12297409543002935527,18432,0,0,0 +Add_492_15870509848159592443,18432,0,0,0 +FullyConnect_MatMul_5_12754193998971094488,90112,0,0,0 +LayerNorm_7_11450735811828114024,18432,0,0,0 +FullyConnect_Add_5_4749853671277160818,18432,0,0,0 +GptAttention_3_add_5419272690383812111,18432,0,0,0 +FullyConnect_MatMul_8_14154070846330210236,34816,0,0,0 +FullyConnect_MatMul_9_9215108924175066058,90112,0,0,0 +GptAttention_2_add_7251589488810842639,18432,0,0,0 +FullyConnect_Add_7_2612800351421913827,18432,0,0,0 +GptAttention_1_add_3894862726029568115,18432,0,0,0 +FullyConnect_MatMul_2_4814122527985171273,34816,0,0,0 +LayerNorm_3_3589946186712403351,18432,0,0,0 +GptAttention_3_8921810316598002134,30720,0,36864,165888 +LayerNorm_9_9113032450990548295,18432,0,0,0 +Add_886_7198133075029541336,18432,0,0,0 +Add_689_16588197583517413999,18432,0,0,0 +GptAttention_3_matmul_14740826065423798917,22528,0,0,0 +FastGelu_AddBias_4_17289691003819959460,73728,0,0,0 +Add_754_3697562882104452642,18432,0,0,0 +FullyConnect_MatMul_4_3508821612885617837,34816,0,0,0 +FastGelu_AddBias_1_17699324882619485158,73728,0,0,0 +FullyConnect_MatMul_3_17781936527365066348,90112,0,0,0 +GptAttention_2_matmul_7328860221231123895,22528,0,0,0 +SkipLayerNormalization_6957325406340516852,18432,0,0,0 +BeamSearch_gpt2_3957842931497654942,24,0,256,1823244 +GptAttention_4_matmul_90143216136586800,22528,0,0,0 +FullyConnect_MatMul_6_11858231833228352542,34816,0,0,0 +GptAttention_0_matmul_16767551145055538728,4096,0,0,0 +FullyConnect_Add_3_17196504264676187520,18432,0,0,0 +GptAttention_0_add_9807374014361508564,18432,0,0,0 +FullyConnect_MatMul_1_17322107022932292417,16384,0,0,0 +GptAttention_4_14364416985904266109,30720,0,36864,165888 +FullyConnect_MatMul_0_3724322618026197588,34816,0,73728,0 +Add_557_10312911821132522354,18432,0,0,0 +Add_360_12940403527838064497,18432,0,0,0 +FastGelu_AddBias_3_13817144420946871274,73728,0,0,0 +EmbedLayerNormalization_0_7260843146120485633,194944,0,37120,0 +FastGelu_AddBias_2_7906787140370676932,73728,0,0,0 +MatMul_1165_4290064500958888402,146432,0,576000,0 +GptAttention_4_add_15131081400494402711,18432,0,0,0 +Add_1083_4580993573699232732,18432,0,0,0 +Add_951_2303460452509012571,18432,0,0,0 +LayerNorm_2_2575702077895349965,18432,0,0,0 +FullyConnect_Add_1_7648227151832366839,18432,0,0,0