From 948b0343d549ca7e97309f7cbc923ea81b5c3efe Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Fri, 1 Nov 2024 16:52:40 -0700
Subject: [PATCH 1/7] Capacity Aware Parititioning begins     Implement
 GetSizeFromTensorTypeProo     Wire in accounting     Make CUDA EP resource
 aware and account on assignment     Fix missing accountant for Ort format    
 Remove redundant functions     Remove unnecessary interface     Fix DML
 issue, minor fixes     Fix alert     DEMO changes     Implement node memory
 stats collection     Place container in the session.     Support nested
 graphs     Add synchronization     Update stats for the max consumption.    
 Introduce input sizes computation.

---
 .../core/framework/execution_provider.h       |   5 +-
 .../core/framework/op_kernel_context.h        |   4 +
 .../core/framework/resource_accountant.h      | 100 +++++++
 include/onnxruntime/core/graph/graph.h        |   7 +
 .../core/graph/indexed_sub_graph.h            |  49 ++++
 .../onnxruntime_session_options_config_keys.h |  25 ++
 onnxruntime/core/framework/execution_frame.cc |   8 +
 onnxruntime/core/framework/execution_frame.h  |  20 +-
 .../core/framework/execution_provider.cc      |   3 +-
 .../core/framework/graph_partitioner.cc       | 159 +++++++++--
 onnxruntime/core/framework/op_kernel.cc       |  10 +
 .../framework/op_kernel_context_internal.h    |  67 +++++
 .../core/framework/resource_accountant.cc     |  47 ++++
 .../core/framework/sequential_executor.cc     |  61 ++++-
 onnxruntime/core/framework/session_state.h    |  22 ++
 .../core/framework/tensorprotoutils.cc        |  60 +++--
 onnxruntime/core/framework/tensorprotoutils.h |   3 +
 onnxruntime/core/graph/graph.cc               |  36 +++
 .../providers/acl/acl_execution_provider.cc   |   3 +-
 .../providers/acl/acl_execution_provider.h    |   3 +-
 .../providers/cann/cann_execution_provider.cc |   3 +-
 .../providers/cann/cann_execution_provider.h  |   3 +-
 .../coreml/coreml_execution_provider.cc       |   3 +-
 .../coreml/coreml_execution_provider.h        |   3 +-
 .../providers/cuda/cuda_execution_provider.cc |  72 ++++-
 .../providers/cuda/cuda_execution_provider.h  |   3 +-
 .../src/ExecutionProvider.cpp                 |  10 +-
 .../src/ExecutionProvider.h                   |  10 +-
 .../providers/dnnl/dnnl_execution_provider.cc |   3 +-
 .../providers/dnnl/dnnl_execution_provider.h  |   3 +-
 .../providers/js/js_execution_provider.cc     |   3 +-
 .../core/providers/js/js_execution_provider.h |   3 +-
 .../migraphx/migraphx_execution_provider.cc   |   3 +-
 .../migraphx/migraphx_execution_provider.h    |   3 +-
 .../nnapi_builtin/nnapi_execution_provider.cc |   5 +-
 .../nnapi_builtin/nnapi_execution_provider.h  |   3 +-
 .../openvino/openvino_execution_provider.cc   |   3 +-
 .../openvino/openvino_execution_provider.h    |   3 +-
 .../providers/qnn/qnn_execution_provider.cc   |   3 +-
 .../providers/qnn/qnn_execution_provider.h    |   3 +-
 .../rknpu/rknpu_execution_provider.cc         |   3 +-
 .../rknpu/rknpu_execution_provider.h          |   3 +-
 .../providers/rocm/rocm_execution_provider.cc |   3 +-
 .../providers/rocm/rocm_execution_provider.h  |   3 +-
 .../provider_bridge_provider.cc               |   5 +-
 .../shared_library/provider_interfaces.h      |   8 +-
 .../shared_library/provider_wrappedtypes.h    |  10 +
 .../providers/snpe/snpe_execution_provider.cc |   3 +-
 .../providers/snpe/snpe_execution_provider.h  |   3 +-
 .../tensorrt/tensorrt_execution_provider.cc   |   3 +-
 .../tensorrt/tensorrt_execution_provider.h    |   3 +-
 .../vitisai/vitisai_execution_provider.cc     |   2 +-
 .../vitisai/vitisai_execution_provider.h      |   3 +-
 .../vsinpu/vsinpu_execution_provider.cc       |   4 +-
 .../vsinpu/vsinpu_execution_provider.h        |   3 +-
 .../webgpu/webgpu_execution_provider.cc       |   3 +-
 .../webgpu/webgpu_execution_provider.h        |   3 +-
 .../webnn/webnn_execution_provider.cc         |   3 +-
 .../webnn/webnn_execution_provider.h          |   3 +-
 .../xnnpack/xnnpack_execution_provider.cc     |   3 +-
 .../xnnpack/xnnpack_execution_provider.h      |   3 +-
 onnxruntime/core/session/inference_session.cc |  38 +++
 onnxruntime/core/session/inference_session.h  |  31 +++
 onnxruntime/core/session/onnxruntime_c_api.cc |   1 +
 .../core/session/provider_bridge_ort.cc       |  15 +-
 .../test/framework/inference_session_test.cc  | 204 ++++++++++----
 .../test/framework/session_state_test.cc      | 253 ++++++++++++++++++
 onnxruntime/test/framework/test_utils.h       |  41 ++-
 .../internal_testing_execution_provider.cc    |   3 +-
 .../internal_testing_execution_provider.h     |   3 +-
 .../test/providers/qnn/qnn_test_utils.cc      |   4 +-
 onnxruntime/test/shared_lib/test_inference.cc |  13 +
 72 files changed, 1366 insertions(+), 154 deletions(-)
 create mode 100644 include/onnxruntime/core/framework/resource_accountant.h
 create mode 100644 onnxruntime/core/framework/resource_accountant.cc

diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index 0d9e6db1a7748..c9a15de9ef897 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -38,6 +38,8 @@ struct OrtRunOptions;
 
 namespace onnxruntime {
 
+class IResourceAccountant;
+
 /**
    Logical device representation.
 */
@@ -130,7 +132,8 @@ class IExecutionProvider {
   */
   virtual std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                const IKernelLookup& kernel_lookup) const;
+                const IKernelLookup& kernel_lookup,
+                IResourceAccountant* resource_accountant = nullptr) const;
 
   /**
      Get kernel registry per execution provider type.
diff --git a/include/onnxruntime/core/framework/op_kernel_context.h b/include/onnxruntime/core/framework/op_kernel_context.h
index ac22d9130983a..a67d7b8ae0174 100644
--- a/include/onnxruntime/core/framework/op_kernel_context.h
+++ b/include/onnxruntime/core/framework/op_kernel_context.h
@@ -204,6 +204,10 @@ class OpKernelContext {
 
   virtual OrtValue* GetOrCreateOutputMLValue(int index);
 
+  virtual int GetOrtValueIndexForInput(int input_index) const;
+
+  virtual int GetOrtValueIndexForOutput(int output_index) const;
+
  private:
   ORT_DISALLOW_COPY_AND_ASSIGNMENT(OpKernelContext);
   int GetInputArgIndex(int index) const;
diff --git a/include/onnxruntime/core/framework/resource_accountant.h b/include/onnxruntime/core/framework/resource_accountant.h
new file mode 100644
index 0000000000000..982b37c969fe7
--- /dev/null
+++ b/include/onnxruntime/core/framework/resource_accountant.h
@@ -0,0 +1,100 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <filesystem>
+#include <iosfwd>
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "core/common/common.h"
+
+namespace onnxruntime {
+
+// Common holder for potentially different resource accounting
+// for different EPs
+using ResourceCount = std::variant<size_t, std::monostate>;
+
+/// <summary>
+/// This class is used for graph partitioning by EPs
+/// It stores the cumulative amount of the resource such as
+/// memory that would be consumed by the graph nodes if it is assigned to the EP.
+///
+/// It provides interfaces to add, remove and query the resource consumption.
+///
+/// Each provider may assign its own meaning to the resource according to its constraints.
+/// </summary>
+class IResourceAccountant {
+ protected:
+  IResourceAccountant() = default;
+  IResourceAccountant(const ResourceCount& threshold) : threshold_(threshold) {}
+
+ public:
+  virtual ~IResourceAccountant() = default;
+  virtual ResourceCount GetConsumedAmount() const = 0;
+  virtual void AddConsumedAmount(const ResourceCount& amount) = 0;
+  virtual void RemoveConsumedAmount(const ResourceCount& amount) = 0;
+  virtual ResourceCount ComputeResourceCount(const std::string& node_name) const = 0;
+
+  std::optional<ResourceCount> GetThreshold() const {
+    return threshold_;
+  }
+
+  void SetStopAssignment() noexcept {
+    stop_assignment_ = true;
+  }
+
+  bool IsStopIssued() const noexcept { return stop_assignment_; }
+
+ private:
+  bool stop_assignment_ = false;
+  std::optional<ResourceCount> threshold_;
+};
+
+// This struct keeps accounting of the memory allocation stats
+// for a kernel during runtime if enabled.
+struct NodeAllocationStats {
+  size_t input_sizes = 0;
+  size_t initializers_sizes = 0;
+  size_t total_dynamic_sizes = 0;
+  size_t total_temp_allocations = 0;
+
+  NodeAllocationStats& operator+=(const NodeAllocationStats& other) {
+    input_sizes += other.input_sizes;
+    initializers_sizes += other.initializers_sizes;
+    total_dynamic_sizes += other.total_dynamic_sizes;
+    total_temp_allocations += other.total_temp_allocations;
+    return *this;
+  }
+
+  void UpdateIfGreater(const NodeAllocationStats& other) {
+    input_sizes = std::max(input_sizes, other.input_sizes);
+    initializers_sizes = std::max(initializers_sizes, other.initializers_sizes);
+    total_dynamic_sizes = std::max(total_dynamic_sizes, other.total_dynamic_sizes);
+    total_temp_allocations = std::max(total_temp_allocations, other.total_temp_allocations);
+  }
+};
+
+class NodeStatsRecorder {
+ public:
+  explicit NodeStatsRecorder(const std::filesystem::path& stats_file_name);
+  ~NodeStatsRecorder();
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(NodeStatsRecorder);
+
+  const std::filesystem::path& GetNodeStatsFileName() const noexcept;
+
+  void ReportNodeStats(const std::string& node_name, const NodeAllocationStats& stats);
+
+  void DumpStats(std::ostream& os) const;
+
+ private:
+  // We would like to hide certain things that may not compile
+  // with some device compilers
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace onnxruntime
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index 7798394b045dc..1eaf2119f34fe 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -883,6 +883,13 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
     return ConstGraphNodes(nodes_, std::move(filter_func));
   }
 
+  /** Compute node memory requirements, which is mostly initializers
+      and large attributes that are copied on device (special cases for some nodes)
+
+      Returns no value if the node was not found.
+  */
+  size_t ComputeNodeMemoryUsage(NodeIndex) const;
+
   /** Gets the maximum NodeIndex value used in the Graph.
   WARNING: This actually returns the max index value used + 1.
   */
diff --git a/include/onnxruntime/core/graph/indexed_sub_graph.h b/include/onnxruntime/core/graph/indexed_sub_graph.h
index c57db41254159..959b183e272ea 100644
--- a/include/onnxruntime/core/graph/indexed_sub_graph.h
+++ b/include/onnxruntime/core/graph/indexed_sub_graph.h
@@ -7,6 +7,8 @@
 #include <string>
 #include <vector>
 
+#include "core/common/inlined_containers_fwd.h"
+#include "core/framework/resource_accountant.h"
 #include "core/graph/basic_types.h"
 #include "core/graph/onnx_protobuf.h"
 
@@ -70,9 +72,56 @@ struct IndexedSubGraph {
     return meta_def_.get();
   }
 
+  // Check if the accounting is enabled for the current EP
+  bool IsAccountingEnabled() const {
+    return resource_accountant != nullptr &&
+           nodes_costs.size() == nodes.size();
+  }
+
+  // Should call IsAccountingEnabled() first
+  // Takes the previously computed ResourceCount for the node
+  // (usually during GetCapabiilty())
+  // if present and adds it to the consumed amount
+  void AccountForNode(size_t cost_index) const {
+    assert(cost_index < nodes_costs.size());
+    if (nodes_costs[cost_index].has_value()) {
+      resource_accountant->AddConsumedAmount(*nodes_costs[cost_index]);
+    }
+  }
+
+  // This computes and accounts for the resource cost for the node that just
+  // been fused from other nodes, and the EP did not had a chance to compute the costs.
+  void ComputeAndAccountForNode(const std::string& node_name) const {
+    assert(resource_accountant != nullptr);
+    resource_accountant->AddConsumedAmount(resource_accountant->ComputeResourceCount(node_name));
+  }
+
+  void SetAccountant(IResourceAccountant* res_accountant) {
+    resource_accountant = res_accountant;
+  }
+
+  // Append resource count to the list of costs for the nodes.
+  void AppendNodeCost(const ResourceCount& cost) {
+    assert(resource_accountant != nullptr);
+    nodes_costs.emplace_back(cost);
+  }
+
+  // Append an absent cost for the node that was already accounted for.
+  void AppendNodeEmptyCost() {
+    assert(resource_accountant != nullptr);
+    nodes_costs.emplace_back();
+  }
+
  private:
   // subgraph meta definition.
   std::unique_ptr<MetaDef> meta_def_;
+  // Optional resource accountant for this subgraph.
+  IResourceAccountant* resource_accountant = nullptr;
+  // Vector with resource costs for nodes above. Should have the same size
+  // Some nodes that were previously accounted for because they already been assigned to an EP
+  // for example during multiple calls to GetCapabiility() will not have resource count present.
+  // may not have a resource count present, we skip it.
+  InlinedVector<std::optional<ResourceCount>> nodes_costs;
 };
 
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 64a4dd19c12b0..5d59380f7d643 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -261,6 +261,31 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMin
 static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
     "session.save_external_prepacked_constant_initializers";
 
+// Use this config when you want to collect memory stats for each node in the graph.
+// The file format is a CSV file with the following columns:
+// The file will be created if it does not exist, and will be overwritten if it does.
+//
+// The content of the file can be used to estimate memory requirements at run time including
+// the temporary allocations. This operation is preferably done on a CPU device, as the model may exceed
+// device memory limits in constrained environments. When enabling this option, it is important to disable
+// memory patterns, as they tend to allocate large blocks to avoid fragmentation and accommodate needs of multiple
+// kernels. Memory patterns may make it difficult to allocate on a device with limited memory.
+//
+// The collected stats then can be used to partition the graph among the devices in a way that only the
+// required memory is allocated on each device.
+//
+// node_name, initializers_memory, dynamic_outputs_sizes, temp_allocations_size
+//
+// - "full path to file": there is not a default for this option. If the file can not be opened for writing, an error will be returned.
+static const char* const kOrtSessionOptionsCollectNodeMemoryStatsToFile = "session.collect_node_memory_stats_to_file";
+
+/// This is a composite CSV setting formatted as "memory limit in kb,file name for collected stats"
+/// "limit > 0": enables Capacity Aware Partitioning for Cuda EP. The EP will place nodes on device
+/// "file name" : this file is expected to be found at the same folder with the model. The file contains
+/// pre-recorded stats collected when running with kOrtSessionOptionsCollectNodeMemoryStatsToFile enforce (see above)
+static const char* const kOrtSessionOptionsResourceCudaPartitioningSettings =
+    "session.resource_cuda_partitioning_settings";
+
 // Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
 // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
 // "0": disable. (default)
diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index 894e0daae94b6..bc13c30294875 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -23,6 +23,8 @@
 
 #include "core/framework/bfc_arena.h"
 
+#include "core/session/onnxruntime_session_options_config_keys.h"
+
 using namespace onnxruntime::common;
 
 namespace onnxruntime {
@@ -614,6 +616,12 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
 #endif
   }
 
+#if !defined(ORT_MINIMAL_BUILD)
+  if (session_state_.GetNodeStatsRecorder() != nullptr) {
+    ort_value_to_dynamic_allocations_size_.insert_or_assign(ort_value_index, size);
+  }
+#endif
+
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/framework/execution_frame.h b/onnxruntime/core/framework/execution_frame.h
index de571f86f1c77..7b5a8fd8a4b01 100644
--- a/onnxruntime/core/framework/execution_frame.h
+++ b/onnxruntime/core/framework/execution_frame.h
@@ -92,10 +92,10 @@ class IExecutionFrame {
 
   Status ReleaseMLValue(int ort_value_idx);
 
- protected:
   // get the ort_value_idx from NodeIndexInfo
   int GetNodeIdxToMLValueIdx(int index) const;
 
+ protected:
   OrtValue& GetMutableMLValue(int ort_value_index) { return const_cast<OrtValue&>(GetMLValue(ort_value_index)); }
 
   virtual Status ReleaseMLValueImpl(int ort_value_idx);
@@ -103,6 +103,8 @@ class IExecutionFrame {
   // returns true if the ort_value_idx is an output from the graph
   bool IsOutput(int ort_value_idx) const;
 
+  const OrtValueNameIdxMap& GetOrtValueNameIdxMap() const noexcept { return ort_value_idx_map_; }
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(IExecutionFrame);
 
@@ -166,6 +168,16 @@ class ExecutionFrame final : public IExecutionFrame {
     return planner_.has_value();
   }
 
+#if !defined(ORT_MINIMAL_BUILD)
+  std::optional<size_t> GetOrtValueDynamicAllocation(int ort_value_index) const {
+    auto it = ort_value_to_dynamic_allocations_size_.find(ort_value_index);
+    if (it != ort_value_to_dynamic_allocations_size_.end()) {
+      return it->second;
+    }
+    return std::nullopt;
+  }
+#endif
+
   // This function try retrieve the inferred shapes for the given NodeArg index.
   // If the retrival is successful, this function returns true and false otherwise.
   bool TryGetInferredShape(int index, TensorShape& shape) const override;
@@ -258,10 +270,14 @@ class ExecutionFrame final : public IExecutionFrame {
   // This field is not physical memory size.
   // dynamic_activation_memory_sizes_in_byte_[location] is the dynamic memory consumption on "location".
   std::unordered_map<std::string, size_t> dynamic_activation_memory_sizes_in_byte_;
+#endif
 
+#if !defined(ORT_MINIMAL_BUILD)
+  // OrtValue index to the size of dynamic memory allocation.
+  std::unordered_map<int, size_t> ort_value_to_dynamic_allocations_size_;
+#endif
   // Mutex which should be acquired when executing non-thread-safe member functions.
   // A current example is the tracker of dynamic memory allocation.
   mutable std::mutex mtx_;
-#endif
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/execution_provider.cc b/onnxruntime/core/framework/execution_provider.cc
index b39924d4c3ff9..3a937a119d03b 100644
--- a/onnxruntime/core/framework/execution_provider.cc
+++ b/onnxruntime/core/framework/execution_provider.cc
@@ -13,7 +13,8 @@ namespace onnxruntime {
 
 std::vector<std::unique_ptr<ComputeCapability>>
 IExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
-                                  const IKernelLookup& kernel_lookup) const {
+                                  const IKernelLookup& kernel_lookup,
+                                  IResourceAccountant*) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (const auto& node : graph.Nodes()) {
     if (const KernelCreateInfo* kernel_create_info = kernel_lookup.LookUpKernel(node);
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index b97cf03e3bf59..8a01e3973cdc6 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -5,13 +5,17 @@
 
 #include <cassert>
 #include <functional>
+#include <variant>
 
+#include "core/common/inlined_containers.h"
+#include "core/common/string_utils.h"
 #include "core/framework/compute_capability.h"
 #include "core/framework/execution_providers.h"
 #include "core/framework/func_kernel.h"
 #include "core/framework/kernel_lookup.h"
 #include "core/framework/kernel_registry_manager.h"
 #include "core/framework/kernel_registry.h"
+#include "core/framework/resource_accountant.h"
 #include "core/graph/function.h"
 #include "core/graph/function_utils.h"
 #include "core/graph/graph_viewer.h"
@@ -49,6 +53,9 @@ namespace onnxruntime {
 
 namespace {
 
+// A map of Ep Type to a resource accountant for this EP
+using ResourceAccountantMap = InlinedHashMap<std::string, std::unique_ptr<IResourceAccountant>>;
+
 // contains some common parameters used by the partitioning helper functions
 struct PartitionParams {
   std::reference_wrapper<Graph> graph;
@@ -60,6 +67,72 @@ struct PartitionParams {
   std::reference_wrapper<const layout_transformation::DebugGraphFn> debug_graph_fn;
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 };
+
+// Use this accountant if your resource can be counted with size_t type
+class SizeTAccountant : public IResourceAccountant {
+ public:
+  SizeTAccountant() = default;
+  ~SizeTAccountant() = default;
+
+  explicit SizeTAccountant(size_t threshold, InlinedHashMap<std::string, NodeAllocationStats>&& node_stats)
+      : IResourceAccountant(threshold), node_stats_(std::move(node_stats)) {}
+
+  ResourceCount GetConsumedAmount() const noexcept override {
+    return consumed_amount_;
+  }
+  void AddConsumedAmount(const ResourceCount& amount) noexcept override {
+    if (std::holds_alternative<size_t>(amount)) {
+      consumed_amount_ += std::get<size_t>(amount);
+    }
+  }
+  void RemoveConsumedAmount(const ResourceCount& amount) noexcept override {
+    if (std::holds_alternative<size_t>(amount)) {
+      consumed_amount_ -= std::get<0>(amount);
+    }
+  }
+
+  ResourceCount ComputeResourceCount(const std::string& node_name) const override {
+    auto hit = node_stats_.find(node_name);
+    if (hit != node_stats_.end()) {
+      const auto& stats = hit->second;
+      return stats.input_sizes + stats.initializers_sizes +
+             stats.total_dynamic_sizes + stats.total_temp_allocations;
+    }
+    return static_cast<size_t>(0U);
+  }
+
+ private:
+  size_t consumed_amount_ = 0;
+  InlinedHashMap<std::string, NodeAllocationStats> node_stats_;
+};
+
+InlinedHashMap<std::string, NodeAllocationStats> LoadNodeAllocationStats(const std::filesystem::path& model_path,
+                                                                         const std::filesystem::path& file_name) {
+  InlinedHashMap<std::string, NodeAllocationStats> node_stats;
+  std::filesystem::path file_path = model_path;
+  if (file_path.has_filename()) {
+    file_path = file_path.parent_path();
+  }
+
+  file_path /= file_name;
+
+  std::ifstream file(file_path);
+  ORT_ENFORCE(file.is_open(), "Failed to open file ", file_path);
+  std::string line;
+  // Read and load a CSV file line by line
+  while (std::getline(file, line)) {
+    auto splits = utils::SplitString(line, ",", false);
+    ORT_ENFORCE(splits.size() == 5, "Invalid line in the file ", file_path, ": ", line);
+    std::string node_name{splits[0]};
+    size_t input_sizes = SafeInt<size_t>(std::stoull(std::string{splits[1]}));
+    size_t initializers_sizes = SafeInt<size_t>(std::stoull(std::string{splits[2]}));
+    size_t total_dynamic_sizes = SafeInt<size_t>(std::stoull(std::string{splits[3]}));
+    size_t total_temp_allocations = SafeInt<size_t>(std::stoull(std::string{splits[4]}));
+    node_stats.insert_or_assign(node_name, {input_sizes, initializers_sizes,
+                                            total_dynamic_sizes, total_temp_allocations});
+  }
+  return node_stats;
+}
 }  // namespace
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -92,11 +165,14 @@ static bool TryAssignNodes(Graph& graph, const IndexedSubGraph& capability,
     }
   }
 
-  for (auto node_index : capability.nodes) {
-    auto* node = graph.GetNode(node_index);
+  const bool acc_enabled = capability.IsAccountingEnabled();
+  for (size_t i = 0, limit = capability.nodes.size(); i < limit; ++i) {
+    auto* node = graph.GetNode(capability.nodes[i]);
     node->SetExecutionProviderType(provider_type);
+    if (acc_enabled) {
+      capability.AccountForNode(i);
+    }
   }
-
   return true;
 }
 
@@ -113,6 +189,9 @@ static bool TryAssignSingleNode(Graph& graph,
   if (nullptr != node && node->GetExecutionProviderType().empty()) {
     // The node was not fused or assigned. Assign it to <provider_type>.
     node->SetExecutionProviderType(provider_type);
+    if (indexed_sub_graph.IsAccountingEnabled()) {
+      indexed_sub_graph.AccountForNode(0);
+    }
     return true;
   }
 
@@ -131,12 +210,14 @@ struct GetCapabilityForEPParams {
   std::reference_wrapper<const layout_transformation::TransformLayoutFunction> transform_layout;
   std::reference_wrapper<const layout_transformation::DebugGraphFn> debug_graph_fn;
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  IResourceAccountant* resource_accountant;
 };
 
 auto get_capabilities = [](const IExecutionProvider& ep,
                            const GraphViewer& graph_viewer,
-                           const IExecutionProvider::IKernelLookup& kernel_lookup) {
-  auto capabilities = ep.GetCapability(graph_viewer, kernel_lookup);
+                           const IExecutionProvider::IKernelLookup& kernel_lookup,
+                           IResourceAccountant* resource_accountant) {
+  auto capabilities = ep.GetCapability(graph_viewer, kernel_lookup, resource_accountant);
 
   // In theory an EP could return an empty capability. Remove those.
   capabilities.erase(std::remove_if(capabilities.begin(), capabilities.end(),
@@ -173,7 +254,7 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params, const l
 
   {
     const GraphViewer graph_viewer(graph);
-    capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup);
+    capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup, params.resource_accountant);
 
     if (capabilities.empty()) {
       return Status::OK();
@@ -211,7 +292,7 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params, const l
     capabilities.clear();
 
     const GraphViewer graph_viewer(graph);
-    capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup);
+    capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup, params.resource_accountant);
 
     // all nodes with an index >= first_new_node with domain of kMSInternalNHWCDomain should be in the capabilities
     InlinedHashSet<NodeIndex> new_nodes_in_capabilities;
@@ -260,7 +341,7 @@ static Status GetCapabilityForEPForAotInlining(const GraphViewer& graph_viewer,
                                    logger};
 
   // TODO: Provide EP with a capability to look inside the functions.
-  capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup);
+  capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup, nullptr);
 
   return Status::OK();
 }
@@ -318,6 +399,7 @@ static Node* PlaceNode(Graph& graph, const IndexedSubGraph& capability,
     }
 
     if (sub_graph_available_for_assignment) {
+      const bool acc_enabled = capability.IsAccountingEnabled();
       if (mode == GraphPartitioner::Mode::kNormal) {
         std::ostringstream oss;
         oss << provider_type << "_" << capability.GetMetaDef()->name << "_" << fused_node_unique_id++;
@@ -333,6 +415,13 @@ static Node* PlaceNode(Graph& graph, const IndexedSubGraph& capability,
         }
 
         fused_node->SetExecutionProviderType(provider_type);
+        if (acc_enabled) {
+          // We account for the fused node. We operate under assumption
+          // that the fused node would use no more memory when the nodes we are fusing.
+          // and potentially less than that, and therefore, no threshold check is needed here.
+          // All threshold checks are done within the EP.
+          capability.ComputeAndAccountForNode(fused_node->Name());
+        }
 
         result = fused_node;
       } else {
@@ -340,10 +429,13 @@ static Node* PlaceNode(Graph& graph, const IndexedSubGraph& capability,
         // This is used when exporting an ORT format model to maintain the original nodes and re-do the fusion
         // at runtime. The original nodes provide a fallback if fewer nodes can be fused at runtime due to device
         // capabilities.
-        for (auto node_index : capability.nodes) {
-          auto* node = graph.GetNode(node_index);
+        for (size_t i = 0, limit = capability.nodes.size(); i < limit; ++i) {
+          auto* node = graph.GetNode(capability.nodes[i]);
           if (node != nullptr) {
             node->SetExecutionProviderType(provider_type);
+            if (acc_enabled) {
+              capability.AccountForNode(i);
+            }
           }
         }
       }
@@ -363,7 +455,7 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
                                            int& fused_node_unique_id,
                                            const layout_transformation::TransformLayoutFunction& transform_layout_fn,
                                            const layout_transformation::DebugGraphFn& debug_graph_fn,
-                                           const logging::Logger& logger) {
+                                           const logging::Logger& logger, IResourceAccountant* resource_accountant) {
   // handle testing edge case where optimizers or constant lifting results in graph with no nodes.
   // doing it here saves all providers checking for this in GetCapability
   if (graph.NumberOfNodes() == 0) {
@@ -377,7 +469,7 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
       // we pass through the FuncManager from the top level graph
       ORT_RETURN_IF_ERROR(PartitionOnnxFormatModelImpl(*subgraph, func_mgr, kernel_registry_mgr,
                                                        fused_kernel_registry, current_ep, mode, fused_node_unique_id,
-                                                       transform_layout_fn, debug_graph_fn, logger));
+                                                       transform_layout_fn, debug_graph_fn, logger, resource_accountant));
     }
   }
 
@@ -400,7 +492,8 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
       std::ref(capabilities),
       mode,
       std::cref(transform_layout_fn),
-      std::cref(debug_graph_fn)};
+      std::cref(debug_graph_fn),
+      resource_accountant};
 
   ORT_RETURN_IF_ERROR(GetCapabilityForEP(get_capability_params, logger));
   if (capabilities.empty()) {
@@ -735,7 +828,7 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
 static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, GraphPartitioner::Mode mode,
                                        const ExecutionProviders& execution_providers,
                                        KernelRegistryManager& kernel_registry_manager,
-                                       const logging::Logger& logger) {
+                                       const ResourceAccountantMap& acc_map, const logging::Logger& logger) {
   bool modified_graph = false;
 
   auto& graph = partition_params.graph.get();
@@ -747,11 +840,16 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params,
   do {
     // process full graph with each EP
     for (const auto& ep : execution_providers) {
+      IResourceAccountant* resource_accountant = nullptr;
+      auto hit = acc_map.find(ep->Type());
+      if (hit != acc_map.end()) {
+        resource_accountant = hit->second.get();
+      }
       ORT_RETURN_IF_ERROR(PartitionOnnxFormatModelImpl(graph, func_mgr, kernel_registry_manager,
                                                        fused_kernel_registry, *ep, mode, fused_node_unique_id,
                                                        transform_layout_function,
                                                        partition_params.debug_graph_fn,
-                                                       logger));
+                                                       logger, resource_accountant));
     }
 
     // expand any nodes that have an ONNX function definition but no matching ORT kernel.
@@ -786,8 +884,8 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
       auto& subgraph = *entry.second;
       PartitionParams subgraph_partition_params = partition_params;
       subgraph_partition_params.graph = std::ref(subgraph);
-      ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(subgraph_partition_params, kernel_registry_mgr, current_ep,
-                                                      logger));
+      ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(subgraph_partition_params, kernel_registry_mgr,
+                                                      current_ep, logger));
     }
   }
 
@@ -803,6 +901,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
       std::cref(partition_params.transform_layout_function),
       std::cref(partition_params.debug_graph_fn),
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+      nullptr
   };
   // clang-format on
 
@@ -835,6 +934,9 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
 
       Node& fused_node = graph.BeginFuseSubGraph(indexed_sub_graph, node_name);
       fused_node.SetExecutionProviderType(type);
+      if (indexed_sub_graph.IsAccountingEnabled()) {
+        indexed_sub_graph.ComputeAndAccountForNode(fused_node.Name());
+      }
 
       // create filtered graph viewer for this set of nodes
       //
@@ -851,6 +953,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   // We will compile the fused nodes one by one, and fuse the subgraph if successful.
   for (const auto& compilation_entry : compilation_entries) {
+    const bool acc_enabled = compilation_entry.capability.get().sub_graph->IsAccountingEnabled();
     Node& node = compilation_entry.fused_node;
     std::vector<NodeComputeInfo> single_node_compute_func;
     ORT_RETURN_IF_ERROR(current_ep.Compile({IExecutionProvider::FusedNodeAndGraph{node, *compilation_entry.viewer}},
@@ -878,6 +981,9 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
 
     // now that we're done compiling we can remove the original nodes from the Graph and wire in the new one
     graph.FinalizeFuseSubGraph(indexed_sub_graph, node);
+    if (acc_enabled) {
+      compilation_entry.capability.get().sub_graph->ComputeAndAccountForNode(node.Name());
+    }
   }
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
@@ -988,9 +1094,26 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
 
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
+  // We use this only if Resource Aware Partitioning is enabled for any of the EPs
+  ResourceAccountantMap ep_acc_map;
+  // Zero, it is disabled by default
+  const std::string resource_partitioning_settings = config_options.GetConfigOrDefault(
+      kOrtSessionOptionsResourceCudaPartitioningSettings, "");
+  if (!resource_partitioning_settings.empty()) {
+    auto splits = utils::SplitString(resource_partitioning_settings, ",", false);
+    if (splits.size() == 4) {
+      SafeInt<size_t> cuda_memory_limit = std::stoul(std::string{splits[0]});
+      cuda_memory_limit *= 1024;  // to bytes
+      auto node_to_stats = LoadNodeAllocationStats(graph.ModelPath(), splits[1]);
+      ep_acc_map[kCudaExecutionProvider] = std::make_unique<SizeTAccountant>(cuda_memory_limit,
+                                                                             std::move(node_to_stats));
+    }
+  }
+
   if (mode == Mode::kNormal || mode == Mode::kAssignOnly) {
 #if !defined(ORT_MINIMAL_BUILD)
-    ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_, logger));
+    ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_,
+                                                 ep_acc_map, logger));
 
     bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
     std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
diff --git a/onnxruntime/core/framework/op_kernel.cc b/onnxruntime/core/framework/op_kernel.cc
index 94b6224440ed0..1d05cb4e5e818 100644
--- a/onnxruntime/core/framework/op_kernel.cc
+++ b/onnxruntime/core/framework/op_kernel.cc
@@ -130,6 +130,16 @@ OrtValue* OpKernelContext::GetOrCreateOutputMLValue(int index) {
   return value;
 }
 
+int OpKernelContext::GetOrtValueIndexForInput(int input_index) const {
+  int input_arg_index = GetInputArgIndex(input_index);
+  return execution_frame_->GetNodeIdxToMLValueIdx(input_arg_index);
+}
+
+int OpKernelContext::GetOrtValueIndexForOutput(int output_index) const {
+  int output_arg_index = GetOutputArgIndex(output_index);
+  return execution_frame_->GetNodeIdxToMLValueIdx(output_arg_index);
+}
+
 int OpKernelContext::GetInputArgIndex(int index) const {
   return node_input_start_index_ + index;
 }
diff --git a/onnxruntime/core/framework/op_kernel_context_internal.h b/onnxruntime/core/framework/op_kernel_context_internal.h
index 64bd70465a1c7..c970243ba461e 100644
--- a/onnxruntime/core/framework/op_kernel_context_internal.h
+++ b/onnxruntime/core/framework/op_kernel_context_internal.h
@@ -36,6 +36,15 @@ class OpKernelContextInternal : public OpKernelContext {
                   implicit_inputs[i]->Name(), " does not.");
       implicit_input_values_.push_back(entry);
     }
+
+#if !defined(ORT_MINIMAL_BUILD)
+    if (session_state_.GetNodeStatsRecorder() != nullptr) {
+      auto alloc = OpKernelContext::GetAllocator(kernel.GetDevice(OrtMemTypeDefault));
+      if (alloc != nullptr) {
+        accounting_allocator_ = std::make_shared<AccountingAllocator>(std::move(alloc));
+      }
+    }
+#endif
   }
 
   bool GetUseDeterministicCompute() const override {
@@ -69,9 +78,67 @@ class OpKernelContextInternal : public OpKernelContext {
     return implicit_input_values_;
   }
 
+  int GetOrtValueIndexForInput(int input_index) const override {
+    return OpKernelContext::GetOrtValueIndexForInput(input_index);
+  }
+
+  int GetOrtValueIndexForOutput(int output_index) const override {
+    return OpKernelContext::GetOrtValueIndexForOutput(output_index);
+  }
+
+#if !defined(ORT_MINIMAL_BUILD)
+  Status GetTempSpaceAllocator(AllocatorPtr* output) const override {
+    if (accounting_allocator_) {
+      *output = accounting_allocator_;
+      return Status::OK();
+    }
+    return OpKernelContext::GetTempSpaceAllocator(output);
+  }
+#endif
+
+#if !defined(ORT_MINIMAL_BUILD)
+  bool GetAllocatorStats(AllocatorStats& stats) {
+    if (accounting_allocator_ == nullptr) {
+      return false;
+    }
+    accounting_allocator_->GetStats(&stats);
+    return true;
+  }
+#endif
+
   const bool& GetTerminateFlag() const noexcept { return terminate_flag_; }
 
  private:
+#if !defined(ORT_MINIMAL_BUILD)
+  class AccountingAllocator : public IAllocator {
+   public:
+    AccountingAllocator(AllocatorPtr alloc) : IAllocator(alloc->Info()), allocator_(std::move(alloc)) {
+    }
+
+    void* Alloc(size_t size) override {
+      void* p = allocator_->Alloc(size);
+      if (p != nullptr) {
+        stats_.total_allocated_bytes += size;
+      }
+      return p;
+    }
+
+    void Free(void* p) override {
+      allocator_->Free(p);
+    }
+
+    void GetStats(AllocatorStats* stats) override {
+      *stats = stats_;
+    }
+
+   private:
+    AllocatorPtr allocator_;
+    AllocatorStats stats_;
+  };
+
+  AllocatorPtr accounting_allocator_;
+#endif
+
   const SessionState& session_state_;
   const bool& terminate_flag_;
   std::vector<const OrtValue*> implicit_input_values_;
diff --git a/onnxruntime/core/framework/resource_accountant.cc b/onnxruntime/core/framework/resource_accountant.cc
new file mode 100644
index 0000000000000..5c2d4feaaf126
--- /dev/null
+++ b/onnxruntime/core/framework/resource_accountant.cc
@@ -0,0 +1,47 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/framework/resource_accountant.h"
+#include "core/common/inlined_containers.h"
+
+#include <mutex>
+
+namespace onnxruntime {
+
+struct NodeStatsRecorder::Impl {
+  std::filesystem::path node_stats_path_;
+  // This is a node name to allocation stats map
+  InlinedHashMap<std::string, NodeAllocationStats> node_stats_;
+  mutable std::mutex mut_;
+};
+
+NodeStatsRecorder::NodeStatsRecorder(const std::filesystem::path& node_stats_path)
+    : impl_(std::make_unique<Impl>()) {
+  impl_->node_stats_path_ = node_stats_path;
+}
+
+NodeStatsRecorder::~NodeStatsRecorder() = default;
+
+const std::filesystem::path& NodeStatsRecorder::GetNodeStatsFileName() const noexcept {
+  return impl_->node_stats_path_;
+}
+
+void NodeStatsRecorder::ReportNodeStats(const std::string& node_name, const NodeAllocationStats& stats) {
+  std::lock_guard lock(impl_->mut_);
+  auto result = impl_->node_stats_.emplace(node_name, stats);
+  if (!result.second) {
+    // Node already exists, update the stats
+    result.first->second.UpdateIfGreater(stats);
+  }
+}
+
+void NodeStatsRecorder::DumpStats(std::ostream& os) const {
+  std::lock_guard lock(impl_->mut_);
+  for (const auto& [name, stats] : impl_->node_stats_) {
+    os << name << "," << stats.input_sizes << "," << stats.initializers_sizes << ","
+       << stats.total_dynamic_sizes << ","
+       << stats.total_temp_allocations << "\n";
+  }
+}
+
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index 61fd9b08655b7..8a7564c7d4236 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -11,6 +11,7 @@
 #include "core/common/logging/logging.h"
 #include "core/framework/allocation_planner.h"
 #include "core/framework/execution_frame.h"
+#include "core/framework/resource_accountant.h"
 #include "core/framework/stream_execution_context.h"
 #include "core/framework/session_state.h"
 #include "core/framework/op_kernel_context_internal.h"
@@ -104,7 +105,7 @@ static void CalculateTotalInputSizes(const OpKernelContextInternal* op_kernel_co
   const int input_count = op_kernel_context->InputCount();
   for (auto i = 0; i < input_count; i++) {
     const OrtValue* p_input = op_kernel_context->GetInputMLValue(i);
-    if (p_input != nullptr && p_input->IsTensor() && p_input->IsAllocated()) {
+    if (p_input != nullptr && p_input->IsAllocated() && p_input->IsTensor()) {
       const OpKernelInfo& op_kernel_info = p_op_kernel->Info();
       const Tensor* p_tensor = nullptr;
       bool is_param = op_kernel_info.TryGetConstantInput(i, &p_tensor);
@@ -256,6 +257,8 @@ class SessionScope {
   TimePoint session_start_;
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
   const ExecutionFrame& frame_;
+#endif
+#if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
   // Whether memory profiler need create events and flush to file.
   // For partial graph run, when the last subgraph of the whole graph is executing, we need flush to file.
   bool flush_memory_info_ = true;
@@ -487,6 +490,61 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx,
       }
 #else
       status = p_kernel->Compute(&kernel_ctx);
+
+#if !defined(ORT_MINIMAL_BUILD)
+      auto* node_stats_recorder = ctx.GetSessionState().GetNodeStatsRecorder();
+      if (node_stats_recorder != nullptr) {
+        // Lets first check if any inputs are initializers,
+        // if so we need to account for their memory usage.
+        const auto& const_initializers = ctx.GetSessionState().GetConstantInitializedTensors();
+        SafeInt<int64_t> initializers_size = 0;
+        SafeInt<size_t> input_sizes = 0;
+        for (int i = 0, lim = kernel_ctx.InputCount(); i < lim; ++i) {
+          // Need to get ort_value_index for each input.
+          int ort_vaue_index = kernel_ctx.GetOrtValueIndexForInput(i);
+          auto hit = const_initializers.find(ort_vaue_index);
+          if (hit != const_initializers.end()) {
+            const auto& ort_value = hit->second;
+            initializers_size += ort_value.Get<Tensor>().SizeInBytes();
+          } else {
+            // If the input is not an initializer, we account it as something that had to be
+            // on the same device with this kernel
+            const OrtValue* ort_value = kernel_ctx.GetInputMLValue(i);
+            if (ort_value != nullptr && ort_value->IsAllocated() && ort_value->IsTensor()) {
+              input_sizes += ort_value->Get<Tensor>().SizeInBytes();
+            }
+          }
+        }
+
+        // XXX: Should we account for implicit inputs?
+
+        // Get outputs and see if any were allocated dynamically
+        SafeInt<size_t> total_dynamic_sizes = 0;
+        const auto& exec_frame = ctx.GetExecutionFrame();
+        for (int i = 0, lim = kernel_ctx.OutputCount(); i < lim; ++i) {
+          int ort_vaue_index = kernel_ctx.GetOrtValueIndexForOutput(i);
+          auto maybe_val = exec_frame.GetOrtValueDynamicAllocation(ort_vaue_index);
+          if (maybe_val.has_value()) {
+            total_dynamic_sizes += *maybe_val;
+          }
+        }
+
+        NodeAllocationStats node_stats;
+        node_stats.input_sizes = static_cast<size_t>(input_sizes);
+        node_stats.initializers_sizes = static_cast<size_t>(initializers_size);
+        node_stats.total_dynamic_sizes = total_dynamic_sizes;
+
+        // Get the temporary allocations
+        AllocatorStats temp_stats;
+        if (kernel_ctx.GetAllocatorStats(temp_stats)) {
+          node_stats.total_temp_allocations = narrow<size_t>(temp_stats.total_allocated_bytes);
+        }
+
+        // Record node allocation stats
+        const auto& node = p_kernel->Node();
+        node_stats_recorder->ReportNodeStats(node.Name(), node_stats);
+      }
+#endif
 #endif
     }
     ORT_CATCH(const std::exception& ex) {
@@ -510,6 +568,7 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx,
     LOGS(logger, ERROR) << msg_string;
     return Status(status.Category(), status.Code(), msg_string);
   }
+
   ctx.RecycleNodeInputs(idx);
   VLOGS(logger, 0) << "stream " << stream_idx << " launch kernel with idx " << idx;
   return Status::OK();
diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
index 82f520f4a4252..964c059e529f9 100644
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@@ -375,6 +375,24 @@ class SessionState {
   /// <returns>true of false
   bool GetSaveModeForPrepacks(bool saving_model, bool saving_ort_format);
 
+#if !defined(ORT_MINIMAL_BUILD)
+
+  void SetNodeStatsRecorder(NodeStatsRecorder* node_stats_recorder) {
+    node_stats_recorder_ = node_stats_recorder;
+  }
+
+  /**
+   * Returns a pointer to the NodeStatsRecorder object if it was enabled for the session.
+   * The object pointer is only present at the root SessionState object
+   */
+  NodeStatsRecorder* GetNodeStatsRecorder() const {
+    if (parent_ != nullptr) {
+      return parent_->GetNodeStatsRecorder();
+    }
+    return node_stats_recorder_;
+  }
+#endif
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SessionState);
 
@@ -502,6 +520,10 @@ class SessionState {
   MemoryProfiler* memory_profiler_;
 #endif
 
+#if !defined(ORT_MINIMAL_BUILD)
+  NodeStatsRecorder* node_stats_recorder_ = nullptr;
+#endif
+
   // switch for enable memory pattern optimization or not.
   bool enable_mem_pattern_;
 
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 097ce436f4419..17c37b8882168 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -844,18 +844,9 @@ INSTANTIATE_UNPACK_TENSOR(UInt4x2)
     break;
 
 template <size_t alignment>
-common::Status GetSizeInBytesFromTensorProto(const ONNX_NAMESPACE::TensorProto& tensor_proto, size_t* out) {
-  const auto& dims = tensor_proto.dims();
-  size_t size = 1;
-  for (google::protobuf::int64 dim : dims) {
-    if (dim < 0 || static_cast<uint64_t>(dim) >= std::numeric_limits<size_t>::max()) {
-      return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Invalid TensorProto");
-    }
-    if (!IAllocator::CalcMemSizeForArray(size, static_cast<size_t>(dim), &size)) {
-      return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Invalid TensorProto");
-    }
-  }
-  switch (tensor_proto.data_type()) {
+common::Status GetSizeInBytesFromTensorShapeAndType(const TensorShape& shape, int32_t element_type, size_t* out) {
+  const auto size = narrow<size_t>(shape.Size());
+  switch (element_type) {
     CASE_PROTO_TRACE(FLOAT, float);
     CASE_PROTO_TRACE(DOUBLE, double);
     CASE_PROTO_TRACE(BOOL, bool);
@@ -884,24 +875,61 @@ common::Status GetSizeInBytesFromTensorProto(const ONNX_NAMESPACE::TensorProto&
   return Status::OK();
 }
 
+template <size_t alignment>
+common::Status GetSizeInBytesFromTensorProto(const ONNX_NAMESPACE::TensorProto& tensor_proto, size_t* out) {
+  TensorShape tensor_shape = GetTensorShapeFromTensorProto(tensor_proto);
+
+  bool any_out_of_bounds = std::any_of(tensor_shape.GetDims().begin(), tensor_shape.GetDims().end(),
+                                       [](int64_t dim) {
+                                         if (dim < 0 ||
+                                             static_cast<uint64_t>(dim) >= std::numeric_limits<size_t>::max()) {
+                                           return true;
+                                         }
+                                         return false;
+                                       });
+
+  ORT_RETURN_IF(any_out_of_bounds, "Out of bounds dimensions in TypeProto_Tensor");
+
+  return GetSizeInBytesFromTensorShapeAndType<alignment>(tensor_shape, tensor_proto.data_type(), out);
+}
+
+template <size_t alignment>
+common::Status GetSizeInBytesFromTensorTypeProto(const ONNX_NAMESPACE::TypeProto_Tensor& tensor_proto, size_t* out) {
+  ORT_RETURN_IF_NOT(HasShape(tensor_proto), "TypeProto_Tensor does not have shape");
+  ORT_RETURN_IF_NOT(HasElemType(tensor_proto), "TypeProto_Tensor does not have element type");
+
+  TensorShape tensor_shape = GetTensorShapeFromTensorShapeProto(tensor_proto.shape());
+
+  bool any_out_of_bounds = std::any_of(tensor_shape.GetDims().begin(), tensor_shape.GetDims().end(),
+                                       [](int64_t dim) {
+                                         return dim < 0 ||
+                                                static_cast<uint64_t>(dim) >= std::numeric_limits<size_t>::max();
+                                       });
+  ORT_RETURN_IF(any_out_of_bounds, "Out of bounds dimensions in TypeProto_Tensor");
+
+  return GetSizeInBytesFromTensorShapeAndType<alignment>(tensor_shape, tensor_proto.elem_type(), out);
+}
+
+template Status GetSizeInBytesFromTensorTypeProto<0>(const ONNX_NAMESPACE::TypeProto_Tensor& tensor_proto, size_t* out);
+
 TensorShape GetTensorShapeFromTensorShapeProto(const ONNX_NAMESPACE::TensorShapeProto& tensor_shape_proto) {
   const auto& dims = tensor_shape_proto.dim();
-  std::vector<int64_t> tensor_shape_vec(static_cast<size_t>(dims.size()));
+  TensorShapeVector tensor_shape_vec(static_cast<size_t>(dims.size()));
   for (int i = 0; i < dims.size(); ++i) {
     tensor_shape_vec[i] =
         HasDimValue(dims[i]) ? dims[i].dim_value() : -1; /* symbolic dimensions are represented as -1 in onnxruntime*/
   }
-  return TensorShape(std::move(tensor_shape_vec));
+  return TensorShape(tensor_shape_vec);
 }
 
 TensorShape GetTensorShapeFromTensorProto(const ONNX_NAMESPACE::TensorProto& tensor_proto) {
   const auto& dims = tensor_proto.dims();
-  std::vector<int64_t> tensor_shape_vec(static_cast<size_t>(dims.size()));
+  TensorShapeVector tensor_shape_vec(static_cast<size_t>(dims.size()));
   for (int i = 0; i < dims.size(); ++i) {
     tensor_shape_vec[i] = dims[i];
   }
 
-  return TensorShape(std::move(tensor_shape_vec));
+  return TensorShape(tensor_shape_vec);
 }
 
 struct UnInitializeParam {
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
index 7b9a47842388c..f5dec7ae988f2 100644
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@@ -157,6 +157,9 @@ ONNXTensorElementDataType GetTensorElementType(const ONNX_NAMESPACE::TensorProto
 template <size_t alignment>
 common::Status GetSizeInBytesFromTensorProto(const ONNX_NAMESPACE::TensorProto& tensor_proto, size_t* out);
 
+template <size_t alignment>
+Status GetSizeInBytesFromTensorTypeProto(const ONNX_NAMESPACE::TypeProto_Tensor& tensor_proto, size_t* out);
+
 /**
 Special marker used to indicate an existing memory buffer contains the TensorProto external data.
 If the 'location' field of the external data info is set to this marker, the 'offset' field should contain the
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 7ee794ccbd2e8..6949eec7f6347 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -5522,6 +5522,42 @@ Graph::Graph(const Model& owning_model,
       is_loaded_from_model_file_(true) {  // true as the Graph isn't manually constructed from scratch
 }
 
+size_t Graph::ComputeNodeMemoryUsage(NodeIndex node_idx) const {
+  /// XXX: In some cases some kernels can copy its attributes to a device
+  // those are edge cases which we currently do not account for.
+  const Node* node = GetNode(node_idx);
+  if (node != nullptr) {
+    SafeInt<size_t> result = 0;
+    for (const auto* input : node->InputDefs()) {
+      if (input->Exists()) {
+        // Let's see if this is an initializer
+        constexpr const bool check_outer_scope_true = true;
+        const ONNX_NAMESPACE::TensorProto* initializer =
+            GetConstantInitializer(input->Name(), check_outer_scope_true);
+        if (initializer != nullptr) {
+          size_t out;
+          if (utils::GetSizeInBytesFromTensorProto<0>(*initializer, &out).IsOK()) {
+            result += out;
+          }
+        } else {
+          const auto* proto = input->TypeAsProto();
+          if (proto != nullptr && utils::HasTensorType(*proto)) {
+            const auto& tensor_type = proto->tensor_type();
+            if (utils::HasElemType(tensor_type) && utils::HasShape(tensor_type)) {
+              size_t size;
+              if (utils::GetSizeInBytesFromTensorTypeProto<0>(tensor_type, &size).IsOK()) {
+                result += size;
+              }
+            }
+          }
+        }
+      }
+    }
+    return static_cast<size_t>(result);
+  }
+  return 0;
+}
+
 common::Status Graph::LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph,
                                         const OrtFormatLoadOptions& load_options) {
   // We deserialize the graph from ORT format in the following order:
diff --git a/onnxruntime/core/providers/acl/acl_execution_provider.cc b/onnxruntime/core/providers/acl/acl_execution_provider.cc
index 8d34e36fe7cd6..ede476ff74d1b 100644
--- a/onnxruntime/core/providers/acl/acl_execution_provider.cc
+++ b/onnxruntime/core/providers/acl/acl_execution_provider.cc
@@ -152,7 +152,8 @@ std::shared_ptr<KernelRegistry> ACLExecutionProvider::GetKernelRegistry() const
 
 std::vector<std::unique_ptr<ComputeCapability>>
 ACLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
-                                    const IKernelLookup& kernel_lookup) const {
+                                    const IKernelLookup& kernel_lookup,
+                                    IResourceAccountant*) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (const auto& node : graph.Nodes()) {
     if (const KernelCreateInfo* kernel_create_info = kernel_lookup.LookUpKernel(node);
diff --git a/onnxruntime/core/providers/acl/acl_execution_provider.h b/onnxruntime/core/providers/acl/acl_execution_provider.h
index 1c267d8713673..d635e56add30b 100755
--- a/onnxruntime/core/providers/acl/acl_execution_provider.h
+++ b/onnxruntime/core/providers/acl/acl_execution_provider.h
@@ -38,7 +38,8 @@ class ACLExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph,
-      const IKernelLookup& kernel_lookup) const override;
+      const IKernelLookup& kernel_lookup,
+      IResourceAccountant* resource_accountant) const override;
 
   Status OnRunStart(const onnxruntime::RunOptions&) override;
 
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc
index f954baf3eabae..07e83933a890c 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider.cc
+++ b/onnxruntime/core/providers/cann/cann_execution_provider.cc
@@ -1253,7 +1253,8 @@ GetSubGraphPartition(const std::vector<NodeIndex>& topological_order, const std:
 
 std::vector<std::unique_ptr<ComputeCapability>>
 CANNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                                     const IKernelLookup& kernel_lookup) const {
+                                     const IKernelLookup& kernel_lookup,
+                                     IResourceAccountant*) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
   // TODO(FFFrog): Feature Enhancement
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.h b/onnxruntime/core/providers/cann/cann_execution_provider.h
index 7debfa72778fd..5ff935463a1c1 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider.h
+++ b/onnxruntime/core/providers/cann/cann_execution_provider.h
@@ -55,7 +55,8 @@ class CANNExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph_viewer,
-      const IKernelLookup& kernel_lookup) const override;
+      const IKernelLookup& kernel_lookup,
+      IResourceAccountant* resource_accountant) const override;
 
   Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                  std::vector<NodeComputeInfo>& node_compute_funcs) override;
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
index b6bb4f2c1d66a..3fa3868267c9b 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
@@ -38,7 +38,8 @@ CoreMLExecutionProvider::~CoreMLExecutionProvider() {}
 
 std::vector<std::unique_ptr<ComputeCapability>>
 CoreMLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                                       const IKernelLookup& /*kernel_lookup*/) const {
+                                       const IKernelLookup& /*kernel_lookup*/,
+                                       IResourceAccountant* /* resource_accountant */) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
   const auto& logger = *GetLogger();
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.h b/onnxruntime/core/providers/coreml/coreml_execution_provider.h
index 650d81a4fecf7..0609bf6af726d 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.h
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.h
@@ -19,7 +19,8 @@ class CoreMLExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                const IKernelLookup& /*kernel_lookup*/) const override;
+                const IKernelLookup& /*kernel_lookup*/,
+                IResourceAccountant* resource_accountant) const override;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes,
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index d4013a7dc3d57..2fb1bc35630fa 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -5,6 +5,7 @@
 #include "core/common/inlined_containers.h"
 #include "core/common/parse_string.h"
 #include "core/framework/int4.h"
+#include "core/framework/resource_accountant.h"
 #include "core/providers/shared_library/provider_api.h"
 #include "core/platform/env_var_utils.h"
 #include "core/providers/cuda/cuda_execution_provider.h"
@@ -2626,11 +2627,43 @@ std::unique_ptr<onnxruntime::IDataTransfer> CUDAExecutionProvider::GetDataTransf
 
 std::vector<std::unique_ptr<ComputeCapability>>
 CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
-                                     const IKernelLookup& kernel_lookup) const {
+                                     const IKernelLookup& kernel_lookup,
+                                     IResourceAccountant* resource_accountant) const {
+  std::vector<std::unique_ptr<ComputeCapability>> result;
+  const logging::Logger& logger = *GetLogger();
+
+  // Figure out the memory limit if accountant is available
+  size_t memory_threshold = std::numeric_limits<size_t>::max();
+  SafeInt<size_t> consumed_memory = 0;
+  if (resource_accountant != nullptr) {
+    if (resource_accountant->IsStopIssued()) {
+      LOGS(logger, WARNING) << "CUDA_EP returning due to Stop Set";
+      return result;
+    }
+
+    auto threshold = resource_accountant->GetThreshold();
+    if (!threshold.has_value()) {
+      // info_.gpu_mem_limit is for BFC arena
+      size_t free_memory, total_memory;
+      if (0 != cudaMemGetInfo(&free_memory, &total_memory)) {
+        memory_threshold = info_.gpu_mem_limit;
+      } else {
+        memory_threshold = std::min(free_memory, info_.gpu_mem_limit);
+      }
+    } else {
+      memory_threshold = std::get<0>(threshold.value());
+    }
+
+    consumed_memory = std::get<0>(resource_accountant->GetConsumedAmount());
+  }
+
+  InlinedHashSet<NodeIndex> previously_assigned_nodes;
+  // On repeated calls to this function, we may have most of the nodes already
+  // assigned to a CUDA EP capability. We'll skip accounting for these nodes.
+  previously_assigned_nodes.reserve(graph.NumberOfNodes());
   InlinedVector<NodeIndex> candidates;
   // A subset of the above vector. A subset of the tentative_nodes might be moved to CPU.
   InlinedVector<NodeIndex> tentative_nodes;
-  const logging::Logger& logger = *GetLogger();
   for (auto& node_index : graph.GetNodesInTopologicalOrder()) {
     const auto* p_node = graph.GetNode(node_index);
     if (p_node == nullptr)
@@ -2640,6 +2673,7 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
     if (!node.GetExecutionProviderType().empty()) {
       if (node.GetExecutionProviderType() == kCudaExecutionProvider) {
         candidates.push_back(node.Index());
+        previously_assigned_nodes.insert(node.Index());
       }
       continue;
     }
@@ -2694,14 +2728,40 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
   // These are usually shape related computation subgraphs
   // Following logic can be extended for other EPs
   auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, logger);
-  std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& node_index : candidates) {
     if (cpu_nodes.count(node_index) > 0)
       continue;
 
-    auto sub_graph = IndexedSubGraph::Create();
-    sub_graph->Nodes().push_back(node_index);
-    result.push_back(ComputeCapability::Create(std::move(sub_graph)));
+    // Previously assigned nodes have been accounted before
+    if (previously_assigned_nodes.count(node_index) > 0 || resource_accountant == nullptr) {
+      auto sub_graph = IndexedSubGraph::Create();
+      sub_graph->Nodes().push_back(node_index);
+      result.push_back(ComputeCapability::Create(std::move(sub_graph)));
+    } else {
+      auto* node = graph.GetNode(node_index);
+      auto resource_count = std::get<0>(resource_accountant->ComputeResourceCount(node->Name()));
+      const auto would_be_consumed = resource_count + consumed_memory;
+      LOGS(logger, INFO) << "CUDA_EP Node: " << node_index << " Memory usage : " << resource_count
+                         << " would be consumed " << static_cast<size_t>(would_be_consumed)
+                         << " threshold: " << memory_threshold;
+      if (would_be_consumed < memory_threshold) {
+        consumed_memory = would_be_consumed;
+        auto sub_graph = IndexedSubGraph::Create();
+        sub_graph->SetAccountant(resource_accountant);
+        sub_graph->Nodes().push_back(node_index);
+        sub_graph->AppendNodeCost(resource_count);
+        result.push_back(ComputeCapability::Create(std::move(sub_graph)));
+      } else {
+        // We break here so we do not have patches of CUDA assigned nodes.
+        auto* node = graph.GetNode(node_index);
+        if (node != nullptr) {
+          LOGS(logger, WARNING) << "CUDA_EP Halting assignment due to capacity threshold at node: "
+                                << node->Name() << " index: " << node_index;
+        }
+        resource_accountant->SetStopAssignment();
+        break;
+      }
+    }
   }
   /*
   std::vector<std::unique_ptr<ComputeCapability>> result;
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
index bd2be2eac2181..79a48e7cb89e1 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -72,7 +72,8 @@ class CUDAExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph,
-      const IKernelLookup& kernel_lookup) const override;
+      const IKernelLookup& kernel_lookup,
+      IResourceAccountant* resource_accountant) const override;
 
   int GetDeviceId() const override { return info_.device_id; }
   const cudaDeviceProp& GetDeviceProp() const { return device_prop_; };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 826f48b5f7a68..dd868ddd8307a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -92,12 +92,13 @@ namespace Dml
     std::vector<std::unique_ptr<onnxruntime::ComputeCapability>>
     ExecutionProvider::GetCapability(
         const onnxruntime::GraphViewer& graph,
-        const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup) const
+        const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup,
+        onnxruntime::IResourceAccountant* resource_accountant) const
     {
 #ifdef ENABLE_GRAPH_COMPILATION
-        return m_impl->GetCapability(graph, kernel_lookup, *GetLogger());
+        return m_impl->GetCapability(graph, kernel_lookup, resource_accountant, *GetLogger());
 #else
-        return onnxruntime::IExecutionProvider::GetCapability(graph, kernel_lookup);
+        return onnxruntime::IExecutionProvider::GetCapability(graph, kernel_lookup, resource_accountant);
 #endif
     }
 
@@ -877,8 +878,7 @@ namespace Dml
     ExecutionProviderImpl::GetCapability(
         const onnxruntime::GraphViewer& graph,
         const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup,
-        const onnxruntime::logging::Logger& logger) const
-    {
+        const onnxruntime::logging::Logger& logger, onnxruntime::IResourceAccountant*) const {
         uint32_t deviceDataTypeMask = GetSupportedDeviceDataTypeMask(); // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
 
         std::vector<std::unique_ptr<onnxruntime::ComputeCapability>> result;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index e7d859c5764de..3002177db13f4 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -11,6 +11,10 @@
 #include <wrl/client.h>
 #include <wrl/implements.h>
 
+namespace onnxruntime {
+class IResourceAccountant;
+}
+
 namespace WRL {
 template <typename... TInterfaces>
 using Base = Microsoft::WRL::RuntimeClass<
@@ -89,8 +93,7 @@ namespace Dml
         GetCapability(
             const onnxruntime::GraphViewer& graph,
             const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup,
-            const onnxruntime::logging::Logger& logger
-            ) const;
+            const onnxruntime::logging::Logger& logger, onnxruntime::IResourceAccountant* resource_accountant) const;
 
         uint32_t GetSupportedDeviceDataTypeMask() const;
 
@@ -283,7 +286,8 @@ namespace Dml
 
         std::vector<std::unique_ptr<onnxruntime::ComputeCapability>>
             GetCapability(const onnxruntime::GraphViewer& graph,
-                const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup) const final override;
+                const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup,
+                onnxruntime::IResourceAccountant* resource_accountant) const final override;
 
         onnxruntime::common::Status OnSessionInitializationEnd() override
         {
diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
index c96f9cc1ff400..4da82b351f1d6 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
@@ -146,7 +146,8 @@ std::vector<std::vector<NodeIndex>> DnnlExecutionProvider::GetSupportedNodes(con
 
 std::vector<std::unique_ptr<ComputeCapability>> DnnlExecutionProvider::GetCapability(
     const GraphViewer& graph_viewer,
-    const IKernelLookup& /*kernel_lookup*/) const {
+    const IKernelLookup& /*kernel_lookup*/,
+    IResourceAccountant* /* resource_accountant */) const {
   // follow from coreml ep's Getcapability
 
   std::vector<std::unique_ptr<ComputeCapability>> result;
diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h
index b7fcbb7765180..bde18e139f2a3 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h
+++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h
@@ -24,7 +24,8 @@ class DnnlExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph,
-                const IKernelLookup& /*kernel_lookup*/) const override;
+                const IKernelLookup& /*kernel_lookup*/,
+                onnxruntime::IResourceAccountant* /* resource_accountant */) const override;
 
   common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                          std::vector<NodeComputeInfo>& node_compute_funcs) override;
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index d2d1d5e6fdd03..5a753d1ccf79a 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -790,7 +790,8 @@ std::vector<AllocatorPtr> JsExecutionProvider::CreatePreferredAllocators() {
 
 std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapability(
     const onnxruntime::GraphViewer& graph,
-    const IKernelLookup& kernel_lookup) const {
+    const IKernelLookup& kernel_lookup,
+    IResourceAccountant* /* resource_accountant */) const {
   InlinedVector<NodeIndex> candidates;
   // `tenative_candidates` is a subset of `candidates`.
   InlinedVector<NodeIndex> tenative_candidates;
diff --git a/onnxruntime/core/providers/js/js_execution_provider.h b/onnxruntime/core/providers/js/js_execution_provider.h
index 966f9c6980212..4bead50fc782e 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.h
+++ b/onnxruntime/core/providers/js/js_execution_provider.h
@@ -44,7 +44,8 @@ class JsExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph_viewer,
-      const IKernelLookup& /*kernel_lookup*/) const override;
+      const IKernelLookup& /*kernel_lookup*/,
+      IResourceAccountant* /* resource_accountant */) const override;
 
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
   std::unique_ptr<onnxruntime::IDataTransfer> GetDataTransfer() const override;
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index 95fbe7ab58ce2..1558d22137c05 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -992,7 +992,8 @@ GetPartitionedSubgraphs(const std::vector<NodeIndex>& topological_order,
 
 std::vector<std::unique_ptr<ComputeCapability>>
 MIGraphXExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                                         const IKernelLookup& /*kernel_lookup*/) const {
+                                         const IKernelLookup& /*kernel_lookup*/,
+                                         IResourceAccountant* /* resource_accountant */) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
   auto model = graph_viewer.CreateModel(*GetLogger());
   auto model_proto = model->ToProto();
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
index 91b6a4741b55e..d6af991f9b77e 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
@@ -68,7 +68,8 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                const IKernelLookup& /*kernel_lookup*/) const override;
+                const IKernelLookup& /*kernel_lookup*/,
+                IResourceAccountant* /* resource_accountant */) const override;
 
   common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes,
                          std::vector<NodeComputeInfo>& node_compute_funcs) override;
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
index f92c9592742d5..27bd584e2d3c6 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
@@ -80,9 +80,10 @@ NnapiExecutionProvider::~NnapiExecutionProvider() {}
 
 std::vector<std::unique_ptr<ComputeCapability>>
 NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                                      const IKernelLookup& /*kernel_lookup*/) const {
-  const auto& logger = *GetLogger();
+                                      const IKernelLookup& /*kernel_lookup*/,
+                                      IResourceAccountant* /* resource_accountant */) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
+  const logging::Logger& logger = *GetLogger();
 
   // TODO: Task 812756: NNAPI EP, add support for subgraph (If and Loop operators)
   if (graph_viewer.IsSubgraph()) {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h
index 460616c41991f..ebf9372eb668d 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h
@@ -25,7 +25,8 @@ class NnapiExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_view,
-                const IKernelLookup& /*kernel_lookup*/) const override;
+                const IKernelLookup& /*kernel_lookup*/,
+                IResourceAccountant* /* resource_accountant */) const override;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes,
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 72a188108adef..0cda59ef4eb19 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -82,7 +82,8 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
 
 std::vector<std::unique_ptr<ComputeCapability>>
 OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
-                                         const IKernelLookup& /*kernel_lookup*/) const {
+                                         const IKernelLookup& /*kernel_lookup*/,
+                                         IResourceAccountant* /* resource_accountant */) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
   std::string openvino_sdk_version = std::to_string(global_context_->OpenVINO_Version.at(0)) + "." +
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index d5c22a4e2a9e4..1d7d3db95bb1d 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -183,7 +183,8 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const GraphViewer& graph_viewer,
-                const IKernelLookup& /*kernel_lookup*/) const override;
+                const IKernelLookup& /*kernel_lookup*/,
+                IResourceAccountant* /* resource_accountant */) const override;
 
   Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes,
                  std::vector<NodeComputeInfo>& node_compute_funcs) override;
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index e9d6884b8c8ca..fd3ab8622dc76 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -670,7 +670,8 @@ static void PartitionCtxModel(const onnxruntime::GraphViewer& graph_viewer,
 
 std::vector<std::unique_ptr<ComputeCapability>>
 QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                                    const IKernelLookup& /*kernel_lookup*/) const {
+                                    const IKernelLookup& /*kernel_lookup*/,
+                                    IResourceAccountant* /* resource_accountant */) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
   if (graph_viewer.IsSubgraph()) {
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 317b34e66a6e4..c717bafa41398 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -37,7 +37,8 @@ class QNNExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_view,
-                const IKernelLookup& /*kernel_lookup*/) const override;
+                const IKernelLookup& /*kernel_lookup*/,
+                IResourceAccountant* /* resource_accountant */) const override;
 
   Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                  std::vector<NodeComputeInfo>& node_compute_funcs) override;
diff --git a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc
index 44b34f4b4ce6c..10fd81786f977 100644
--- a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc
+++ b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc
@@ -50,7 +50,8 @@ std::vector<std::vector<int>> RknpuExecutionProvider::GetSupportedNodes(
 
 std::vector<std::unique_ptr<ComputeCapability>>
 RknpuExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                                      const IKernelLookup& /*kernel_lookup*/) const {
+                                      const IKernelLookup& /*kernel_lookup*/,
+                                      IResourceAccountant* /* resource_accountant */) const {
   // Find inputs, initializers and outputs for each supported subgraph
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
diff --git a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.h b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.h
index 1289c8569f8e8..ce16d63e111d9 100644
--- a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.h
+++ b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.h
@@ -19,7 +19,8 @@ class RknpuExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph,
-                const IKernelLookup& /*kernel_lookup*/) const override;
+                const IKernelLookup& /*kernel_lookup*/,
+                IResourceAccountant* /* resource_accountant */) const override;
   common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                          std::vector<NodeComputeInfo>& node_compute_funcs) override;
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index 0a427b146dcaa..9d6e9df907ce3 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -2440,7 +2440,8 @@ std::unique_ptr<onnxruntime::IDataTransfer> ROCMExecutionProvider::GetDataTransf
 
 std::vector<std::unique_ptr<ComputeCapability>>
 ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
-                                     const IKernelLookup& kernel_lookup) const {
+                                     const IKernelLookup& kernel_lookup,
+                                     IResourceAccountant* /* resource_accountant */) const {
   InlinedVector<NodeIndex> candidates;
   // A subset of the above vector. A subset of the tentative_nodes might be moved to CPU.
   InlinedVector<NodeIndex> tentative_nodes;
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.h b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
index be467869248ea..ff2bff7c98723 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
@@ -61,7 +61,8 @@ class ROCMExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph,
-      const IKernelLookup& kernel_lookup) const override;
+      const IKernelLookup& kernel_lookup,
+      IResourceAccountant* /* resource_accountant */) const override;
 
   int GetDeviceId() const override { return info_.device_id; }
   const hipDeviceProp_t& GetDeviceProp() const { return device_prop_; };
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index aa8c367d25d51..e9f8c061d9ef3 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -331,8 +331,9 @@ bool IAllocator::CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, siz
 }
 
 std::vector<std::unique_ptr<ComputeCapability>> IExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                                                                                  const IKernelLookup& kernel_lookup) const {
-  return g_host->IExecutionProvider__GetCapability(this, graph_viewer, kernel_lookup);
+                                                                                  const IKernelLookup& kernel_lookup,
+                                                                                  IResourceAccountant* resource_accountant) const {
+  return g_host->IExecutionProvider__GetCapability(this, graph_viewer, kernel_lookup, resource_accountant);
 }
 common::Status IExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                                            std::vector<NodeComputeInfo>& node_compute_funcs) {
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 5a179ec622f8c..49c514e121178 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -11,6 +11,7 @@
 #include "core/providers/shared_library/provider_host_api.h"
 
 #include "core/common/inlined_containers_fwd.h"
+#include "core/framework/resource_accountant.h"
 #include "core/providers/shared/common.h"
 
 #define PROVIDER_DISALLOW_ALL(TypeName)     \
@@ -246,7 +247,8 @@ struct ProviderHost {
 
   // IExecutionProvider
   virtual std::vector<std::unique_ptr<ComputeCapability>> IExecutionProvider__GetCapability(const IExecutionProvider* p, const onnxruntime::GraphViewer& graph_viewer,
-                                                                                            const IExecutionProvider::IKernelLookup& kernel_lookup) = 0;
+                                                                                            const IExecutionProvider::IKernelLookup& kernel_lookup,
+                                                                                            IResourceAccountant* resource_accountant) = 0;
 
   virtual common::Status IExecutionProvider__Compile(IExecutionProvider* p, const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs, std::vector<NodeComputeInfo>& node_compute_funcs) = 0;
 
@@ -628,6 +630,7 @@ struct ProviderHost {
   virtual std::unique_ptr<IndexedSubGraph> IndexedSubGraph__construct() = 0;
   virtual void IndexedSubGraph__operator_delete(IndexedSubGraph* p) = 0;
 
+  virtual const std::vector<onnxruntime::NodeIndex>& IndexedSubGraph__Nodes(const IndexedSubGraph* p) = 0;
   virtual std::vector<onnxruntime::NodeIndex>& IndexedSubGraph__Nodes(IndexedSubGraph* p) = 0;
 
   virtual void IndexedSubGraph__SetMetaDef(IndexedSubGraph* p, std::unique_ptr<IndexedSubGraph_MetaDef>&& meta_def_) = 0;
@@ -635,6 +638,9 @@ struct ProviderHost {
 
   virtual void IndexedSubGraph__SetSchemaSource(IndexedSubGraph* p, IndexedSubGraph_SourceOfSchema schema_source) = 0;
   virtual IndexedSubGraph_SourceOfSchema IndexedSubGraph__GetSchemaSource(const IndexedSubGraph* p) = 0;
+  virtual void IndexedSubGraph__SetAccountant(IndexedSubGraph* p, IResourceAccountant*) = 0;
+  virtual void IndexedSubGraph__AppendNodeCost(IndexedSubGraph* p, const ResourceCount& count) = 0;
+  virtual void IndexedSubGraph__AppendNodeEmptyCost(IndexedSubGraph* p) = 0;
 
   // KernelDef
   virtual void KernelDef__operator_delete(KernelDef* p) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 76b6d8063fd66..bf75507b3c6b3 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -553,6 +553,7 @@ struct IndexedSubGraph final {
   static std::unique_ptr<IndexedSubGraph> Create() { return g_host->IndexedSubGraph__construct(); }
   static void operator delete(void* p) { g_host->IndexedSubGraph__operator_delete(reinterpret_cast<IndexedSubGraph*>(p)); }
 
+  gsl::span<const onnxruntime::NodeIndex> Nodes() const { return g_host->IndexedSubGraph__Nodes(this); }
   std::vector<onnxruntime::NodeIndex>& Nodes() { return g_host->IndexedSubGraph__Nodes(this); }
 
   void SetMetaDef(std::unique_ptr<IndexedSubGraph_MetaDef>&& meta_def_) { return g_host->IndexedSubGraph__SetMetaDef(this, std::move(*reinterpret_cast<std::unique_ptr<IndexedSubGraph_MetaDef>*>(&meta_def_))); }
@@ -560,6 +561,15 @@ struct IndexedSubGraph final {
 
   void SetSchemaSource(IndexedSubGraph_SourceOfSchema schema_source) { return g_host->IndexedSubGraph__SetSchemaSource(this, schema_source); }
   IndexedSubGraph_SourceOfSchema GetSchemaSource() const { return g_host->IndexedSubGraph__GetSchemaSource(this); }
+  void SetAccountant(IResourceAccountant* resource_accountant) {
+    g_host->IndexedSubGraph__SetAccountant(this, resource_accountant);
+  }
+  void AppendNodeCost(const ResourceCount& resource_count) {
+    g_host->IndexedSubGraph__AppendNodeCost(this, resource_count);
+  }
+  void AppendNodeEmptyCost() {
+    g_host->IndexedSubGraph__AppendNodeEmptyCost(this);
+  }
 
   IndexedSubGraph() = delete;
   IndexedSubGraph(const IndexedSubGraph&) = delete;
diff --git a/onnxruntime/core/providers/snpe/snpe_execution_provider.cc b/onnxruntime/core/providers/snpe/snpe_execution_provider.cc
index fb9ce580ea2dc..c7fc6d3a556a7 100644
--- a/onnxruntime/core/providers/snpe/snpe_execution_provider.cc
+++ b/onnxruntime/core/providers/snpe/snpe_execution_provider.cc
@@ -71,7 +71,8 @@ SNPEExecutionProvider::~SNPEExecutionProvider() {}
 
 std::vector<std::unique_ptr<ComputeCapability>>
 SNPEExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
-                                     const IKernelLookup& kernel_lookup) const {
+                                     const IKernelLookup& kernel_lookup,
+                                     IResourceAccountant* /* resource_accountant */) const {
   std::vector<NodeIndex> candidates;
   for (auto& node_index : graph.GetNodesInTopologicalOrder()) {
     const auto* p_node = graph.GetNode(node_index);
diff --git a/onnxruntime/core/providers/snpe/snpe_execution_provider.h b/onnxruntime/core/providers/snpe/snpe_execution_provider.h
index c0a62eea11a25..99033649fcbbf 100644
--- a/onnxruntime/core/providers/snpe/snpe_execution_provider.h
+++ b/onnxruntime/core/providers/snpe/snpe_execution_provider.h
@@ -18,7 +18,8 @@ class SNPEExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph,
-      const IKernelLookup& kernel_lookup) const override;
+      const IKernelLookup& kernel_lookup,
+      IResourceAccountant* /* resource_accountant */) const override;
 
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
   std::unordered_map<std::string, std::string> GetRuntimeOptions() const { return runtime_options_; }
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index c583598bbcc52..0ee5cef7cbaa1 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -2451,7 +2451,8 @@ bool TensorrtExecutionProvider::DetectTensorRTGraphCycles(SubGraphCollection_t&
 
 std::vector<std::unique_ptr<ComputeCapability>>
 TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
-                                         const IKernelLookup& /*kernel_lookup*/) const {
+                                         const IKernelLookup& /*kernel_lookup*/,
+                                         IResourceAccountant* /* resource_accountant */) const {
   // Construct subgraph capability from node list
   std::vector<std::unique_ptr<ComputeCapability>> result;
   // Get ModelPath
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index d3e0b0fba8891..92fdcbd3d950c 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -247,7 +247,8 @@ class TensorrtExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const GraphViewer& graph,
-                const IKernelLookup& /*kernel_lookup*/) const override;
+                const IKernelLookup& /*kernel_lookup*/,
+                IResourceAccountant* /* resource_accountant */) const override;
 
   int GetDeviceId() const { return device_id_; }
 
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index 3a99f56bb732a..5d2204b0b1979 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -51,7 +51,7 @@ const InlinedVector<const Node*> VitisAIExecutionProvider::GetEpContextNodes() c
   return ep_context_node_ptrs;
 }
 std::vector<std::unique_ptr<ComputeCapability>> VitisAIExecutionProvider::GetCapability(
-    const onnxruntime::GraphViewer& graph_viewer, const IKernelLookup& kernel_lookup) const {
+    const onnxruntime::GraphViewer& graph_viewer, const IKernelLookup& kernel_lookup, IResourceAccountant* /* resource_accountant */) const {
   if (graph_viewer.IsSubgraph()) {
     // VITIS AI EP not support sungraph. Assigned to CPU.
     return {};
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
index f0d1a289a2a73..5b031ab882839 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
@@ -28,7 +28,8 @@ class VitisAIExecutionProvider : public IExecutionProvider {
   ~VitisAIExecutionProvider() = default;
 
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                                                                const IKernelLookup& /*kernel_lookup*/) const override;
+                                                                const IKernelLookup& /*kernel_lookup*/,
+                                                                IResourceAccountant* /* resource_accountant */) const override;
 
   int GetDeviceId() const { return 0; }
   common::Status OnRunStart(const onnxruntime::RunOptions& /*run_options*/) override;
diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc
index 7da7cc6cb63ba..4b9f6fae86423 100644
--- a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc
@@ -61,8 +61,8 @@ VSINPUExecutionProvider::~VSINPUExecutionProvider() {}
 
 std::vector<std::unique_ptr<ComputeCapability>>
 VSINPUExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                                       const IKernelLookup& /*kernel_lookup*/) const {
-  const auto& logger = *GetLogger();
+                                       const IKernelLookup& /*kernel_lookup*/,
+                                       IResourceAccountant* /* resource_accountant */) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
   if (graph_viewer.IsSubgraph()) {
diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h
index c2605eb65faee..16cfbc8a9c581 100644
--- a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h
@@ -39,7 +39,8 @@ class VSINPUExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph_viewer,
-      const IKernelLookup& kernel_lookup) const override;
+      const IKernelLookup& kernel_lookup,
+      IResourceAccountant* /* resource_accountant */) const override;
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
   Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                  std::vector<NodeComputeInfo>& node_compute_funcs) override;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index dec7e48786bf5..7909084e7177a 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -760,7 +760,8 @@ std::vector<AllocatorPtr> WebGpuExecutionProvider::CreatePreferredAllocators() {
 
 std::vector<std::unique_ptr<ComputeCapability>> WebGpuExecutionProvider::GetCapability(
     const onnxruntime::GraphViewer& graph,
-    const IKernelLookup& kernel_lookup) const {
+    const IKernelLookup& kernel_lookup,
+    IResourceAccountant* /* resource_accountant */) const {
   InlinedVector<NodeIndex> candidates;
   // `tenative_candidates` is a subset of `candidates`.
   InlinedVector<NodeIndex> tenative_candidates;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
index ad81924e06901..5df276fa2d8a0 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
@@ -42,7 +42,8 @@ class WebGpuExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph_viewer,
-      const IKernelLookup& /*kernel_lookup*/) const override;
+      const IKernelLookup& /*kernel_lookup*/,
+      IResourceAccountant* /* resource_accountant */) const override;
 
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
   std::unique_ptr<onnxruntime::IDataTransfer> GetDataTransfer() const override;
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
index 00fbb26b731f8..df95b653bd863 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
@@ -55,7 +55,8 @@ WebNNExecutionProvider::~WebNNExecutionProvider() {}
 
 std::vector<std::unique_ptr<ComputeCapability>>
 WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                                      const IKernelLookup& /*kernel_registries*/) const {
+                                      const IKernelLookup& /*kernel_registries*/,
+                                      IResourceAccountant* /* resource_accountant */) const {
   // For subgraph which is the attribute of the control flow nodes, part of its initializers are stored in its
   // ancestor graphs as common initializers shared for other subgraphs. We need to collect all of them used for
   // identifying the required initializer names and storing into 'meta_def->constant_initializers'.
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.h b/onnxruntime/core/providers/webnn/webnn_execution_provider.h
index 26c5e476bcc4f..e806dc340d53e 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.h
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.h
@@ -24,7 +24,8 @@ class WebNNExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                const IKernelLookup& /*kernel_registries*/) const override;
+                const IKernelLookup& /*kernel_registries*/,
+                IResourceAccountant* /* resource_accountant */) const override;
 
   DataLayout GetPreferredLayout() const override { return preferred_layout_; }
 
diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
index ee4e7be0f1f49..641f8b0729d0a 100644
--- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
+++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
@@ -257,7 +257,8 @@ static void AddComputeCapabilityForEachNodeInNodeUnit(
 
 std::vector<std::unique_ptr<ComputeCapability>> XnnpackExecutionProvider::GetCapability(
     const onnxruntime::GraphViewer& graph,
-    const IKernelLookup& /*kernel_lookup*/) const {
+    const IKernelLookup& /*kernel_lookup*/,
+    IResourceAccountant* /* resource_accountant */) const {
   const auto& logger = *GetLogger();
   std::vector<std::unique_ptr<ComputeCapability>> capabilities;
 
diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.h b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.h
index 395dc2f90070e..152bef1a1c52c 100644
--- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.h
+++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.h
@@ -32,7 +32,8 @@ class XnnpackExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph_viewer,
-      const IKernelLookup& /*kernel_lookup*/) const override;
+      const IKernelLookup& /*kernel_lookup*/,
+      IResourceAccountant* /* resource_accountant */) const override;
 
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
 
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 223eed248800e..f3ca991c0e1e1 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1594,6 +1594,17 @@ common::Status InferenceSession::AddPrePackedWeightsContainer(PrepackedWeightsCo
   return Status::OK();
 }
 
+#if !defined(ORT_MINIMAL_BUILD)
+Status onnxruntime::InferenceSession::CreateNodeStatsRecorder(const std::filesystem::path& node_stats_file) {
+  if (node_stats_recorder_.has_value()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "The session already has an instance of NodeStatsRecorder");
+  }
+  node_stats_recorder_.emplace(node_stats_file);
+  return Status::OK();
+}
+#endif
+
 namespace {
 Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
                                const ExecutionProviders& providers,
@@ -1795,6 +1806,17 @@ common::Status InferenceSession::Initialize() {
       }
     }
 
+#if !defined(ORT_MINIMAL_BUILD)
+    const std::string node_stats_file = session_options_.config_options.GetConfigOrDefault(
+        kOrtSessionOptionsCollectNodeMemoryStatsToFile, "");
+
+    if (!node_stats_file.empty()) {
+      ORT_RETURN_IF_ERROR_SESSIONID_(CreateNodeStatsRecorder(node_stats_file));
+    }
+
+    session_state_->SetNodeStatsRecorder(GetNodeStatsRecorder());
+#endif
+
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
     // Don't want to pollute SessionState constructor since memory profile is enabled optionally.
     session_state_->SetMemoryProfiler(&memory_profiler_);
@@ -2726,6 +2748,22 @@ Status InferenceSession::Run(const RunOptions& run_options,
   TraceLoggingWriteStop(ortrun_activity, "OrtRun");
 #endif
 
+#if !defined(ORT_MINIMAL_BUILD)
+  if (GetNodeStatsRecorder() != nullptr && retval.IsOK()) {
+    // Dump node stats if the run was successful
+    const auto* node_stats_recorder = GetNodeStatsRecorder();
+    auto node_stats_file = session_state_->GetGraphViewer().ModelPath();
+    if (node_stats_file.has_filename()) {
+      node_stats_file = node_stats_file.parent_path();
+    }
+    node_stats_file /= node_stats_recorder->GetNodeStatsFileName();
+    std::ofstream ofs(node_stats_file, std::ofstream::out);
+    ORT_ENFORCE(ofs.is_open(), "Failed to open file: ", node_stats_file);
+    node_stats_recorder->DumpStats(ofs);
+    ofs.close();
+  }
+#endif
+
   // As N+1 inference runs (N for memory allocation and 1 for graph capturing)
   // are needed before replaying the captured graph, here run N inference runs recursively until graph captured,
   // so that users just need one session run to capture the graph.
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index e28ff75345785..2c0c09dfd3e51 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -21,6 +21,7 @@
 #include "core/framework/external_data_loader_manager.h"
 #include "core/framework/kernel_registry_manager.h"
 #include "core/framework/prepacked_weights_container.h"
+#include "core/framework/resource_accountant.h"
 #include "core/framework/session_state.h"
 #include "core/framework/tuning_results.h"
 #include "core/framework/framework_provider_common.h"
@@ -545,6 +546,31 @@ class InferenceSession {
    */
   Status AddPrePackedWeightsContainer(PrepackedWeightsContainer* prepacked_weights_container);
 
+#if !defined(ORT_MINIMAL_BUILD)
+  /**
+   * CreateNodeStats recorder and enable collection of node statistics that is useful
+   * for resource constrained partitioning and otherwise.
+   *
+   * @param node_stats_file - this file will be created at the same folder where the model file is present.
+   */
+  Status CreateNodeStatsRecorder(const std::filesystem::path& node_stats_file);
+
+  /**
+   * Returns true if collection is enabled
+   */
+  bool IsNodeStatsCollectionEnabled() const noexcept {
+    return node_stats_recorder_.has_value();
+  }
+
+  /**
+   * NodeStatsRecorder pointer. If not present, returns nullptr
+   */
+  NodeStatsRecorder* GetNodeStatsRecorder() noexcept {
+    return node_stats_recorder_.has_value() ? &*node_stats_recorder_ : nullptr;
+  }
+
+#endif
+
  protected:
 #if !defined(ORT_MINIMAL_BUILD)
 
@@ -911,6 +937,11 @@ class InferenceSession {
   };
 
   CachedExecutionProviderForGraphReplay cached_execution_provider_for_graph_replay_;
+
+#if !defined(ORT_MINIMAL_BUILD)
+  // Enable nodestats collection
+  std::optional<NodeStatsRecorder> node_stats_recorder_;
+#endif
 };
 
 struct SessionIOBinding {
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index ca6950af0227a..3761b4ca0ec41 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -5,6 +5,7 @@
 #include "core/session/allocator_adapters.h"
 #include "core/session/inference_session_utils.h"
 #include "core/session/IOBinding.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/framework/allocator.h"
 #include "core/framework/error_code_helper.h"
 #include "core/framework/execution_provider.h"
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index af39edae2074d..3208c5634b438 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -336,8 +336,9 @@ struct ProviderHostImpl : ProviderHost {
   // IExecutionProvider (direct)
   std::vector<std::unique_ptr<ComputeCapability>> IExecutionProvider__GetCapability(
       const IExecutionProvider* p, const onnxruntime::GraphViewer& graph_viewer,
-      const IExecutionProvider::IKernelLookup& kernel_lookup) override {
-    return p->IExecutionProvider::GetCapability(graph_viewer, kernel_lookup);
+      const IExecutionProvider::IKernelLookup& kernel_lookup,
+      IResourceAccountant* resource_accountant) override {
+    return p->IExecutionProvider::GetCapability(graph_viewer, kernel_lookup, resource_accountant);
   }
 
   common::Status IExecutionProvider__Compile(IExecutionProvider* p, const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs, std::vector<NodeComputeInfo>& node_compute_funcs) override {
@@ -761,6 +762,9 @@ struct ProviderHostImpl : ProviderHost {
   std::unique_ptr<IndexedSubGraph> IndexedSubGraph__construct() override { return std::make_unique<IndexedSubGraph>(); }
   void IndexedSubGraph__operator_delete(IndexedSubGraph* p) override { delete p; }
 
+  const std::vector<onnxruntime::NodeIndex>& IndexedSubGraph__Nodes(const IndexedSubGraph* p) override {
+    return p->nodes;
+  }
   std::vector<onnxruntime::NodeIndex>& IndexedSubGraph__Nodes(IndexedSubGraph* p) override { return p->nodes; }
 
   void IndexedSubGraph__SetMetaDef(IndexedSubGraph* p, std::unique_ptr<IndexedSubGraph_MetaDef>&& meta_def_) override { p->SetMetaDef(std::move(meta_def_)); }
@@ -768,6 +772,13 @@ struct ProviderHostImpl : ProviderHost {
 
   void IndexedSubGraph__SetSchemaSource(IndexedSubGraph* p, IndexedSubGraph_SourceOfSchema schema_source) override { p->schema_source = schema_source; }
   IndexedSubGraph_SourceOfSchema IndexedSubGraph__GetSchemaSource(const IndexedSubGraph* p) override { return p->schema_source; }
+  void IndexedSubGraph__SetAccountant(IndexedSubGraph* p, IResourceAccountant* resource_accountant) override {
+    p->SetAccountant(resource_accountant);
+  }
+  void IndexedSubGraph__AppendNodeCost(IndexedSubGraph* p, const ResourceCount& resource_count) override {
+    p->AppendNodeCost(resource_count);
+  }
+  void IndexedSubGraph__AppendNodeEmptyCost(IndexedSubGraph* p) override { p->AppendNodeEmptyCost(); }
 
   // KernelDef (wrapped)
   void KernelDef__operator_delete(KernelDef* p) override { delete p; }
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 740c566794f15..7ac0aaa291f67 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -10,6 +10,7 @@
 #include <iterator>
 #include <thread>
 #include <fstream>
+#include <random>
 
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include "core/common/denormal.h"
@@ -59,7 +60,6 @@
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
 
-using namespace std;
 using namespace ONNX_NAMESPACE;
 using namespace onnxruntime::logging;
 using namespace onnxruntime::concurrency;
@@ -137,7 +137,8 @@ class FuseExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph,
-                const IKernelLookup& /*kernel_lookup*/) const override {
+                const IKernelLookup& /*kernel_lookup*/,
+                IResourceAccountant* /* resource_accountant */) const override {
     // Fuse two add into one.
     std::vector<std::unique_ptr<ComputeCapability>> result;
     std::unique_ptr<IndexedSubGraph> sub_graph = std::make_unique<IndexedSubGraph>();
@@ -283,7 +284,7 @@ void RunModelWithBindingMatMul(InferenceSession& session_object,
                                ProviderType allocation_provider,
                                IExecutionProvider* gpu_provider,
                                OrtDevice* output_device) {
-  unique_ptr<IOBinding> io_binding;
+  std::unique_ptr<IOBinding> io_binding;
   Status st = session_object.NewIOBinding(&io_binding);
   ASSERT_TRUE(st.IsOK());
   auto input_allocator = io_binding->GetCPUAllocator(bind_provider_type);
@@ -358,7 +359,7 @@ void RunModelWithBindingMatMul(InferenceSession& session_object,
       (output_device && output_device->Type() == OrtDevice::GPU)) {
 #if defined(USE_CUDA) || defined(USE_ROCM)
     // in this case we need to copy the tensor from cuda to cpu
-    vector<OrtValue>& outputs = io_binding->GetOutputs();
+    std::vector<OrtValue>& outputs = io_binding->GetOutputs();
     ASSERT_EQ(1u, outputs.size());
     auto& rtensor = outputs.front().Get<Tensor>();
     auto element_type = rtensor.DataType();
@@ -388,6 +389,106 @@ void RunModelWithBindingMatMul(InferenceSession& session_object,
   }
 }
 
+#if 0
+namespace {
+// generate random inputs
+template <class T>
+InlinedVector<T> GenerateRandomInput(size_t size) {
+  InlinedVector<T> values(size);
+  std::random_device dev;
+  std::mt19937 rng(dev());
+  std::uniform_int_distribution<std::mt19937::result_type> distribution(1, 100);
+  std::generate(values.begin(), values.end(), [&]() { return static_cast<T>(distribution(rng)); });
+  return values;
+}
+
+template <>
+InlinedVector<float> GenerateRandomInput<float>(size_t size) {
+  InlinedVector<float> values(size);
+  std::random_device dev;
+  std::default_random_engine rng(dev());
+  std::uniform_real_distribution<float> distribution(-1.f, 1.f);
+  std::generate(values.begin(), values.end(), [&]() { return static_cast<float>(distribution(rng)); });
+  return values;
+}
+
+template <class T>
+void CreateMLValueFromRandom(const AllocatorPtr& alloc, gsl::span<const int64_t> shape,
+                             OrtValue& ort_value) {
+  const auto elements = narrow<size_t>(std::accumulate(shape.begin(), shape.end(),
+                                                       static_cast<int64_t>(1),
+                                                       std::multiplies<int64_t>()));
+  const auto values = GenerateRandomInput<T>(elements);
+  CreateMLValue<T>(alloc, shape, values, &ort_value);
+}
+
+}  // namespace
+
+TEST(InferenceSessionTests, GenerateNodeStatsWithRandomInput) {
+  static constexpr const ORTCHAR_T* STAT_MODEL =
+      ORT_TSTR("D:/dev/data/FunctionsConverterProfling/HF_Mobile_Bert/attention_mask2d_fp32.onnx");
+
+  SessionOptions so;
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsCollectNodeMemoryStatsToFile,
+                                                    "attention_mask2d_fp32_node_stats.txt"));
+  InferenceSession session_object{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session_object.Load(STAT_MODEL));
+  ASSERT_STATUS_OK(session_object.Initialize());
+
+  auto allocators = TestCPUExecutionProvider()->CreatePreferredAllocators();
+  auto inputs_defs = session_object.GetModelInputs();
+  ASSERT_STATUS_OK(inputs_defs.first);
+  NameMLValMap feeds;
+  for (const auto* def : *inputs_defs.second) {
+    if (!def->Exists()) {
+      continue;
+    }
+
+    OrtValue ml_value;
+    const auto* type_proto = def->TypeAsProto();
+    ASSERT_TRUE(utils::HasTensorType(*type_proto));
+    const auto elem_type = type_proto->tensor_type().elem_type();
+    ASSERT_TRUE(utils::HasShape(*type_proto));
+    const auto& tensor_shape_proto = type_proto->tensor_type().shape();
+
+    TensorShapeVector input_dims;
+    for (const auto& dim : tensor_shape_proto.dim()) {
+      input_dims.push_back(dim.dim_value());
+    }
+
+    switch (elem_type) {
+      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
+        CreateMLValueFromRandom<float>(allocators[0], input_dims, ml_value);
+        break;
+      }
+      case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
+        CreateMLValueFromRandom<int32_t>(allocators[0], input_dims, ml_value);
+        break;
+      }
+      case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
+        CreateMLValueFromRandom<int64_t>(allocators[0], input_dims, ml_value);
+        break;
+      }
+
+      default:
+        ASSERT_TRUE(false) << "Unsupported type: " << elem_type;
+    }
+    feeds.insert_or_assign(def->Name(), std::move(ml_value));
+  }
+
+  InlinedVector<std::string> output_names;
+  auto outputs = session_object.GetModelOutputs();
+  ASSERT_STATUS_OK(outputs.first);
+  for (const auto& output : *outputs.second) {
+    output_names.push_back(output->Name());
+  }
+
+  RunOptions run_options;
+  std::vector<OrtValue> fetches;
+  ASSERT_STATUS_OK(session_object.Run(run_options, feeds, output_names, &fetches));
+}
+#endif
+
 TEST(InferenceSessionTests, NoTimeout) {
   SessionOptions so;
 
@@ -438,7 +539,7 @@ TEST(InferenceSessionTests, TestModelSerialization) {
   // Load model with level 0 transform level
   // and assert that the model has Identity nodes.
   SessionOptions so;
-  const string test_model = "testdata/transform/abs-id-max.onnx";
+  const std::string test_model = "testdata/transform/abs-id-max.onnx";
   so.session_logid = "InferenceSessionTests.TestModelSerialization";
   so.graph_optimization_level = TransformerLevel::Default;
   InferenceSessionWrapper session_object_noopt{so, GetEnvironment()};
@@ -478,9 +579,9 @@ TEST(InferenceSessionTests, TestModelSerialization) {
 
   // Assert that re-feed of optimized model with default transform level results
   // in same runtime model as abs-id-max.onnx with TransformLevel-1.
-  std::ifstream model_fs_session1(so.optimized_model_filepath, ios::in | ios::binary);
+  std::ifstream model_fs_session1(so.optimized_model_filepath, std::ios::in | std::ios::binary);
   ASSERT_TRUE(model_fs_session1.good());
-  std::ifstream model_fs_session2(so_opt.optimized_model_filepath, ios::in | ios::binary);
+  std::ifstream model_fs_session2(so_opt.optimized_model_filepath, std::ios::in | std::ios::binary);
   ASSERT_TRUE(model_fs_session2.good());
   ASSERT_TRUE(model_fs_session1.tellg() == model_fs_session2.tellg());
   model_fs_session1.seekg(0, std::ifstream::beg);
@@ -499,7 +600,7 @@ TEST(InferenceSessionTests, TestModelSerialization) {
 #ifdef ORT_RUN_EXTERNAL_ONNX_TESTS
 static bool Compare(const InputDefList& f_arg, const InputDefList& s_arg) {
   if (f_arg.size() != s_arg.size()) {
-    cout << "Sizes differ: f_arg size: " << f_arg.size() << " s_arg size: " << s_arg.size() << endl;
+    std::cout << "Sizes differ: f_arg size: " << f_arg.size() << " s_arg size: " << s_arg.size() << std::endl;
     return false;
   }
 
@@ -564,9 +665,9 @@ TEST(InferenceSessionTests, ModelMetadata) {
     }
 
     auto retval = session_object.GetModelInputs();
-    cout << "weights size: " << weights.size()
-         << " inputs.size(): " << inputs.size()
-         << " from session: " << retval.second->size() << endl;
+    std::cout << "weights size: " << weights.size()
+              << " inputs.size(): " << inputs.size()
+              << " from session: " << retval.second->size() << std::endl;
     ASSERT_TRUE(retval.first.IsOK());
     ASSERT_TRUE(Compare(inputs_no_weights, *retval.second));
   }
@@ -617,7 +718,7 @@ TEST(InferenceSessionTests, CheckRunLogger) {
   bool have_log_entry_with_run_tag =
       (std::find_if(msgs.begin(), msgs.end(),
                     [&run_options](std::string msg) {
-                      return msg.find(run_options.run_tag) != string::npos;
+                      return msg.find(run_options.run_tag) != std::string::npos;
                     }) != msgs.end());
 
   ASSERT_TRUE(have_log_entry_with_run_tag);
@@ -660,18 +761,18 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
 
   auto size = lines.size();
   ASSERT_TRUE(size > 1);
-  ASSERT_TRUE(lines[0].find("[") != string::npos);
-  ASSERT_TRUE(lines[1].find("model_loading_uri") != string::npos);
-  ASSERT_TRUE(lines[size - 1].find("]") != string::npos);
+  ASSERT_TRUE(lines[0].find("[") != std::string::npos);
+  ASSERT_TRUE(lines[1].find("model_loading_uri") != std::string::npos);
+  ASSERT_TRUE(lines[size - 1].find("]") != std::string::npos);
   std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
 
   bool has_kernel_info = false;
   for (size_t i = 1; i < size - 1; ++i) {
     for (auto& s : tags) {
-      ASSERT_TRUE(lines[i].find(s) != string::npos);
-      has_kernel_info = has_kernel_info || lines[i].find("Kernel") != string::npos &&
-                                               lines[i].find("stream") != string::npos &&
-                                               lines[i].find("block_x") != string::npos;
+      ASSERT_TRUE(lines[i].find(s) != std::string::npos);
+      has_kernel_info = has_kernel_info || lines[i].find("Kernel") != std::string::npos &&
+                                               lines[i].find("stream") != std::string::npos &&
+                                               lines[i].find("block_x") != std::string::npos;
     }
   }
 
@@ -717,25 +818,25 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions2) {
 
   auto size = lines.size();
   ASSERT_TRUE(size > 1);
-  ASSERT_TRUE(lines[0].find("[") != string::npos);
-  ASSERT_TRUE(lines[1].find("model_loading_uri") != string::npos);
-  ASSERT_TRUE(lines[size - 1].find("]") != string::npos);
+  ASSERT_TRUE(lines[0].find("[") != std::string::npos);
+  ASSERT_TRUE(lines[1].find("model_loading_uri") != std::string::npos);
+  ASSERT_TRUE(lines[size - 1].find("]") != std::string::npos);
   std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
 
   [[maybe_unused]] bool has_api_info = false;
   for (size_t i = 1; i < size - 1; ++i) {
     for (auto& s : tags) {
-      ASSERT_TRUE(lines[i].find(s) != string::npos);
+      ASSERT_TRUE(lines[i].find(s) != std::string::npos);
 #ifdef USE_CUDA
-      has_api_info = has_api_info || lines[i].find("Api") != string::npos &&
-                                         lines[i].find("cudaLaunch") != string::npos;
+      has_api_info = has_api_info || lines[i].find("Api") != std::string::npos &&
+                                         lines[i].find("cudaLaunch") != std::string::npos;
 #endif
 #ifdef USE_ROCM
-      has_api_info = has_api_info || lines[i].find("Api") != string::npos &&
-                                         lines[i].find("hipLaunch") != string::npos;
+      has_api_info = has_api_info || lines[i].find("Api") != std::string::npos &&
+                                         lines[i].find("hipLaunch") != std::string::npos;
 #endif
 #ifdef USE_WEBGPU
-      has_api_info = has_api_info || lines[i].find("Api") != string::npos;
+      has_api_info = has_api_info || lines[i].find("Api") != std::string::npos;
 #endif
     }
   }
@@ -769,17 +870,17 @@ TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) {
   int count = 0;
   while (std::getline(profile, line)) {
     if (count == 0) {
-      ASSERT_TRUE(line.find("[") != string::npos);
+      ASSERT_TRUE(line.find("[") != std::string::npos);
     } else if (count <= 3) {
       for (auto& s : tags) {
-        ASSERT_TRUE(line.find(s) != string::npos);
+        ASSERT_TRUE(line.find(s) != std::string::npos);
       }
     } else {
-      ASSERT_TRUE(line.find("]") != string::npos);
+      ASSERT_TRUE(line.find("]") != std::string::npos);
     }
 
     if (count == 1) {
-      ASSERT_TRUE(line.find("mul_1_kernel_time") != string::npos);
+      ASSERT_TRUE(line.find("mul_1_kernel_time") != std::string::npos);
     }
     count++;
   }
@@ -929,7 +1030,7 @@ TEST(InferenceSessionTests, ConfigureVerbosityLevel) {
   std::copy(msgs.begin(), msgs.end(), std::ostream_iterator<std::string>(std::cout, "\n"));
   bool have_log_entry_with_vlog_session_msg =
       (std::find_if(msgs.begin(), msgs.end(),
-                    [&](std::string msg) { return msg.find("Added input argument with name") != string::npos; }) !=
+                    [&](std::string msg) { return msg.find("Added input argument with name") != std::string::npos; }) !=
        msgs.end());
 
   ASSERT_TRUE(have_log_entry_with_vlog_session_msg);
@@ -942,7 +1043,8 @@ TEST(InferenceSessionTests, ConfigureVerbosityLevel) {
   // ASSERT_TRUE(have_log_entry_with_vlog_run_msg);
 
   bool has_num_streams_msg =
-      (std::find_if(msgs.begin(), msgs.end(), [&](std::string msg) { return msg.find("Number of streams") != string::npos; }) != msgs.end());
+      (std::find_if(msgs.begin(), msgs.end(), [&](std::string msg) { return msg.find("Number of streams") !=
+                                                                            std::string::npos; }) != msgs.end());
 
   ASSERT_TRUE(has_num_streams_msg);
 #endif
@@ -983,7 +1085,7 @@ TEST(InferenceSessionTests, UseUserSpecifiedLoggingFunctionInSession) {
 #ifndef NDEBUG
   bool have_log_entry_with_vlog_session_msg =
       (std::find_if(log_msgs.begin(), log_msgs.end(),
-                    [&](std::string msg) { return msg.find("Added input argument with name") != string::npos; }) !=
+                    [&](std::string msg) { return msg.find("Added input argument with name") != std::string::npos; }) !=
        log_msgs.end());
   ASSERT_TRUE(have_log_entry_with_vlog_session_msg);
 #endif
@@ -996,7 +1098,7 @@ TEST(InferenceSessionTests, TestWithIstream) {
 
   InferenceSession session_object{so, GetEnvironment()};
 
-  std::ifstream model_file_stream(MODEL_URI, ios::in | ios::binary);
+  std::ifstream model_file_stream(MODEL_URI, std::ios::in | std::ios::binary);
   ASSERT_TRUE(model_file_stream.good());
   ASSERT_TRUE(session_object.Load(model_file_stream).IsOK());
   ASSERT_STATUS_OK(session_object.Initialize());
@@ -1015,7 +1117,7 @@ TEST(InferenceSessionTests, TestRegisterExecutionProvider) {
   CPUExecutionProviderInfo epi;
   ASSERT_TRUE(session_object.RegisterExecutionProvider(std::make_unique<CPUExecutionProvider>(epi)).IsOK());
 
-  std::ifstream model_file_stream(MODEL_URI, ios::in | ios::binary);
+  std::ifstream model_file_stream(MODEL_URI, std::ios::in | std::ios::binary);
   ASSERT_TRUE(model_file_stream.good());
   ASSERT_TRUE(session_object.Load(model_file_stream).IsOK());
   ASSERT_STATUS_OK(session_object.Initialize());
@@ -1092,13 +1194,14 @@ TEST(InferenceSessionTests, TestIOBindingReuse) {
   std::stringstream sstr(s1);
   ASSERT_TRUE(session_object.Load(sstr).IsOK());
   ASSERT_STATUS_OK(session_object.Initialize());
-  unique_ptr<IOBinding> io_binding;
+  std::unique_ptr<IOBinding> io_binding;
   Status st = session_object.NewIOBinding(&io_binding);
   ASSERT_TRUE(st.IsOK());
 
   OrtValue ml_value1;
-  vector<float> v1{2.f};
-  CreateMLValue<float>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], {1}, v1, &ml_value1);
+  const std::vector<float> v1{2.f};
+  const int64_t shape[] = {1};
+  CreateMLValue<float>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], shape, v1, &ml_value1);
   ASSERT_STATUS_OK(io_binding->BindOutput("foo", ml_value1));
   ASSERT_TRUE(io_binding->GetOutputs().size() == 1);
   auto span = io_binding->GetOutputs()[0].Get<Tensor>().DataAsSpan<float>();
@@ -1108,8 +1211,8 @@ TEST(InferenceSessionTests, TestIOBindingReuse) {
   }
 
   OrtValue ml_value2;
-  vector<float> v2{3.f};
-  CreateMLValue<float>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], {1}, v2, &ml_value2);
+  const std::vector<float> v2{3.f};
+  CreateMLValue<float>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], shape, v2, &ml_value2);
   ASSERT_STATUS_OK(io_binding->BindOutput("foo", ml_value2));
   ASSERT_TRUE(io_binding->GetOutputs().size() == 1);
   span = io_binding->GetOutputs()[0].Get<Tensor>().DataAsSpan<float>();
@@ -1651,7 +1754,7 @@ TEST(InferenceSessionTests, Test3LayerNestedSubgraph) {
   run_options.run_tag = so.session_logid;
 
   std::vector<int64_t> dim = {1};
-  std::vector<bool> va = {false};
+  InlinedVector<bool> va = {false};
   OrtValue ml_value_x;
   CreateMLValue<bool>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dim, va,
                       &ml_value_x);
@@ -1807,8 +1910,9 @@ TEST(InferenceSessionTests, Test2LayerNestedSubgraph) {
   OrtValue ml_value_input_0;
   CreateMLValue<float>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dim_input_0, data_input_0,
                        &ml_value_input_0);
-  std::vector<int64_t> dim_input_1 = {1};
-  std::vector<bool> data_input_1 = {false};
+
+  const int64_t dim_input_1[] = {1};
+  const bool data_input_1[] = {false};
   OrtValue ml_value_input_1;
   CreateMLValue<bool>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dim_input_1, data_input_1,
                       &ml_value_input_1);
@@ -2047,7 +2151,7 @@ TEST(InferenceSessionTests, TestCopyToFromDevices) {
 // It creates and registers a dummy transformer and after session initialize
 // validates that this transformer was called regardless of the graph optimization level set.
 TEST(InferenceSessionTests, TestRegisterTransformers) {
-  string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
+  std::string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
 
   for (int i = static_cast<int>(TransformerLevel::Default); i <= static_cast<int>(TransformerLevel::MaxLevel); i++) {
     SessionOptions so;
@@ -2126,7 +2230,7 @@ TEST(InferenceSessionTests, TestStrictShapeInference) {
 
   tester.AddInput("data", input_shape, input_data);
   tester.AddOutput<int64_t>("output", invalid_output_shape, output_data);
-  const std::unordered_set<string> excluded_provider_types = {
+  const std::unordered_set<std::string> excluded_provider_types = {
       kTensorrtExecutionProvider,   // Doesn't handle Unsqueeze.
       kOpenVINOExecutionProvider};  // Disabled temporarily.
 
@@ -2144,7 +2248,7 @@ TEST(InferenceSessionTests, TestStrictShapeInference) {
 #ifdef USE_CUDA
 // disable it, since we are going to enable parallel execution with cuda ep
 TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) {
-  string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
+  std::string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
 
   SessionOptions so;
   so.execution_mode = ExecutionMode::ORT_PARALLEL;
@@ -2822,10 +2926,10 @@ TEST(InferenceSessionTests, InitializerSharing_EnsureSessionsUseUserAddedInitial
   std::vector<float> input_data_vec{1., 2., 3., 4., 5., 6.};
 
   auto allocator = TestCPUExecutionProvider()->CreatePreferredAllocators()[0];
-  CreateMLValue<float>(allocator, {3, 2}, input_data_vec, &val_to_share_from_allocator);
+  CreateMLValue<float>(allocator, AsSpan<int64_t>({3, 2}), input_data_vec, &val_to_share_from_allocator);
 
   OrtMemoryInfo mem_info{CPU, OrtArenaAllocator};
-  CreateMLValue<float>(std::array<int64_t, 2>{3, 2}, input_data_vec.data(), mem_info, &val_to_share);
+  CreateMLValue<float>(AsSpan<int64_t>({3, 2}), input_data_vec.data(), mem_info, &val_to_share);
 
   // create sessions to share the allocator
   SessionOptions so1;
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index e7f8b1aaa49d8..c34b9ac84b259 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -22,6 +22,8 @@
 #include "core/util/thread_utils.h"
 #include "gtest/gtest.h"
 #include "test/test_environment.h"
+#include "test/optimizer/graph_transform_test_builder.h"
+#include "test/util/include/test_environment.h"
 #include "test/util/include/default_providers.h"
 #include "test/util/include/file_util.h"
 #include "core/optimizer/layout_transformation/layout_transformation.h"
@@ -440,6 +442,257 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
   }
 }
 
+#ifdef USE_CUDA
+namespace {
+void BuildTestModel(Graph& graph, const std::vector<int64_t>& input_shape,
+                    size_t approx_init_a_size,
+                    size_t approx_init_b_size) {
+  ASSERT_EQ(2, input_shape.size());
+
+  // Create two MatMul nodes each with the initializers, that are going to
+  // dictate the cost of the nodes
+  const auto init_a_dim_0 = input_shape[1];
+  const int64_t init_a_dim_1 = approx_init_a_size / input_shape[1];
+  const std::vector<int64_t> init_a_shape = {init_a_dim_0, init_a_dim_1};
+
+  // This is also an A input to mm_2
+  const std::vector<int64_t> mm_1_output_shape = {input_shape[0], init_a_shape[1]};
+
+  const int64_t init_b_dim_0 = mm_1_output_shape[1];
+  const int64_t init_b_dim_1 = approx_init_b_size / mm_1_output_shape[1];
+  const std::vector<int64_t> init_b_shape = {init_b_dim_0, init_b_dim_1};
+
+  const std::vector<int64_t> output_shape = {mm_1_output_shape[0], init_b_dim_1};
+
+  ModelTestBuilder builder(graph);
+
+  std::optional<std::vector<int64_t>> in_shape = input_shape;
+  NodeArg* model_input = builder.MakeInput<float>(in_shape, "input");
+  NodeArg* init_a = builder.MakeInitializer<float>(init_a_shape, 1.f, 10.f);
+  NodeArg* mm_1_output = builder.MakeIntermediate<float>(mm_1_output_shape);
+  NodeArg* init_b = builder.MakeIntermediate<float>(init_b_shape);
+  NodeArg* mm_2_output = builder.MakeOutput<float>(output_shape);
+
+  builder.AddNode("MatMul", {model_input, init_a}, {mm_1_output});
+  builder.AddNode("MatMul", {mm_1_output, init_b}, {mm_2_output});
+}
+}  // namespace
+
+// Produces node stats for the model. This requires running the model.
+// TEST(SessionStateTest, TestResourceAwareParitioningSaveNodeStats) {
+//
+//  const auto& log_manager = DefaultLoggingManager();
+//  log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE);
+//  const auto& default_logger = log_manager.DefaultLogger();
+//  std::unordered_map<std::string, int> domain_to_version;
+//  domain_to_version[kOnnxDomain] = 16;  // We can make it a parameter
+//  Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+//              domain_to_version, {}, default_logger);
+//
+//  const std::vector<int64_t> input_shape = {1024, 1024};
+//  constexpr const size_t approx_init_a_size = 1024 * 1024;  // 1Mb
+//  constexpr const size_t approx_init_b_size = 1024 * 1024;  // 1Mb
+//
+//  auto& graph = model.MainGraph();
+//  BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size);
+//  ASSERT_STATUS_OK(graph.Resolve());
+//
+//  auto model_proto = model.ToProto();
+//  const auto model_string = model_proto.SerializeAsString();
+//  std::ofstream model_file("model.onnx", std::ios::binary);
+//}
+
+/// XXX: Optionally add resource aware parameters
+/// This test can only run with CUDA present currently.
+TEST(SessionStateTest, TestResourceAwarePartitioning_NoLimit) {
+  const auto& log_manager = DefaultLoggingManager();
+  log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE);
+  const auto& default_logger = log_manager.DefaultLogger();
+  std::unordered_map<std::string, int> domain_to_version;
+  domain_to_version[kOnnxDomain] = 16;  // We can make it a parameter
+  Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              domain_to_version, {}, default_logger);
+
+  // Input Shape
+  const std::vector<int64_t> input_shape = {1024, 1024};
+  constexpr const size_t approx_init_a_size = 1024 * 1024;  // 1Mb
+  constexpr const size_t approx_init_b_size = 1024 * 1024;  // 1Mb
+
+  auto& graph = model.MainGraph();
+  BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size);
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  OrtThreadPoolParams to;
+  to.thread_pool_size = 1;
+  auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP);
+
+  ExecutionProviders execution_providers;
+  auto tmp_cpu_execution_provider = DefaultCudaExecutionProvider();
+  tmp_cpu_execution_provider->SetLogger(&default_logger);
+  ASSERT_STATUS_OK(execution_providers.Add(kCudaExecutionProvider, std::move(tmp_cpu_execution_provider)));
+
+  KernelRegistryManager krm;
+  ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers));
+
+  DataTransferManager dtm;
+  ExternalDataLoaderManager edlm;
+  profiling::Profiler profiler;
+  // Try to load the model without restrictions
+  // and verify nodes have been placed to CUDA
+  SessionOptions sess_options;
+  sess_options.enable_mem_pattern = false;
+  sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+  sess_options.use_deterministic_compute = false;
+  sess_options.enable_mem_reuse = false;
+
+  SessionState session_state(graph, execution_providers, tp.get(), nullptr, dtm, edlm,
+                             default_logger, profiler, sess_options);
+
+  GraphPartitioner partitioner(krm, execution_providers);
+  layout_transformation::TransformLayoutFunction transform_layout_fn;
+  layout_transformation::DebugGraphFn debug_graph_fn;
+  ASSERT_STATUS_OK(
+      partitioner.Partition(graph, session_state.GetMutableFuncMgr(), transform_layout_fn,
+                            sess_options.config_options, default_logger,
+                            GraphPartitioner::Mode::kNormal, debug_graph_fn));
+
+  // All nodes have been placed to CUDA
+  const auto& graph_nodes = graph.Nodes();
+  for (const auto& node : graph_nodes) {
+    EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider);
+  }
+}
+
+// TEST(SessionStateTest, TestResourceAwarePartitioning_LargeLimit) {
+//   const auto& log_manager = DefaultLoggingManager();
+//   log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE);
+//   const auto& default_logger = log_manager.DefaultLogger();
+//   std::unordered_map<std::string, int> domain_to_version;
+//   domain_to_version[kOnnxDomain] = 16;  // We can make it a parameter
+//   Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+//               domain_to_version, {}, default_logger);
+//
+//   // Input Shape
+//   const std::vector<int64_t> input_shape = {1024, 1024};
+//   constexpr const size_t approx_init_a_size = 1024 * 1024;  // 1Mb
+//   constexpr const size_t approx_init_b_size = 1024 * 1024;  // 1Mb
+//
+//   auto& graph = model.MainGraph();
+//   BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size);
+//   ASSERT_STATUS_OK(graph.Resolve());
+//
+//   OrtThreadPoolParams to;
+//   to.thread_pool_size = 1;
+//   auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP);
+//
+//   ExecutionProviders execution_providers;
+//   auto tmp_cpu_execution_provider = DefaultCudaExecutionProvider();
+//   tmp_cpu_execution_provider->SetLogger(&default_logger);
+//   ASSERT_STATUS_OK(execution_providers.Add(kCudaExecutionProvider, std::move(tmp_cpu_execution_provider)));
+//
+//   KernelRegistryManager krm;
+//   ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers));
+//
+//   DataTransferManager dtm;
+//   ExternalDataLoaderManager edlm;
+//   profiling::Profiler profiler;
+//   // Try to load the model without restrictions
+//   // and verify nodes have been placed to CUDA
+//   SessionOptions sess_options;
+//   sess_options.enable_mem_pattern = false;
+//   sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+//   sess_options.use_deterministic_compute = false;
+//   sess_options.enable_mem_reuse = false;
+//   ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(kOrtSessionOptionsResourceCudaPartitioningSettings,
+//                                                               "4206592"));
+//
+//   SessionState session_state(graph, execution_providers, tp.get(), nullptr, dtm, edlm,
+//                              default_logger, profiler, sess_options);
+//
+//   GraphPartitioner partitioner(krm, execution_providers);
+//   layout_transformation::TransformLayoutFunction transform_layout_fn;
+//   layout_transformation::DebugGraphFn debug_graph_fn;
+//   ASSERT_STATUS_OK(
+//       partitioner.Partition(graph, session_state.GetMutableFuncMgr(), transform_layout_fn,
+//                             sess_options.config_options, default_logger,
+//                             GraphPartitioner::Mode::kNormal, debug_graph_fn));
+//
+//   // All nodes have been placed to CUDA
+//   const auto& graph_nodes = graph.Nodes();
+//   for (const auto& node : graph_nodes) {
+//     EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider);
+//   }
+// }
+
+// TEST(SessionStateTest, TestResourceAwarePartitioning_SecondNodeCutOff) {
+//   const auto& log_manager = DefaultLoggingManager();
+//   log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE);
+//   const auto& default_logger = log_manager.DefaultLogger();
+//   std::unordered_map<std::string, int> domain_to_version;
+//   domain_to_version[kOnnxDomain] = 16;  // We can make it a parameter
+//   Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+//               domain_to_version, {}, default_logger);
+//
+//   // Input Shape
+//   const std::vector<int64_t> input_shape = {1024, 1024};
+//   constexpr const size_t approx_init_a_size = 1024 * 1024;  // 1Mb
+//   constexpr const size_t approx_init_b_size = 1024 * 1024;  // 1Mb
+//
+//   auto& graph = model.MainGraph();
+//   BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size);
+//   ASSERT_STATUS_OK(graph.Resolve());
+//
+//   OrtThreadPoolParams to;
+//   to.thread_pool_size = 1;
+//   auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP);
+//
+//   ExecutionProviders execution_providers;
+//   auto tmp_cpu_execution_provider = DefaultCudaExecutionProvider();
+//   tmp_cpu_execution_provider->SetLogger(&default_logger);
+//   ASSERT_STATUS_OK(execution_providers.Add(kCudaExecutionProvider, std::move(tmp_cpu_execution_provider)));
+//
+//   KernelRegistryManager krm;
+//   ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers));
+//
+//   DataTransferManager dtm;
+//   ExternalDataLoaderManager edlm;
+//   profiling::Profiler profiler;
+//   // Try to load the model without restrictions
+//   // and verify nodes have been placed to CUDA
+//   SessionOptions sess_options;
+//   sess_options.enable_mem_pattern = false;
+//   sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+//   sess_options.use_deterministic_compute = false;
+//   sess_options.enable_mem_reuse = false;
+//   ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(kOrtSessionOptionsResourceCudaPartitioningSettings,
+//                                                               "16383"));
+//
+//   SessionState session_state(graph, execution_providers, tp.get(), nullptr, dtm, edlm,
+//                              default_logger, profiler, sess_options);
+//
+//   GraphPartitioner partitioner(krm, execution_providers);
+//   layout_transformation::TransformLayoutFunction transform_layout_fn;
+//   layout_transformation::DebugGraphFn debug_graph_fn;
+//   ASSERT_STATUS_OK(
+//       partitioner.Partition(graph, session_state.GetMutableFuncMgr(), transform_layout_fn,
+//                             sess_options.config_options, default_logger,
+//                             GraphPartitioner::Mode::kNormal, debug_graph_fn));
+//
+//   // Second node did not make it to CUDA
+//   const auto& graph_nodes = graph.Nodes();
+//   size_t count = 0;
+//   for (const auto& node : graph_nodes) {
+//     if (count == 0) {
+//       EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider);
+//     } else {
+//       EXPECT_TRUE(node.GetExecutionProviderType().empty());
+//     }
+//     count++;
+//   }
+// }
+
+#endif  // USE_CUDA
+
 INSTANTIATE_TEST_SUITE_P(SessionStateTests, SessionStateTestP, testing::ValuesIn(param_list));
 
 #ifndef ENABLE_TRAINING_CORE
diff --git a/onnxruntime/test/framework/test_utils.h b/onnxruntime/test/framework/test_utils.h
index 51b02ee3e7f8c..9c5893948ff1b 100644
--- a/onnxruntime/test/framework/test_utils.h
+++ b/onnxruntime/test/framework/test_utils.h
@@ -32,8 +32,13 @@ namespace test {
 IExecutionProvider* TestCPUExecutionProvider();
 
 template <typename T>
+inline void CopyVectorToTensor(gsl::span<const T> value, Tensor& tensor) {
+  gsl::copy(value, tensor.MutableDataAsSpan<T>());
+}
+
+template <class T>
 inline void CopyVectorToTensor(const std::vector<T>& value, Tensor& tensor) {
-  gsl::copy(gsl::make_span(value), tensor.MutableDataAsSpan<T>());
+  gsl::copy(AsSpan(value), tensor.MutableDataAsSpan<T>());
 }
 
 // vector<bool> is specialized so we need to handle it separately
@@ -45,8 +50,20 @@ inline void CopyVectorToTensor<bool>(const std::vector<bool>& value, Tensor& ten
   }
 }
 
+template <class T>
+void CreateMLValue(AllocatorPtr alloc, gsl::span<const int64_t> dims, const std::vector<T>& value,
+                   OrtValue* p_mlvalue) {
+  TensorShape shape(dims);
+  auto element_type = DataTypeImpl::GetType<T>();
+  Tensor::InitOrtValue(element_type, shape, std::move(alloc), *p_mlvalue);
+  if (!value.empty()) {
+    Tensor& tensor = *p_mlvalue->GetMutable<Tensor>();
+    CopyVectorToTensor(value, tensor);
+  }
+}
+
 template <typename T>
-void CreateMLValue(AllocatorPtr alloc, const std::vector<int64_t>& dims, const std::vector<T>& value,
+void CreateMLValue(AllocatorPtr alloc, gsl::span<const int64_t> dims, gsl::span<const T> value,
                    OrtValue* p_mlvalue) {
   TensorShape shape(dims);
   auto element_type = DataTypeImpl::GetType<T>();
@@ -58,6 +75,24 @@ void CreateMLValue(AllocatorPtr alloc, const std::vector<int64_t>& dims, const s
   }
 }
 
+template <class T>
+void CreateMLValue(AllocatorPtr alloc, std::initializer_list<int64_t> dims, gsl::span<const T> value,
+                   OrtValue* p_mlvalue) {
+  CreateMLValue<T>(alloc, AsSpan(dims), value, p_mlvalue);
+}
+
+template <class T>
+void CreateMLValue(AllocatorPtr alloc, gsl::span<const int64_t> dims, std::initializer_list<T> value,
+                   OrtValue* p_mlvalue) {
+  CreateMLValue<T>(alloc, dims, AsSpan(value), p_mlvalue);
+}
+
+template <class T>
+void CreateMLValue(AllocatorPtr alloc, std::initializer_list<int64_t> dims, std::initializer_list<T> value,
+                   OrtValue* p_mlvalue) {
+  CreateMLValue<T>(alloc, AsSpan(dims), AsSpan(value), p_mlvalue);
+}
+
 // Lifetime of data_buffer should be managed by the caller.
 template <typename T>
 void CreateMLValue(gsl::span<const int64_t> dims, T* data_buffer, const OrtMemoryInfo& info,
@@ -68,7 +103,7 @@ void CreateMLValue(gsl::span<const int64_t> dims, T* data_buffer, const OrtMemor
 }
 
 template <typename T>
-void AllocateMLValue(AllocatorPtr alloc, const std::vector<int64_t>& dims, OrtValue* p_mlvalue) {
+void AllocateMLValue(AllocatorPtr alloc, gsl::span<const int64_t> dims, OrtValue* p_mlvalue) {
   TensorShape shape(dims);
   auto element_type = DataTypeImpl::GetType<T>();
   Tensor::InitOrtValue(element_type, shape, std::move(alloc), *p_mlvalue);
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc
index 2e073def5d643..b753bc386d722 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc
@@ -110,7 +110,8 @@ DataLayout InternalTestingExecutionProvider::GetPreferredLayout() const {
 
 std::vector<std::unique_ptr<ComputeCapability>>
 InternalTestingExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
-                                                const IKernelLookup& kernel_lookup) const {
+                                                const IKernelLookup& kernel_lookup,
+                                                IResourceAccountant* /* resource_accountant */) const {
   // find nodes that have ops in our supported list
   std::unordered_set<const Node*> supported_static_nodes;
   std::unordered_set<const Node*> supported_compiled_nodes;
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h
index 6615eb82f2b05..d2ed8259ee974 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h
@@ -19,7 +19,8 @@ class InternalTestingExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_view,
-                const IKernelLookup& /*kernel_lookup*/) const override;
+                const IKernelLookup& /*kernel_lookup*/,
+                IResourceAccountant* /* resource_accountant */) const override;
 
   common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes,
                          std::vector<NodeComputeInfo>& node_compute_funcs) override;
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.cc b/onnxruntime/test/providers/qnn/qnn_test_utils.cc
index 4feeb5f830508..c8ed550c0625c 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.cc
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.cc
@@ -281,7 +281,7 @@ static BackendSupport GetHTPSupport(const onnxruntime::logging::Logger& logger)
       {{"backend_path", "QnnHtp.dll"}});
 
   qnn_ep->SetLogger(&logger);
-  auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup);
+  auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup, nullptr);
 
   return result.empty() ? BackendSupport::UNSUPPORTED : BackendSupport::SUPPORTED;
 }
@@ -344,7 +344,7 @@ static BackendSupport GetCPUSupport(const onnxruntime::logging::Logger& logger)
       {{"backend_path", "QnnCpu.dll"}});
 
   qnn_ep->SetLogger(&logger);
-  auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup);
+  auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup, nullptr);
 
   return result.empty() ? BackendSupport::UNSUPPORTED : BackendSupport::SUPPORTED;
 }
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index dab73d3824d3b..882ef80d76441 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -4768,3 +4768,16 @@ TEST(CApiTest, OrtCustomOp_GetInPlace) {
   ASSERT_EQ(len, static_cast<size_t>(2));
   mock_gqa.ReleaseAliasMap(input_index, output_index);
 }
+
+/*TEST(CApiTest, RunWithNodeStats) {
+  Ort::Env env(ORT_LOGGING_LEVEL_INFO);
+  constexpr const ORTCHAR_T* model_path = TSTR("testdata/attention_mask2d_fp32.onnx");
+
+  Ort::SessionOptions session_options;
+  session_options.DisableCpuMemArena();
+  session_options.DisableMemPattern();
+  session_options.AddConfigEntry(kOrtSessionOptionsCollectNodeMemoryStatsToFile,
+                                 "D:/dev/data/FunctionsConverterProfling/HF_Mobile_Bert/attention_memory.txt");
+
+  Ort::Session session(env, model_path, session_options);
+}*/
\ No newline at end of file

From b8f6b7b6ceb9c0e899686215592e5e1b4ffd927a Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Fri, 31 Jan 2025 11:45:08 -0800
Subject: [PATCH 2/7] Adjust CSV parsing

---
 onnxruntime/core/framework/graph_partitioner.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index 8a01e3973cdc6..b955e05ec803b 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -121,8 +121,11 @@ InlinedHashMap<std::string, NodeAllocationStats> LoadNodeAllocationStats(const s
   std::string line;
   // Read and load a CSV file line by line
   while (std::getline(file, line)) {
-    auto splits = utils::SplitString(line, ",", false);
+    auto splits = utils::SplitString(line, ",", true);
     ORT_ENFORCE(splits.size() == 5, "Invalid line in the file ", file_path, ": ", line);
+    if (splits[0].empty()) {
+      continue;
+    }
     std::string node_name{splits[0]};
     size_t input_sizes = SafeInt<size_t>(std::stoull(std::string{splits[1]}));
     size_t initializers_sizes = SafeInt<size_t>(std::stoull(std::string{splits[2]}));
@@ -1101,7 +1104,7 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
       kOrtSessionOptionsResourceCudaPartitioningSettings, "");
   if (!resource_partitioning_settings.empty()) {
     auto splits = utils::SplitString(resource_partitioning_settings, ",", false);
-    if (splits.size() == 4) {
+    if (splits.size() == 2) {
       SafeInt<size_t> cuda_memory_limit = std::stoul(std::string{splits[0]});
       cuda_memory_limit *= 1024;  // to bytes
       auto node_to_stats = LoadNodeAllocationStats(graph.ModelPath(), splits[1]);

From b1d1467bea9d9c159a05e4877c1fd6a6a1b8c4fb Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 5 Feb 2025 18:41:35 -0800
Subject: [PATCH 3/7] Tests pass

---
 .../core/framework/op_kernel_context.h        |   2 +-
 .../core/framework/resource_accountant.h      |  24 +-
 .../core/framework/graph_partitioner.cc       | 104 +------
 .../framework/op_kernel_context_internal.h    |   4 +
 .../core/framework/resource_accountant.cc     | 148 ++++++++-
 .../core/framework/sequential_executor.cc     |  46 +--
 .../shared_library/provider_interfaces.h      |   1 -
 onnxruntime/core/session/inference_session.cc |  14 +-
 .../test/framework/inference_session_test.cc  | 100 ------
 .../test/framework/session_state_test.cc      | 286 +++++-------------
 onnxruntime/test/shared_lib/test_inference.cc |  79 ++++-
 .../tiny_gpt2_beamsearch_node_stats.txt       |  56 ++++
 12 files changed, 407 insertions(+), 457 deletions(-)
 create mode 100644 onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt

diff --git a/include/onnxruntime/core/framework/op_kernel_context.h b/include/onnxruntime/core/framework/op_kernel_context.h
index a67d7b8ae0174..e9a1490dedc34 100644
--- a/include/onnxruntime/core/framework/op_kernel_context.h
+++ b/include/onnxruntime/core/framework/op_kernel_context.h
@@ -192,7 +192,7 @@ class OpKernelContext {
   onnxruntime::NodeIndex GetNodeIndex() const;
 
   virtual const OrtValue* GetInputMLValue(int index) const;
-  const OrtValue* GetImplicitInputMLValue(int index) const;
+  virtual const OrtValue* GetImplicitInputMLValue(int index) const;
   OrtValue* GetOutputMLValue(int index);
 
 #ifdef ENABLE_ATEN
diff --git a/include/onnxruntime/core/framework/resource_accountant.h b/include/onnxruntime/core/framework/resource_accountant.h
index 982b37c969fe7..1f2e9ea5ccfb0 100644
--- a/include/onnxruntime/core/framework/resource_accountant.h
+++ b/include/onnxruntime/core/framework/resource_accountant.h
@@ -7,15 +7,19 @@
 #include <iosfwd>
 #include <optional>
 #include <string>
+#include <unordered_set>
 #include <variant>
 
 #include "core/common/common.h"
+#include "core/common/inlined_containers_fwd.h"
 
 namespace onnxruntime {
 
+struct ConfigOptions;
+
 // Common holder for potentially different resource accounting
 // for different EPs
-using ResourceCount = std::variant<size_t, std::monostate>;
+using ResourceCount = std::variant<size_t>;
 
 /// <summary>
 /// This class is used for graph partitioning by EPs
@@ -53,6 +57,9 @@ class IResourceAccountant {
   std::optional<ResourceCount> threshold_;
 };
 
+// A map of Ep Type to a resource accountant for this EP
+using ResourceAccountantMap = InlinedHashMap<std::string, std::unique_ptr<IResourceAccountant>>;
+
 // This struct keeps accounting of the memory allocation stats
 // for a kernel during runtime if enabled.
 struct NodeAllocationStats {
@@ -86,13 +93,22 @@ class NodeStatsRecorder {
 
   const std::filesystem::path& GetNodeStatsFileName() const noexcept;
 
+  bool ShouldAccountFor(const std::string& input_output_name) const;
+
+  void ResetPerRunNameDeduper();
+
   void ReportNodeStats(const std::string& node_name, const NodeAllocationStats& stats);
 
-  void DumpStats(std::ostream& os) const;
+  void DumpStats(const std::filesystem::path& model_path) const;
+
+  static Status CreateAccountants(
+      const ConfigOptions& config_options,
+      const std::filesystem::path& model_path,
+      std::optional<ResourceAccountantMap>& acc_map);
 
  private:
-  // We would like to hide certain things that may not compile
-  // with some device compilers
+  void DumpStats(std::ostream& os) const;
+
   struct Impl;
   std::unique_ptr<Impl> impl_;
 };
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index d6eaea70565a7..08ddfd872ca78 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -5,7 +5,6 @@
 
 #include <cassert>
 #include <functional>
-#include <variant>
 
 #include "core/common/inlined_containers.h"
 #include "core/common/string_utils.h"
@@ -54,9 +53,6 @@ namespace onnxruntime {
 
 namespace {
 
-// A map of Ep Type to a resource accountant for this EP
-using ResourceAccountantMap = InlinedHashMap<std::string, std::unique_ptr<IResourceAccountant>>;
-
 // contains some common parameters used by the partitioning helper functions
 struct PartitionParams {
   std::reference_wrapper<Graph> graph;
@@ -68,75 +64,6 @@ struct PartitionParams {
   std::reference_wrapper<const layout_transformation::DebugGraphFn> debug_graph_fn;
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 };
-
-// Use this accountant if your resource can be counted with size_t type
-class SizeTAccountant : public IResourceAccountant {
- public:
-  SizeTAccountant() = default;
-  ~SizeTAccountant() = default;
-
-  explicit SizeTAccountant(size_t threshold, InlinedHashMap<std::string, NodeAllocationStats>&& node_stats)
-      : IResourceAccountant(threshold), node_stats_(std::move(node_stats)) {}
-
-  ResourceCount GetConsumedAmount() const noexcept override {
-    return consumed_amount_;
-  }
-  void AddConsumedAmount(const ResourceCount& amount) noexcept override {
-    if (std::holds_alternative<size_t>(amount)) {
-      consumed_amount_ += std::get<size_t>(amount);
-    }
-  }
-  void RemoveConsumedAmount(const ResourceCount& amount) noexcept override {
-    if (std::holds_alternative<size_t>(amount)) {
-      consumed_amount_ -= std::get<0>(amount);
-    }
-  }
-
-  ResourceCount ComputeResourceCount(const std::string& node_name) const override {
-    auto hit = node_stats_.find(node_name);
-    if (hit != node_stats_.end()) {
-      const auto& stats = hit->second;
-      return stats.input_sizes + stats.initializers_sizes +
-             stats.total_dynamic_sizes + stats.total_temp_allocations;
-    }
-    return static_cast<size_t>(0U);
-  }
-
- private:
-  size_t consumed_amount_ = 0;
-  InlinedHashMap<std::string, NodeAllocationStats> node_stats_;
-};
-
-InlinedHashMap<std::string, NodeAllocationStats> LoadNodeAllocationStats(const std::filesystem::path& model_path,
-                                                                         const std::filesystem::path& file_name) {
-  InlinedHashMap<std::string, NodeAllocationStats> node_stats;
-  std::filesystem::path file_path = model_path;
-  if (file_path.has_filename()) {
-    file_path = file_path.parent_path();
-  }
-
-  file_path /= file_name;
-
-  std::ifstream file(file_path);
-  ORT_ENFORCE(file.is_open(), "Failed to open file ", file_path);
-  std::string line;
-  // Read and load a CSV file line by line
-  while (std::getline(file, line)) {
-    auto splits = utils::SplitString(line, ",", true);
-    ORT_ENFORCE(splits.size() == 5, "Invalid line in the file ", file_path, ": ", line);
-    if (splits[0].empty()) {
-      continue;
-    }
-    std::string node_name{splits[0]};
-    size_t input_sizes = SafeInt<size_t>(std::stoull(std::string{splits[1]}));
-    size_t initializers_sizes = SafeInt<size_t>(std::stoull(std::string{splits[2]}));
-    size_t total_dynamic_sizes = SafeInt<size_t>(std::stoull(std::string{splits[3]}));
-    size_t total_temp_allocations = SafeInt<size_t>(std::stoull(std::string{splits[4]}));
-    node_stats.insert_or_assign(node_name, {input_sizes, initializers_sizes,
-                                            total_dynamic_sizes, total_temp_allocations});
-  }
-  return node_stats;
-}
 }  // namespace
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -848,7 +775,8 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
 static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, GraphPartitioner::Mode mode,
                                        const ExecutionProviders& execution_providers,
                                        KernelRegistryManager& kernel_registry_manager,
-                                       const ResourceAccountantMap& acc_map, const logging::Logger& logger) {
+                                       const std::optional<ResourceAccountantMap>& acc_map,
+                                       const logging::Logger& logger) {
   bool modified_graph = false;
 
   auto& graph = partition_params.graph.get();
@@ -861,9 +789,11 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params,
     // process full graph with each EP
     for (const auto& ep : execution_providers) {
       IResourceAccountant* resource_accountant = nullptr;
-      auto hit = acc_map.find(ep->Type());
-      if (hit != acc_map.end()) {
-        resource_accountant = hit->second.get();
+      if (acc_map.has_value()) {
+        auto hit = acc_map->find(ep->Type());
+        if (hit != acc_map->end()) {
+          resource_accountant = hit->second.get();
+        }
       }
       ORT_RETURN_IF_ERROR(PartitionOnnxFormatModelImpl(graph, func_mgr, kernel_registry_manager,
                                                        fused_kernel_registry, *ep, mode, fused_node_unique_id,
@@ -1114,24 +1044,12 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
 
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
-  // We use this only if Resource Aware Partitioning is enabled for any of the EPs
-  ResourceAccountantMap ep_acc_map;
-  // Zero, it is disabled by default
-  const std::string resource_partitioning_settings = config_options.GetConfigOrDefault(
-      kOrtSessionOptionsResourceCudaPartitioningSettings, "");
-  if (!resource_partitioning_settings.empty()) {
-    auto splits = utils::SplitString(resource_partitioning_settings, ",", false);
-    if (splits.size() == 2) {
-      SafeInt<size_t> cuda_memory_limit = std::stoul(std::string{splits[0]});
-      cuda_memory_limit *= 1024;  // to bytes
-      auto node_to_stats = LoadNodeAllocationStats(graph.ModelPath(), splits[1]);
-      ep_acc_map[kCudaExecutionProvider] = std::make_unique<SizeTAccountant>(cuda_memory_limit,
-                                                                             std::move(node_to_stats));
-    }
-  }
-
   if (mode == Mode::kNormal || mode == Mode::kAssignOnly) {
 #if !defined(ORT_MINIMAL_BUILD)
+    // We use this only if Resource Aware Partitioning is enabled for any of the EPs
+    std::optional<ResourceAccountantMap> ep_acc_map;
+    ORT_RETURN_IF_ERROR(NodeStatsRecorder::CreateAccountants(config_options, graph.ModelPath(), ep_acc_map));
+
     ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_,
                                                  ep_acc_map, logger));
 
diff --git a/onnxruntime/core/framework/op_kernel_context_internal.h b/onnxruntime/core/framework/op_kernel_context_internal.h
index c970243ba461e..64932dce50917 100644
--- a/onnxruntime/core/framework/op_kernel_context_internal.h
+++ b/onnxruntime/core/framework/op_kernel_context_internal.h
@@ -59,6 +59,10 @@ class OpKernelContextInternal : public OpKernelContext {
     return OpKernelContext::GetInputMLValue(index);
   }
 
+  const OrtValue* GetImplicitInputMLValue(int index) const override {
+    return OpKernelContext::GetImplicitInputMLValue(index);
+  }
+
   OrtValue* GetOutputMLValue(int index) {
     return OpKernelContext::GetOutputMLValue(index);
   }
diff --git a/onnxruntime/core/framework/resource_accountant.cc b/onnxruntime/core/framework/resource_accountant.cc
index 5c2d4feaaf126..786da13e69458 100644
--- a/onnxruntime/core/framework/resource_accountant.cc
+++ b/onnxruntime/core/framework/resource_accountant.cc
@@ -2,33 +2,86 @@
 // Licensed under the MIT License.
 
 #include "core/framework/resource_accountant.h"
+
 #include "core/common/inlined_containers.h"
+#include "core/common/safeint.h"
+#include "core/common/string_utils.h"
+
+#include "core/framework/config_options.h"
+#include "core/graph/constants.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 
-#include <mutex>
+#include <fstream>
 
 namespace onnxruntime {
 
+// Use this accountant if your resource can be counted with size_t type
+class SizeTAccountant : public IResourceAccountant {
+ public:
+  SizeTAccountant() = default;
+  ~SizeTAccountant() = default;
+
+  explicit SizeTAccountant(size_t threshold, InlinedHashMap<std::string, NodeAllocationStats>&& node_stats)
+      : IResourceAccountant(threshold), node_stats_(std::move(node_stats)) {}
+
+  ResourceCount GetConsumedAmount() const noexcept override {
+    return consumed_amount_;
+  }
+  void AddConsumedAmount(const ResourceCount& amount) noexcept override {
+    if (std::holds_alternative<size_t>(amount)) {
+      consumed_amount_ += std::get<size_t>(amount);
+    }
+  }
+  void RemoveConsumedAmount(const ResourceCount& amount) noexcept override {
+    if (std::holds_alternative<size_t>(amount)) {
+      consumed_amount_ -= std::get<0>(amount);
+    }
+  }
+
+  ResourceCount ComputeResourceCount(const std::string& node_name) const override {
+    auto hit = node_stats_.find(node_name);
+    if (hit != node_stats_.end()) {
+      const auto& stats = hit->second;
+      return stats.input_sizes + stats.initializers_sizes +
+             stats.total_dynamic_sizes + stats.total_temp_allocations;
+    }
+    return static_cast<size_t>(0U);
+  }
+
+ private:
+  size_t consumed_amount_ = 0;
+  InlinedHashMap<std::string, NodeAllocationStats> node_stats_;
+};
+
 struct NodeStatsRecorder::Impl {
-  std::filesystem::path node_stats_path_;
+  std::filesystem::path node_stats_path;
   // This is a node name to allocation stats map
-  InlinedHashMap<std::string, NodeAllocationStats> node_stats_;
-  mutable std::mutex mut_;
+  InlinedHashMap<std::string, NodeAllocationStats> node_stats;
+  // Keeps track of nodes for which input/output sizes are accounted
+  InlinedHashSet<std::string> input_output_accounted;
 };
 
 NodeStatsRecorder::NodeStatsRecorder(const std::filesystem::path& node_stats_path)
     : impl_(std::make_unique<Impl>()) {
-  impl_->node_stats_path_ = node_stats_path;
+  impl_->node_stats_path = node_stats_path;
 }
 
 NodeStatsRecorder::~NodeStatsRecorder() = default;
 
 const std::filesystem::path& NodeStatsRecorder::GetNodeStatsFileName() const noexcept {
-  return impl_->node_stats_path_;
+  return impl_->node_stats_path;
+}
+
+bool NodeStatsRecorder::ShouldAccountFor(const std::string& input_output_name) const {
+  return impl_->input_output_accounted.insert(input_output_name).second;
+}
+
+void NodeStatsRecorder::ResetPerRunNameDeduper() {
+  impl_->input_output_accounted.clear();
 }
 
 void NodeStatsRecorder::ReportNodeStats(const std::string& node_name, const NodeAllocationStats& stats) {
-  std::lock_guard lock(impl_->mut_);
-  auto result = impl_->node_stats_.emplace(node_name, stats);
+  auto result = impl_->node_stats.emplace(node_name, stats);
   if (!result.second) {
     // Node already exists, update the stats
     result.first->second.UpdateIfGreater(stats);
@@ -36,12 +89,87 @@ void NodeStatsRecorder::ReportNodeStats(const std::string& node_name, const Node
 }
 
 void NodeStatsRecorder::DumpStats(std::ostream& os) const {
-  std::lock_guard lock(impl_->mut_);
-  for (const auto& [name, stats] : impl_->node_stats_) {
+  for (const auto& [name, stats] : impl_->node_stats) {
     os << name << "," << stats.input_sizes << "," << stats.initializers_sizes << ","
        << stats.total_dynamic_sizes << ","
        << stats.total_temp_allocations << "\n";
   }
 }
 
+void NodeStatsRecorder::DumpStats(const std::filesystem::path& model_path) const {
+  auto node_stats_file = model_path;
+  if (node_stats_file.has_filename()) {
+    node_stats_file = node_stats_file.parent_path();
+  }
+  node_stats_file /= GetNodeStatsFileName();
+  std::ofstream ofs(node_stats_file, std::ofstream::out);
+  ORT_ENFORCE(ofs.is_open(), "Failed to open file: ", node_stats_file);
+  DumpStats(ofs);
+  ofs.close();
+}
+
+static Status LoadNodeAllocationStats(
+    const std::filesystem::path& model_path, const std::filesystem::path& file_name,
+    InlinedHashMap<std::string, NodeAllocationStats>& result) {
+  InlinedHashMap<std::string, NodeAllocationStats> node_stats;
+  std::filesystem::path file_path = model_path;
+  if (file_path.has_filename()) {
+    file_path = file_path.parent_path();
+  }
+
+  file_path /= file_name;
+
+  std::ifstream file(file_path);
+  ORT_RETURN_IF_NOT(file.is_open(), "Failed to open file ", file_path);
+  std::string line;
+  // Read and load a CSV file line by line
+  while (std::getline(file, line)) {
+    auto splits = utils::SplitString(line, ",", true);
+    ORT_ENFORCE(splits.size() == 5, "Invalid line in the file ", file_path, ": ", line);
+    if (splits[0].empty()) {
+      continue;
+    }
+    std::string node_name{splits[0]};
+    size_t input_sizes = SafeInt<size_t>(std::stoull(std::string{splits[1]}));
+    size_t initializers_sizes = SafeInt<size_t>(std::stoull(std::string{splits[2]}));
+    size_t total_dynamic_sizes = SafeInt<size_t>(std::stoull(std::string{splits[3]}));
+    size_t total_temp_allocations = SafeInt<size_t>(std::stoull(std::string{splits[4]}));
+    node_stats.insert_or_assign(node_name, {input_sizes, initializers_sizes,
+                                            total_dynamic_sizes, total_temp_allocations});
+  }
+
+  result.swap(node_stats);
+  return Status::OK();
+}
+
+Status NodeStatsRecorder::CreateAccountants(
+    const ConfigOptions& config_options,
+    const std::filesystem::path& model_path,
+    std::optional<ResourceAccountantMap>& acc_map) {
+  // Check if CUDA partitioning settings are provided
+  const std::string resource_partitioning_settings = config_options.GetConfigOrDefault(
+      kOrtSessionOptionsResourceCudaPartitioningSettings, "");
+
+  if (!resource_partitioning_settings.empty()) {
+    auto splits = utils::SplitString(resource_partitioning_settings, ",", false);
+    if (splits.size() == 2) {
+      SafeInt<size_t> cuda_memory_limit = std::stoul(std::string{splits[0]});
+      cuda_memory_limit *= 1024;  // to bytes
+
+      InlinedHashMap<std::string, NodeAllocationStats> loaded_stats;
+      ORT_RETURN_IF_ERROR(LoadNodeAllocationStats(model_path, splits[1], loaded_stats));
+
+      std::optional<ResourceAccountantMap> result;
+      auto& map = result.emplace();
+
+      map.insert_or_assign(kCudaExecutionProvider,
+                           std::make_unique<SizeTAccountant>(cuda_memory_limit,
+                                                             std::move(loaded_stats)));
+      acc_map = std::move(result);
+    }
+  }
+
+  return Status::OK();
+}
+
 }  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index 8a7564c7d4236..35ae33328837c 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -494,38 +494,42 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx,
 #if !defined(ORT_MINIMAL_BUILD)
       auto* node_stats_recorder = ctx.GetSessionState().GetNodeStatsRecorder();
       if (node_stats_recorder != nullptr) {
+        const auto& node = p_kernel->Node();
+        const OpKernelInfo& op_kernel_info = p_kernel->Info();
+        const auto input_defs = node.InputDefs();
+
         // Lets first check if any inputs are initializers,
         // if so we need to account for their memory usage.
-        const auto& const_initializers = ctx.GetSessionState().GetConstantInitializedTensors();
         SafeInt<int64_t> initializers_size = 0;
         SafeInt<size_t> input_sizes = 0;
         for (int i = 0, lim = kernel_ctx.InputCount(); i < lim; ++i) {
           // Need to get ort_value_index for each input.
-          int ort_vaue_index = kernel_ctx.GetOrtValueIndexForInput(i);
-          auto hit = const_initializers.find(ort_vaue_index);
-          if (hit != const_initializers.end()) {
-            const auto& ort_value = hit->second;
-            initializers_size += ort_value.Get<Tensor>().SizeInBytes();
-          } else {
-            // If the input is not an initializer, we account it as something that had to be
-            // on the same device with this kernel
-            const OrtValue* ort_value = kernel_ctx.GetInputMLValue(i);
-            if (ort_value != nullptr && ort_value->IsAllocated() && ort_value->IsTensor()) {
-              input_sizes += ort_value->Get<Tensor>().SizeInBytes();
+          const OrtValue* p_input = kernel_ctx.GetInputMLValue(i);
+          if (p_input != nullptr && p_input->IsAllocated() && p_input->IsTensor()) {
+            const auto& input_name = input_defs[i]->Name();
+            if (node_stats_recorder->ShouldAccountFor(input_name)) {
+              const Tensor* p_tensor = nullptr;
+              const bool is_constant = op_kernel_info.TryGetConstantInput(i, &p_tensor);
+              if (!is_constant) {
+                p_tensor = &p_input->Get<Tensor>();
+              }
+              input_sizes += p_tensor->SizeInBytes();
             }
           }
         }
 
-        // XXX: Should we account for implicit inputs?
-
-        // Get outputs and see if any were allocated dynamically
+        // Get outputs and see if anything were allocated dynamically
+        const auto output_defs = node.OutputDefs();
         SafeInt<size_t> total_dynamic_sizes = 0;
         const auto& exec_frame = ctx.GetExecutionFrame();
         for (int i = 0, lim = kernel_ctx.OutputCount(); i < lim; ++i) {
-          int ort_vaue_index = kernel_ctx.GetOrtValueIndexForOutput(i);
-          auto maybe_val = exec_frame.GetOrtValueDynamicAllocation(ort_vaue_index);
-          if (maybe_val.has_value()) {
-            total_dynamic_sizes += *maybe_val;
+          const OrtValue* p_output = kernel_ctx.GetOutputMLValue(i);
+          if (p_output != nullptr && p_output->IsAllocated() && p_output->IsTensor()) {
+            int ort_value_index = kernel_ctx.GetOrtValueIndexForOutput(i);
+            auto maybe_val = exec_frame.GetOrtValueDynamicAllocation(ort_value_index);
+            if (maybe_val.has_value() && node_stats_recorder->ShouldAccountFor(output_defs[i]->Name())) {
+              total_dynamic_sizes += *maybe_val;
+            }
           }
         }
 
@@ -541,8 +545,8 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx,
         }
 
         // Record node allocation stats
-        const auto& node = p_kernel->Node();
-        node_stats_recorder->ReportNodeStats(node.Name(), node_stats);
+        const auto& name = (node.Name().empty()) ? node.OpType() : node.Name();
+        node_stats_recorder->ReportNodeStats(name, node_stats);
       }
 #endif
 #endif
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index e8fe5428612d4..059a722958118 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -9,7 +9,6 @@
 
 // Public wrappers around internal ort interfaces (currently)
 #include "core/providers/shared_library/provider_host_api.h"
-
 #include "core/common/inlined_containers_fwd.h"
 #include "core/framework/resource_accountant.h"
 #include "core/providers/shared/common.h"
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index d2a5b9339eab7..fb0fcd55ffc63 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -2749,18 +2749,10 @@ Status InferenceSession::Run(const RunOptions& run_options,
 #endif
 
 #if !defined(ORT_MINIMAL_BUILD)
-  if (GetNodeStatsRecorder() != nullptr && retval.IsOK()) {
+  if (node_stats_recorder_.has_value() && retval.IsOK()) {
     // Dump node stats if the run was successful
-    const auto* node_stats_recorder = GetNodeStatsRecorder();
-    auto node_stats_file = session_state_->GetGraphViewer().ModelPath();
-    if (node_stats_file.has_filename()) {
-      node_stats_file = node_stats_file.parent_path();
-    }
-    node_stats_file /= node_stats_recorder->GetNodeStatsFileName();
-    std::ofstream ofs(node_stats_file, std::ofstream::out);
-    ORT_ENFORCE(ofs.is_open(), "Failed to open file: ", node_stats_file);
-    node_stats_recorder->DumpStats(ofs);
-    ofs.close();
+    node_stats_recorder_->DumpStats(session_state_->GetGraphViewer().ModelPath());
+    node_stats_recorder_->ResetPerRunNameDeduper();
   }
 #endif
 
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 7ac0aaa291f67..1b06eb55afbd2 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -389,106 +389,6 @@ void RunModelWithBindingMatMul(InferenceSession& session_object,
   }
 }
 
-#if 0
-namespace {
-// generate random inputs
-template <class T>
-InlinedVector<T> GenerateRandomInput(size_t size) {
-  InlinedVector<T> values(size);
-  std::random_device dev;
-  std::mt19937 rng(dev());
-  std::uniform_int_distribution<std::mt19937::result_type> distribution(1, 100);
-  std::generate(values.begin(), values.end(), [&]() { return static_cast<T>(distribution(rng)); });
-  return values;
-}
-
-template <>
-InlinedVector<float> GenerateRandomInput<float>(size_t size) {
-  InlinedVector<float> values(size);
-  std::random_device dev;
-  std::default_random_engine rng(dev());
-  std::uniform_real_distribution<float> distribution(-1.f, 1.f);
-  std::generate(values.begin(), values.end(), [&]() { return static_cast<float>(distribution(rng)); });
-  return values;
-}
-
-template <class T>
-void CreateMLValueFromRandom(const AllocatorPtr& alloc, gsl::span<const int64_t> shape,
-                             OrtValue& ort_value) {
-  const auto elements = narrow<size_t>(std::accumulate(shape.begin(), shape.end(),
-                                                       static_cast<int64_t>(1),
-                                                       std::multiplies<int64_t>()));
-  const auto values = GenerateRandomInput<T>(elements);
-  CreateMLValue<T>(alloc, shape, values, &ort_value);
-}
-
-}  // namespace
-
-TEST(InferenceSessionTests, GenerateNodeStatsWithRandomInput) {
-  static constexpr const ORTCHAR_T* STAT_MODEL =
-      ORT_TSTR("D:/dev/data/FunctionsConverterProfling/HF_Mobile_Bert/attention_mask2d_fp32.onnx");
-
-  SessionOptions so;
-  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsCollectNodeMemoryStatsToFile,
-                                                    "attention_mask2d_fp32_node_stats.txt"));
-  InferenceSession session_object{so, GetEnvironment()};
-  ASSERT_STATUS_OK(session_object.Load(STAT_MODEL));
-  ASSERT_STATUS_OK(session_object.Initialize());
-
-  auto allocators = TestCPUExecutionProvider()->CreatePreferredAllocators();
-  auto inputs_defs = session_object.GetModelInputs();
-  ASSERT_STATUS_OK(inputs_defs.first);
-  NameMLValMap feeds;
-  for (const auto* def : *inputs_defs.second) {
-    if (!def->Exists()) {
-      continue;
-    }
-
-    OrtValue ml_value;
-    const auto* type_proto = def->TypeAsProto();
-    ASSERT_TRUE(utils::HasTensorType(*type_proto));
-    const auto elem_type = type_proto->tensor_type().elem_type();
-    ASSERT_TRUE(utils::HasShape(*type_proto));
-    const auto& tensor_shape_proto = type_proto->tensor_type().shape();
-
-    TensorShapeVector input_dims;
-    for (const auto& dim : tensor_shape_proto.dim()) {
-      input_dims.push_back(dim.dim_value());
-    }
-
-    switch (elem_type) {
-      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
-        CreateMLValueFromRandom<float>(allocators[0], input_dims, ml_value);
-        break;
-      }
-      case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
-        CreateMLValueFromRandom<int32_t>(allocators[0], input_dims, ml_value);
-        break;
-      }
-      case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
-        CreateMLValueFromRandom<int64_t>(allocators[0], input_dims, ml_value);
-        break;
-      }
-
-      default:
-        ASSERT_TRUE(false) << "Unsupported type: " << elem_type;
-    }
-    feeds.insert_or_assign(def->Name(), std::move(ml_value));
-  }
-
-  InlinedVector<std::string> output_names;
-  auto outputs = session_object.GetModelOutputs();
-  ASSERT_STATUS_OK(outputs.first);
-  for (const auto& output : *outputs.second) {
-    output_names.push_back(output->Name());
-  }
-
-  RunOptions run_options;
-  std::vector<OrtValue> fetches;
-  ASSERT_STATUS_OK(session_object.Run(run_options, feeds, output_names, &fetches));
-}
-#endif
-
 TEST(InferenceSessionTests, NoTimeout) {
   SessionOptions so;
 
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index c34b9ac84b259..b6b915f90d99a 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -443,83 +443,21 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
 }
 
 #ifdef USE_CUDA
-namespace {
-void BuildTestModel(Graph& graph, const std::vector<int64_t>& input_shape,
-                    size_t approx_init_a_size,
-                    size_t approx_init_b_size) {
-  ASSERT_EQ(2, input_shape.size());
-
-  // Create two MatMul nodes each with the initializers, that are going to
-  // dictate the cost of the nodes
-  const auto init_a_dim_0 = input_shape[1];
-  const int64_t init_a_dim_1 = approx_init_a_size / input_shape[1];
-  const std::vector<int64_t> init_a_shape = {init_a_dim_0, init_a_dim_1};
-
-  // This is also an A input to mm_2
-  const std::vector<int64_t> mm_1_output_shape = {input_shape[0], init_a_shape[1]};
-
-  const int64_t init_b_dim_0 = mm_1_output_shape[1];
-  const int64_t init_b_dim_1 = approx_init_b_size / mm_1_output_shape[1];
-  const std::vector<int64_t> init_b_shape = {init_b_dim_0, init_b_dim_1};
 
-  const std::vector<int64_t> output_shape = {mm_1_output_shape[0], init_b_dim_1};
-
-  ModelTestBuilder builder(graph);
-
-  std::optional<std::vector<int64_t>> in_shape = input_shape;
-  NodeArg* model_input = builder.MakeInput<float>(in_shape, "input");
-  NodeArg* init_a = builder.MakeInitializer<float>(init_a_shape, 1.f, 10.f);
-  NodeArg* mm_1_output = builder.MakeIntermediate<float>(mm_1_output_shape);
-  NodeArg* init_b = builder.MakeIntermediate<float>(init_b_shape);
-  NodeArg* mm_2_output = builder.MakeOutput<float>(output_shape);
+namespace {
 
-  builder.AddNode("MatMul", {model_input, init_a}, {mm_1_output});
-  builder.AddNode("MatMul", {mm_1_output, init_b}, {mm_2_output});
-}
-}  // namespace
+using ParitionVerifierFn = std::function<void(const Graph&)>;
 
-// Produces node stats for the model. This requires running the model.
-// TEST(SessionStateTest, TestResourceAwareParitioningSaveNodeStats) {
-//
-//  const auto& log_manager = DefaultLoggingManager();
-//  log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE);
-//  const auto& default_logger = log_manager.DefaultLogger();
-//  std::unordered_map<std::string, int> domain_to_version;
-//  domain_to_version[kOnnxDomain] = 16;  // We can make it a parameter
-//  Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
-//              domain_to_version, {}, default_logger);
-//
-//  const std::vector<int64_t> input_shape = {1024, 1024};
-//  constexpr const size_t approx_init_a_size = 1024 * 1024;  // 1Mb
-//  constexpr const size_t approx_init_b_size = 1024 * 1024;  // 1Mb
-//
-//  auto& graph = model.MainGraph();
-//  BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size);
-//  ASSERT_STATUS_OK(graph.Resolve());
-//
-//  auto model_proto = model.ToProto();
-//  const auto model_string = model_proto.SerializeAsString();
-//  std::ofstream model_file("model.onnx", std::ios::binary);
-//}
-
-/// XXX: Optionally add resource aware parameters
-/// This test can only run with CUDA present currently.
-TEST(SessionStateTest, TestResourceAwarePartitioning_NoLimit) {
+void LoadWithResourceAwarePartitioning(const ORTCHAR_T* model_path,
+                                       const SessionOptions& sess_options,
+                                       const ParitionVerifierFn& verifier_fn) {
   const auto& log_manager = DefaultLoggingManager();
   log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE);
   const auto& default_logger = log_manager.DefaultLogger();
-  std::unordered_map<std::string, int> domain_to_version;
-  domain_to_version[kOnnxDomain] = 16;  // We can make it a parameter
-  Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
-              domain_to_version, {}, default_logger);
+  std::shared_ptr<onnxruntime::Model> model;
+  ASSERT_STATUS_OK(Model::Load(model_path, model, nullptr, default_logger));
 
-  // Input Shape
-  const std::vector<int64_t> input_shape = {1024, 1024};
-  constexpr const size_t approx_init_a_size = 1024 * 1024;  // 1Mb
-  constexpr const size_t approx_init_b_size = 1024 * 1024;  // 1Mb
-
-  auto& graph = model.MainGraph();
-  BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size);
+  Graph& graph = model->MainGraph();
   ASSERT_STATUS_OK(graph.Resolve());
 
   OrtThreadPoolParams to;
@@ -537,15 +475,8 @@ TEST(SessionStateTest, TestResourceAwarePartitioning_NoLimit) {
   DataTransferManager dtm;
   ExternalDataLoaderManager edlm;
   profiling::Profiler profiler;
-  // Try to load the model without restrictions
-  // and verify nodes have been placed to CUDA
-  SessionOptions sess_options;
-  sess_options.enable_mem_pattern = false;
-  sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
-  sess_options.use_deterministic_compute = false;
-  sess_options.enable_mem_reuse = false;
 
-  SessionState session_state(graph, execution_providers, tp.get(), nullptr, dtm, edlm,
+  SessionState session_state(model->MainGraph(), execution_providers, tp.get(), nullptr, dtm, edlm,
                              default_logger, profiler, sess_options);
 
   GraphPartitioner partitioner(krm, execution_providers);
@@ -556,140 +487,75 @@ TEST(SessionStateTest, TestResourceAwarePartitioning_NoLimit) {
                             sess_options.config_options, default_logger,
                             GraphPartitioner::Mode::kNormal, debug_graph_fn));
 
-  // All nodes have been placed to CUDA
-  const auto& graph_nodes = graph.Nodes();
-  for (const auto& node : graph_nodes) {
-    EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider);
-  }
+  verifier_fn(graph);
+}
+}  // namespace
+
+TEST(SessionStateTest, TestResourceAwarePartitioning_NoLimit) {
+  constexpr const ORTCHAR_T* model_path = ORT_TSTR("testdata/transformers/tiny_gpt2_beamsearch.onnx");
+
+  // Try to load the model without restrictions
+  // and verify nodes have been placed to CUDA
+  SessionOptions sess_options;
+  sess_options.enable_mem_pattern = false;
+  sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+  sess_options.use_deterministic_compute = false;
+  sess_options.enable_mem_reuse = false;
+
+  LoadWithResourceAwarePartitioning(model_path, sess_options, [](const Graph& graph) {
+    const auto& graph_nodes = graph.Nodes();
+    for (const auto& node : graph_nodes) {
+      EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider);
+    }
+  });
+}
+
+TEST(SessionStateTest, TestResourceAwarePartitioning_LargeLimit) {
+  constexpr const ORTCHAR_T* model_path = ORT_TSTR("testdata/transformers/tiny_gpt2_beamsearch.onnx");
+  constexpr const char* limit_setting = "10000,tiny_gpt2_beamsearch_node_stats.txt";
+
+  // Large limit, all nodes are still assigned
+  SessionOptions sess_options;
+  sess_options.enable_mem_pattern = false;
+  sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+  sess_options.use_deterministic_compute = false;
+  sess_options.enable_mem_reuse = false;
+  ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(
+      kOrtSessionOptionsResourceCudaPartitioningSettings, limit_setting));
+
+  LoadWithResourceAwarePartitioning(model_path, sess_options, [](const Graph& graph) {
+    const auto& graph_nodes = graph.Nodes();
+    for (const auto& node : graph_nodes) {
+      EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider);
+    }
+  });
 }
 
-// TEST(SessionStateTest, TestResourceAwarePartitioning_LargeLimit) {
-//   const auto& log_manager = DefaultLoggingManager();
-//   log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE);
-//   const auto& default_logger = log_manager.DefaultLogger();
-//   std::unordered_map<std::string, int> domain_to_version;
-//   domain_to_version[kOnnxDomain] = 16;  // We can make it a parameter
-//   Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
-//               domain_to_version, {}, default_logger);
-//
-//   // Input Shape
-//   const std::vector<int64_t> input_shape = {1024, 1024};
-//   constexpr const size_t approx_init_a_size = 1024 * 1024;  // 1Mb
-//   constexpr const size_t approx_init_b_size = 1024 * 1024;  // 1Mb
-//
-//   auto& graph = model.MainGraph();
-//   BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size);
-//   ASSERT_STATUS_OK(graph.Resolve());
-//
-//   OrtThreadPoolParams to;
-//   to.thread_pool_size = 1;
-//   auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP);
-//
-//   ExecutionProviders execution_providers;
-//   auto tmp_cpu_execution_provider = DefaultCudaExecutionProvider();
-//   tmp_cpu_execution_provider->SetLogger(&default_logger);
-//   ASSERT_STATUS_OK(execution_providers.Add(kCudaExecutionProvider, std::move(tmp_cpu_execution_provider)));
-//
-//   KernelRegistryManager krm;
-//   ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers));
-//
-//   DataTransferManager dtm;
-//   ExternalDataLoaderManager edlm;
-//   profiling::Profiler profiler;
-//   // Try to load the model without restrictions
-//   // and verify nodes have been placed to CUDA
-//   SessionOptions sess_options;
-//   sess_options.enable_mem_pattern = false;
-//   sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
-//   sess_options.use_deterministic_compute = false;
-//   sess_options.enable_mem_reuse = false;
-//   ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(kOrtSessionOptionsResourceCudaPartitioningSettings,
-//                                                               "4206592"));
-//
-//   SessionState session_state(graph, execution_providers, tp.get(), nullptr, dtm, edlm,
-//                              default_logger, profiler, sess_options);
-//
-//   GraphPartitioner partitioner(krm, execution_providers);
-//   layout_transformation::TransformLayoutFunction transform_layout_fn;
-//   layout_transformation::DebugGraphFn debug_graph_fn;
-//   ASSERT_STATUS_OK(
-//       partitioner.Partition(graph, session_state.GetMutableFuncMgr(), transform_layout_fn,
-//                             sess_options.config_options, default_logger,
-//                             GraphPartitioner::Mode::kNormal, debug_graph_fn));
-//
-//   // All nodes have been placed to CUDA
-//   const auto& graph_nodes = graph.Nodes();
-//   for (const auto& node : graph_nodes) {
-//     EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider);
-//   }
-// }
-
-// TEST(SessionStateTest, TestResourceAwarePartitioning_SecondNodeCutOff) {
-//   const auto& log_manager = DefaultLoggingManager();
-//   log_manager.SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE);
-//   const auto& default_logger = log_manager.DefaultLogger();
-//   std::unordered_map<std::string, int> domain_to_version;
-//   domain_to_version[kOnnxDomain] = 16;  // We can make it a parameter
-//   Model model("LargeModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
-//               domain_to_version, {}, default_logger);
-//
-//   // Input Shape
-//   const std::vector<int64_t> input_shape = {1024, 1024};
-//   constexpr const size_t approx_init_a_size = 1024 * 1024;  // 1Mb
-//   constexpr const size_t approx_init_b_size = 1024 * 1024;  // 1Mb
-//
-//   auto& graph = model.MainGraph();
-//   BuildTestModel(graph, input_shape, approx_init_a_size, approx_init_b_size);
-//   ASSERT_STATUS_OK(graph.Resolve());
-//
-//   OrtThreadPoolParams to;
-//   to.thread_pool_size = 1;
-//   auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP);
-//
-//   ExecutionProviders execution_providers;
-//   auto tmp_cpu_execution_provider = DefaultCudaExecutionProvider();
-//   tmp_cpu_execution_provider->SetLogger(&default_logger);
-//   ASSERT_STATUS_OK(execution_providers.Add(kCudaExecutionProvider, std::move(tmp_cpu_execution_provider)));
-//
-//   KernelRegistryManager krm;
-//   ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers));
-//
-//   DataTransferManager dtm;
-//   ExternalDataLoaderManager edlm;
-//   profiling::Profiler profiler;
-//   // Try to load the model without restrictions
-//   // and verify nodes have been placed to CUDA
-//   SessionOptions sess_options;
-//   sess_options.enable_mem_pattern = false;
-//   sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
-//   sess_options.use_deterministic_compute = false;
-//   sess_options.enable_mem_reuse = false;
-//   ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(kOrtSessionOptionsResourceCudaPartitioningSettings,
-//                                                               "16383"));
-//
-//   SessionState session_state(graph, execution_providers, tp.get(), nullptr, dtm, edlm,
-//                              default_logger, profiler, sess_options);
-//
-//   GraphPartitioner partitioner(krm, execution_providers);
-//   layout_transformation::TransformLayoutFunction transform_layout_fn;
-//   layout_transformation::DebugGraphFn debug_graph_fn;
-//   ASSERT_STATUS_OK(
-//       partitioner.Partition(graph, session_state.GetMutableFuncMgr(), transform_layout_fn,
-//                             sess_options.config_options, default_logger,
-//                             GraphPartitioner::Mode::kNormal, debug_graph_fn));
-//
-//   // Second node did not make it to CUDA
-//   const auto& graph_nodes = graph.Nodes();
-//   size_t count = 0;
-//   for (const auto& node : graph_nodes) {
-//     if (count == 0) {
-//       EXPECT_EQ(node.GetExecutionProviderType(), kCudaExecutionProvider);
-//     } else {
-//       EXPECT_TRUE(node.GetExecutionProviderType().empty());
-//     }
-//     count++;
-//   }
-// }
+TEST(SessionStateTest, TestResourceAwarePartitioning_CPUOffloaded) {
+  constexpr const ORTCHAR_T* model_path = ORT_TSTR("testdata/transformers/tiny_gpt2_beamsearch.onnx");
+  constexpr const char* limit_setting = "5000,tiny_gpt2_beamsearch_node_stats.txt";
+
+  // Large limit, all nodes are still assigned
+  SessionOptions sess_options;
+  sess_options.enable_mem_pattern = false;
+  sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+  sess_options.use_deterministic_compute = false;
+  sess_options.enable_mem_reuse = false;
+  ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(
+      kOrtSessionOptionsResourceCudaPartitioningSettings, limit_setting));
+
+  LoadWithResourceAwarePartitioning(model_path, sess_options, [](const Graph& graph) {
+    const auto& graph_nodes = graph.Nodes();
+    bool cpu_node_found = false;
+    for (const auto& node : graph_nodes) {
+      if (node.GetExecutionProviderType() != kCudaExecutionProvider) {
+        cpu_node_found = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(cpu_node_found);
+  });
+}
 
 #endif  // USE_CUDA
 
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 2eee6399960dd..59920487a7248 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -40,6 +40,7 @@
 #endif
 
 #ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
 #include <cuda_runtime.h>
 #endif
 
@@ -4778,15 +4779,81 @@ TEST(CApiTest, OrtCustomOp_GetInPlace) {
   mock_gqa.ReleaseAliasMap(input_index, output_index);
 }
 
-/*TEST(CApiTest, RunWithNodeStats) {
+#if !defined(ORT_MINIMAL_BUILD) && defined(USE_CUDA)
+
+TEST(CApiTest, GenerateNodeStatsFile) {
   Ort::Env env(ORT_LOGGING_LEVEL_INFO);
-  constexpr const ORTCHAR_T* model_path = TSTR("testdata/attention_mask2d_fp32.onnx");
+  constexpr const ORTCHAR_T* model_path = TSTR("testdata/transformers/tiny_gpt2_beamsearch.onnx");
 
   Ort::SessionOptions session_options;
-  session_options.DisableCpuMemArena();
-  session_options.DisableMemPattern();
   session_options.AddConfigEntry(kOrtSessionOptionsCollectNodeMemoryStatsToFile,
-                                 "D:/dev/data/FunctionsConverterProfling/HF_Mobile_Bert/attention_memory.txt");
+                                 "tiny_gpt2_beamsearch_node_stats.txt");
+
+  OrtCUDAProviderOptionsV2 cuda_options;
+  cuda_options.use_tf32 = false;
+  session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
+
+  std::vector<int64_t> input_ids_shape{3, 12};
+  std::vector<int32_t> input_ids{
+      0, 0, 0, 0, 0, 52, 195, 731, 321, 301, 734, 620,
+      41, 554, 74, 622, 206, 222, 75, 223, 221, 198, 224, 572,
+      0, 0, 0, 52, 328, 219, 328, 206, 288, 227, 896, 328};
+
+  std::vector<int64_t> parameter_shape{1};
+  std::vector<int32_t> max_length{20};
+  std::vector<int32_t> min_length{1};
+  std::vector<int32_t> num_beams{4};
+  std::vector<int32_t> num_return_sequences{1};
+  std::vector<float> length_penalty{1.0f};
+  std::vector<float> repetition_penalty{1.0f};
+
+  std::vector<int64_t> expected_output_shape{input_ids_shape[0], num_return_sequences[0], max_length[0]};
+  std::vector<int32_t> expected_output{
+      0, 0, 0, 0, 0, 52, 195, 731, 321, 301, 734, 620, 131, 131, 131, 181, 638, 638, 638, 638,
+      41, 554, 74, 622, 206, 222, 75, 223, 221, 198, 224, 572, 292, 292, 292, 292, 292, 292, 292, 292,
+      0, 0, 0, 52, 328, 219, 328, 206, 288, 227, 896, 328, 328, 669, 669, 669, 669, 669, 669, 669};
+
+  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+  auto input_ids_tensor = Ort::Value::CreateTensor(
+      info, input_ids.data(), input_ids.size(), input_ids_shape.data(), input_ids_shape.size());
+
+  auto max_length_tensor = Ort::Value::CreateTensor(
+      info, max_length.data(), max_length.size(), parameter_shape.data(), parameter_shape.size());
+
+  auto min_length_tensor = Ort::Value::CreateTensor(
+      info, min_length.data(), min_length.size(), parameter_shape.data(), parameter_shape.size());
+
+  auto num_beams_tensor = Ort::Value::CreateTensor(
+      info, num_beams.data(), num_beams.size(), parameter_shape.data(), parameter_shape.size());
 
+  auto num_return_sequences_tensor = Ort::Value::CreateTensor(
+      info, num_return_sequences.data(), num_return_sequences.size(), parameter_shape.data(), parameter_shape.size());
+
+  auto length_penalty_tensor = Ort::Value::CreateTensor(
+      info, length_penalty.data(), length_penalty.size(), parameter_shape.data(), parameter_shape.size());
+
+  auto repetition_penalty_tensor = Ort::Value::CreateTensor(
+      info, repetition_penalty.data(), repetition_penalty.size(), parameter_shape.data(), parameter_shape.size());
+
+  std::vector<Ort::Value> ort_inputs;
+  ort_inputs.push_back(std::move(input_ids_tensor));
+  ort_inputs.push_back(std::move(max_length_tensor));
+  ort_inputs.push_back(std::move(min_length_tensor));
+  ort_inputs.push_back(std::move(num_beams_tensor));
+  ort_inputs.push_back(std::move(num_return_sequences_tensor));
+  ort_inputs.push_back(std::move(length_penalty_tensor));
+  ort_inputs.push_back(std::move(repetition_penalty_tensor));
+  const char* input_names[] = {"input_ids", "max_length", "min_length", "num_beams", "num_return_sequences",
+                               "length_penalty", "repetition_penalty"};
+  const char* const output_names[] = {"sequences"};
+
+  // The ONNX model is generated like the following:
+  // python convert_generation.py --model_type gpt2 -m hf-internal-testing/tiny-random-gpt2
+  //        --output tiny_gpt2_beamsearch_fp16.onnx --use_gpu --max_length 20
+  // (with separate_gpt2_decoder_for_init_run set to False as it is now set to True by default)
   Ort::Session session(env, model_path, session_options);
-}*/
\ No newline at end of file
+  session.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(),
+              output_names, 1);
+}
+
+#endif
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt b/onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt
new file mode 100644
index 0000000000000..d9150cf6768f5
--- /dev/null
+++ b/onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt
@@ -0,0 +1,56 @@
+GptAttention_1_add,18432,0,0,0
+GptAttention_0_matmul,4096,0,0,0
+GptAttention_2_matmul,22528,0,0,0
+FullyConnect_MatMul_5,90112,0,0,0
+GptAttention_3,30720,0,36864,165888
+LayerNorm_4,18432,0,0,0
+GptAttention_1_matmul,22528,0,0,0
+FullyConnect_Add_5,18432,0,0,0
+GptAttention_2_add,18432,0,0,0
+FullyConnect_Add_3,18432,0,0,0
+GptAttention_3_add,18432,0,0,0
+Add_689,18432,0,0,0
+Add_886,18432,0,0,0
+LayerNorm_7,18432,0,0,0
+FullyConnect_MatMul_6,34816,0,0,0
+GptAttention_4,30720,0,36864,165888
+GptAttention_4_add,18432,0,0,0
+SkipLayerNormalization,18432,0,0,0
+LayerNorm_1,18432,0,0,0
+GptAttention_3_matmul,22528,0,0,0
+LayerNorm_8,18432,0,0,0
+FullyConnect_MatMul_8,34816,0,0,0
+FullyConnect_Add_7,18432,0,0,0
+LayerNorm_9,18432,0,0,0
+FastGelu_AddBias_3,73728,0,0,0
+FullyConnect_Add_1,18432,0,0,0
+GptAttention_4_matmul,22528,0,0,0
+GptAttention_0,13248,0,55296,165888
+FullyConnect_MatMul_2,34816,0,0,0
+FullyConnect_MatMul_9,90112,0,0,0
+MatMul_1165,146432,0,576000,0
+GptAttention_2,30720,0,36864,165888
+LayerNorm_6,18432,0,0,0
+BeamSearch_gpt2,24,0,256,1823244
+FastGelu_AddBias_4,73728,0,0,0
+Add_951,18432,0,0,0
+GptAttention_1,30720,0,36864,165888
+LayerNorm_3,18432,0,0,0
+Add_295,18432,0,0,0
+Add_1083,18432,0,0,0
+EmbedLayerNormalization_0,194944,0,37120,0
+GptAttention_0_add,18432,0,0,0
+FullyConnect_MatMul_7,90112,0,0,0
+FastGelu_AddBias_1,73728,0,0,0
+LayerNorm_2,18432,0,0,0
+FastGelu_AddBias_2,73728,0,0,0
+Add_360,18432,0,0,0
+Add_754,18432,0,0,0
+FullyConnect_MatMul_3,90112,0,0,0
+FullyConnect_MatMul_4,34816,0,0,0
+Add_557,18432,0,0,0
+FullyConnect_MatMul_0,34816,0,73728,0
+FastGelu_AddBias_0,512,0,73728,0
+FullyConnect_MatMul_1,16384,0,0,0
+Add_492,18432,0,0,0
+LayerNorm_5,18432,0,0,0

From 08bfa77c63f903f3930deb6f1e2df062ff8118ed Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Thu, 6 Feb 2025 10:27:56 -0800
Subject: [PATCH 4/7] Address Dml build problem

---
 .../providers/dml/DmlExecutionProvider/src/ExecutionProvider.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index 3002177db13f4..7f420f8850001 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -93,7 +93,8 @@ namespace Dml
         GetCapability(
             const onnxruntime::GraphViewer& graph,
             const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup,
-            const onnxruntime::logging::Logger& logger, onnxruntime::IResourceAccountant* resource_accountant) const;
+            onnxruntime::IResourceAccountant* resource_accountant,
+            const onnxruntime::logging::Logger& logger) const;
 
         uint32_t GetSupportedDeviceDataTypeMask() const;
 

From 6d57fa4e71c545f9c81374059b0d15055d1cd5a2 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Thu, 6 Feb 2025 14:24:10 -0800
Subject: [PATCH 5/7] Address comments

---
 include/onnxruntime/core/framework/op_kernel_context.h    | 2 --
 onnxruntime/core/framework/execution_frame.cc             | 2 --
 onnxruntime/core/framework/op_kernel.cc                   | 5 -----
 onnxruntime/core/framework/op_kernel_context_internal.h   | 8 --------
 .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp    | 2 +-
 onnxruntime/core/session/inference_session.cc             | 2 +-
 onnxruntime/core/session/onnxruntime_c_api.cc             | 1 -
 7 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/include/onnxruntime/core/framework/op_kernel_context.h b/include/onnxruntime/core/framework/op_kernel_context.h
index e9a1490dedc34..3fd9ee0d8b292 100644
--- a/include/onnxruntime/core/framework/op_kernel_context.h
+++ b/include/onnxruntime/core/framework/op_kernel_context.h
@@ -204,8 +204,6 @@ class OpKernelContext {
 
   virtual OrtValue* GetOrCreateOutputMLValue(int index);
 
-  virtual int GetOrtValueIndexForInput(int input_index) const;
-
   virtual int GetOrtValueIndexForOutput(int output_index) const;
 
  private:
diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index bc13c30294875..c5046353ba528 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -23,8 +23,6 @@
 
 #include "core/framework/bfc_arena.h"
 
-#include "core/session/onnxruntime_session_options_config_keys.h"
-
 using namespace onnxruntime::common;
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/framework/op_kernel.cc b/onnxruntime/core/framework/op_kernel.cc
index 1d05cb4e5e818..212ce9c5069ea 100644
--- a/onnxruntime/core/framework/op_kernel.cc
+++ b/onnxruntime/core/framework/op_kernel.cc
@@ -130,11 +130,6 @@ OrtValue* OpKernelContext::GetOrCreateOutputMLValue(int index) {
   return value;
 }
 
-int OpKernelContext::GetOrtValueIndexForInput(int input_index) const {
-  int input_arg_index = GetInputArgIndex(input_index);
-  return execution_frame_->GetNodeIdxToMLValueIdx(input_arg_index);
-}
-
 int OpKernelContext::GetOrtValueIndexForOutput(int output_index) const {
   int output_arg_index = GetOutputArgIndex(output_index);
   return execution_frame_->GetNodeIdxToMLValueIdx(output_arg_index);
diff --git a/onnxruntime/core/framework/op_kernel_context_internal.h b/onnxruntime/core/framework/op_kernel_context_internal.h
index 64932dce50917..4c7ee10a07691 100644
--- a/onnxruntime/core/framework/op_kernel_context_internal.h
+++ b/onnxruntime/core/framework/op_kernel_context_internal.h
@@ -59,10 +59,6 @@ class OpKernelContextInternal : public OpKernelContext {
     return OpKernelContext::GetInputMLValue(index);
   }
 
-  const OrtValue* GetImplicitInputMLValue(int index) const override {
-    return OpKernelContext::GetImplicitInputMLValue(index);
-  }
-
   OrtValue* GetOutputMLValue(int index) {
     return OpKernelContext::GetOutputMLValue(index);
   }
@@ -82,10 +78,6 @@ class OpKernelContextInternal : public OpKernelContext {
     return implicit_input_values_;
   }
 
-  int GetOrtValueIndexForInput(int input_index) const override {
-    return OpKernelContext::GetOrtValueIndexForInput(input_index);
-  }
-
   int GetOrtValueIndexForOutput(int output_index) const override {
     return OpKernelContext::GetOrtValueIndexForOutput(output_index);
   }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index dd868ddd8307a..9d23b8b950272 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -878,7 +878,7 @@ namespace Dml
     ExecutionProviderImpl::GetCapability(
         const onnxruntime::GraphViewer& graph,
         const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup,
-        const onnxruntime::logging::Logger& logger, onnxruntime::IResourceAccountant*) const {
+        onnxruntime::IResourceAccountant*, const onnxruntime::logging::Logger& logger) const {
         uint32_t deviceDataTypeMask = GetSupportedDeviceDataTypeMask(); // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
 
         std::vector<std::unique_ptr<onnxruntime::ComputeCapability>> result;
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index fb0fcd55ffc63..71b1cad06f3f5 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -2749,7 +2749,7 @@ Status InferenceSession::Run(const RunOptions& run_options,
 #endif
 
 #if !defined(ORT_MINIMAL_BUILD)
-  if (node_stats_recorder_.has_value() && retval.IsOK()) {
+  if (IsNodeStatsCollectionEnabled() && retval.IsOK()) {
     // Dump node stats if the run was successful
     node_stats_recorder_->DumpStats(session_state_->GetGraphViewer().ModelPath());
     node_stats_recorder_->ResetPerRunNameDeduper();
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 3761b4ca0ec41..ca6950af0227a 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -5,7 +5,6 @@
 #include "core/session/allocator_adapters.h"
 #include "core/session/inference_session_utils.h"
 #include "core/session/IOBinding.h"
-#include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/framework/allocator.h"
 #include "core/framework/error_code_helper.h"
 #include "core/framework/execution_provider.h"

From d7504060eb1df87b7b157f6e15b457b2a40775b1 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 10 Feb 2025 12:46:06 -0800
Subject: [PATCH 6/7] Make threshold optional

---
 .../core/framework/resource_accountant.cc     | 27 ++++++++++++++-----
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/framework/resource_accountant.cc b/onnxruntime/core/framework/resource_accountant.cc
index 786da13e69458..391f010c79f37 100644
--- a/onnxruntime/core/framework/resource_accountant.cc
+++ b/onnxruntime/core/framework/resource_accountant.cc
@@ -21,12 +21,16 @@ class SizeTAccountant : public IResourceAccountant {
   SizeTAccountant() = default;
   ~SizeTAccountant() = default;
 
-  explicit SizeTAccountant(size_t threshold, InlinedHashMap<std::string, NodeAllocationStats>&& node_stats)
+  SizeTAccountant(size_t threshold, InlinedHashMap<std::string, NodeAllocationStats>&& node_stats)
       : IResourceAccountant(threshold), node_stats_(std::move(node_stats)) {}
 
+  explicit SizeTAccountant(InlinedHashMap<std::string, NodeAllocationStats>&& node_stats)
+      : IResourceAccountant(), node_stats_(std::move(node_stats)) {}
+
   ResourceCount GetConsumedAmount() const noexcept override {
     return consumed_amount_;
   }
+
   void AddConsumedAmount(const ResourceCount& amount) noexcept override {
     if (std::holds_alternative<size_t>(amount)) {
       consumed_amount_ += std::get<size_t>(amount);
@@ -151,10 +155,11 @@ Status NodeStatsRecorder::CreateAccountants(
       kOrtSessionOptionsResourceCudaPartitioningSettings, "");
 
   if (!resource_partitioning_settings.empty()) {
-    auto splits = utils::SplitString(resource_partitioning_settings, ",", false);
+    auto splits = utils::SplitString(resource_partitioning_settings, ",", true);
     if (splits.size() == 2) {
-      SafeInt<size_t> cuda_memory_limit = std::stoul(std::string{splits[0]});
-      cuda_memory_limit *= 1024;  // to bytes
+      if (splits[1].empty()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid resource partitioning settings");
+      }
 
       InlinedHashMap<std::string, NodeAllocationStats> loaded_stats;
       ORT_RETURN_IF_ERROR(LoadNodeAllocationStats(model_path, splits[1], loaded_stats));
@@ -162,9 +167,17 @@ Status NodeStatsRecorder::CreateAccountants(
       std::optional<ResourceAccountantMap> result;
       auto& map = result.emplace();
 
-      map.insert_or_assign(kCudaExecutionProvider,
-                           std::make_unique<SizeTAccountant>(cuda_memory_limit,
-                                                             std::move(loaded_stats)));
+      if (!splits[0].empty()) {
+        SafeInt<size_t> cuda_memory_limit = std::stoul(std::string{splits[0]});
+        cuda_memory_limit *= 1024;  // to bytes
+        map.insert_or_assign(kCudaExecutionProvider,
+                             std::make_unique<SizeTAccountant>(cuda_memory_limit,
+                                                               std::move(loaded_stats)));
+      } else {
+        map.insert_or_assign(kCudaExecutionProvider,
+                             std::make_unique<SizeTAccountant>(std::move(loaded_stats)));
+      }
+
       acc_map = std::move(result);
     }
   }

From 128d2fbdff47d559e190048a502aa9ce6ad1c48e Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 11 Feb 2025 17:53:22 -0800
Subject: [PATCH 7/7] Address review comments

---
 .../core/framework/resource_accountant.h      |  11 +-
 include/onnxruntime/core/graph/graph.h        |   7 --
 .../core/graph/indexed_sub_graph.h            |  19 +--
 .../onnxruntime_session_options_config_keys.h |   7 +-
 .../core/framework/graph_partitioner.cc       |   6 +-
 .../core/framework/resource_accountant.cc     |  45 ++++++-
 .../core/framework/sequential_executor.cc     |   2 +-
 onnxruntime/core/graph/graph.cc               |  36 ------
 .../providers/cuda/cuda_execution_provider.cc |   2 +-
 .../shared_library/provider_interfaces.h      |   1 -
 .../shared_library/provider_wrappedtypes.h    |   3 -
 .../core/session/provider_bridge_ort.cc       |   1 -
 .../tiny_gpt2_beamsearch_node_stats.txt       | 113 +++++++++---------
 13 files changed, 120 insertions(+), 133 deletions(-)

diff --git a/include/onnxruntime/core/framework/resource_accountant.h b/include/onnxruntime/core/framework/resource_accountant.h
index 1f2e9ea5ccfb0..274750a505fbd 100644
--- a/include/onnxruntime/core/framework/resource_accountant.h
+++ b/include/onnxruntime/core/framework/resource_accountant.h
@@ -16,6 +16,11 @@
 namespace onnxruntime {
 
 struct ConfigOptions;
+#ifndef SHARED_PROVIDER
+class Node;
+#else
+struct Node;
+#endif
 
 // Common holder for potentially different resource accounting
 // for different EPs
@@ -40,7 +45,7 @@ class IResourceAccountant {
   virtual ResourceCount GetConsumedAmount() const = 0;
   virtual void AddConsumedAmount(const ResourceCount& amount) = 0;
   virtual void RemoveConsumedAmount(const ResourceCount& amount) = 0;
-  virtual ResourceCount ComputeResourceCount(const std::string& node_name) const = 0;
+  virtual ResourceCount ComputeResourceCount(const Node& node) const = 0;
 
   std::optional<ResourceCount> GetThreshold() const {
     return threshold_;
@@ -52,6 +57,8 @@ class IResourceAccountant {
 
   bool IsStopIssued() const noexcept { return stop_assignment_; }
 
+  static std::string MakeUniqueNodeName(const Node& node);
+
  private:
   bool stop_assignment_ = false;
   std::optional<ResourceCount> threshold_;
@@ -101,7 +108,7 @@ class NodeStatsRecorder {
 
   void DumpStats(const std::filesystem::path& model_path) const;
 
-  static Status CreateAccountants(
+  [[nodiscard]] static Status CreateAccountants(
       const ConfigOptions& config_options,
       const std::filesystem::path& model_path,
       std::optional<ResourceAccountantMap>& acc_map);
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index 1eaf2119f34fe..7798394b045dc 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -883,13 +883,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
     return ConstGraphNodes(nodes_, std::move(filter_func));
   }
 
-  /** Compute node memory requirements, which is mostly initializers
-      and large attributes that are copied on device (special cases for some nodes)
-
-      Returns no value if the node was not found.
-  */
-  size_t ComputeNodeMemoryUsage(NodeIndex) const;
-
   /** Gets the maximum NodeIndex value used in the Graph.
   WARNING: This actually returns the max index value used + 1.
   */
diff --git a/include/onnxruntime/core/graph/indexed_sub_graph.h b/include/onnxruntime/core/graph/indexed_sub_graph.h
index 959b183e272ea..e457d3dcad1f1 100644
--- a/include/onnxruntime/core/graph/indexed_sub_graph.h
+++ b/include/onnxruntime/core/graph/indexed_sub_graph.h
@@ -84,16 +84,14 @@ struct IndexedSubGraph {
   // if present and adds it to the consumed amount
   void AccountForNode(size_t cost_index) const {
     assert(cost_index < nodes_costs.size());
-    if (nodes_costs[cost_index].has_value()) {
-      resource_accountant->AddConsumedAmount(*nodes_costs[cost_index]);
-    }
+    resource_accountant->AddConsumedAmount(nodes_costs[cost_index]);
   }
 
   // This computes and accounts for the resource cost for the node that just
   // been fused from other nodes, and the EP did not had a chance to compute the costs.
-  void ComputeAndAccountForNode(const std::string& node_name) const {
+  void ComputeAndAccountForNode(const Node& node) const {
     assert(resource_accountant != nullptr);
-    resource_accountant->AddConsumedAmount(resource_accountant->ComputeResourceCount(node_name));
+    resource_accountant->AddConsumedAmount(resource_accountant->ComputeResourceCount(node));
   }
 
   void SetAccountant(IResourceAccountant* res_accountant) {
@@ -106,22 +104,13 @@ struct IndexedSubGraph {
     nodes_costs.emplace_back(cost);
   }
 
-  // Append an absent cost for the node that was already accounted for.
-  void AppendNodeEmptyCost() {
-    assert(resource_accountant != nullptr);
-    nodes_costs.emplace_back();
-  }
-
  private:
   // subgraph meta definition.
   std::unique_ptr<MetaDef> meta_def_;
   // Optional resource accountant for this subgraph.
   IResourceAccountant* resource_accountant = nullptr;
   // Vector with resource costs for nodes above. Should have the same size
-  // Some nodes that were previously accounted for because they already been assigned to an EP
-  // for example during multiple calls to GetCapabiility() will not have resource count present.
-  // may not have a resource count present, we skip it.
-  InlinedVector<std::optional<ResourceCount>> nodes_costs;
+  InlinedVector<ResourceCount> nodes_costs;
 };
 
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index a50b19e4a8a56..f97964f49b582 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -286,8 +286,11 @@ static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
 static const char* const kOrtSessionOptionsCollectNodeMemoryStatsToFile = "session.collect_node_memory_stats_to_file";
 
 /// This is a composite CSV setting formatted as "memory limit in kb,file name for collected stats"
-/// "limit > 0": enables Capacity Aware Partitioning for Cuda EP. The EP will place nodes on device
-/// "file name" : this file is expected to be found at the same folder with the model. The file contains
+/// "limit > 0": enables Capacity Aware Partitioning for Cuda EP. `limit` is optional and when absent
+/// the provider may attempt to figure out the memory available automatically.
+/// The setting with no limit is expected to look like: ",file name for collected stats"
+///  The EP will place nodes on device "file name" :
+/// this file is expected to be found at the same folder with the model. The file contains
 /// pre-recorded stats collected when running with kOrtSessionOptionsCollectNodeMemoryStatsToFile enforce (see above)
 static const char* const kOrtSessionOptionsResourceCudaPartitioningSettings =
     "session.resource_cuda_partitioning_settings";
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index 08ddfd872ca78..1ec99f3dc8625 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -351,7 +351,7 @@ static Node* PlaceNode(Graph& graph, const IndexedSubGraph& capability,
           // that the fused node would use no more memory when the nodes we are fusing.
           // and potentially less than that, and therefore, no threshold check is needed here.
           // All threshold checks are done within the EP.
-          capability.ComputeAndAccountForNode(fused_node->Name());
+          capability.ComputeAndAccountForNode(*fused_node);
         }
 
         result = fused_node;
@@ -885,7 +885,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
       Node& fused_node = graph.BeginFuseSubGraph(indexed_sub_graph, node_name);
       fused_node.SetExecutionProviderType(type);
       if (indexed_sub_graph.IsAccountingEnabled()) {
-        indexed_sub_graph.ComputeAndAccountForNode(fused_node.Name());
+        indexed_sub_graph.ComputeAndAccountForNode(fused_node);
       }
 
       // create filtered graph viewer for this set of nodes
@@ -932,7 +932,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
     // now that we're done compiling we can remove the original nodes from the Graph and wire in the new one
     graph.FinalizeFuseSubGraph(indexed_sub_graph, node);
     if (acc_enabled) {
-      compilation_entry.capability.get().sub_graph->ComputeAndAccountForNode(node.Name());
+      compilation_entry.capability.get().sub_graph->ComputeAndAccountForNode(node);
     }
   }
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/framework/resource_accountant.cc b/onnxruntime/core/framework/resource_accountant.cc
index 391f010c79f37..4d537219ec714 100644
--- a/onnxruntime/core/framework/resource_accountant.cc
+++ b/onnxruntime/core/framework/resource_accountant.cc
@@ -4,11 +4,15 @@
 #include "core/framework/resource_accountant.h"
 
 #include "core/common/inlined_containers.h"
+#include "core/common/narrow.h"
+#include "core/common/parse_string.h"
 #include "core/common/safeint.h"
 #include "core/common/string_utils.h"
 
 #include "core/framework/config_options.h"
+#include "core/framework/murmurhash3.h"
 #include "core/graph/constants.h"
+#include "core/graph/graph.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 
 #include <fstream>
@@ -42,7 +46,8 @@ class SizeTAccountant : public IResourceAccountant {
     }
   }
 
-  ResourceCount ComputeResourceCount(const std::string& node_name) const override {
+  ResourceCount ComputeResourceCount(const Node& node) const override {
+    const auto node_name = MakeUniqueNodeName(node);
     auto hit = node_stats_.find(node_name);
     if (hit != node_stats_.end()) {
       const auto& stats = hit->second;
@@ -88,11 +93,13 @@ void NodeStatsRecorder::ReportNodeStats(const std::string& node_name, const Node
   auto result = impl_->node_stats.emplace(node_name, stats);
   if (!result.second) {
     // Node already exists, update the stats
+    // This may happen when the user collects stats from multiple Runs()
     result.first->second.UpdateIfGreater(stats);
   }
 }
 
 void NodeStatsRecorder::DumpStats(std::ostream& os) const {
+  os << "#name,input_sizes,initializers_sizes,total_dynamic_sizes,total_temp_allocations\n";
   for (const auto& [name, stats] : impl_->node_stats) {
     os << name << "," << stats.input_sizes << "," << stats.initializers_sizes << ","
        << stats.total_dynamic_sizes << ","
@@ -128,6 +135,8 @@ static Status LoadNodeAllocationStats(
   std::string line;
   // Read and load a CSV file line by line
   while (std::getline(file, line)) {
+    if (line.empty() || line[0] == '#') continue;
+
     auto splits = utils::SplitString(line, ",", true);
     ORT_ENFORCE(splits.size() == 5, "Invalid line in the file ", file_path, ": ", line);
     if (splits[0].empty()) {
@@ -138,8 +147,8 @@ static Status LoadNodeAllocationStats(
     size_t initializers_sizes = SafeInt<size_t>(std::stoull(std::string{splits[2]}));
     size_t total_dynamic_sizes = SafeInt<size_t>(std::stoull(std::string{splits[3]}));
     size_t total_temp_allocations = SafeInt<size_t>(std::stoull(std::string{splits[4]}));
-    node_stats.insert_or_assign(node_name, {input_sizes, initializers_sizes,
-                                            total_dynamic_sizes, total_temp_allocations});
+    node_stats.insert_or_assign(std::move(node_name), {input_sizes, initializers_sizes,
+                                                       total_dynamic_sizes, total_temp_allocations});
   }
 
   result.swap(node_stats);
@@ -168,8 +177,9 @@ Status NodeStatsRecorder::CreateAccountants(
       auto& map = result.emplace();
 
       if (!splits[0].empty()) {
-        SafeInt<size_t> cuda_memory_limit = std::stoul(std::string{splits[0]});
-        cuda_memory_limit *= 1024;  // to bytes
+        size_t cuda_memory_limit = 0;
+        ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(std::string{splits[0]}, cuda_memory_limit));
+        cuda_memory_limit = SafeInt<size_t>(cuda_memory_limit) * 1024;  // to bytes
         map.insert_or_assign(kCudaExecutionProvider,
                              std::make_unique<SizeTAccountant>(cuda_memory_limit,
                                                                std::move(loaded_stats)));
@@ -179,10 +189,35 @@ Status NodeStatsRecorder::CreateAccountants(
       }
 
       acc_map = std::move(result);
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid format for: ",
+                             kOrtSessionOptionsResourceCudaPartitioningSettings,
+                             " : expecting comma separated fields");
     }
   }
 
   return Status::OK();
 }
 
+std::string IResourceAccountant::MakeUniqueNodeName(const Node& node) {
+  std::string result;
+
+  uint32_t hash[4] = {0, 0, 0, 0};
+  auto hash_str = [&hash](const std::string& str) {
+    MurmurHash3::x86_128(str.data(), narrow<int32_t>(str.size()), hash[0], &hash);
+  };
+
+  const auto& node_name = (node.Name().empty()) ? node.OpType() : node.Name();
+
+  for (const auto& def : node.InputDefs()) {
+    hash_str(def->Name());
+  }
+
+  HashValue node_hash = hash[0] | (uint64_t(hash[1]) << 32);
+  result.reserve(node_name.size() + 1 + 16);
+  result.append(node_name).append("_").append(std::to_string(node_hash));
+
+  return result;
+}
+
 }  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index 35ae33328837c..26a57ec3ea02f 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -545,7 +545,7 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx,
         }
 
         // Record node allocation stats
-        const auto& name = (node.Name().empty()) ? node.OpType() : node.Name();
+        const std::string name = IResourceAccountant::MakeUniqueNodeName(node);
         node_stats_recorder->ReportNodeStats(name, node_stats);
       }
 #endif
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index d0a1280ce9895..e4915616b7b7c 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -5537,42 +5537,6 @@ Graph::Graph(const Model& owning_model,
       is_loaded_from_model_file_(true) {  // true as the Graph isn't manually constructed from scratch
 }
 
-size_t Graph::ComputeNodeMemoryUsage(NodeIndex node_idx) const {
-  /// XXX: In some cases some kernels can copy its attributes to a device
-  // those are edge cases which we currently do not account for.
-  const Node* node = GetNode(node_idx);
-  if (node != nullptr) {
-    SafeInt<size_t> result = 0;
-    for (const auto* input : node->InputDefs()) {
-      if (input->Exists()) {
-        // Let's see if this is an initializer
-        constexpr const bool check_outer_scope_true = true;
-        const ONNX_NAMESPACE::TensorProto* initializer =
-            GetConstantInitializer(input->Name(), check_outer_scope_true);
-        if (initializer != nullptr) {
-          size_t out;
-          if (utils::GetSizeInBytesFromTensorProto<0>(*initializer, &out).IsOK()) {
-            result += out;
-          }
-        } else {
-          const auto* proto = input->TypeAsProto();
-          if (proto != nullptr && utils::HasTensorType(*proto)) {
-            const auto& tensor_type = proto->tensor_type();
-            if (utils::HasElemType(tensor_type) && utils::HasShape(tensor_type)) {
-              size_t size;
-              if (utils::GetSizeInBytesFromTensorTypeProto<0>(tensor_type, &size).IsOK()) {
-                result += size;
-              }
-            }
-          }
-        }
-      }
-    }
-    return static_cast<size_t>(result);
-  }
-  return 0;
-}
-
 common::Status Graph::LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph,
                                         const OrtFormatLoadOptions& load_options) {
   // We deserialize the graph from ORT format in the following order:
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 86909fd272be3..b675c08e5f804 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -2771,7 +2771,7 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
       result.push_back(ComputeCapability::Create(std::move(sub_graph)));
     } else {
       auto* node = graph.GetNode(node_index);
-      auto resource_count = std::get<0>(resource_accountant->ComputeResourceCount(node->Name()));
+      auto resource_count = std::get<0>(resource_accountant->ComputeResourceCount(*node));
       const auto would_be_consumed = resource_count + consumed_memory;
       LOGS(logger, INFO) << "CUDA_EP Node: " << node_index << " Memory usage : " << resource_count
                          << " would be consumed " << static_cast<size_t>(would_be_consumed)
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 059a722958118..0dd771f522336 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -670,7 +670,6 @@ struct ProviderHost {
   virtual IndexedSubGraph_SourceOfSchema IndexedSubGraph__GetSchemaSource(const IndexedSubGraph* p) = 0;
   virtual void IndexedSubGraph__SetAccountant(IndexedSubGraph* p, IResourceAccountant*) = 0;
   virtual void IndexedSubGraph__AppendNodeCost(IndexedSubGraph* p, const ResourceCount& count) = 0;
-  virtual void IndexedSubGraph__AppendNodeEmptyCost(IndexedSubGraph* p) = 0;
 
   // KernelDef
   virtual void KernelDef__operator_delete(KernelDef* p) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 6441547cab914..a502ce9c66f69 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -595,9 +595,6 @@ struct IndexedSubGraph final {
   void AppendNodeCost(const ResourceCount& resource_count) {
     g_host->IndexedSubGraph__AppendNodeCost(this, resource_count);
   }
-  void AppendNodeEmptyCost() {
-    g_host->IndexedSubGraph__AppendNodeEmptyCost(this);
-  }
 
   IndexedSubGraph() = delete;
   IndexedSubGraph(const IndexedSubGraph&) = delete;
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 83ba757886ff5..a1cd9af3b5091 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -844,7 +844,6 @@ struct ProviderHostImpl : ProviderHost {
   void IndexedSubGraph__AppendNodeCost(IndexedSubGraph* p, const ResourceCount& resource_count) override {
     p->AppendNodeCost(resource_count);
   }
-  void IndexedSubGraph__AppendNodeEmptyCost(IndexedSubGraph* p) override { p->AppendNodeEmptyCost(); }
 
   // KernelDef (wrapped)
   void KernelDef__operator_delete(KernelDef* p) override { delete p; }
diff --git a/onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt b/onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt
index d9150cf6768f5..df1e0c48825a0 100644
--- a/onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt
+++ b/onnxruntime/test/testdata/transformers/tiny_gpt2_beamsearch_node_stats.txt
@@ -1,56 +1,57 @@
-GptAttention_1_add,18432,0,0,0
-GptAttention_0_matmul,4096,0,0,0
-GptAttention_2_matmul,22528,0,0,0
-FullyConnect_MatMul_5,90112,0,0,0
-GptAttention_3,30720,0,36864,165888
-LayerNorm_4,18432,0,0,0
-GptAttention_1_matmul,22528,0,0,0
-FullyConnect_Add_5,18432,0,0,0
-GptAttention_2_add,18432,0,0,0
-FullyConnect_Add_3,18432,0,0,0
-GptAttention_3_add,18432,0,0,0
-Add_689,18432,0,0,0
-Add_886,18432,0,0,0
-LayerNorm_7,18432,0,0,0
-FullyConnect_MatMul_6,34816,0,0,0
-GptAttention_4,30720,0,36864,165888
-GptAttention_4_add,18432,0,0,0
-SkipLayerNormalization,18432,0,0,0
-LayerNorm_1,18432,0,0,0
-GptAttention_3_matmul,22528,0,0,0
-LayerNorm_8,18432,0,0,0
-FullyConnect_MatMul_8,34816,0,0,0
-FullyConnect_Add_7,18432,0,0,0
-LayerNorm_9,18432,0,0,0
-FastGelu_AddBias_3,73728,0,0,0
-FullyConnect_Add_1,18432,0,0,0
-GptAttention_4_matmul,22528,0,0,0
-GptAttention_0,13248,0,55296,165888
-FullyConnect_MatMul_2,34816,0,0,0
-FullyConnect_MatMul_9,90112,0,0,0
-MatMul_1165,146432,0,576000,0
-GptAttention_2,30720,0,36864,165888
-LayerNorm_6,18432,0,0,0
-BeamSearch_gpt2,24,0,256,1823244
-FastGelu_AddBias_4,73728,0,0,0
-Add_951,18432,0,0,0
-GptAttention_1,30720,0,36864,165888
-LayerNorm_3,18432,0,0,0
-Add_295,18432,0,0,0
-Add_1083,18432,0,0,0
-EmbedLayerNormalization_0,194944,0,37120,0
-GptAttention_0_add,18432,0,0,0
-FullyConnect_MatMul_7,90112,0,0,0
-FastGelu_AddBias_1,73728,0,0,0
-LayerNorm_2,18432,0,0,0
-FastGelu_AddBias_2,73728,0,0,0
-Add_360,18432,0,0,0
-Add_754,18432,0,0,0
-FullyConnect_MatMul_3,90112,0,0,0
-FullyConnect_MatMul_4,34816,0,0,0
-Add_557,18432,0,0,0
-FullyConnect_MatMul_0,34816,0,73728,0
-FastGelu_AddBias_0,512,0,73728,0
-FullyConnect_MatMul_1,16384,0,0,0
-Add_492,18432,0,0,0
-LayerNorm_5,18432,0,0,0
+#name,input_sizes,initializers_sizes,total_dynamic_sizes,total_temp_allocations
+GptAttention_1_matmul_3390928670334833856,22528,0,0,0
+LayerNorm_8_16340230589392852003,18432,0,0,0
+LayerNorm_6_9539917679182944001,18432,0,0,0
+LayerNorm_4_3998281518089755446,18432,0,0,0
+Add_295_12458934867448263403,18432,0,0,0
+GptAttention_1_5945223373512700064,30720,0,36864,165888
+FastGelu_AddBias_0_8293496556664011978,512,0,73728,0
+FullyConnect_MatMul_7_9121431797220490115,90112,0,0,0
+GptAttention_0_7799922821510396356,13248,0,55296,165888
+GptAttention_2_13772881973491265914,30720,0,36864,165888
+LayerNorm_1_10060807585253518719,18432,0,0,0
+LayerNorm_5_12297409543002935527,18432,0,0,0
+Add_492_15870509848159592443,18432,0,0,0
+FullyConnect_MatMul_5_12754193998971094488,90112,0,0,0
+LayerNorm_7_11450735811828114024,18432,0,0,0
+FullyConnect_Add_5_4749853671277160818,18432,0,0,0
+GptAttention_3_add_5419272690383812111,18432,0,0,0
+FullyConnect_MatMul_8_14154070846330210236,34816,0,0,0
+FullyConnect_MatMul_9_9215108924175066058,90112,0,0,0
+GptAttention_2_add_7251589488810842639,18432,0,0,0
+FullyConnect_Add_7_2612800351421913827,18432,0,0,0
+GptAttention_1_add_3894862726029568115,18432,0,0,0
+FullyConnect_MatMul_2_4814122527985171273,34816,0,0,0
+LayerNorm_3_3589946186712403351,18432,0,0,0
+GptAttention_3_8921810316598002134,30720,0,36864,165888
+LayerNorm_9_9113032450990548295,18432,0,0,0
+Add_886_7198133075029541336,18432,0,0,0
+Add_689_16588197583517413999,18432,0,0,0
+GptAttention_3_matmul_14740826065423798917,22528,0,0,0
+FastGelu_AddBias_4_17289691003819959460,73728,0,0,0
+Add_754_3697562882104452642,18432,0,0,0
+FullyConnect_MatMul_4_3508821612885617837,34816,0,0,0
+FastGelu_AddBias_1_17699324882619485158,73728,0,0,0
+FullyConnect_MatMul_3_17781936527365066348,90112,0,0,0
+GptAttention_2_matmul_7328860221231123895,22528,0,0,0
+SkipLayerNormalization_6957325406340516852,18432,0,0,0
+BeamSearch_gpt2_3957842931497654942,24,0,256,1823244
+GptAttention_4_matmul_90143216136586800,22528,0,0,0
+FullyConnect_MatMul_6_11858231833228352542,34816,0,0,0
+GptAttention_0_matmul_16767551145055538728,4096,0,0,0
+FullyConnect_Add_3_17196504264676187520,18432,0,0,0
+GptAttention_0_add_9807374014361508564,18432,0,0,0
+FullyConnect_MatMul_1_17322107022932292417,16384,0,0,0
+GptAttention_4_14364416985904266109,30720,0,36864,165888
+FullyConnect_MatMul_0_3724322618026197588,34816,0,73728,0
+Add_557_10312911821132522354,18432,0,0,0
+Add_360_12940403527838064497,18432,0,0,0
+FastGelu_AddBias_3_13817144420946871274,73728,0,0,0
+EmbedLayerNormalization_0_7260843146120485633,194944,0,37120,0
+FastGelu_AddBias_2_7906787140370676932,73728,0,0,0
+MatMul_1165_4290064500958888402,146432,0,576000,0
+GptAttention_4_add_15131081400494402711,18432,0,0,0
+Add_1083_4580993573699232732,18432,0,0,0
+Add_951_2303460452509012571,18432,0,0,0
+LayerNorm_2_2575702077895349965,18432,0,0,0
+FullyConnect_Add_1_7648227151832366839,18432,0,0,0