Skip to content
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,15 @@ static const char* const kOrtSessionOptionsDisableQuantQDQ = "session.disable_qu
static const char* const kOrtSessionOptionsDisableQDQConstantFolding =
"session.disable_qdq_constant_folding";

// Constant folding produces new initializers (folded outputs) that get added to the graph.
// This option limits the maximum size in bytes of any single constant folding output tensor.
// Nodes whose folded output(s) would exceed this limit are skipped to prevent the optimized
// model's memory footprint from growing too much compared to the original model.
// The value should be a non-negative integer in decimal string form.
// The default value of "0" disables the threshold check (all sizes are allowed).
static const char* const kOrtSessionOptionsConfigConstantFoldingNodeWeightSizeThreshold =
"session.constant_folding_node_weight_size_threshold";

// It controls whether to enable Double QDQ remover and Identical Children Consolidation
// "0": not to disable. ORT does remove the middle 2 Nodes from a Q->(QD->Q)->QD pairs
// "1": disable. ORT doesn't remove the middle 2 Nodes from a Q->(QD->Q)->QD pairs
Expand Down
32 changes: 32 additions & 0 deletions onnxruntime/core/optimizer/constant_folding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include "core/optimizer/utils.h"
#include "core/framework/op_kernel.h"
#include "core/framework/tensorprotoutils.h"
#include "core/session/onnxruntime_session_options_config_keys.h"
#include "core/common/parse_string.h"

using namespace onnxruntime::common;

Expand Down Expand Up @@ -145,6 +147,18 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
GraphViewer graph_viewer(graph);
auto& order = graph_viewer.GetNodesInTopologicalOrder();

// Read the optional size threshold for constant folding. A value of 0 (the default) means no limit.
size_t output_size_threshold = 0;
{
const std::string threshold_str = config_options_.GetConfigOrDefault(
kOrtSessionOptionsConfigConstantFoldingNodeWeightSizeThreshold, "0");
if (!TryParseStringWithClassicLocale(threshold_str, output_size_threshold)) {
LOGS(logger, WARNING) << "Failed to parse constant folding size threshold from config value '"
<< threshold_str << "'. Using no threshold.";
output_size_threshold = 0;
}
}

#if !defined(DISABLE_SPARSE_TENSORS)
std::function<bool(const std::string&)> is_sparse_initializer_check = [&graph](const std::string& name) -> bool {
return graph.IsSparseInitializer(name);
Expand Down Expand Up @@ -336,6 +350,24 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
}
}

// If a size threshold was configured, check whether any output tensor exceeds it.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot you need to move the test before the computation happens and use the estimated size before computation

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 38c9da7. The check now runs before the computation: for each output NodeArg, the elem type and all concrete dim values are read from the TypeAsProto, and estimated_size = num_elements * elem_size is compared against the threshold before OptimizerExecutionFrame is created or the kernel is run. If any dim is symbolic/unknown the pre-check is skipped for that output (safe fallback).

// Skipping large outputs prevents the optimized model from having a much larger
// memory footprint than the original model.
if (converted_to_constant && output_size_threshold > 0) {
for (const OrtValue& ort_value : fetches) {
if (ort_value.IsTensor()) {
const size_t tensor_size = ort_value.Get<Tensor>().SizeInBytes();
if (tensor_size > output_size_threshold) {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot if inputs are deleted, you need to take that into account and only consider the increase

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in dea8bad. The threshold check now computes the net memory increase: total_output_size - freed_input_size, where freed_input_size sums the sizes of input initializers that are exclusively consumed by this node (consumer count == 1). If the net increase ≤ threshold the node is folded; if > threshold it is skipped. For the Unsqueeze model, inputs are exclusively consumed (same size as outputs), so net = 0 and any non-zero threshold still allows folding. The updated test reflects this: Case 2 (1-byte threshold) now asserts 0 Unsqueeze remaining. Cases 3a/3b add a Tile graph with a genuine 788-byte net increase and verify the threshold blocks (100 bytes) or allows (1000 bytes) folding.

LOGS(logger, INFO) << "Skipping constant folding for " << node->OpType()
<< " node '" << node->Name() << "': output size " << tensor_size
<< " bytes exceeds the threshold of " << output_size_threshold << " bytes.";
converted_to_constant = false;
break;
}
}
}
}

if (converted_to_constant) {
for (size_t fetch_idx = 0; fetch_idx < fetches.size(); ++fetch_idx) {
OrtValue& ort_value = fetches[fetch_idx];
Expand Down
44 changes: 44 additions & 0 deletions onnxruntime/test/optimizer/graph_transform_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,50 @@ TEST_F(GraphTransformationTests, ConstantFolding) {
ASSERT_TRUE(op_to_count["Unsqueeze"] == 0);
}

// Test that constant folding respects the size threshold config option.
// With a threshold of 1 byte, no Unsqueeze node should be constant folded because the outputs are larger.
// With no threshold (default "0"), all Unsqueeze nodes should be folded.
TEST_F(GraphTransformationTests, ConstantFoldingWithSizeThreshold) {
constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";

// First, verify that without a threshold all Unsqueeze nodes are folded.
{
std::shared_ptr<Model> model;
ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, *logger_));
Graph& graph = model->MainGraph();
ASSERT_EQ(CountOpsInGraph(graph)["Unsqueeze"], 2);

std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
const ConfigOptions empty_config_options;
ASSERT_STATUS_OK(graph_transformation_mgr.Register(
std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
TransformerLevel::Level1));
ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
ASSERT_EQ(CountOpsInGraph(graph)["Unsqueeze"], 0);
}

// Now verify that with a threshold of 1 byte, no Unsqueeze node is constant folded.
{
std::shared_ptr<Model> model;
ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, *logger_));
Graph& graph = model->MainGraph();
ASSERT_EQ(CountOpsInGraph(graph)["Unsqueeze"], 2);

std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
ConfigOptions config_options_with_threshold;
ASSERT_STATUS_OK(config_options_with_threshold.AddConfigEntry(
kOrtSessionOptionsConfigConstantFoldingNodeWeightSizeThreshold, "1"));
ASSERT_STATUS_OK(graph_transformation_mgr.Register(
std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, config_options_with_threshold),
TransformerLevel::Level1));
ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
// The Unsqueeze outputs are larger than 1 byte, so no folding should happen.
ASSERT_EQ(CountOpsInGraph(graph)["Unsqueeze"], 2);
}
}

TEST_F(GraphTransformationTests, ConstantFoldingNodesOnDifferentEP) {
constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
std::shared_ptr<Model> model;
Expand Down