MustaphaU
diff --git a/‎benchmarks/cpp/README.md
+1-1 b/‎benchmarks/cpp/README.md
+1-1
diff --git a/‎benchmarks/cpp/bertBenchmark.cpp
+2-2 b/‎benchmarks/cpp/bertBenchmark.cpp
+2-2
diff --git a/‎benchmarks/cpp/gptManagerBenchmark.cpp
+13 b/‎benchmarks/cpp/gptManagerBenchmark.cpp
+13
@@ -232,7 +232,7 @@ ${HOME}/.local/bin/trtllm-build \
     --output_dir ${LORA_ENGINE} \
     --max_batch_size ${MAX_BATCH} \
     --max_input_len $MAX_LEN \
-    --max_output_len $MAX_LEN \
+    --max_seq_len $((2*${MAX_LEN})) \
     --gemm_plugin float16 \
     --lora_plugin float16 \
     --use_paged_context_fmha enable \
 
@@ -17,6 +17,7 @@
 #include "tensorrt_llm/common/memoryUtils.h"
 #include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/rawEngine.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
 #include "tensorrt_llm/runtime/tllmRuntime.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
@@ -78,11 +79,10 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da
 {
     auto const worldConfig = WorldConfig::mpi();
     auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName);
-    auto engineBlob = loadEngine(enginePath.string());
 
     for (float gpuWeightsPercent : gpuWeightsPercents)
     {
-        auto rt = std::make_shared<TllmRuntime>(engineBlob.data(), engineBlob.size(), gpuWeightsPercent, *logger);
+        auto rt = std::make_shared<TllmRuntime>(RawEngine(enginePath), logger.get(), gpuWeightsPercent);
         rt->addContext(0);
         for (auto inLen : inLens)
         {
 
@@ -150,6 +150,7 @@ struct BenchmarkParams
     bool streaming{false};
     bool enableExpDelays{false};
     std::optional<float> requestRate{std::nullopt};
+    std::optional<SizeType32> maxBatchSize{std::nullopt};
     int randomSeed = 430;
     std::optional<int> maxAttentionWindow{std::nullopt};
 
@@ -785,6 +786,10 @@ class ExecutorServer
         executorConfig.setPeftCacheConfig(peftCacheConfig);
         executorConfig.setBatchingType(
             modelType == TrtGptModelType::V1 ? texec::BatchingType::kSTATIC : texec::BatchingType::kINFLIGHT);
+        if (benchmarkParams.maxBatchSize)
+        {
+            executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
+        }
 
         mExecutor = std::make_unique<texec::Executor>(trtEnginePath, texec::ModelType::kDECODER_ONLY, executorConfig);
 
@@ -1339,6 +1344,7 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
     optionalParams.kvCacheConfig.onboardBlocks = benchmarkParams.kvOnboardBlocks;
     optionalParams.gpuWeightsPercent = benchmarkParams.gpuWeightsPercent;
     optionalParams.maxBeamWidth = beamWidth;
+    optionalParams.maxBatchSize = benchmarkParams.maxBatchSize;
     optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy};
 
     auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
@@ -1628,6 +1634,7 @@ int main(int argc, char* argv[])
     options.add_options()("request_rate",
         "request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay.",
         cxxopts::value<float>());
+    options.add_options()("max_batch_size", "The max runtime batch size when benchmarking", cxxopts::value<int>());
     options.add_options()("enable_trt_overlap", "Overlap TRT context preparation and execution",
         cxxopts::value<bool>()->default_value("false"));
     options.add_options()("enable_exp_delays", "Enables exponential delay distr to mimic real world request arrival",
@@ -1777,6 +1784,12 @@ int main(int argc, char* argv[])
         benchmarkParams.requestRate = result["request_rate"].as<float>();
     }
 
+    // Argument: request rate
+    if (result.count("max_batch_size"))
+    {
+        benchmarkParams.maxBatchSize = result["max_batch_size"].as<int>();
+    }
+
     benchmarkParams.enableExpDelays = result["enable_exp_delays"].as<bool>();
 
     // Argument: Enable batch stats output
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`#include "tensorrt_llm/common/memoryUtils.h"`
`18`	`18`	`#include "tensorrt_llm/plugins/api/tllmPlugin.h"`
`19`	`19`	`#include "tensorrt_llm/runtime/iTensor.h"`
	`20`	`+#include "tensorrt_llm/runtime/rawEngine.h"`
`20`	`21`	`#include "tensorrt_llm/runtime/tllmLogger.h"`
`21`	`22`	`#include "tensorrt_llm/runtime/tllmRuntime.h"`
`22`	`23`	`#include "tensorrt_llm/runtime/worldConfig.h"`
`@@ -78,11 +79,10 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da`
`78`	`79`	`{`
`79`	`80`	`auto const worldConfig = WorldConfig::mpi();`
`80`	`81`	`auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName);`
`81`		`- auto engineBlob = loadEngine(enginePath.string());`
`82`	`82`
`83`	`83`	`for (float gpuWeightsPercent : gpuWeightsPercents)`
`84`	`84`	`{`
`85`		`- auto rt = std::make_shared<TllmRuntime>(engineBlob.data(), engineBlob.size(), gpuWeightsPercent, *logger);`
	`85`	`+ auto rt = std::make_shared<TllmRuntime>(RawEngine(enginePath), logger.get(), gpuWeightsPercent);`
`86`	`86`	`rt->addContext(0);`
`87`	`87`	`for (auto inLen : inLens)`
`88`	`88`	`{`