Skip to content

Commit 2a115da

Browse files
Shixiaowei02DreamGenXAce-RRbprusjanpetrov
authored
Update TensorRT-LLM (NVIDIA#1793)
Co-authored-by: DreamGenX <[email protected]> Co-authored-by: Ace-RR <[email protected]> Co-authored-by: bprus <[email protected]> Co-authored-by: janpetrov <[email protected]>
1 parent db4edea commit 2a115da

File tree

318 files changed

+8621
-4763
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

318 files changed

+8621
-4763
lines changed

benchmarks/cpp/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ ${HOME}/.local/bin/trtllm-build \
232232
--output_dir ${LORA_ENGINE} \
233233
--max_batch_size ${MAX_BATCH} \
234234
--max_input_len $MAX_LEN \
235-
--max_output_len $MAX_LEN \
235+
--max_seq_len $((2*${MAX_LEN})) \
236236
--gemm_plugin float16 \
237237
--lora_plugin float16 \
238238
--use_paged_context_fmha enable \

benchmarks/cpp/bertBenchmark.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "tensorrt_llm/common/memoryUtils.h"
1818
#include "tensorrt_llm/plugins/api/tllmPlugin.h"
1919
#include "tensorrt_llm/runtime/iTensor.h"
20+
#include "tensorrt_llm/runtime/rawEngine.h"
2021
#include "tensorrt_llm/runtime/tllmLogger.h"
2122
#include "tensorrt_llm/runtime/tllmRuntime.h"
2223
#include "tensorrt_llm/runtime/worldConfig.h"
@@ -78,11 +79,10 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da
7879
{
7980
auto const worldConfig = WorldConfig::mpi();
8081
auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName);
81-
auto engineBlob = loadEngine(enginePath.string());
8282

8383
for (float gpuWeightsPercent : gpuWeightsPercents)
8484
{
85-
auto rt = std::make_shared<TllmRuntime>(engineBlob.data(), engineBlob.size(), gpuWeightsPercent, *logger);
85+
auto rt = std::make_shared<TllmRuntime>(RawEngine(enginePath), logger.get(), gpuWeightsPercent);
8686
rt->addContext(0);
8787
for (auto inLen : inLens)
8888
{

benchmarks/cpp/gptManagerBenchmark.cpp

+13
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ struct BenchmarkParams
150150
bool streaming{false};
151151
bool enableExpDelays{false};
152152
std::optional<float> requestRate{std::nullopt};
153+
std::optional<SizeType32> maxBatchSize{std::nullopt};
153154
int randomSeed = 430;
154155
std::optional<int> maxAttentionWindow{std::nullopt};
155156

@@ -785,6 +786,10 @@ class ExecutorServer
785786
executorConfig.setPeftCacheConfig(peftCacheConfig);
786787
executorConfig.setBatchingType(
787788
modelType == TrtGptModelType::V1 ? texec::BatchingType::kSTATIC : texec::BatchingType::kINFLIGHT);
789+
if (benchmarkParams.maxBatchSize)
790+
{
791+
executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
792+
}
788793

789794
mExecutor = std::make_unique<texec::Executor>(trtEnginePath, texec::ModelType::kDECODER_ONLY, executorConfig);
790795

@@ -1339,6 +1344,7 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
13391344
optionalParams.kvCacheConfig.onboardBlocks = benchmarkParams.kvOnboardBlocks;
13401345
optionalParams.gpuWeightsPercent = benchmarkParams.gpuWeightsPercent;
13411346
optionalParams.maxBeamWidth = beamWidth;
1347+
optionalParams.maxBatchSize = benchmarkParams.maxBatchSize;
13421348
optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy};
13431349

13441350
auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
@@ -1628,6 +1634,7 @@ int main(int argc, char* argv[])
16281634
options.add_options()("request_rate",
16291635
"request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay.",
16301636
cxxopts::value<float>());
1637+
options.add_options()("max_batch_size", "The max runtime batch size when benchmarking", cxxopts::value<int>());
16311638
options.add_options()("enable_trt_overlap", "Overlap TRT context preparation and execution",
16321639
cxxopts::value<bool>()->default_value("false"));
16331640
options.add_options()("enable_exp_delays", "Enables exponential delay distr to mimic real world request arrival",
@@ -1777,6 +1784,12 @@ int main(int argc, char* argv[])
17771784
benchmarkParams.requestRate = result["request_rate"].as<float>();
17781785
}
17791786

1787+
// Argument: request rate
1788+
if (result.count("max_batch_size"))
1789+
{
1790+
benchmarkParams.maxBatchSize = result["max_batch_size"].as<int>();
1791+
}
1792+
17801793
benchmarkParams.enableExpDelays = result["enable_exp_delays"].as<bool>();
17811794

17821795
// Argument: Enable batch stats output

0 commit comments

Comments
 (0)