Skip to content

Commit 9691e12

Browse files
kaiyuxmfuntowicz
andauthored
Update TensorRT-LLM (NVIDIA#1835)
* Update TensorRT-LLM --------- Co-authored-by: Morgan Funtowicz <[email protected]>
1 parent 2a115da commit 9691e12

File tree

94 files changed

+1129
-988
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

94 files changed

+1129
-988
lines changed

benchmarks/cpp/gptManagerBenchmark.cpp

+44
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,9 @@ struct BenchmarkParams
165165

166166
// Weights offloading
167167
float gpuWeightsPercent{1.0};
168+
169+
// Decoding params
170+
std::optional<std::vector<std::vector<SizeType32>>> medusaChoices;
168171
};
169172

170173
class InferenceRequestsSyncSend
@@ -791,6 +794,10 @@ class ExecutorServer
791794
executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
792795
}
793796

797+
executorConfig.setDecodingConfig(texec::DecodingConfig(
798+
benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
799+
std::nullopt, benchmarkParams.medusaChoices));
800+
794801
mExecutor = std::make_unique<texec::Executor>(trtEnginePath, texec::ModelType::kDECODER_ONLY, executorConfig);
795802

796803
if (logIterationData)
@@ -1346,6 +1353,9 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
13461353
optionalParams.maxBeamWidth = beamWidth;
13471354
optionalParams.maxBatchSize = benchmarkParams.maxBatchSize;
13481355
optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy};
1356+
optionalParams.decodingConfig = texec::DecodingConfig(
1357+
benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
1358+
std::nullopt, benchmarkParams.medusaChoices);
13491359

13501360
auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
13511361
SizeType32 deviceCount{0};
@@ -1600,6 +1610,32 @@ void benchmarkExecutor(std::filesystem::path const& engineDir, TrtGptModelType m
16001610
}
16011611
}
16021612

1613+
std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
1614+
{
1615+
std::vector<std::vector<SizeType32>> result;
1616+
std::regex outer_regex(R"(\[(.*?)\])");
1617+
std::regex inner_regex(R"(\d+)");
1618+
auto outer_begin = std::sregex_iterator(input.begin(), input.end(), outer_regex);
1619+
auto outer_end = std::sregex_iterator();
1620+
1621+
for (std::sregex_iterator i = outer_begin; i != outer_end; ++i)
1622+
{
1623+
std::smatch match = *i;
1624+
std::string inner_str = match.str(1);
1625+
std::vector<int> inner_vec;
1626+
auto inner_begin = std::sregex_iterator(inner_str.begin(), inner_str.end(), inner_regex);
1627+
auto inner_end = std::sregex_iterator();
1628+
1629+
for (std::sregex_iterator j = inner_begin; j != inner_end; ++j)
1630+
{
1631+
std::smatch inner_match = *j;
1632+
inner_vec.push_back(std::stoi(inner_match.str()));
1633+
}
1634+
result.push_back(inner_vec);
1635+
}
1636+
return result;
1637+
}
1638+
16031639
} // namespace
16041640

16051641
int main(int argc, char* argv[])
@@ -1692,6 +1728,8 @@ int main(int argc, char* argv[])
16921728
options.add_options()("gpu_weights_percent",
16931729
"Specify the percentage of weights that reside on GPU (from 0.0 to 1.0).",
16941730
cxxopts::value<float>()->default_value("1.0"));
1731+
options.add_options()(
1732+
"medusa_choices", "Medusa choices in the format of [[0], [0, 1], [0, 0, 1]]", cxxopts::value<std::string>());
16951733

16961734
auto result = options.parse(argc, argv);
16971735

@@ -1823,6 +1861,12 @@ int main(int argc, char* argv[])
18231861
// Argument: If offloaded blocks should be onboarded to primary memory before they are reused.
18241862
benchmarkParams.kvOnboardBlocks = !result["kv_dont_onboard_blocks"].as<bool>();
18251863

1864+
// Argument: Medusa choices for the Medusa speculative decoding.
1865+
if (result.count("medusa_choices"))
1866+
{
1867+
benchmarkParams.medusaChoices = parseVectorOfVectors(result["medusa_choices"].as<std::string>());
1868+
}
1869+
18261870
std::optional<TokenIdType> padId;
18271871
// Argument: Padding token id
18281872
if (result.count("pad_id"))

benchmarks/python/build.py

+1
Original file line numberDiff line numberDiff line change
@@ -944,6 +944,7 @@ def build_gpt(args):
944944
network = builder.create_network()
945945
network.trt_network.name = engine_name
946946
network.plugin_config.to_legacy_setting()
947+
network.plugin_config.dtype = args.dtype
947948

948949
# Plugins
949950
if args.mode in ['plugin', 'plugin-ifb']:
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1
3-
size 3931604
2+
oid sha256:8b6ad33047e2684c7d22471f87febbb96ae26f4eac6529e2f3b7c1469ec2ec6d
3+
size 3931504
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1
3-
size 3931604
2+
oid sha256:560f736af15a4dfba849ab29efc3520d6ec8c87bf2aa16589299b232dc171cca
3+
size 3989220
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.a
2-
93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.pre_cxx11.a
3-
c0bd2b69c932257678a2aad9bd8baba4b291795e commit
1+
f8538ac35803837e5d457ea8c1a58053 libtensorrt_llm_batch_manager_static.a
2+
dc6fc82dc4ba319899e1d6777bd8c3a4 libtensorrt_llm_batch_manager_static.pre_cxx11.a
3+
265b039443334094026fbd8f396d52fe29c2d9d1 commit
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:bd757c26886a3ffd6947615d9f2829434e94839b693007a64b47c6b5c26416e4
3-
size 3812158
2+
oid sha256:74948e00ff7341914b1831ccfdce9ae242dd149603b1ba7e24ee993f08b63542
3+
size 3812960
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:87321383075adf2d87cfbdc8a12a3d3815ef058d5da9b6aaa8d7d3f3263af439
3-
size 3773896
2+
oid sha256:0421ceacd5d07bc172bb4d0979edaf466aa8950290b4d6d1a7d355dbcefc2c84
3+
size 3772832
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:58cdc0a330f8bfb7b50e3202aeac47bde0835b1dc600b4bfdcd2b30801e66e03
3-
size 22381766
2+
oid sha256:46eb1d351e3e8da3945a3f451166f12536aae3e440d57337d8891492424aff78
3+
size 22387798
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe
3-
size 1396404
2+
oid sha256:19585b7709736197d9c1762d1bb8e3099e298d6dcc1c521d51c83637cc624c20
3+
size 1397814
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe
3-
size 1396404
2+
oid sha256:f5d5475663640c068af2e9b5772b9b602656641dd17ca473ce7125ef7f2ec855
3+
size 1423172
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.a
2-
7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.pre_cxx11.a
3-
c0bd2b69c932257678a2aad9bd8baba4b291795e commit
1+
e18e84fb356995b11c04b79e55c4c3f5 libtensorrt_llm_executor_static.a
2+
f0555b76f21d43e676e5808bf197cc58 libtensorrt_llm_executor_static.pre_cxx11.a
3+
265b039443334094026fbd8f396d52fe29c2d9d1 commit
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:e503b4cfb1c842850287a359ffed23a1773a67a96475d365b66d757a283ac218
3-
size 1448772
2+
oid sha256:8496c9e4a20efd3d2072520cf843dac70cbb0fe23621cfba2a1e0ef3e5fa22ed
3+
size 1450288
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:f8c80cf7aca2b135a656a060456fb30a820e459b4b36560162b02fa65121ef50
3-
size 1375430
2+
oid sha256:1b76267834252836e26ddecc2e1b9449e33a67fb1981e5d42f721bc439be1c02
3+
size 1377018
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:cc65971d6d74260cb49b354aa4b0b82f92863cc722fbf206bf8a4919a4897532
3-
size 14031364
2+
oid sha256:9bd0faf48175745d7aeff58f539ae021db365b73933dab9c51329de9e92f2d86
3+
size 14039826

cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,8 @@ std::vector<cutlass_extensions::CutlassGemmConfig> MoeGemmRunner<T, WeightType>:
424424
template <typename T, typename WeightType>
425425
bool MoeGemmRunner<T, WeightType>::isHopperSpecialised() const
426426
{
427-
bool config_is_sm90 = best_config_ && best_config_->is_sm90;
427+
TLLM_CHECK_WITH_INFO(best_config_, "Cannot determine if hopper is specialised without a selected config");
428+
bool config_is_sm90 = best_config_->is_sm90;
428429
return supportsHopperSpecialisation() && config_is_sm90;
429430
}
430431

@@ -440,7 +441,7 @@ int MoeGemmRunner<T, WeightType>::getSM() const
440441
return this->sm_;
441442
}
442443

443-
// currently support sm80 bf16/fp16 gate ativation, only set predication tensor for m direction
444+
// currently support sm80 bf16/fp16 gate activation, only set predication tensor for m direction
444445
template <typename T, typename WeightType>
445446
bool MoeGemmRunner<T, WeightType>::isFusedGatedActivation(bool is_gated_activation, int gemm_n, int gemm_k) const
446447
{
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
5b6c74ce66f62d2a58aa9cac16f11ad6 libtensorrt_llm_nvrtc_wrapper.so
2-
c0bd2b69c932257678a2aad9bd8baba4b291795e commit
2+
265b039443334094026fbd8f396d52fe29c2d9d1 commit
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:84319476e8ecf9666f40f69355f19ec3b585fc0987f940be14af9e11e3f524c3
2+
oid sha256:9f2f97eb5b4181917a47b6028a857d7a597ca93faa5846af42c4cb24797d7fa7
33
size 1080832

cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu

+53-13
Original file line numberDiff line numberDiff line change
@@ -1072,10 +1072,38 @@ std::vector<size_t> CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWo
10721072
size_t const hopper_size = using_hopper ? HopperGroupedGemmInput::workspaceSize(num_experts_per_node) : 0;
10731073
size_t const gemm_workspace_size = moe_gemm_runner_.getMaxWorkspaceSize(num_experts_per_node);
10741074

1075-
std::vector<size_t> workspace{source_rows_size, permuted_rows_size, permuted_experts_size, permuted_data_size,
1076-
total_rows_before_expert_size, softmax_out_size, glu_inter_size,
1075+
// We do some overlapping of the large workspace buffers. Although we could overlap some of the other buffers, they
1076+
// are small enough (i.e no factor of hidden size) they will only be a couple MiB at most, so we don't bother
1077+
// in the case of fused activation we overlap permuted_data and fc2_result
1078+
// in the case of unfused activation we overlap permuted_data and fc1_result
1079+
// we need to calculate the max possible size, so use the max of all three
1080+
size_t overlapped_gemm1_gemm2_inputs = std::max(permuted_data_size, fc2_result_size);
1081+
// When glu_inter_elems is 0 we are always fused, otherwise we may need the un-fused case
1082+
if (glu_inter_elems > 0)
1083+
{
1084+
overlapped_gemm1_gemm2_inputs = std::max(overlapped_gemm1_gemm2_inputs, fc1_result_size);
1085+
}
1086+
1087+
// if we have glu_inter we overlap it with fc2_result, otherwise we use fc1_result by itself
1088+
size_t overlapped_gemm1_gemm2_outputs = fc1_result_size;
1089+
if (glu_inter_elems > 0)
1090+
{
1091+
overlapped_gemm1_gemm2_outputs
1092+
= std::max(std::max(glu_inter_size, fc2_result_size), overlapped_gemm1_gemm2_outputs);
1093+
}
1094+
1095+
std::vector<size_t> workspace{ //
1096+
source_rows_size, //
1097+
permuted_rows_size, //
1098+
permuted_experts_size, //
1099+
total_rows_before_expert_size, //
1100+
softmax_out_size, //
1101+
sorter_size, //
10771102
// These pointers reuse the same memory
1078-
std::max(fc1_result_size, sorter_size), fc2_result_size, hopper_size, gemm_workspace_size};
1103+
overlapped_gemm1_gemm2_inputs, //
1104+
overlapped_gemm1_gemm2_outputs, //
1105+
hopper_size, //
1106+
gemm_workspace_size};
10791107
return workspace;
10801108
}
10811109

@@ -1088,7 +1116,9 @@ size_t CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWorkspaceSize(i
10881116
TLLM_CHECK_WITH_INFO(num_experts % ep_size == 0, "Number of experts must be a multiple of ep size");
10891117
auto workspace = getWorkspaceBufferSizes(
10901118
num_rows, hidden_size, inter_size, num_experts, num_experts / ep_size, k, activation_type);
1091-
return tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size());
1119+
auto ws_size = tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size());
1120+
TLLM_LOG_DEBUG("Mixture Of Experts Plugin requires workspace of %2f MiB", ws_size / 1024.f / 1024.f);
1121+
return ws_size;
10921122
}
10931123

10941124
template <class T, class WeightType, class OutputType, class Enable>
@@ -1109,29 +1139,38 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::configureWsPtrs(char
11091139
source_rows_ = (int*) ws_sliced[0];
11101140
permuted_rows_ = (int*) ws_sliced[1];
11111141
permuted_experts_ = (int*) ws_sliced[2];
1112-
permuted_data_ = (T*) ws_sliced[3];
11131142

1114-
total_rows_before_expert_ = (int64_t*) ws_sliced[4];
1143+
total_rows_before_expert_ = (int64_t*) ws_sliced[3];
11151144

11161145
softmax_out_ = nullptr;
11171146
bool const is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
11181147
if (!is_pow_2 || num_experts > 256)
11191148
{
1120-
softmax_out_ = (float*) ws_sliced[5];
1149+
softmax_out_ = (float*) ws_sliced[4];
11211150
}
11221151

1123-
glu_inter_result_ = (T*) ws_sliced[6];
1152+
sorter_ws_ = (char*) ws_sliced[5];
11241153

1125-
// These pointers are aliased. Since the sort ws can be overwritten after it is finished
1126-
sorter_ws_ = (char*) ws_sliced[7];
1127-
fc1_result_ = (T*) ws_sliced[7];
1154+
// Always 6, but overlapped with either fc1_result_ or fc2_result_
1155+
permuted_data_ = (T*) ws_sliced[6];
11281156

1129-
fc2_result_ = (T*) ws_sliced[8];
1157+
bool const is_gated_activation = isGatedActivation(activation_type);
1158+
bool const use_fused_moe = moe_gemm_runner_.isFusedGatedActivation(is_gated_activation, inter_size, hidden_size);
1159+
bool const using_hopper = moe_gemm_runner_.isHopperSpecialised();
1160+
bool const hopper_has_glu = using_hopper && (mayHaveDifferentGEMMOutputType() || is_gated_activation);
1161+
bool const non_hopper_has_glu = !using_hopper && !use_fused_moe && is_gated_activation;
1162+
bool const has_glu_inter_result = hopper_has_glu || non_hopper_has_glu;
1163+
// Always 7, ignored if not needed
1164+
glu_inter_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : nullptr;
1165+
1166+
// fc1 and fc2 alias one of the above pointers, but it depends on if actfn is fused/unfused which is overlapped
1167+
fc1_result_ = has_glu_inter_result ? (T*) ws_sliced[6] : (T*) ws_sliced[7];
1168+
fc2_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : (T*) ws_sliced[6];
11301169

11311170
hopper_grouped_gemm_input_ = {};
11321171
if (moe_gemm_runner_.isHopperSpecialised())
11331172
{
1134-
hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[9], num_experts_per_node, ws_sliced[10], ws_sizes[10]);
1173+
hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[8], num_experts_per_node, ws_sliced[9], ws_sizes[9]);
11351174
}
11361175
}
11371176

@@ -1293,6 +1332,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::runMoe(void const* i
12931332
}
12941333
else
12951334
{
1335+
12961336
// Run the GEMM with activation function overridden with `Identity`, we do the activation separately
12971337
ActivationType activation_type = (use_fused_moe) ? fc1_activation_type : ActivationType::Identity;
12981338
T* gemm_result = (use_fused_moe) ? fc1_result_ : static_cast<T*>(glu_inter_result_);

cpp/tensorrt_llm/pybind/executor/bindings.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,8 @@ void InitBindings(pybind11::module_& m)
431431
&tle::DecodingConfig::setLookaheadDecoding)
432432
.def_property("medusa_choices", &tle::DecodingConfig::getMedusaChoices, &tle::DecodingConfig::setMedusaChoices);
433433

434-
auto executorConfigGetState = [&](tle::ExecutorConfig const& self)
434+
auto executorConfigGetState = [&peftCacheConfigGetstate, &kvCacheConfigGetstate, &schedulerConfigGetstate,
435+
&parallelConfigGetstate](tle::ExecutorConfig const& self)
435436
{
436437
py::object peftCacheConfigState = py::none();
437438

@@ -453,7 +454,8 @@ void InitBindings(pybind11::module_& m)
453454
peftCacheConfigState, self.getLogitsPostProcessorMap(), self.getLogitsPostProcessorBatched(),
454455
self.getDecodingConfig(), self.getGpuWeightsPercent());
455456
};
456-
auto executorConfigSetState = [&](py::tuple state)
457+
auto executorConfigSetState = [&kvCacheConfigSetstate, &peftCacheConfigSetstate, &schedulerConfigSetstate,
458+
&parallelConfigSetstate](py::tuple state)
457459
{
458460
if (state.size() != 15)
459461
{

cpp/tensorrt_llm/runtime/medusaModule.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,11 @@ void MedusaModule::initMedusaTensorsFromChoices(MedusaChoices const& choices, st
9696
if (curDepth != depth)
9797
{
9898
TLLM_CHECK(depth + 1 == curDepth);
99+
TLLM_CHECK_WITH_INFO(depth <= getMaxDraftPathLen(),
100+
"Medusa choices require more Medusa heads than the engine was built with.");
99101
// Save TopK
100102
topKs[depth - 1] = maxTopK;
103+
101104
// Accumulate TopK for global indexing in tree
102105
globalNodeInTreeIdx += maxTopK;
103106

0 commit comments

Comments
 (0)