Skip to content

Commit 77d7fe1

Browse files
kaiyuxaotman
andauthored
Update TensorRT-LLM (NVIDIA#2849)
* Update TensorRT-LLM --------- Co-authored-by: aotman <[email protected]>
1 parent 0bcfdca commit 77d7fe1

File tree

621 files changed

+1912852
-1889046
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

621 files changed

+1912852
-1889046
lines changed

.github/workflows/blossom-ci.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ on:
2525
required: true
2626
test_result:
2727
description: 'test result'
28-
required: false
28+
required: true
2929
test_results_url:
3030
description: 'test results url'
3131
required: true
@@ -38,7 +38,7 @@ jobs:
3838

3939
# This job only runs for pull request comments
4040
if: |
41-
startsWith( github.event.comment.body, '/bot' ) && contains('["niukuo", "tburt-nv"]', github.actor)
41+
startsWith( github.event.comment.body, '/bot' ) && contains('["chzblych", "tburt-nv", "niukuo"]', github.actor)
4242
steps:
4343
- name: Check if comment is issued by authorized person
4444
run: blossom-ci
@@ -81,7 +81,7 @@ jobs:
8181
CI_SERVER: ${{ secrets.CI_SERVER }}
8282
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
8383

84-
Upload-Test:
84+
Upload-results:
8585
name: Upload test results
8686
runs-on: linux-amd64-cpu4
8787
if: github.event_name == 'workflow_dispatch'

.gitmodules

+7-7
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,22 @@
11
[submodule "3rdparty/cutlass"]
22
path = 3rdparty/cutlass
3-
url = https://github.com/NVIDIA/cutlass.git
3+
url = https://gitlab-master.nvidia.com/ftp/GitHubSync/cutlass-mirror.git
44
[submodule "3rdparty/json"]
55
path = 3rdparty/json
6-
url = https://github.com/nlohmann/json.git
6+
url = https://gitlab-master.nvidia.com/ftp/GitHubSync/json-mirror.git
77
[submodule "3rdparty/cxxopts"]
88
path = 3rdparty/cxxopts
9-
url = https://github.com/jarro2783/cxxopts
9+
url = https://gitlab-master.nvidia.com/ftp/GitHubSync/cxxopts-mirror.git
1010
branch = v3.1.1
1111
[submodule "3rdparty/NVTX"]
1212
path = 3rdparty/NVTX
13-
url = https://github.com/NVIDIA/NVTX.git
13+
url = https://gitlab-master.nvidia.com/ftp/GitHubSync/NVTX-mirror.git
1414
[submodule "3rdparty/ucxx"]
1515
path = 3rdparty/ucxx
16-
url = https://github.com/rapidsai/ucxx.git
16+
url = https://gitlab-master.nvidia.com/ftp/GitHubSync/ucxx.git
1717
[submodule "3rdparty/pybind11"]
1818
path = 3rdparty/pybind11
19-
url = https://github.com/pybind/pybind11.git
19+
url = https://gitlab-master.nvidia.com/ftp/GitHubSync/pybind11.git
2020
[submodule "3rdparty/xgrammar"]
2121
path = 3rdparty/xgrammar
22-
url = https://github.com/mlc-ai/xgrammar.git
22+
url = https://gitlab-master.nvidia.com/ftp/GitHubSync/xgrammar.git

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ TensorRT-LLM
2121
* [2025/02/25] 🌟 DeepSeek-R1 performance now optimized for Blackwell [➡️ link](https://huggingface.co/nvidia/DeepSeek-R1-FP4)
2222
<div align="center">
2323
<img src="docs/source/media/r1-perf.jpeg" width="75%">
24-
24+
2525
<sub><sup>HGX B200 (8 GPUs) vs HGX H200 (8 GPUs) vs 2 x HGX H100 (normalized to 8 GPUs for comparison). Input tokens not included in TPS calculations. TensorRT-LLM Version: 0.18.0.dev2025021800 (pre-release) used for Feb measurements, SGLang used for Jan measurements. Hopper numbers in FP8. B200 numbers in FP4. Max concurrency use case. ISL/OSL: 1K/1K.</sub></sup>
2626
<div align="left">
2727

benchmarks/cpp/disaggServerBenchmark.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,7 @@ texec::Request makeExecutorContextRequest(Sample const& sample, SizeType32 const
527527
lookaheadConfig, // lookaheadConfig
528528
std::nullopt, // kvCacheRetentionConfig
529529
std::nullopt, // logitsPostProcessorName
530+
std::nullopt, // logitsPostProcessor
530531
encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt);
531532
request.setRequestType(tensorrt_llm::executor::RequestType::REQUEST_TYPE_CONTEXT_ONLY);
532533
return request;

benchmarks/cpp/gptManagerBenchmark.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -833,6 +833,7 @@ texec::Request makeExecutorRequest(Sample const& sample, SizeType32 const& beamW
833833
lookaheadConfig, // lookaheadConfig
834834
std::nullopt, // kvCacheRetentionConfig
835835
std::nullopt, // logitsPostProcessorName
836+
std::nullopt, // logitsPostProcessor
836837
encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt);
837838
}
838839

cpp/include/tensorrt_llm/batch_manager/GptManager.h

+1
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ class [[deprecated("Use the executor API instead.")]] GptManager
9494
[[nodiscard]] SizeType32 getMaxSequenceLen() const;
9595
[[nodiscard]] SizeType32 getMaxNumSequences() const;
9696
[[nodiscard]] SizeType32 getMaxDraftLen() const;
97+
[[nodiscard]] SizeType32 getVocabSizePadded() const;
9798

9899
void validateLlmRequest(
99100
LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;

cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h

+12-5
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@ class CacheTransceiverFactory
4141
public:
4242
static std::unique_ptr<BaseCacheTransceiver> createCacheTransceiver(
4343
kv_cache_manager::BaseKVCacheManager* cacheManager, runtime::ModelConfig const& modelConfig,
44-
runtime::WorldConfig const& worldConfig);
44+
runtime::WorldConfig const& worldConfig,
45+
executor::kv_cache::CacheState::AttentionType attentionType
46+
= executor::kv_cache::CacheState::AttentionType::kDEFAULT);
4547
};
4648

4749
class BaseCacheTransceiver
@@ -75,14 +77,18 @@ class CacheTransceiver : public BaseCacheTransceiver
7577

7678
CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager, CommType commType,
7779
executor::kv_cache::CacheState::ModelConfig const& cacheStateModelCfg, runtime::WorldConfig const& worldConfig,
78-
nvinfer1::DataType dataType);
80+
nvinfer1::DataType dataType,
81+
executor::kv_cache::CacheState::AttentionType attentionType
82+
= executor::kv_cache::CacheState::AttentionType::kDEFAULT);
7983

8084
CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager, CommType commType,
8185
std::vector<SizeType32> numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
82-
runtime::WorldConfig const& worldConfig, nvinfer1::DataType dataType)
86+
runtime::WorldConfig const& worldConfig, nvinfer1::DataType dataType,
87+
executor::kv_cache::CacheState::AttentionType attentionType
88+
= executor::kv_cache::CacheState::AttentionType::kDEFAULT)
8389
: CacheTransceiver(cacheManager, commType,
8490
executor::kv_cache::CacheState::ModelConfig{numKvHeadsPerLayer, sizePerHead, tokensPerBlock}, worldConfig,
85-
dataType)
91+
dataType, attentionType)
8692
{
8793
}
8894

@@ -113,7 +119,8 @@ class CacheTransceiver : public BaseCacheTransceiver
113119
std::map<LlmRequest*, std::future<void>> mResponderFutures;
114120
std::vector<std::pair<LlmRequest*, std::future<void>>> mRequesterFutures;
115121
mpi::MpiComm const *mMpiGroupComm{}, *mMpiWorldComm{};
116-
std::shared_ptr<mpi::MpiComm> mMpiGroupTensorParaComm, mMpiGroupPipeParaComm;
122+
std::shared_ptr<mpi::MpiComm> mMpiGroupTensorParaComm, mMpiGroupPipeParaComm, mMpiGroupDataComm,
123+
mMpiGroupTPInDPComm;
117124
executor::kv_cache::CommState const* mCommState;
118125
std::unique_ptr<executor::kv_cache::CacheState> mCacheState;
119126
std::unique_ptr<executor::kv_cache::ConnectionManager> mManager;

cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h

+8-1
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,18 @@ class DecoderInputBuffers
4141
using SizeType32 = runtime::SizeType32;
4242
using TensorPtr = runtime::ITensor::SharedPtr;
4343

44-
explicit DecoderInputBuffers(SizeType32 maxBatchSize, SizeType32 maxTokensPerEngineStep);
44+
explicit DecoderInputBuffers(
45+
SizeType32 maxBatchSize, SizeType32 maxTokensPerEngineStep, runtime::BufferManager const& manager);
4546

47+
// buffers for setup
4648
TensorPtr setupBatchSlots;
4749
TensorPtr inputsIds;
4850

51+
// buffers for forward
52+
TensorPtr forwardBatchSlotsRequestOrder;
53+
TensorPtr forwardBatchSlotsRequestOrderDevice;
54+
TensorPtr fillValues;
55+
TensorPtr fillValuesDevice;
4956
TensorPtr forwardBatchSlots;
5057
};
5158

cpp/include/tensorrt_llm/batch_manager/handleGenerationLogits.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@ class HandleGenerationLogits : Algorithm
4141

4242
HandleGenerationLogits() = default;
4343

44-
void operator()(tr::SizeType32 logitsIndex, RequestVector const& contextRequests,
45-
RequestVector const& generationRequests, RuntimeBuffers const& genRuntimeBuffers,
46-
DecoderBuffers& decoderBuffers, tr::ModelConfig const& modelConfig, runtime::TllmRuntime const& runtime) const;
44+
void operator()(tr::SizeType32 logitsIndex, RequestVector const& generationRequests,
45+
RuntimeBuffers& genRuntimeBuffers, DecoderBuffers& decoderBuffers, tr::ModelConfig const& modelConfig,
46+
runtime::TllmRuntime const& runtime) const;
4747
};
4848

4949
} // namespace tensorrt_llm::batch_manager

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

+27-2
Original file line numberDiff line numberDiff line change
@@ -491,11 +491,14 @@ class BlockManager
491491

492492
void replaceSharedBlock(GenerationRequest& sequence, SizeType32 blockIdx);
493493

494+
//! \brief Get the ids of all newly allocated (not reused) blocks for the sequence.
495+
std::vector<KVCacheBlock::IdType> getNewlyAllocatedBlockIds(GenerationRequest const& sequence) const;
496+
494497
//! \brief Release blocks of the sequence. Store blocks for reuse if llmReqeust is provided.
495498
void releaseBlocks(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt);
496499

497500
//! \brief Simulate freeing all blocks for that sequence to check impact on number of free blocks
498-
void schedulingReleaseBlocks(GenerationRequest& sequence);
501+
void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId);
499502

500503
//! \brief Release last block in the sequence
501504
void releaseLastBlock(GenerationRequest& sequence);
@@ -658,6 +661,11 @@ class BlockManager
658661

659662
[[nodiscard]] static bool blockInRadixTree(BlockPtr const& block);
660663

664+
[[nodiscard]] bool isEnableHashKey() const
665+
{
666+
return mEnableHashKey;
667+
}
668+
661669
private:
662670
//! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
663671
void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -849,6 +857,7 @@ class BaseKVCacheManager
849857
virtual void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) = 0;
850858

851859
[[nodiscard]] virtual GenerationRequest const& getSequence(LlmRequest::RequestIdType requestId) const = 0;
860+
[[nodiscard]] virtual GenerationRequest& getSequence(LlmRequest::RequestIdType requestId) = 0;
852861

853862
[[nodiscard]] virtual bool isCrossKv() const = 0;
854863

@@ -872,6 +881,10 @@ class BaseKVCacheManager
872881
std::vector<LlmRequest::RequestIdType> const& requestIds) const
873882
= 0;
874883

884+
[[nodiscard]] virtual std::vector<KVCacheBlock::IdType> getNewlyAllocatedBlockIds(
885+
LlmRequest::RequestIdType requestId) const
886+
= 0;
887+
875888
[[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
876889
[[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
877890

@@ -904,6 +917,8 @@ class BaseKVCacheManager
904917
/// @param outputLength The number of output tokens in each sequence in the batch.
905918
/// @return SizeType32 A number of sequences per batch.
906919
[[nodiscard]] virtual SizeType32 getMaxCapacityBatchSize(SizeType32 inputLength, SizeType32 outputLength) const = 0;
920+
921+
[[nodiscard]] virtual CacheType getCacheType() const = 0;
907922
};
908923

909924
class KVCacheManager : public BaseKVCacheManager
@@ -935,7 +950,7 @@ class KVCacheManager : public BaseKVCacheManager
935950
SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<SizeType32> maxSequenceLength,
936951
bool enableBlockReuse = true, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
937952
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
938-
std::shared_ptr<KVCacheEventManager> eventManager = nullptr);
953+
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false);
939954

940955
KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
941956
SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
@@ -1100,12 +1115,18 @@ class KVCacheManager : public BaseKVCacheManager
11001115
void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override;
11011116

11021117
[[nodiscard]] GenerationRequest const& getSequence(LlmRequest::RequestIdType requestId) const override;
1118+
[[nodiscard]] GenerationRequest& getSequence(LlmRequest::RequestIdType requestId) override;
11031119

11041120
[[nodiscard]] bool isCrossKv() const override
11051121
{
11061122
return mBlockManager.getCacheType() == CacheType::kCROSS;
11071123
}
11081124

1125+
[[nodiscard]] CacheType getCacheType() const override
1126+
{
1127+
return mBlockManager.getCacheType();
1128+
}
1129+
11091130
//! \brief Find first new block that must be allocated for context phase and return it's concatenated token vector.
11101131
//! \details Only full blocks are considered.
11111132
[[nodiscard]] std::optional<BlockKey> findNewContextBlock(
@@ -1148,6 +1169,8 @@ class KVCacheManager : public BaseKVCacheManager
11481169
std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
11491170
std::vector<LlmRequest::RequestIdType> const& requestIds) const override;
11501171

1172+
std::vector<SizeType32> getNewlyAllocatedBlockIds(LlmRequest::RequestIdType requestId) const override;
1173+
11511174
runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const override;
11521175

11531176
SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const override
@@ -1219,6 +1242,8 @@ class KVCacheManager : public BaseKVCacheManager
12191242
bool mEnableHashKey;
12201243
// Whether use one more block for each sequence
12211244
bool mUseOneMoreBlock;
1245+
// Mutex to protect access to mSequences
1246+
mutable std::mutex mSequencesMtx;
12221247
// buffers for static tensors, will be created after allocating pools
12231248
runtime::ITensor::SharedPtr mBlockPoolPointers;
12241249
runtime::ITensor::SharedPtr mLayerToPoolMapping;

0 commit comments

Comments
 (0)