Skip to content

Commit aaacc9b

Browse files
kaiyuxStarrickLiu
andauthored
Update TensorRT-LLM (NVIDIA#2562)
* Update TensorRT-LLM --------- Co-authored-by: Starrick Liu <[email protected]>
1 parent 340a1b6 commit aaacc9b

File tree

459 files changed

+858777
-364419
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

459 files changed

+858777
-364419
lines changed

.github/workflows/auto-assign.yml

-43
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,25 @@
11
# Ref: https://docs.github.com/en/actions/managing-issues-and-pull-requests/closing-inactive-issues
22
name: Close inactive issues
33
on:
4-
workflow_dispatch:
54
schedule:
6-
- cron: "0 * * * *"
5+
- cron: "30 1 * * *"
76

87
jobs:
98
stale:
109
runs-on: ubuntu-latest
1110
permissions:
1211
issues: write
1312
pull-requests: write
14-
1513
steps:
1614
- uses: actions/stale@v9
1715
with:
16+
days-before-issue-stale: 30
17+
days-before-issue-close: 15
18+
stale-issue-label: "stale"
19+
exempt-issue-labels: ""
20+
stale-issue-message: This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 15 days."
21+
close-issue-message: "This issue was closed because it has been stalled for 15 days with no activity."
22+
days-before-pr-stale: -1
23+
days-before-pr-close: -1
1824
repo-token: ${{ secrets.GITHUB_TOKEN }}
19-
stale-issue-message: 'Issue has not received an update in over 14 days. Adding stale label.'
20-
stale-pr-message: 'PR has not received an update in over 14 days. Adding stale label.'
21-
close-issue-message: 'This issue was closed because it has been 14 days without activity since it has been marked as stale.'
22-
close-pr-message: 'This PR was closed because it has been 14 days without activity since it has been marked as stale.'
23-
days-before-issue-stale: 14
24-
days-before-close: 14
25-
only-labels: 'waiting for feedback'
26-
labels-to-add-when-unstale: 'investigating'
27-
labels-to-remove-when-unstale: 'stale,waiting for feedback'
28-
stale-issue-label: 'stale'
29-
stale-pr-label: 'stale'
25+
debug-only: false

.github/workflows/module-owners.json

-13
This file was deleted.

README.md

+5-4
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ TensorRT-LLM
55
<h4> A TensorRT Toolbox for Optimized Large Language Model Inference</h4>
66

77
[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
8-
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
9-
[![cuda](https://img.shields.io/badge/cuda-12.6.2-green)](https://developer.nvidia.com/cuda-downloads)
10-
[![trt](https://img.shields.io/badge/TRT-10.6.0-green)](https://developer.nvidia.com/tensorrt)
8+
[![python](https://img.shields.io/badge/python-3.12.3-green)](https://www.python.org/downloads/release/python-3123/)
9+
[![cuda](https://img.shields.io/badge/cuda-12.6.3-green)](https://developer.nvidia.com/cuda-downloads)
10+
[![trt](https://img.shields.io/badge/TRT-10.7.0-green)](https://developer.nvidia.com/tensorrt)
1111
[![version](https://img.shields.io/badge/release-0.16.0.dev-green)](./tensorrt_llm/version.py)
1212
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
1313

14-
[Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
14+
[Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
1515

1616
---
1717
<div align="left">
@@ -151,6 +151,7 @@ To get started with TensorRT-LLM, visit our documentation:
151151
- [Release Notes](https://nvidia.github.io/TensorRT-LLM/release-notes.html)
152152
- [Installation Guide for Linux](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
153153
- [Installation Guide for Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html)
154+
- [Installation Guide for Grace Hopper](https://nvidia.github.io/TensorRT-LLM/installation/grace-hopper.html)
154155
- [Supported Hardware, Models, and other Software](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html)
155156

156157
## Community

benchmarks/cpp/gptManagerBenchmark.cpp

+37-14
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,9 @@ class Recorder
259259
const std::lock_guard<std::mutex> lock(mRequestBenchInfosMutex);
260260
mRequestBenchInfos[requestId].outputLength = outSeqLen;
261261
mRequestBenchInfos[requestId].decodingIter = response.getResult().decodingIter;
262+
263+
// We record the first beam for the response file
264+
mResponseTensors[requestId] = outputTokenIds[0];
262265
}
263266
else
264267
{
@@ -492,14 +495,19 @@ class Recorder
492495
nlohmann::json jsonResponses = nlohmann::json::array();
493496
for (auto const& [respId, respTokensTensor] : mResponseTensors)
494497
{
495-
int inputLength = mRequestBenchInfos[respId].inputLength;
496-
int outputLength = mRequestBenchInfos[respId].outputLength;
497-
std::vector<int32_t> outputTokens(outputLength);
498+
auto respTokens = mResponseTensors[respId];
499+
int respLength = respTokens.size();
500+
int* respBufferPtr = respTokens.data();
498501

499-
int32_t* outputToksBufferPtr = bufferCast<int32_t>(*respTokensTensor);
500502
if (mOutputHasInput)
501-
outputToksBufferPtr += inputLength;
502-
std::copy(outputToksBufferPtr, outputToksBufferPtr + outputLength, outputTokens.begin());
503+
{
504+
int inputSeqLen = mRequestBenchInfos[respId].inputLength;
505+
respBufferPtr += inputSeqLen;
506+
respLength -= inputSeqLen;
507+
}
508+
509+
std::vector<int32_t> outputTokens(respLength);
510+
std::copy(respBufferPtr, respBufferPtr + respLength, outputTokens.begin());
503511

504512
nlohmann::json currResp;
505513
currResp["response_id"] = respId;
@@ -552,7 +560,7 @@ class Recorder
552560
bool mStreaming;
553561
int mBeamWidth;
554562
std::string mRespJsonFile;
555-
std::unordered_map<uint64_t, TensorPtr> mResponseTensors;
563+
std::unordered_map<uint64_t, texec::VecTokens> mResponseTensors;
556564
bool mOutputHasInput;
557565
std::mutex mRequestBenchInfosMutex;
558566

@@ -792,7 +800,8 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
792800
std::optional<int32_t> const& eosId, std::optional<int32_t> const& padId, BenchmarkParams const& benchmarkParams,
793801
texec::CapacitySchedulerPolicy capacitySchedulerPolicy, std::chrono::milliseconds waitSleep,
794802
bool returnContextLogits, bool returnGenerationLogits, std::optional<int> const staticEmulatedBatchSize,
795-
bool logIterationData, std::optional<SizeType32> const maxPromptLen, texec::ModelType executorModelType)
803+
bool logIterationData, std::optional<SizeType32> const maxPromptLen, texec::ModelType executorModelType,
804+
std::string const& responsesJsonFile)
796805
{
797806
auto const& world = tensorrt_llm::mpi::MpiComm::world();
798807
auto worldRank = world.getRank();
@@ -801,7 +810,7 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
801810
auto const samples = parseWorkloadJson(datasetPath, maxNumSamples, maxPromptLen);
802811
auto const numSamples = samples.size();
803812

804-
auto recorder = std::make_shared<Recorder>(opCsvFile, benchmarkParams.streaming, beamWidth);
813+
auto recorder = std::make_shared<Recorder>(opCsvFile, benchmarkParams.streaming, beamWidth, responsesJsonFile);
805814
int32_t decoderStartTokenId = 0;
806815
std::shared_ptr<ExecutorServer> executorServer;
807816

@@ -989,6 +998,7 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
989998
recorder->calculateMetrics();
990999
recorder->report();
9911000
recorder->writeOpMetricsToCsv();
1001+
recorder->dumpResponseSeqs();
9921002
// Send terminateReqId to terminate servers on all ranks
9931003
// Sever on rank 0 will broadcast the terminate signal to other servers on multi-GPU cases
9941004
// gptServer->enqueue(std::make_shared<InferenceRequest>(terminateReqId));
@@ -1047,11 +1057,13 @@ int main(int argc, char* argv[])
10471057
cxxopts::value<bool>()->default_value("false"));
10481058
options.add_options()("enable_exp_delays", "Enables exponential delay distr to mimic real world request arrival",
10491059
cxxopts::value<bool>()->default_value("false"));
1050-
options.add_options()("streaming", "Operate in streaming mode", cxxopts::value<bool>()->default_value("false"));
1060+
options.add_options()("streaming",
1061+
"Operate in streaming mode. Note: it reflects time-to-first-token and inter-token-latency",
1062+
cxxopts::value<bool>()->default_value("false"));
10511063
options.add_options()(
10521064
"enable_kv_cache_reuse", "Enables the KV cache reuse.", cxxopts::value<bool>()->default_value("false"));
1053-
options.add_options()("enable_chunked_context", "Whether to enable context chunking.",
1054-
cxxopts::value<bool>()->default_value("false"));
1065+
options.add_options()(
1066+
"enable_chunked_context", "Whether to enable context chunking.", cxxopts::value<bool>()->default_value("true"));
10551067
options.add_options()(
10561068
"return_context_logits", "Whether to return context logits.", cxxopts::value<bool>()->default_value("false"));
10571069
options.add_options()("return_generation_logits", "Whether to return generation logits.",
@@ -1064,7 +1076,7 @@ int main(int argc, char* argv[])
10641076
options.add_options()("static_emulated_batch_size",
10651077
"Emulate static batching performance with the provided batch size.", cxxopts::value<SizeType32>());
10661078
options.add_options()("log_level", "Choose log level between verbose/info/warning/error/internal_error.",
1067-
cxxopts::value<std::string>()->default_value("error"));
1079+
cxxopts::value<std::string>()->default_value("warning"));
10681080
options.add_options()("log_iteration_data", "On each decoder iteration, print batch state metadata.",
10691081
cxxopts::value<bool>()->default_value("false"));
10701082
options.add_options()("wait_sleep", "Specify how many milliseconds to sleep each iteration of waitForEmpty loop.",
@@ -1111,6 +1123,8 @@ int main(int argc, char* argv[])
11111123
"lookahead config in the format of [max_window_size, max_ngram_size, max_verification_set_size], and each <= "
11121124
"executor lookahead config",
11131125
cxxopts::value<std::string>());
1126+
options.add_options()("responses_json", "Write output response sequences to a json file",
1127+
cxxopts::value<std::string>()->default_value(""));
11141128

11151129
auto result = options.parse(argc, argv);
11161130

@@ -1137,6 +1151,12 @@ int main(int argc, char* argv[])
11371151
{
11381152
TLLM_LOG_WARNING("type option \"V1\" is going to be renamed to \"static\".");
11391153
}
1154+
bool streaming = result["streaming"].as<bool>();
1155+
if (streaming)
1156+
{
1157+
TLLM_LOG_ERROR("Streaming is not supported in static batching.\n");
1158+
return 1;
1159+
}
11401160
batchingType = texec::BatchingType::kSTATIC;
11411161
}
11421162
else if (type == "IFB" || type == "inflight")
@@ -1419,6 +1439,9 @@ int main(int argc, char* argv[])
14191439

14201440
initTrtLlmPlugins(logger.get());
14211441

1442+
// Argument: output sequences JSON
1443+
auto const responsesJsonFile = result["responses_json"].as<std::string>();
1444+
14221445
// Argument: API
14231446
auto const api = result["api"].as<std::string>();
14241447
if (api == "executor")
@@ -1449,7 +1472,7 @@ int main(int argc, char* argv[])
14491472
benchmarkExecutor(decoderEngineDir, encoderEngineDir, batchingType, datasetPath, opCsvFile, maxNumSamples,
14501473
beamWidth, result["warm_up"].as<int>(), eosId, padId, benchmarkParams, capacitySchedulerPolicy,
14511474
waitSleep, returnContextLogits, returnGenerationLogits, staticEmulatedBatchSize, logIterationData,
1452-
maxPromptLen, executorModelType);
1475+
maxPromptLen, executorModelType, responsesJsonFile);
14531476
}
14541477
catch (std::exception const& e)
14551478
{

benchmarks/cpp/utils/utils.cpp

+7
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,13 @@ Samples parseWorkloadJson(
9494
}
9595
samples.emplace_back(Sample{std::move(input_ids), sample["output_len"], taskId});
9696
}
97+
98+
if (samples.size() < maxNumSamples)
99+
{
100+
TLLM_LOG_WARNING(
101+
"Dataset size %zu is smaller than given max_num_samples %d, max_num_samples will be ignored.\n",
102+
samples.size(), maxNumSamples);
103+
}
97104
return samples;
98105
}
99106

benchmarks/cpp/utils/utils.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ struct BenchmarkParams
5252
bool enableBatchSizeTuning{false};
5353
bool enableMaxNumTokensTuning{false};
5454
bool enableBlockReuse{false};
55-
bool enableChunkedContext{false};
55+
bool enableChunkedContext{true};
5656
bool streaming{false};
5757
bool enableExpDelays{false};
5858
std::optional<float> requestRate{std::nullopt};

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

+4-7
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include <list>
3636
#include <memory>
3737
#include <optional>
38+
#include <set>
3839
#include <unordered_map>
3940
#include <vector>
4041

@@ -52,6 +53,7 @@ static constexpr SizeType32 kSecondaryLevel = 1;
5253

5354
class KVCacheBlock;
5455
class KVCacheManager;
56+
class KVCacheTransferManager;
5557

5658
using SizeType32 = tensorrt_llm::runtime::SizeType32;
5759
using TokenIdType = tensorrt_llm::runtime::TokenIdType;
@@ -622,13 +624,6 @@ class BlockManager
622624
void claimLeafBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority = std::nullopt,
623625
std::optional<std::chrono::milliseconds> durationMs = std::nullopt);
624626

625-
//! \brief Compute pointer to raw KV block (K & V, all layers).
626-
[[nodiscard]] runtime::ITensor::SharedPtr computeBlockPointer(
627-
std::shared_ptr<KVCacheBlock> block, SizeType32 poolIdx) const;
628-
629-
//! \brief Copy content of src block to dst.
630-
void copyBlock(BlockPtr src, BlockPtr dst);
631-
632627
private:
633628
// Number of blocks in pools
634629
SizeType32 mNumPrimaryBlocks;
@@ -667,6 +662,8 @@ class BlockManager
667662
std::shared_ptr<BaseEvictionPolicy> mEvictionPolicy;
668663
// Event manager
669664
std::shared_ptr<KVCacheEventManager> mEventManager;
665+
// Transfer manager
666+
std::shared_ptr<KVCacheTransferManager> mTransferManager;
670667

671668
// Statistics for block allocations/reuse
672669
// Total number of blocks allocated by all requests
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/*
2+
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "tensorrt_llm/batch_manager/kvCacheManager.h"
18+
#include "tensorrt_llm/runtime/bufferManager.h"
19+
#include "tensorrt_llm/runtime/cudaEvent.h"
20+
21+
namespace tr = tensorrt_llm::runtime;
22+
23+
#pragma once
24+
25+
namespace tensorrt_llm::batch_manager::kv_cache_manager
26+
{
27+
28+
// The TransferManager accelerates transfers to/from the GPU by overlapping HtoD and DtoH transfers, and tracks ongoing
29+
// transfers in order to avoid race conditions. It is functionally equivalent to the prior approach of putting all
30+
// transfers into the forward pass stream. This is only ever used as a component of a KVCacheManager.
31+
class KVCacheTransferManager
32+
{
33+
public:
34+
explicit KVCacheTransferManager(tr::BufferManager const& bufferManager);
35+
36+
//! \brief Onboard a block to gpu memory.
37+
void onboard(BlockPtr const& offloadBlock, BlockPtr const& block, std::vector<KVCacheBlockPool> const& pools);
38+
39+
//! \brief Offload a block to cpu memory.
40+
void offload(BlockPtr const& block, BlockPtr const& offloadBlock, std::vector<KVCacheBlockPool> const& pools);
41+
42+
//! \brief Synchronize the offload/onboard streams with the bufferManager stream.
43+
void syncTransfers();
44+
45+
private:
46+
//! \brief Get pointer to pool specified by cache block.
47+
static tr::ITensor::SharedPtr computeBlockPointer(
48+
BlockPtr const& block, std::vector<KVCacheBlockPool> const& pools, size_t poolIdx);
49+
50+
//! \brief Copy content of src block to dst.
51+
void copyBlock(
52+
BlockPtr const& src, BlockPtr const& dst, std::vector<KVCacheBlockPool> const& pools, bool isOffload);
53+
54+
runtime::BufferManager mBufferManager;
55+
runtime::BufferManager mOnboardManager;
56+
runtime::BufferManager mOffloadManager;
57+
58+
// Track the block ids offloaded in this iteration.
59+
std::unordered_map<int32_t, tr::CudaEvent> mPendingOffloads;
60+
};
61+
62+
} // namespace tensorrt_llm::batch_manager::kv_cache_manager

0 commit comments

Comments
 (0)