Skip to content

Commit

Permalink
Update TensorRT-LLM (#2333)
Browse files Browse the repository at this point in the history
* Update TensorRT-LLM

---------

Co-authored-by: Puneesh Khanna <[email protected]>
Co-authored-by: Ethan Zhang <[email protected]>
  • Loading branch information
3 people authored Oct 15, 2024
1 parent 8681b3a commit 75057cd
Show file tree
Hide file tree
Showing 251 changed files with 8,125 additions and 1,534 deletions.
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ TensorRT-LLM
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
[![cuda](https://img.shields.io/badge/cuda-12.5.1-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-10.4.0-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.14.0.dev-green)](./tensorrt_llm/version.py)
[![version](https://img.shields.io/badge/release-0.15.0.dev-green)](./tensorrt_llm/version.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)

[Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)
Expand All @@ -17,12 +17,15 @@ TensorRT-LLM
<div align="left">

## Latest News
* [2024/09/29] 🌟 AI at Meta PyTorch + TensorRT v2.4 🌟 ⚡TensorRT 10.1 ⚡PyTorch 2.4 ⚡CUDA 12.4 ⚡Python 3.12
[➡️ link](https://github.com/pytorch/TensorRT/releases/tag/v2.4.0)
* [2024/10/07] 🚀🚀🚀Optimizing Microsoft Bing Visual Search with NVIDIA Accelerated Libraries
[➡️ link](https://developer.nvidia.com/blog/optimizing-microsoft-bing-visual-search-with-nvidia-accelerated-libraries/)
<div align="center">
<img src="docs/source/media/image-09-29-2024.png" width="50%">
<img src="docs/source/media/image-10-07-2024.png" width="50%">
<div align="left">

* [2024/09/29] 🌟 AI at Meta PyTorch + TensorRT v2.4 🌟 ⚡TensorRT 10.1 ⚡PyTorch 2.4 ⚡CUDA 12.4 ⚡Python 3.12
[➡️ link](https://github.com/pytorch/TensorRT/releases/tag/v2.4.0)

* [2024/09/17] ✨ NVIDIA TensorRT-LLM Meetup
[➡️ link](https://drive.google.com/file/d/1RR8GqC-QbuaKuHj82rZcXb3MS20SWo6F/view?usp=share_link)

Expand Down
49 changes: 47 additions & 2 deletions benchmarks/cpp/gptManagerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -426,13 +426,19 @@ class Recorder
void initialize()
{
mStart = std::chrono::steady_clock::now();
mRequestsQueueingLatencies.clear();
}

void finalize()
{
mEnd = std::chrono::steady_clock::now();
}

void recordQueueLatency(std::vector<float> const& latencies)
{
mRequestsQueueingLatencies.insert(mRequestsQueueingLatencies.end(), latencies.begin(), latencies.end());
}

void recordStart(std::shared_ptr<InferenceRequest> request, uint64_t requestId)
{
auto const inputLength = request->getInputIds()->getSize();
Expand Down Expand Up @@ -677,6 +683,16 @@ class Recorder
mMaxGenT2TLatency = genT2TLatencies.back();
mMinGenT2TLatency = genT2TLatencies.front();
}

mAvgReqQueueingLatency
= std::accumulate(mRequestsQueueingLatencies.begin(), mRequestsQueueingLatencies.end(), 0.F)
/ mRequestsQueueingLatencies.size();
std::sort(mRequestsQueueingLatencies.begin(), mRequestsQueueingLatencies.end());
mP99ReqQueueingLatency = calcPercentile(mRequestsQueueingLatencies, 99);
mP90ReqQueueingLatency = calcPercentile(mRequestsQueueingLatencies, 90);
mP50ReqQueueingLatency = calcPercentile(mRequestsQueueingLatencies, 50);
mMaxReqQueueingLatency = mRequestsQueueingLatencies.back();
mMinReqQueueingLatency = mRequestsQueueingLatencies.front();
}
}

Expand Down Expand Up @@ -713,6 +729,13 @@ class Recorder
printf("[BENCHMARK] p99_inter_token_latency(ms) %.2f\n", mP99GenT2TLatency);
printf("[BENCHMARK] p90_inter_token_latency(ms) %.2f\n", mP90GenT2TLatency);
printf("[BENCHMARK] p50_inter_token_latency(ms) %.2f\n\n", mP50GenT2TLatency);

printf("[BENCHMARK] avg_request_queueing_latency(ms) %.2f\n", mAvgReqQueueingLatency);
printf("[BENCHMARK] max_request_queueing_latency(ms) %.2f\n", mMaxReqQueueingLatency);
printf("[BENCHMARK] min_request_queueing_latency(ms) %.2f\n", mMinReqQueueingLatency);
printf("[BENCHMARK] p99_request_queueing_latency(ms) %.2f\n", mP99ReqQueueingLatency);
printf("[BENCHMARK] p90_request_queueing_latency(ms) %.2f\n", mP90ReqQueueingLatency);
printf("[BENCHMARK] p50_request_queueing_latency(ms) %.2f\n\n", mP50ReqQueueingLatency);
}
}

Expand Down Expand Up @@ -820,6 +843,13 @@ class Recorder
float mP50GenT2TLatency{};
float mMaxGenT2TLatency{};
float mMinGenT2TLatency{};
float mAvgReqQueueingLatency{};
float mP99ReqQueueingLatency{};
float mP90ReqQueueingLatency{};
float mP50ReqQueueingLatency{};
float mMaxReqQueueingLatency{};
float mMinReqQueueingLatency{};
std::vector<float> mRequestsQueueingLatencies{};

std::string mOpCsvFile;
bool mStreaming;
Expand All @@ -846,6 +876,7 @@ class ExecutorServer
, mActiveCount(0)
, mNumFinished(0)
, mShutdown(false)
, mLogIterationData(logIterationData)
{

texec::SchedulerConfig schedulerConfig(capacitySchedulerPolicy);
Expand Down Expand Up @@ -899,7 +930,9 @@ class ExecutorServer
TLLM_LOG_ERROR("not a supported executor model type in executor server.");
}

if (logIterationData)
auto const& world = tensorrt_llm::mpi::MpiComm::world();
auto worldRank = world.getRank();
if (worldRank == 0)
{
mCollectStatsThread = std::thread(&ExecutorServer::collectStats, this);
}
Expand Down Expand Up @@ -988,7 +1021,18 @@ class ExecutorServer
auto iterStats = mExecutor->getLatestIterationStats();
for (auto const& iterStat : iterStats)
{
TLLM_LOG_INFO(texec::JsonSerialization::toJsonStr(iterStat));
SizeType32 numNewActiveRequests = iterStat.numNewActiveRequests;
if (numNewActiveRequests > 0)
{
float avgQueueingTime
= static_cast<float>(iterStat.newActiveRequestsQueueLatencyMS / numNewActiveRequests);
std::vector<float> requestsQueueLatencyMS(numNewActiveRequests, avgQueueingTime);
mRecorder->recordQueueLatency(requestsQueueLatencyMS);
}
if (mLogIterationData)
{
TLLM_LOG_INFO(texec::JsonSerialization::toJsonStr(iterStat));
}
}
auto const waitSleep = std::chrono::milliseconds(50);
std::this_thread::sleep_for(waitSleep);
Expand All @@ -1005,6 +1049,7 @@ class ExecutorServer
std::atomic<uint64_t> mActiveCount;
std::atomic<uint64_t> mNumFinished;
std::atomic<bool> mShutdown;
bool mLogIterationData;
}; // class ExecutorServer

class GptServer
Expand Down
38 changes: 37 additions & 1 deletion cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ class GenericLlmRequest
, mDecodingIter(0)
, mPriority(req.getPriority())
, mFinishReasons(mSamplingConfig.beamWidth)
, mEncoderInputFeatures(std::nullopt)
, mEncoderOutputLength(req.getEncoderOutputLength())
, mContextPhaseParams(req.getContextPhaseParams())
, mInputTokenExtraIds(std::nullopt)
Expand Down Expand Up @@ -263,7 +264,8 @@ class GenericLlmRequest
auto pTuningConfig = req.getPromptTuningConfig();
if (pTuningConfig)
{
mPromptEmbeddingTable = executor::detail::toITensor(pTuningConfig.value().getEmbeddingTable());
mPromptEmbeddingTable = tensorrt_llm::runtime::ITensor::view(
executor::detail::toITensor(pTuningConfig.value().getEmbeddingTable()));
TLLM_CHECK(mPromptEmbeddingTable.value()->getShape().nbDims == 2);
mPromptVocabSize = mPromptEmbeddingTable.value()->getShape().d[0];
mPromptEmbeddingTable.value()->unsqueeze(0);
Expand Down Expand Up @@ -1438,6 +1440,36 @@ class GenericLlmRequest
0.0, std::chrono::duration<double, std::milli>(mKvCacheTransferEnd - mKvCacheTransferStart).count());
}

void updateAllocTotalBlocksPerRequest(SizeType32 allocTotalBlocksPerRequest)
{
mAllocTotalBlocksPerRequest += allocTotalBlocksPerRequest;
}

[[nodiscard]] SizeType32 getAllocTotalBlocksPerRequest() const
{
return mAllocTotalBlocksPerRequest;
}

void updateAllocNewBlocksPerRequest(SizeType32 allocNewBlocksPerRequest)
{
mAllocNewBlocksPerRequest += allocNewBlocksPerRequest;
}

[[nodiscard]] SizeType32 getAllocNewBlocksPerRequest() const
{
return mAllocNewBlocksPerRequest;
}

void updateReusedBlocksPerRequest(SizeType32 reusedBlocksPerRequest)
{
mReusedBlocksPerRequest += reusedBlocksPerRequest;
}

[[nodiscard]] SizeType32 getReusedBlocksPerRequest() const
{
return mReusedBlocksPerRequest;
}

RequestIdType mRequestId;
SizeType32 mPromptLen;
SizeType32 mMaxNewTokens;
Expand Down Expand Up @@ -1545,6 +1577,10 @@ class GenericLlmRequest
std::chrono::time_point<std::chrono::steady_clock> mKvCacheTransferStart;
std::chrono::time_point<std::chrono::steady_clock> mKvCacheTransferEnd;

SizeType32 mAllocTotalBlocksPerRequest{0};
SizeType32 mAllocNewBlocksPerRequest{0};
SizeType32 mReusedBlocksPerRequest{0};

private:
void initialize(VecTokens const& inputTokens, bool outputLogProbs)
{
Expand Down
8 changes: 8 additions & 0 deletions cpp/include/tensorrt_llm/executor/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,8 @@ struct IterationStats
double iterLatencyMS;
/// @brief The total time spent in queue by the requests that became active in this iteration (ms)
double newActiveRequestsQueueLatencyMS;
/// @brief Number of new fetched active requests
SizeType32 numNewActiveRequests;
/// @brief Number of active requests
SizeType32 numActiveRequests;
/// @brief Number of queued requests
Expand Down Expand Up @@ -364,6 +366,12 @@ struct RequestStats
bool paused;
/// @brief Stats specific to disaggregated serving
std::optional<DisServingRequestStats> disServingStats;
/// @brief Number of total allocated blocks per request
SizeType32 allocTotalBlocksPerRequest;
/// @brief Number of newly allocated blocks per request
SizeType32 allocNewBlocksPerRequest;
/// @brief Number of reused blocks per request
SizeType32 reusedBlocksPerRequest;
};

/// @brief Struct that holds the stats of all requests in an iteration
Expand Down
1 change: 0 additions & 1 deletion cpp/include/tensorrt_llm/runtime/gptSession.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ class [[deprecated("Use the executor API instead.")]] GptSession
std::optional<SizeType32> genMicroBatchSize = std::nullopt;
std::optional<executor::DecodingMode> decodingMode = std::nullopt;
bool normalizeLogProbs = true;
std::optional<std::filesystem::path> enginePath;
};

//! @brief Optional profiler class to profile the generation phase of an inference request
Expand Down
12 changes: 12 additions & 0 deletions cpp/include/tensorrt_llm/runtime/modelConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ class ModelConfig
, mContextFMHA(false)
, mPagedContextFMHA(false)
, mUseXQA{false}
, mPpReduceScatter{false}
, mUseLoraPlugin(false)
, mMlpHiddenSize(0)
, mUseCrossAttention(false)
Expand Down Expand Up @@ -468,6 +469,16 @@ class ModelConfig
return mUseXQA;
}

void constexpr setPpReduceScatter(bool ppReduceScatter) noexcept
{
mPpReduceScatter = ppReduceScatter;
}

[[nodiscard]] bool constexpr getPpReduceScatter() const noexcept
{
return mPpReduceScatter;
}

[[nodiscard]] bool constexpr useLoraPlugin() const noexcept
{
return mUseLoraPlugin;
Expand Down Expand Up @@ -759,6 +770,7 @@ class ModelConfig
bool mContextFMHA;
bool mPagedContextFMHA;
bool mUseXQA;
bool mPpReduceScatter;

bool mUseLoraPlugin;
std::vector<LoraModule> mLoraModules;
Expand Down
30 changes: 24 additions & 6 deletions cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ class SpeculativeDecodingMode
return SpeculativeDecodingMode{kExplicitDraftTokens};
}

static auto constexpr Eagle()
{
return SpeculativeDecodingMode{kEagle};
}

[[nodiscard]] bool constexpr isNone() const
{
return anyBitSet(kNone);
Expand All @@ -75,29 +80,34 @@ class SpeculativeDecodingMode
return anyBitSet(kExplicitDraftTokens);
}

[[nodiscard]] bool constexpr isEagle() const
{
return anyBitSet(kEagle);
}

[[nodiscard]] bool constexpr updatesPositionIds() const
{
return anyBitSet(kLookaheadDecoding | kExplicitDraftTokens);
return anyBitSet(kLookaheadDecoding | kExplicitDraftTokens | kEagle);
}

[[nodiscard]] bool constexpr requiresAttentionMask() const
{
return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens);
return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens | kEagle);
}

[[nodiscard]] bool constexpr predictsDraftTokens() const
{
return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens);
return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens | kEagle);
}

[[nodiscard]] bool constexpr needsKVCacheRewind() const
{
return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens);
return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens | kEagle);
}

[[nodiscard]] bool constexpr variableDraftLength() const
{
return anyBitSet(kDraftTokensExternal | kExplicitDraftTokens | kLookaheadDecoding);
return anyBitSet(kDraftTokensExternal | kExplicitDraftTokens | kLookaheadDecoding | kEagle);
}

[[nodiscard]] bool constexpr hasDraftLogits() const
Expand All @@ -107,7 +117,7 @@ class SpeculativeDecodingMode

[[nodiscard]] bool constexpr needsDecoderPrologue() const
{
return anyBitSet(kExplicitDraftTokens | kLookaheadDecoding);
return anyBitSet(kExplicitDraftTokens | kLookaheadDecoding | kEagle);
}

using UnderlyingType = std::uint8_t;
Expand All @@ -129,6 +139,7 @@ class SpeculativeDecodingMode
static UnderlyingType constexpr kMedusa{1U << 2U};
static UnderlyingType constexpr kLookaheadDecoding{1U << 3U};
static UnderlyingType constexpr kExplicitDraftTokens{1U << 4U};
static UnderlyingType constexpr kEagle{1U << 5U};

[[nodiscard]] bool constexpr anyBitSet(UnderlyingType bits) const
{
Expand Down Expand Up @@ -173,4 +184,11 @@ static_assert(!SpeculativeDecodingMode::ExplicitDraftTokens().isDraftTokensExter
static_assert(!SpeculativeDecodingMode::ExplicitDraftTokens().isMedusa());
static_assert(!SpeculativeDecodingMode::ExplicitDraftTokens().isLookaheadDecoding());

static_assert(SpeculativeDecodingMode::Eagle().isEagle());
static_assert(!SpeculativeDecodingMode::Eagle().isNone());
static_assert(!SpeculativeDecodingMode::Eagle().isDraftTokensExternal());
static_assert(!SpeculativeDecodingMode::Eagle().isMedusa());
static_assert(!SpeculativeDecodingMode::Eagle().isExplicitDraftTokens());
static_assert(!SpeculativeDecodingMode::Eagle().isLookaheadDecoding());

} // namespace tensorrt_llm::runtime
Git LFS file not shown
Git LFS file not shown
6 changes: 3 additions & 3 deletions cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
954182e0c057f71f858a84f746201044 libtensorrt_llm_batch_manager_static.a
dfe6ca360cf1d24a3dcae0a2bf8589c0 libtensorrt_llm_batch_manager_static.pre_cxx11.a
4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
d7508bec7b6f112a2eac04cbeaf8b5da libtensorrt_llm_batch_manager_static.a
d8969624b327af844d9ffba910084b93 libtensorrt_llm_batch_manager_static.pre_cxx11.a
3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit
Git LFS file not shown
Git LFS file not shown
6 changes: 3 additions & 3 deletions cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
61fd34e765788884d42f4ba27f085520 libtensorrt_llm_batch_manager_static.a
e8a64dd19a234304483ef6756e67fd40 libtensorrt_llm_batch_manager_static.pre_cxx11.a
4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
7029ee9cb0a921a3603e98815da18985 libtensorrt_llm_batch_manager_static.a
0e7fe69b6621fe6dabcc0b372c3440f4 libtensorrt_llm_batch_manager_static.pre_cxx11.a
3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit
Git LFS file not shown
Loading

0 comments on commit 75057cd

Please sign in to comment.