Update TensorRT-LLM (#2333)

* Update TensorRT-LLM --------- Co-authored-by: Puneesh Khanna <[email protected]> Co-authored-by: Ethan Zhang <[email protected]>
NVIDIA · Oct 15, 2024 · 75057cd · 75057cd
1 parent 8681b3a
commit 75057cd
Show file tree

Hide file tree

Showing 251 changed files with 8,125 additions and 1,534 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.5.1-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-10.4.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.14.0.dev-green)](./tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-0.15.0.dev-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)
@@ -17,12 +17,15 @@ TensorRT-LLM
 <div align="left">
 
 ## Latest News
-* [2024/09/29] 🌟 AI at Meta PyTorch + TensorRT v2.4 🌟 ⚡TensorRT 10.1 ⚡PyTorch 2.4 ⚡CUDA 12.4 ⚡Python 3.12
-[➡️ link](https://github.com/pytorch/TensorRT/releases/tag/v2.4.0)
+* [2024/10/07] 🚀🚀🚀Optimizing Microsoft Bing Visual Search with NVIDIA Accelerated Libraries
+[➡️ link](https://developer.nvidia.com/blog/optimizing-microsoft-bing-visual-search-with-nvidia-accelerated-libraries/)
 <div align="center">
-<img src="docs/source/media/image-09-29-2024.png" width="50%">
+<img src="docs/source/media/image-10-07-2024.png" width="50%">
 <div align="left">
 
+* [2024/09/29] 🌟 AI at Meta PyTorch + TensorRT v2.4 🌟 ⚡TensorRT 10.1 ⚡PyTorch 2.4 ⚡CUDA 12.4 ⚡Python 3.12
+[➡️ link](https://github.com/pytorch/TensorRT/releases/tag/v2.4.0)
+
 * [2024/09/17] ✨ NVIDIA TensorRT-LLM Meetup
 [➡️ link](https://drive.google.com/file/d/1RR8GqC-QbuaKuHj82rZcXb3MS20SWo6F/view?usp=share_link)
 

diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
@@ -426,13 +426,19 @@ class Recorder
     void initialize()
     {
         mStart = std::chrono::steady_clock::now();
+        mRequestsQueueingLatencies.clear();
     }
 
     void finalize()
     {
         mEnd = std::chrono::steady_clock::now();
     }
 
+    void recordQueueLatency(std::vector<float> const& latencies)
+    {
+        mRequestsQueueingLatencies.insert(mRequestsQueueingLatencies.end(), latencies.begin(), latencies.end());
+    }
+
     void recordStart(std::shared_ptr<InferenceRequest> request, uint64_t requestId)
     {
         auto const inputLength = request->getInputIds()->getSize();
@@ -677,6 +683,16 @@ class Recorder
                 mMaxGenT2TLatency = genT2TLatencies.back();
                 mMinGenT2TLatency = genT2TLatencies.front();
             }
+
+            mAvgReqQueueingLatency
+                = std::accumulate(mRequestsQueueingLatencies.begin(), mRequestsQueueingLatencies.end(), 0.F)
+                / mRequestsQueueingLatencies.size();
+            std::sort(mRequestsQueueingLatencies.begin(), mRequestsQueueingLatencies.end());
+            mP99ReqQueueingLatency = calcPercentile(mRequestsQueueingLatencies, 99);
+            mP90ReqQueueingLatency = calcPercentile(mRequestsQueueingLatencies, 90);
+            mP50ReqQueueingLatency = calcPercentile(mRequestsQueueingLatencies, 50);
+            mMaxReqQueueingLatency = mRequestsQueueingLatencies.back();
+            mMinReqQueueingLatency = mRequestsQueueingLatencies.front();
         }
     }
 
@@ -713,6 +729,13 @@ class Recorder
             printf("[BENCHMARK] p99_inter_token_latency(ms) %.2f\n", mP99GenT2TLatency);
             printf("[BENCHMARK] p90_inter_token_latency(ms) %.2f\n", mP90GenT2TLatency);
             printf("[BENCHMARK] p50_inter_token_latency(ms) %.2f\n\n", mP50GenT2TLatency);
+
+            printf("[BENCHMARK] avg_request_queueing_latency(ms) %.2f\n", mAvgReqQueueingLatency);
+            printf("[BENCHMARK] max_request_queueing_latency(ms) %.2f\n", mMaxReqQueueingLatency);
+            printf("[BENCHMARK] min_request_queueing_latency(ms) %.2f\n", mMinReqQueueingLatency);
+            printf("[BENCHMARK] p99_request_queueing_latency(ms) %.2f\n", mP99ReqQueueingLatency);
+            printf("[BENCHMARK] p90_request_queueing_latency(ms) %.2f\n", mP90ReqQueueingLatency);
+            printf("[BENCHMARK] p50_request_queueing_latency(ms) %.2f\n\n", mP50ReqQueueingLatency);
         }
     }
 
@@ -820,6 +843,13 @@ class Recorder
     float mP50GenT2TLatency{};
     float mMaxGenT2TLatency{};
     float mMinGenT2TLatency{};
+    float mAvgReqQueueingLatency{};
+    float mP99ReqQueueingLatency{};
+    float mP90ReqQueueingLatency{};
+    float mP50ReqQueueingLatency{};
+    float mMaxReqQueueingLatency{};
+    float mMinReqQueueingLatency{};
+    std::vector<float> mRequestsQueueingLatencies{};
 
     std::string mOpCsvFile;
     bool mStreaming;
@@ -846,6 +876,7 @@ class ExecutorServer
         , mActiveCount(0)
         , mNumFinished(0)
         , mShutdown(false)
+        , mLogIterationData(logIterationData)
     {
 
         texec::SchedulerConfig schedulerConfig(capacitySchedulerPolicy);
@@ -899,7 +930,9 @@ class ExecutorServer
             TLLM_LOG_ERROR("not a supported executor model type in executor server.");
         }
 
-        if (logIterationData)
+        auto const& world = tensorrt_llm::mpi::MpiComm::world();
+        auto worldRank = world.getRank();
+        if (worldRank == 0)
         {
             mCollectStatsThread = std::thread(&ExecutorServer::collectStats, this);
         }
@@ -988,7 +1021,18 @@ class ExecutorServer
             auto iterStats = mExecutor->getLatestIterationStats();
             for (auto const& iterStat : iterStats)
             {
-                TLLM_LOG_INFO(texec::JsonSerialization::toJsonStr(iterStat));
+                SizeType32 numNewActiveRequests = iterStat.numNewActiveRequests;
+                if (numNewActiveRequests > 0)
+                {
+                    float avgQueueingTime
+                        = static_cast<float>(iterStat.newActiveRequestsQueueLatencyMS / numNewActiveRequests);
+                    std::vector<float> requestsQueueLatencyMS(numNewActiveRequests, avgQueueingTime);
+                    mRecorder->recordQueueLatency(requestsQueueLatencyMS);
+                }
+                if (mLogIterationData)
+                {
+                    TLLM_LOG_INFO(texec::JsonSerialization::toJsonStr(iterStat));
+                }
             }
             auto const waitSleep = std::chrono::milliseconds(50);
             std::this_thread::sleep_for(waitSleep);
@@ -1005,6 +1049,7 @@ class ExecutorServer
     std::atomic<uint64_t> mActiveCount;
     std::atomic<uint64_t> mNumFinished;
     std::atomic<bool> mShutdown;
+    bool mLogIterationData;
 }; // class ExecutorServer
 
 class GptServer

diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -201,6 +201,7 @@ class GenericLlmRequest
         , mDecodingIter(0)
         , mPriority(req.getPriority())
         , mFinishReasons(mSamplingConfig.beamWidth)
+        , mEncoderInputFeatures(std::nullopt)
         , mEncoderOutputLength(req.getEncoderOutputLength())
         , mContextPhaseParams(req.getContextPhaseParams())
         , mInputTokenExtraIds(std::nullopt)
@@ -263,7 +264,8 @@ class GenericLlmRequest
         auto pTuningConfig = req.getPromptTuningConfig();
         if (pTuningConfig)
         {
-            mPromptEmbeddingTable = executor::detail::toITensor(pTuningConfig.value().getEmbeddingTable());
+            mPromptEmbeddingTable = tensorrt_llm::runtime::ITensor::view(
+                executor::detail::toITensor(pTuningConfig.value().getEmbeddingTable()));
             TLLM_CHECK(mPromptEmbeddingTable.value()->getShape().nbDims == 2);
             mPromptVocabSize = mPromptEmbeddingTable.value()->getShape().d[0];
             mPromptEmbeddingTable.value()->unsqueeze(0);
@@ -1438,6 +1440,36 @@ class GenericLlmRequest
             0.0, std::chrono::duration<double, std::milli>(mKvCacheTransferEnd - mKvCacheTransferStart).count());
     }
 
+    void updateAllocTotalBlocksPerRequest(SizeType32 allocTotalBlocksPerRequest)
+    {
+        mAllocTotalBlocksPerRequest += allocTotalBlocksPerRequest;
+    }
+
+    [[nodiscard]] SizeType32 getAllocTotalBlocksPerRequest() const
+    {
+        return mAllocTotalBlocksPerRequest;
+    }
+
+    void updateAllocNewBlocksPerRequest(SizeType32 allocNewBlocksPerRequest)
+    {
+        mAllocNewBlocksPerRequest += allocNewBlocksPerRequest;
+    }
+
+    [[nodiscard]] SizeType32 getAllocNewBlocksPerRequest() const
+    {
+        return mAllocNewBlocksPerRequest;
+    }
+
+    void updateReusedBlocksPerRequest(SizeType32 reusedBlocksPerRequest)
+    {
+        mReusedBlocksPerRequest += reusedBlocksPerRequest;
+    }
+
+    [[nodiscard]] SizeType32 getReusedBlocksPerRequest() const
+    {
+        return mReusedBlocksPerRequest;
+    }
+
     RequestIdType mRequestId;
     SizeType32 mPromptLen;
     SizeType32 mMaxNewTokens;
@@ -1545,6 +1577,10 @@ class GenericLlmRequest
     std::chrono::time_point<std::chrono::steady_clock> mKvCacheTransferStart;
     std::chrono::time_point<std::chrono::steady_clock> mKvCacheTransferEnd;
 
+    SizeType32 mAllocTotalBlocksPerRequest{0};
+    SizeType32 mAllocNewBlocksPerRequest{0};
+    SizeType32 mReusedBlocksPerRequest{0};
+
 private:
     void initialize(VecTokens const& inputTokens, bool outputLogProbs)
     {

diff --git a/cpp/include/tensorrt_llm/executor/types.h b/cpp/include/tensorrt_llm/executor/types.h
@@ -297,6 +297,8 @@ struct IterationStats
     double iterLatencyMS;
     /// @brief The total time spent in queue by the requests that became active in this iteration (ms)
     double newActiveRequestsQueueLatencyMS;
+    /// @brief Number of new fetched active requests
+    SizeType32 numNewActiveRequests;
     /// @brief Number of active requests
     SizeType32 numActiveRequests;
     /// @brief Number of queued requests
@@ -364,6 +366,12 @@ struct RequestStats
     bool paused;
     /// @brief Stats specific to disaggregated serving
     std::optional<DisServingRequestStats> disServingStats;
+    /// @brief Number of total allocated blocks per request
+    SizeType32 allocTotalBlocksPerRequest;
+    /// @brief Number of newly allocated blocks per request
+    SizeType32 allocNewBlocksPerRequest;
+    /// @brief Number of reused blocks per request
+    SizeType32 reusedBlocksPerRequest;
 };
 
 /// @brief Struct that holds the stats of all requests in an iteration

diff --git a/cpp/include/tensorrt_llm/runtime/gptSession.h b/cpp/include/tensorrt_llm/runtime/gptSession.h
@@ -115,7 +115,6 @@ class [[deprecated("Use the executor API instead.")]] GptSession
         std::optional<SizeType32> genMicroBatchSize = std::nullopt;
         std::optional<executor::DecodingMode> decodingMode = std::nullopt;
         bool normalizeLogProbs = true;
-        std::optional<std::filesystem::path> enginePath;
     };
 
     //! @brief Optional profiler class to profile the generation phase of an inference request

diff --git a/cpp/include/tensorrt_llm/runtime/modelConfig.h b/cpp/include/tensorrt_llm/runtime/modelConfig.h
@@ -127,6 +127,7 @@ class ModelConfig
         , mContextFMHA(false)
         , mPagedContextFMHA(false)
         , mUseXQA{false}
+        , mPpReduceScatter{false}
         , mUseLoraPlugin(false)
         , mMlpHiddenSize(0)
         , mUseCrossAttention(false)
@@ -468,6 +469,16 @@ class ModelConfig
         return mUseXQA;
     }
 
+    void constexpr setPpReduceScatter(bool ppReduceScatter) noexcept
+    {
+        mPpReduceScatter = ppReduceScatter;
+    }
+
+    [[nodiscard]] bool constexpr getPpReduceScatter() const noexcept
+    {
+        return mPpReduceScatter;
+    }
+
     [[nodiscard]] bool constexpr useLoraPlugin() const noexcept
     {
         return mUseLoraPlugin;
@@ -759,6 +770,7 @@ class ModelConfig
     bool mContextFMHA;
     bool mPagedContextFMHA;
     bool mUseXQA;
+    bool mPpReduceScatter;
 
     bool mUseLoraPlugin;
     std::vector<LoraModule> mLoraModules;

diff --git a/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h b/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h
@@ -50,6 +50,11 @@ class SpeculativeDecodingMode
         return SpeculativeDecodingMode{kExplicitDraftTokens};
     }
 
+    static auto constexpr Eagle()
+    {
+        return SpeculativeDecodingMode{kEagle};
+    }
+
     [[nodiscard]] bool constexpr isNone() const
     {
         return anyBitSet(kNone);
@@ -75,29 +80,34 @@ class SpeculativeDecodingMode
         return anyBitSet(kExplicitDraftTokens);
     }
 
+    [[nodiscard]] bool constexpr isEagle() const
+    {
+        return anyBitSet(kEagle);
+    }
+
     [[nodiscard]] bool constexpr updatesPositionIds() const
     {
-        return anyBitSet(kLookaheadDecoding | kExplicitDraftTokens);
+        return anyBitSet(kLookaheadDecoding | kExplicitDraftTokens | kEagle);
     }
 
     [[nodiscard]] bool constexpr requiresAttentionMask() const
     {
-        return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens);
+        return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens | kEagle);
     }
 
     [[nodiscard]] bool constexpr predictsDraftTokens() const
     {
-        return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens);
+        return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens | kEagle);
     }
 
     [[nodiscard]] bool constexpr needsKVCacheRewind() const
     {
-        return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens);
+        return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens | kEagle);
     }
 
     [[nodiscard]] bool constexpr variableDraftLength() const
     {
-        return anyBitSet(kDraftTokensExternal | kExplicitDraftTokens | kLookaheadDecoding);
+        return anyBitSet(kDraftTokensExternal | kExplicitDraftTokens | kLookaheadDecoding | kEagle);
     }
 
     [[nodiscard]] bool constexpr hasDraftLogits() const
@@ -107,7 +117,7 @@ class SpeculativeDecodingMode
 
     [[nodiscard]] bool constexpr needsDecoderPrologue() const
     {
-        return anyBitSet(kExplicitDraftTokens | kLookaheadDecoding);
+        return anyBitSet(kExplicitDraftTokens | kLookaheadDecoding | kEagle);
     }
 
     using UnderlyingType = std::uint8_t;
@@ -129,6 +139,7 @@ class SpeculativeDecodingMode
     static UnderlyingType constexpr kMedusa{1U << 2U};
     static UnderlyingType constexpr kLookaheadDecoding{1U << 3U};
     static UnderlyingType constexpr kExplicitDraftTokens{1U << 4U};
+    static UnderlyingType constexpr kEagle{1U << 5U};
 
     [[nodiscard]] bool constexpr anyBitSet(UnderlyingType bits) const
     {
@@ -173,4 +184,11 @@ static_assert(!SpeculativeDecodingMode::ExplicitDraftTokens().isDraftTokensExter
 static_assert(!SpeculativeDecodingMode::ExplicitDraftTokens().isMedusa());
 static_assert(!SpeculativeDecodingMode::ExplicitDraftTokens().isLookaheadDecoding());
 
+static_assert(SpeculativeDecodingMode::Eagle().isEagle());
+static_assert(!SpeculativeDecodingMode::Eagle().isNone());
+static_assert(!SpeculativeDecodingMode::Eagle().isDraftTokensExternal());
+static_assert(!SpeculativeDecodingMode::Eagle().isMedusa());
+static_assert(!SpeculativeDecodingMode::Eagle().isExplicitDraftTokens());
+static_assert(!SpeculativeDecodingMode::Eagle().isLookaheadDecoding());
+
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
diff --git a/...orrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/...orrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-954182e0c057f71f858a84f746201044 libtensorrt_llm_batch_manager_static.a
-dfe6ca360cf1d24a3dcae0a2bf8589c0 libtensorrt_llm_batch_manager_static.pre_cxx11.a
-4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
+d7508bec7b6f112a2eac04cbeaf8b5da libtensorrt_llm_batch_manager_static.a
+d8969624b327af844d9ffba910084b93 libtensorrt_llm_batch_manager_static.pre_cxx11.a
+3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
diff --git a/...sorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/...sorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-61fd34e765788884d42f4ba27f085520 libtensorrt_llm_batch_manager_static.a
-e8a64dd19a234304483ef6756e67fd40 libtensorrt_llm_batch_manager_static.pre_cxx11.a
-4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
+7029ee9cb0a921a3603e98815da18985 libtensorrt_llm_batch_manager_static.a
+0e7fe69b6621fe6dabcc0b372c3440f4 libtensorrt_llm_batch_manager_static.pre_cxx11.a
+3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib