diff --git a/README.md b/README.md index 8f581897b..c2ab3f3bc 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ TensorRT-LLM [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/) [![cuda](https://img.shields.io/badge/cuda-12.5.1-green)](https://developer.nvidia.com/cuda-downloads) [![trt](https://img.shields.io/badge/TRT-10.4.0-green)](https://developer.nvidia.com/tensorrt) -[![version](https://img.shields.io/badge/release-0.14.0.dev-green)](./tensorrt_llm/version.py) +[![version](https://img.shields.io/badge/release-0.15.0.dev-green)](./tensorrt_llm/version.py) [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE) [Architecture](./docs/source/architecture/overview.md)   |   [Results](./docs/source/performance/perf-overview.md)   |   [Examples](./examples/)   |   [Documentation](./docs/source/) @@ -17,12 +17,15 @@ TensorRT-LLM
## Latest News -* [2024/09/29] 🌟 AI at Meta PyTorch + TensorRT v2.4 🌟 ⚡TensorRT 10.1 ⚡PyTorch 2.4 ⚡CUDA 12.4 ⚡Python 3.12 -[➡️ link](https://github.com/pytorch/TensorRT/releases/tag/v2.4.0) +* [2024/10/07] 🚀🚀🚀Optimizing Microsoft Bing Visual Search with NVIDIA Accelerated Libraries +[➡️ link](https://developer.nvidia.com/blog/optimizing-microsoft-bing-visual-search-with-nvidia-accelerated-libraries/)
- +
+* [2024/09/29] 🌟 AI at Meta PyTorch + TensorRT v2.4 🌟 ⚡TensorRT 10.1 ⚡PyTorch 2.4 ⚡CUDA 12.4 ⚡Python 3.12 +[➡️ link](https://github.com/pytorch/TensorRT/releases/tag/v2.4.0) + * [2024/09/17] ✨ NVIDIA TensorRT-LLM Meetup [➡️ link](https://drive.google.com/file/d/1RR8GqC-QbuaKuHj82rZcXb3MS20SWo6F/view?usp=share_link) diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp index b901a17bc..585579755 100644 --- a/benchmarks/cpp/gptManagerBenchmark.cpp +++ b/benchmarks/cpp/gptManagerBenchmark.cpp @@ -426,6 +426,7 @@ class Recorder void initialize() { mStart = std::chrono::steady_clock::now(); + mRequestsQueueingLatencies.clear(); } void finalize() @@ -433,6 +434,11 @@ class Recorder mEnd = std::chrono::steady_clock::now(); } + void recordQueueLatency(std::vector const& latencies) + { + mRequestsQueueingLatencies.insert(mRequestsQueueingLatencies.end(), latencies.begin(), latencies.end()); + } + void recordStart(std::shared_ptr request, uint64_t requestId) { auto const inputLength = request->getInputIds()->getSize(); @@ -677,6 +683,16 @@ class Recorder mMaxGenT2TLatency = genT2TLatencies.back(); mMinGenT2TLatency = genT2TLatencies.front(); } + + mAvgReqQueueingLatency + = std::accumulate(mRequestsQueueingLatencies.begin(), mRequestsQueueingLatencies.end(), 0.F) + / mRequestsQueueingLatencies.size(); + std::sort(mRequestsQueueingLatencies.begin(), mRequestsQueueingLatencies.end()); + mP99ReqQueueingLatency = calcPercentile(mRequestsQueueingLatencies, 99); + mP90ReqQueueingLatency = calcPercentile(mRequestsQueueingLatencies, 90); + mP50ReqQueueingLatency = calcPercentile(mRequestsQueueingLatencies, 50); + mMaxReqQueueingLatency = mRequestsQueueingLatencies.back(); + mMinReqQueueingLatency = mRequestsQueueingLatencies.front(); } } @@ -713,6 +729,13 @@ class Recorder printf("[BENCHMARK] p99_inter_token_latency(ms) %.2f\n", mP99GenT2TLatency); printf("[BENCHMARK] p90_inter_token_latency(ms) %.2f\n", mP90GenT2TLatency); printf("[BENCHMARK] p50_inter_token_latency(ms) %.2f\n\n", mP50GenT2TLatency); + + printf("[BENCHMARK] avg_request_queueing_latency(ms) %.2f\n", mAvgReqQueueingLatency); + printf("[BENCHMARK] max_request_queueing_latency(ms) %.2f\n", mMaxReqQueueingLatency); + printf("[BENCHMARK] min_request_queueing_latency(ms) %.2f\n", mMinReqQueueingLatency); + printf("[BENCHMARK] p99_request_queueing_latency(ms) %.2f\n", mP99ReqQueueingLatency); + printf("[BENCHMARK] p90_request_queueing_latency(ms) %.2f\n", mP90ReqQueueingLatency); + printf("[BENCHMARK] p50_request_queueing_latency(ms) %.2f\n\n", mP50ReqQueueingLatency); } } @@ -820,6 +843,13 @@ class Recorder float mP50GenT2TLatency{}; float mMaxGenT2TLatency{}; float mMinGenT2TLatency{}; + float mAvgReqQueueingLatency{}; + float mP99ReqQueueingLatency{}; + float mP90ReqQueueingLatency{}; + float mP50ReqQueueingLatency{}; + float mMaxReqQueueingLatency{}; + float mMinReqQueueingLatency{}; + std::vector mRequestsQueueingLatencies{}; std::string mOpCsvFile; bool mStreaming; @@ -846,6 +876,7 @@ class ExecutorServer , mActiveCount(0) , mNumFinished(0) , mShutdown(false) + , mLogIterationData(logIterationData) { texec::SchedulerConfig schedulerConfig(capacitySchedulerPolicy); @@ -899,7 +930,9 @@ class ExecutorServer TLLM_LOG_ERROR("not a supported executor model type in executor server."); } - if (logIterationData) + auto const& world = tensorrt_llm::mpi::MpiComm::world(); + auto worldRank = world.getRank(); + if (worldRank == 0) { mCollectStatsThread = std::thread(&ExecutorServer::collectStats, this); } @@ -988,7 +1021,18 @@ class ExecutorServer auto iterStats = mExecutor->getLatestIterationStats(); for (auto const& iterStat : iterStats) { - TLLM_LOG_INFO(texec::JsonSerialization::toJsonStr(iterStat)); + SizeType32 numNewActiveRequests = iterStat.numNewActiveRequests; + if (numNewActiveRequests > 0) + { + float avgQueueingTime + = static_cast(iterStat.newActiveRequestsQueueLatencyMS / numNewActiveRequests); + std::vector requestsQueueLatencyMS(numNewActiveRequests, avgQueueingTime); + mRecorder->recordQueueLatency(requestsQueueLatencyMS); + } + if (mLogIterationData) + { + TLLM_LOG_INFO(texec::JsonSerialization::toJsonStr(iterStat)); + } } auto const waitSleep = std::chrono::milliseconds(50); std::this_thread::sleep_for(waitSleep); @@ -1005,6 +1049,7 @@ class ExecutorServer std::atomic mActiveCount; std::atomic mNumFinished; std::atomic mShutdown; + bool mLogIterationData; }; // class ExecutorServer class GptServer diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index 475970b7b..a323a15f9 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -201,6 +201,7 @@ class GenericLlmRequest , mDecodingIter(0) , mPriority(req.getPriority()) , mFinishReasons(mSamplingConfig.beamWidth) + , mEncoderInputFeatures(std::nullopt) , mEncoderOutputLength(req.getEncoderOutputLength()) , mContextPhaseParams(req.getContextPhaseParams()) , mInputTokenExtraIds(std::nullopt) @@ -263,7 +264,8 @@ class GenericLlmRequest auto pTuningConfig = req.getPromptTuningConfig(); if (pTuningConfig) { - mPromptEmbeddingTable = executor::detail::toITensor(pTuningConfig.value().getEmbeddingTable()); + mPromptEmbeddingTable = tensorrt_llm::runtime::ITensor::view( + executor::detail::toITensor(pTuningConfig.value().getEmbeddingTable())); TLLM_CHECK(mPromptEmbeddingTable.value()->getShape().nbDims == 2); mPromptVocabSize = mPromptEmbeddingTable.value()->getShape().d[0]; mPromptEmbeddingTable.value()->unsqueeze(0); @@ -1438,6 +1440,36 @@ class GenericLlmRequest 0.0, std::chrono::duration(mKvCacheTransferEnd - mKvCacheTransferStart).count()); } + void updateAllocTotalBlocksPerRequest(SizeType32 allocTotalBlocksPerRequest) + { + mAllocTotalBlocksPerRequest += allocTotalBlocksPerRequest; + } + + [[nodiscard]] SizeType32 getAllocTotalBlocksPerRequest() const + { + return mAllocTotalBlocksPerRequest; + } + + void updateAllocNewBlocksPerRequest(SizeType32 allocNewBlocksPerRequest) + { + mAllocNewBlocksPerRequest += allocNewBlocksPerRequest; + } + + [[nodiscard]] SizeType32 getAllocNewBlocksPerRequest() const + { + return mAllocNewBlocksPerRequest; + } + + void updateReusedBlocksPerRequest(SizeType32 reusedBlocksPerRequest) + { + mReusedBlocksPerRequest += reusedBlocksPerRequest; + } + + [[nodiscard]] SizeType32 getReusedBlocksPerRequest() const + { + return mReusedBlocksPerRequest; + } + RequestIdType mRequestId; SizeType32 mPromptLen; SizeType32 mMaxNewTokens; @@ -1545,6 +1577,10 @@ class GenericLlmRequest std::chrono::time_point mKvCacheTransferStart; std::chrono::time_point mKvCacheTransferEnd; + SizeType32 mAllocTotalBlocksPerRequest{0}; + SizeType32 mAllocNewBlocksPerRequest{0}; + SizeType32 mReusedBlocksPerRequest{0}; + private: void initialize(VecTokens const& inputTokens, bool outputLogProbs) { diff --git a/cpp/include/tensorrt_llm/executor/types.h b/cpp/include/tensorrt_llm/executor/types.h index 5a8525caf..c9ff1e099 100644 --- a/cpp/include/tensorrt_llm/executor/types.h +++ b/cpp/include/tensorrt_llm/executor/types.h @@ -297,6 +297,8 @@ struct IterationStats double iterLatencyMS; /// @brief The total time spent in queue by the requests that became active in this iteration (ms) double newActiveRequestsQueueLatencyMS; + /// @brief Number of new fetched active requests + SizeType32 numNewActiveRequests; /// @brief Number of active requests SizeType32 numActiveRequests; /// @brief Number of queued requests @@ -364,6 +366,12 @@ struct RequestStats bool paused; /// @brief Stats specific to disaggregated serving std::optional disServingStats; + /// @brief Number of total allocated blocks per request + SizeType32 allocTotalBlocksPerRequest; + /// @brief Number of newly allocated blocks per request + SizeType32 allocNewBlocksPerRequest; + /// @brief Number of reused blocks per request + SizeType32 reusedBlocksPerRequest; }; /// @brief Struct that holds the stats of all requests in an iteration diff --git a/cpp/include/tensorrt_llm/runtime/gptSession.h b/cpp/include/tensorrt_llm/runtime/gptSession.h index 46cd19902..a4b8e4cc3 100644 --- a/cpp/include/tensorrt_llm/runtime/gptSession.h +++ b/cpp/include/tensorrt_llm/runtime/gptSession.h @@ -115,7 +115,6 @@ class [[deprecated("Use the executor API instead.")]] GptSession std::optional genMicroBatchSize = std::nullopt; std::optional decodingMode = std::nullopt; bool normalizeLogProbs = true; - std::optional enginePath; }; //! @brief Optional profiler class to profile the generation phase of an inference request diff --git a/cpp/include/tensorrt_llm/runtime/modelConfig.h b/cpp/include/tensorrt_llm/runtime/modelConfig.h index b1b495e75..ce8985b56 100644 --- a/cpp/include/tensorrt_llm/runtime/modelConfig.h +++ b/cpp/include/tensorrt_llm/runtime/modelConfig.h @@ -127,6 +127,7 @@ class ModelConfig , mContextFMHA(false) , mPagedContextFMHA(false) , mUseXQA{false} + , mPpReduceScatter{false} , mUseLoraPlugin(false) , mMlpHiddenSize(0) , mUseCrossAttention(false) @@ -468,6 +469,16 @@ class ModelConfig return mUseXQA; } + void constexpr setPpReduceScatter(bool ppReduceScatter) noexcept + { + mPpReduceScatter = ppReduceScatter; + } + + [[nodiscard]] bool constexpr getPpReduceScatter() const noexcept + { + return mPpReduceScatter; + } + [[nodiscard]] bool constexpr useLoraPlugin() const noexcept { return mUseLoraPlugin; @@ -759,6 +770,7 @@ class ModelConfig bool mContextFMHA; bool mPagedContextFMHA; bool mUseXQA; + bool mPpReduceScatter; bool mUseLoraPlugin; std::vector mLoraModules; diff --git a/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h b/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h index e739e8188..3b396122b 100644 --- a/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h +++ b/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h @@ -50,6 +50,11 @@ class SpeculativeDecodingMode return SpeculativeDecodingMode{kExplicitDraftTokens}; } + static auto constexpr Eagle() + { + return SpeculativeDecodingMode{kEagle}; + } + [[nodiscard]] bool constexpr isNone() const { return anyBitSet(kNone); @@ -75,29 +80,34 @@ class SpeculativeDecodingMode return anyBitSet(kExplicitDraftTokens); } + [[nodiscard]] bool constexpr isEagle() const + { + return anyBitSet(kEagle); + } + [[nodiscard]] bool constexpr updatesPositionIds() const { - return anyBitSet(kLookaheadDecoding | kExplicitDraftTokens); + return anyBitSet(kLookaheadDecoding | kExplicitDraftTokens | kEagle); } [[nodiscard]] bool constexpr requiresAttentionMask() const { - return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens); + return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens | kEagle); } [[nodiscard]] bool constexpr predictsDraftTokens() const { - return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens); + return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens | kEagle); } [[nodiscard]] bool constexpr needsKVCacheRewind() const { - return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens); + return anyBitSet(kLookaheadDecoding | kMedusa | kExplicitDraftTokens | kEagle); } [[nodiscard]] bool constexpr variableDraftLength() const { - return anyBitSet(kDraftTokensExternal | kExplicitDraftTokens | kLookaheadDecoding); + return anyBitSet(kDraftTokensExternal | kExplicitDraftTokens | kLookaheadDecoding | kEagle); } [[nodiscard]] bool constexpr hasDraftLogits() const @@ -107,7 +117,7 @@ class SpeculativeDecodingMode [[nodiscard]] bool constexpr needsDecoderPrologue() const { - return anyBitSet(kExplicitDraftTokens | kLookaheadDecoding); + return anyBitSet(kExplicitDraftTokens | kLookaheadDecoding | kEagle); } using UnderlyingType = std::uint8_t; @@ -129,6 +139,7 @@ class SpeculativeDecodingMode static UnderlyingType constexpr kMedusa{1U << 2U}; static UnderlyingType constexpr kLookaheadDecoding{1U << 3U}; static UnderlyingType constexpr kExplicitDraftTokens{1U << 4U}; + static UnderlyingType constexpr kEagle{1U << 5U}; [[nodiscard]] bool constexpr anyBitSet(UnderlyingType bits) const { @@ -173,4 +184,11 @@ static_assert(!SpeculativeDecodingMode::ExplicitDraftTokens().isDraftTokensExter static_assert(!SpeculativeDecodingMode::ExplicitDraftTokens().isMedusa()); static_assert(!SpeculativeDecodingMode::ExplicitDraftTokens().isLookaheadDecoding()); +static_assert(SpeculativeDecodingMode::Eagle().isEagle()); +static_assert(!SpeculativeDecodingMode::Eagle().isNone()); +static_assert(!SpeculativeDecodingMode::Eagle().isDraftTokensExternal()); +static_assert(!SpeculativeDecodingMode::Eagle().isMedusa()); +static_assert(!SpeculativeDecodingMode::Eagle().isExplicitDraftTokens()); +static_assert(!SpeculativeDecodingMode::Eagle().isLookaheadDecoding()); + } // namespace tensorrt_llm::runtime diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a index d04677b80..ddf6e4806 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a292517d802f2297c5d12d5d14ab597f47f46ebd31412fac044ceb9ca51a482 -size 5160586 +oid sha256:a55035628e0035141b4ea79b946f49ad77893d6e5d1ab47c402e1a9b95fbbb6c +size 5160128 diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a index 462c03949..850e53457 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8575fb58200701ae30feb4b8bd3f325f8018aac5505167fdba42e269adb3bd8c -size 5271836 +oid sha256:ed219fad83caf000a40f0688fdb20cb8593a5fe8096316d645229ee160c42514 +size 5271480 diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt index aff5e53bd..1d38b0ca3 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -954182e0c057f71f858a84f746201044 libtensorrt_llm_batch_manager_static.a -dfe6ca360cf1d24a3dcae0a2bf8589c0 libtensorrt_llm_batch_manager_static.pre_cxx11.a -4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit \ No newline at end of file +d7508bec7b6f112a2eac04cbeaf8b5da libtensorrt_llm_batch_manager_static.a +d8969624b327af844d9ffba910084b93 libtensorrt_llm_batch_manager_static.pre_cxx11.a +3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a index 4e5be000e..d11b40f7c 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8fe84073b7ccff8dc361fdee64c3ef30bc523909e0bf9c16547f76a05a53fb5c -size 5009886 +oid sha256:36479d1577d131e36ca03549467a6cfe4822868ca0f3dda3b5d254ee4680341f +size 5009646 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a index 46d8c1b5c..a1485d52a 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e565c2c3ce58656742772591d992aca91c7e46eb9fc711599d2d51928b88b48 -size 4970532 +oid sha256:b5caef410133f1552418978aa20cc1d3f7b6500b1dbc8b9f44232554b7cc8390 +size 4971234 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt index 2c9c2852f..89f9c2b21 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -61fd34e765788884d42f4ba27f085520 libtensorrt_llm_batch_manager_static.a -e8a64dd19a234304483ef6756e67fd40 libtensorrt_llm_batch_manager_static.pre_cxx11.a -4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit \ No newline at end of file +7029ee9cb0a921a3603e98815da18985 libtensorrt_llm_batch_manager_static.a +0e7fe69b6621fe6dabcc0b372c3440f4 libtensorrt_llm_batch_manager_static.pre_cxx11.a +3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib index d1664c2e8..42a6fe97d 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib +++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:200a6721aa1d6e009c94866adab36ac686eb1beef02df267af7e18e31e11612b -size 32436708 +oid sha256:b86e215e86c7b0f8b0c9618fb655e6e4f31cc731f778cf0ca12fde93c7afbcab +size 32389592 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt index 45482c43b..0679a9114 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt +++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt @@ -1,2 +1,2 @@ -9485cfa635b17378f23d1624b3acfbaf tensorrt_llm_batch_manager_static.lib -4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit \ No newline at end of file +afac175cfda36b14d76e17517bad8b24 tensorrt_llm_batch_manager_static.lib +3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h index a4f80dc6f..f81961dee 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h @@ -92,7 +92,7 @@ template < typename Policy_, /// Number of stages, int Stages, - /// Converter for B matrix applited immediately after the LDS + /// Converter for B matrix applied immediately after the LDS typename TransformBAfterLDS_, /// The quantization operator being used WeightOnlyQuantOp QuantOp_, diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a index 26b60736a..2fd74350e 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:809a1da76123ec4c640d63efc902209585223b66e23d887db9a198c5836986a2 -size 3349066 +oid sha256:414606be5b56f592fc7bd25f1e9fbf958c900dd2b01e01907029dfe19408ce59 +size 3349230 diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a index 007fa3207..095132fac 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6846ecefa017d03ab7d853908794c884ab4e92a500e223278b1d64eab59ed061 -size 3376088 +oid sha256:682cf952def054fce6116983a3b5686994b71744fcc85a65e3c9a6e44549c82d +size 3377832 diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt index 4a30230b9..e73a6e86b 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -5a771664fdb75d99ba5fb90249ac26f0 libtensorrt_llm_executor_static.a -3b433ea93b7d1d6fa471b457980f2680 libtensorrt_llm_executor_static.pre_cxx11.a -4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit \ No newline at end of file +dc9b4081af6357227886180a1b9a6d8d libtensorrt_llm_executor_static.a +8291552cf3e8da9dc368c8c37cd35abe libtensorrt_llm_executor_static.pre_cxx11.a +3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a index 7584b1fe6..5d0776dfa 100644 --- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a +++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:479e86f410763445357f5d879cc666d210352dda9709ab5ab56e73591a9e8af8 -size 7851266 +oid sha256:88810c1dac205a1111fc833c0fe0d38486152b4b878fd972585eec2ac27d5160 +size 7857242 diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a index 0f764244d..425d255fc 100644 --- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6473c77d18929fa75342d63ffc591df39e8aeba1dda0b920b0187d4888710559 -size 7767384 +oid sha256:c023d6bad569fb3b3c528f3e003afa6a5f11a045bdccb06ca875607a6c781ade +size 7769728 diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt index 4baf60ba7..9ff444cfe 100644 --- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -5424fb0f82076e03b5316f73aed04434 libtensorrt_llm_executor_static.a -d0b1236baf61fc5c43383bbc1cd50fa8 libtensorrt_llm_executor_static.pre_cxx11.a -4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit \ No newline at end of file +fd9cb10c300350266f65957475404bff libtensorrt_llm_executor_static.a +b8b0ae2861ef66853330441752ab1e32 libtensorrt_llm_executor_static.pre_cxx11.a +3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib index efd7ecf87..f9e5e12f7 100644 --- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib +++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dee57c9257a6678833e3c0d83e8df07aff25c185bc085db75938cec6652044c0 -size 24568210 +oid sha256:baf4dd1bacd75c4eae6d98fe411bbb5d478dc5905a298d4238db3db21121ebca +size 24630026 diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt index 681dc3284..f46f09905 100644 --- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt +++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt @@ -1,2 +1,2 @@ -305fac5d046a574ded2d46d968f746b0 tensorrt_llm_executor_static.lib -4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit \ No newline at end of file +30d62c80211e4a2dc38bbe9dc5257839 tensorrt_llm_executor_static.lib +3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl index 1a0f6bc65..126e761ec 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl @@ -74,7 +74,6 @@ void sm80_generic_fused_moe_gemm_kernelLauncher(ElementType_ const* A, CutlassWe int occupancy = std::min(2, fused_moe::fused_gemm_maximum_active_blocks()); int const threadblock_count = multi_processor_count * occupancy; TLLM_CHECK_WITH_INFO(occupancy > 0, "GPU lacks the shared memory resources to run fused_moe kernel"); - GemmType gemm; using Arguments = typename GemmType::Arguments; Arguments args{{const_cast(A), const_cast(B), const_cast(biases), reinterpret_cast(C), total_tokens_including_expert, static_cast(gemm_n), diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt index 92ae4d99b..c19ceafee 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt @@ -1,2 +1,2 @@ 88c30973b9b3452baa3f063d34d08169 libtensorrt_llm_nvrtc_wrapper.so -4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit \ No newline at end of file +3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt index e2ce46ae4..9fa1f5280 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt @@ -1,2 +1,2 @@ 95e9f87610383348e444d2d0b8396f2d libtensorrt_llm_nvrtc_wrapper.so -4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit \ No newline at end of file +3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll index 3f82a0827..ccb5cdd40 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db512d533ab4e4a4abd0047a65d891dfd6e1522f2d34c90f29296c3239fd3cc1 +oid sha256:3bc495e1e677616db2756eb7d56d1161c34ae723896db34487883a955e2b3442 size 1128448 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib index cfe4399d6..eb4782449 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e207a8f57b944529163c7ed2ab30639a5f2779c5118602c6ebd50a623d16f845 +oid sha256:1a6c03470aaa69378d4989971ab9dd00ee427f7e14a85ba5e114ea0594c4de5e size 3488 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt index 465df4be7..1f123d67b 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt @@ -1,3 +1,3 @@ -b7e624ba775e9f5090ef4b67bcdbd7a2 tensorrt_llm_nvrtc_wrapper.lib -d89a0a140d2d427af13c3794a4b21e2c tensorrt_llm_nvrtc_wrapper.dll -4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit \ No newline at end of file +c5f36e093e875c8ea84523fb1566d986 tensorrt_llm_nvrtc_wrapper.lib +de4b2f87f8eb1027f89c0f5cb05ca047 tensorrt_llm_nvrtc_wrapper.dll +3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a index 70cc1d3d6..6b5ab2887 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0814af36fed752bbe70d953cefbb78dd306c42f3d9f6848b7043a865e48f9662 +oid sha256:80dbb6e3a34380bf4e375901ad9b71df24ec97cddcaa9f226bc0a278d11cbdd6 size 25364090 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a index 84879c280..2910af2e3 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee46f2d1c9162f4302a1031f778fcb7c7110c84110427f97af6532ed9bd342fd +oid sha256:31e5cd6ef9e3599d55501ab0484b81f82ef1f22a79360a2699cd4a62c4928115 size 25768990 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt index 736fddd4a..8c8438147 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -90740ead1def66f350e14c133278463d libtensorrt_llm_internal_cutlass_kernels_static.a -b0104227ffd1ce19fc1fdb45e349df36 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a -4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit \ No newline at end of file +1febd9d1bf244163deb269e2bebcd1e3 libtensorrt_llm_internal_cutlass_kernels_static.a +8fdb39f871225dedd32ca6651f1944ba libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a +3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a index 573caf92e..3ac157472 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d9ba0f8b95cf64227cb0b17654fb7c9bc1741fe003889658b305750b388a4dc +oid sha256:3431f91bcb2cadb8a2641c4ea54d1f8f90c5aa7648591510e3a27865c94169ea size 44173632 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a index daa8557bd..f9ab0f6e1 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f848d5beebbd69792047a96b16f7145f8e1e3e311d2a19789ce639ad8149b0e +oid sha256:1dedd4dd1df76a57576e749b4105a5d5f5070a6f7ee30d11944105742fea9b4b size 43561206 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt index 0c0c38e19..69baedf76 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -2aaf05cb84f52b024e89d4fa634d6900 libtensorrt_llm_internal_cutlass_kernels_static.a -f17ce186e9105c594e39d252777ce4c7 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a -4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit \ No newline at end of file +8683b15e77bf62ee9f57a2507e21e6a7 libtensorrt_llm_internal_cutlass_kernels_static.a +a065a7b6a11b079ee544664dddcf59a6 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a +3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib index 5aa0009ca..00c671277 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c429687e335c75f08186bcd8f629b50467cb0f2e484d755834c5b1cdbb9ecaf3 -size 88140796 +oid sha256:c7afdf2c313685b0e31f4e5572e20cd11d94227177849784ce7405e15a3587f6 +size 88140804 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt index e14aff7e8..889c9577f 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt @@ -1,2 +1,2 @@ -4f663be2b768088805ccec6dc33545fc tensorrt_llm_internal_cutlass_kernels_static.lib -4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit \ No newline at end of file +7eee845e969cfb8d589074d81288b700 tensorrt_llm_internal_cutlass_kernels_static.lib +3eeadd9a4a9ca2558b3a2f2089419f8d285744e5 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu index 04edc841a..8811c484e 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu +++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu @@ -48,7 +48,7 @@ __global__ void topKStage1(T const* __restrict logProbs, T const* const* __restr auto const tokenIdx = static_cast(blockIdx.y); auto const batchId = bid / BLOCKS_PER_BEAM_; // row id for logProbs - auto const batchSlot = batchSlots[batchId]; + auto const batchSlot = batchSlots == nullptr ? batchId : batchSlots[batchId]; if (tokensPerStep != nullptr && tokenIdx >= tokensPerStep[batchSlot]) { return; @@ -63,7 +63,6 @@ __global__ void topKStage1(T const* __restrict logProbs, T const* const* __restr auto const logBufIndex = batchId * maxTokensPerStep * vocabSize + tokenIdx * vocabSize; auto logProbsSlot = logProbsPtrs == nullptr ? logProbs + logBufIndex : logProbsPtrs[batchId * maxTokensPerStep + tokenIdx]; - auto const blockLane = bid % BLOCKS_PER_BEAM_; // block id for a beam auto const k = (topKs != nullptr) ? topKs[batchSlot] : maxTopK; // batchId = batch index @@ -77,7 +76,7 @@ __global__ void topKStage1(T const* __restrict logProbs, T const* const* __restr if (finished != nullptr && finishState.isFinished()) { - if (tid < k) + if (tid < k && endIds != nullptr) // if returnAllSelectedToken, endIds would not be an input { auto const index = tmpTopKBufIndex + tid; if (blockLane == 0 && tid == 0) @@ -134,7 +133,7 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T* float const* topPs, curandState_t* curandState, TokenIdType const* endIds, SizeType32 vocabSize, bool const* skipDecode, SizeType32 const* batchSlots, SizeType32 maxBatchSize, bool normalizeLogProbs, bool logitHasProbs, SizeType32 const* tokensPerStep, SizeType32 maxTokensPerStep, SizeType32 maxSeqLen, - bool returnAllTopK) + bool returnAllSelectedTokens) { bool const IS_FP16 = std::is_same::value; T const MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX; @@ -142,7 +141,7 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T* auto const tid = static_cast(threadIdx.x); auto const batchIdx = static_cast(blockIdx.x); auto const tokenIdx = static_cast(blockIdx.y); - auto const batchSlot = batchSlots[batchIdx]; + auto const batchSlot = batchSlots == nullptr ? batchIdx : batchSlots[batchIdx]; FinishedState const finishState = finishedInput != nullptr ? finishedInput[batchSlot] : FinishedState::empty(); if ((skipDecode != nullptr && skipDecode[batchSlot]) || (finishState.isSkipDecoding())) { @@ -215,13 +214,16 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T* if (tid == 0) { - auto randNum = static_cast(curand_uniform(curandState + batchSlot) * probThreshold * sSum); + // if we want to return all top k indices, we should not do random sampling for probThreshold + auto randNum = (returnAllSelectedTokens || curandState == nullptr) + ? static_cast(probThreshold * sSum) + : static_cast(curand_uniform(curandState + batchSlot) * probThreshold * sSum); auto* outputIdsRequestPtr = idsPtrs == nullptr ? ids + batchSlot * maxSeqLen : idsPtrs[batchSlot]; for (SizeType32 ki = 0; ki < k; ki++) { auto expLogit = sVal2[ki]; randNum = randNum - expLogit; - if (randNum <= 0.0f || ki == k - 1 || returnAllTopK) + if (randNum <= 0.0f || ki == k - 1 || returnAllSelectedTokens) { auto idx = sId[ki]; // If sId is -1 here we force output token to the last from vocabulary to get vivid indicator of smth @@ -230,10 +232,10 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T* ? topKTmpIdBuf[(batchIdx * maxTokensPerStep + tokenIdx) * stride + idx] % vocabSize : vocabSize - 1; auto const curSeqLen = sequenceLengths == nullptr ? 0 : sequenceLengths[batchSlot]; - auto const outIdx = returnAllTopK ? tokenIdx * maxTopK + ki : curSeqLen + tokenIdx; + auto const outIdx = returnAllSelectedTokens ? tokenIdx * maxTopK + ki : curSeqLen + tokenIdx; outputIdsRequestPtr[outIdx] = outputId; - // cum log prob is not supported with returnAllTopK - if (!returnAllTopK) + // cum log prob is not supported with returnAllSelectedTokens + if (!returnAllSelectedTokens) { if (cumLogProbs != nullptr || outputLogProbs != nullptr) { @@ -255,9 +257,17 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T* } break; } + if (returnAllSelectedTokens && randNum <= 0.0f) + { + if (ki < k - 1) + { // not the last k, write a -1 to to log top p tokens boundary for external draft token masking + outputIdsRequestPtr[outIdx + 1] = -1; + } + break; + } } } - if (maxTokensPerStep == 1 && !returnAllTopK && sequenceLengths != nullptr && finishedOutput != nullptr + if (maxTokensPerStep == 1 && !returnAllSelectedTokens && sequenceLengths != nullptr && finishedOutput != nullptr && endIds != nullptr) { auto const seqLen = sequenceLengths[batchSlot]; @@ -297,7 +307,7 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T* params.maxTopK, params.topKs, params.maxTopP, params.topPs, params.curandState, params.endIds, \ params.vocabSizePadded, params.skipDecode, params.batchSlots, params.maxBatchSize, \ params.normalizeLogProbs, params.logitsHasProbs, params.tokensPerStep, params.maxTokensPerStep, \ - params.maxSeqLen, params.returnAllTopK); \ + params.maxSeqLen, params.returnAllSelectedTokens); \ } \ } while (0) diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h index 0330cad31..c14e73ab9 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h +++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h @@ -34,8 +34,8 @@ struct TopKSamplingKernelParams //! Log probabilities of each token in the vocab. If logitsHasProbs is true, //! logProbs must contain **just** probabilities instead of log probabilities. T const* logProbs{nullptr}; - //! input buffer [batchSize][vocabSizePadded] array of pointers to logits. - //! If nullptr, logProbs is used. Only maxTokensPerStep == 1 is supported. + //! input buffer [batchSize][tokensPerStep, vocabSizePadded] array of pointers to logits. + //! If nullptr, logProbs is used. T const* const* logProbsPtrs{nullptr}; //! output buffer [maxBatchSize][maxSeqLen], optional. Contains pointers to rows @@ -82,7 +82,8 @@ struct TopKSamplingKernelParams //! Ignored if nullptr. float* outputLogProbs{nullptr}; - //! input buffer [maxBatchSize]. Initialized curand states + //! input buffer [maxBatchSize], optional. Initialized curand states. + //! If nullptr, 1 is always used. curandState_t* curandState{nullptr}; //! input buffer [maxBatchSize]. K for topK sampling per request. //! Supported K is in range [1; 1024]. Where K=1 is greedy search. @@ -106,8 +107,8 @@ struct TopKSamplingKernelParams bool normalizeLogProbs{false}; //! flag to highlight that logProbs contains probabilities bool logitsHasProbs{false}; - //! flag to return all selectedTopK results - bool returnAllTopK{false}; + //! flag to return all selected TopK results + bool returnAllSelectedTokens{false}; void checkParams() const { @@ -131,13 +132,12 @@ struct TopKSamplingKernelParams } TLLM_CHECK(workspace); - TLLM_CHECK(curandState); - TLLM_CHECK(maxTokensPerStep != 1 || returnAllTopK || sequenceLengths); - TLLM_CHECK(maxTokensPerStep != 1 || returnAllTopK || endIds); + TLLM_CHECK(maxTokensPerStep != 1 || returnAllSelectedTokens || sequenceLengths); + TLLM_CHECK(maxTokensPerStep != 1 || returnAllSelectedTokens || endIds); if (cumLogProbs != nullptr || outputLogProbs != nullptr) { - TLLM_CHECK(maxTokensPerStep == 1 && !returnAllTopK); + TLLM_CHECK(maxTokensPerStep == 1 && !returnAllSelectedTokens); } TLLM_CHECK(((finishedOutput == nullptr) ^ (endIds == nullptr)) == 0); diff --git a/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu index 5605dbcea..23a8db7bb 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu +++ b/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu @@ -200,7 +200,7 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId SizeType32* sequenceLength, FinishedState const* finishedInput, FinishedState* finishedOutput, float* cumLogProbs, float* outputLogProbs, SizeType32 const* beginOffsetBuf, SizeType32 const* offsetBuf, SizeType32 vocabSize, curandState_t* curandState, float const* topPs, TokenIdType const* endIds, SizeType32 maxBatchSize, - bool const* skipDecode, SizeType32 const* batchSlots, bool returnAllTopP, SizeType32 maxSeqLen) + bool const* skipDecode, SizeType32 const* batchSlots, bool returnAllSelectedTokens, SizeType32 maxSeqLen) { /** * Each block processes one request row sorted in descending order by probabilities. @@ -244,7 +244,7 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId if (threadIdx.x == 0) { // if we want to return all top p indices, we should not do random sampling for probThreshold - randNumS = returnAllTopP ? probThreshold : curand_uniform(curandState + blockIdx.x) * probThreshold; + randNumS = returnAllSelectedTokens ? probThreshold : curand_uniform(curandState + blockIdx.x) * probThreshold; } // if beginOffsetBuf and offsetBuf of sorting have same value, @@ -255,7 +255,7 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId if (tid == 0) { auto offset = batchId * vocabSize; - if (returnAllTopP) + if (returnAllSelectedTokens) { outputIdsRequestPtr[currentStep] = sortedIdVals[offset]; } @@ -294,7 +294,7 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId } } - if (returnAllTopP) + if (returnAllSelectedTokens) { __shared__ SizeType32 sharedSelectedTokenId; if (threadIdx.x == min(blockDim.x - count, blockDim.x - 1)) @@ -403,7 +403,7 @@ void invokeBatchTopPSampling(TopPSamplingKernelParams const& params, cudaStre params.outputIds, params.outputIdsPtrs, params.sequenceLength, params.finishedInput, params.finishedOutput, params.cumLogProbs, params.outputLogProbs, beginOffsetBuf, offsetBuf + 1, params.vocabSizePadded, params.curandState, params.topPs, params.endIds, params.maxBatchSize, params.skipDecode, params.batchSlots, - params.returnAllTopP, params.maxSeqLen); + params.returnAllSelectedTokens, params.maxSeqLen); sync_check_cuda_error(); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); diff --git a/cpp/tensorrt_llm/kernels/samplingTopPKernels.h b/cpp/tensorrt_llm/kernels/samplingTopPKernels.h index 2ab025ba0..639d7d4d6 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopPKernels.h +++ b/cpp/tensorrt_llm/kernels/samplingTopPKernels.h @@ -80,7 +80,7 @@ struct TopPSamplingKernelParams runtime::SizeType32 vocabSizePadded{-1}; runtime::SizeType32 maxSeqLen{-1}; - bool returnAllTopP{false}; + bool returnAllSelectedTokens{false}; void checkParams() const { @@ -91,7 +91,7 @@ struct TopPSamplingKernelParams TLLM_CHECK(probs); TLLM_CHECK(outputIds || outputIdsPtrs); TLLM_CHECK(workspace); - TLLM_CHECK((sequenceLength != nullptr) || returnAllTopP); + TLLM_CHECK((sequenceLength != nullptr) || returnAllSelectedTokens); TLLM_CHECK(curandState); TLLM_CHECK(topPs); diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu new file mode 100644 index 000000000..b03b674fa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/cudaTypeUtils.cuh" +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/memoryUtils.h" +#include "tensorrt_llm/common/reduceKernelUtils.cuh" +#include "tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h" +#ifndef CUDART_VERSION +#error CUDART_VERSION Undefined! +#elif (CUDART_VERSION >= 11050) +#include +#else +#include "3rdparty/cub/cub.cuh" +#endif + +using namespace tensorrt_llm::common; +using namespace tensorrt_llm::runtime; + +namespace tensorrt_llm::kernels::speculative_decoding +{ +namespace +{ +template +__global__ void assembleTargetLogitsOffsets(T const** logitsPtrs, SizeType32* decodingTokens, T const* logits, + SizeType32 const* draftDecodingTokens, SizeType32 batchSize, SizeType32 maxDecodingTokens, + SizeType32 vocabSizePadded) +{ + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage tempStorage; + + auto const tix = static_cast(threadIdx.x); + + SizeType32 numDecodingTokens{0}; + if (tix < batchSize) + { + numDecodingTokens = draftDecodingTokens[tix] + 1; + decodingTokens[tix] = numDecodingTokens; + } + + SizeType32 logitsOffset{0}; + BlockScan(tempStorage).ExclusiveSum(numDecodingTokens, logitsOffset); + + if (tix < batchSize) + { + for (SizeType32 ti = 0; ti < numDecodingTokens; ++ti) + { + logitsPtrs[tix * maxDecodingTokens + ti] = logits + (logitsOffset + ti) * vocabSizePadded; + } + } +} +} // namespace + +template +void invokeAssembleTargetLogitsOffsets(T const** logitsPtrs, SizeType32* decodingTokens, T const* logits, + SizeType32 const* draftDecodingTokens, SizeType32 batchSize, SizeType32 maxDecodingTokens, + SizeType32 vocabSizePadded, cudaStream_t stream) +{ + SizeType32 constexpr BLOCK_SIZE = 512; + TLLM_CHECK_WITH_INFO( + batchSize <= BLOCK_SIZE, "Batch size larger than %d is not supported for EAGLE yet", batchSize); + assembleTargetLogitsOffsets<<<1, BLOCK_SIZE, 0, stream>>>( + logitsPtrs, decodingTokens, logits, draftDecodingTokens, batchSize, maxDecodingTokens, vocabSizePadded); + + sync_check_cuda_error(); +} + +template void invokeAssembleTargetLogitsOffsets(float const** logitsPtrs, SizeType32* decodingTokens, + float const* logits, SizeType32 const* draftDecodingTokens, SizeType32 batchSize, SizeType32 maxDecodingTokens, + SizeType32 vocabSizePadded, cudaStream_t stream); +template void invokeAssembleTargetLogitsOffsets(__half const** logitsPtrs, SizeType32* decodingTokens, + __half const* logits, SizeType32 const* draftDecodingTokens, SizeType32 batchSize, SizeType32 maxDecodingTokens, + SizeType32 vocabSizePadded, cudaStream_t stream); + +namespace +{ +template +__global__ void selectLastAccTokenAndComputeIndicesCumSum(TokenIdType* lastAcceptedTokenIds, + SizeType32* exclusiveSumLastAcceptedIndices, SizeType32 const* draftDecodingTokens, + TokenIdType const* acceptedTokenIds, SizeType32 const* acceptedLengths, SizeType32 const* bestPathIds, + SizeType32 const* paths, SizeType32 batchSize, SizeType32 maxDecodingTokens, SizeType32 maxPathLen) +{ + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage tempStorage; + + auto const tix = static_cast(threadIdx.x); + SizeType32 decodingTokens{0}; + SizeType32 lastTokenId{0}; + if (tix < batchSize) + { + auto const acceptedLen = acceptedLengths[tix]; + lastAcceptedTokenIds[tix] = acceptedTokenIds[tix * maxPathLen + acceptedLen - 1]; + auto const bestPathId = bestPathIds[tix]; + auto const pathIdx = flat_index3(tix, bestPathId, acceptedLen - 1, maxDecodingTokens, maxPathLen); + lastTokenId = paths[pathIdx]; + decodingTokens = draftDecodingTokens[tix] + 1; + } + + BlockScan(tempStorage).ExclusiveSum(decodingTokens, decodingTokens); + + if (tix < batchSize) + { + exclusiveSumLastAcceptedIndices[tix] = decodingTokens + lastTokenId; + } +} +} // namespace + +void invokeSelectLastAccTokenAndComputeIndicesCumSum(TokenIdType* lastAcceptedTokenIds, + SizeType32* exclusiveSumLastAcceptedIndices, SizeType32 const* draftDecodingTokens, + TokenIdType const* acceptedTokenIds, SizeType32 const* acceptedLengths, SizeType32 const* bestPathIds, + SizeType32 const* paths, SizeType32 batchSize, SizeType32 maxDecodingTokens, SizeType32 maxPathLen, + cudaStream_t stream) +{ + SizeType32 constexpr BLOCK_SIZE = 512; + TLLM_CHECK_WITH_INFO( + batchSize <= BLOCK_SIZE, "Batch size larger than %d is not supported for EAGLE yet", batchSize); + selectLastAccTokenAndComputeIndicesCumSum<<<1, BLOCK_SIZE, 0, stream>>>(lastAcceptedTokenIds, + exclusiveSumLastAcceptedIndices, draftDecodingTokens, acceptedTokenIds, acceptedLengths, bestPathIds, paths, + batchSize, maxDecodingTokens, maxPathLen); +} + +} // namespace tensorrt_llm::kernels::speculative_decoding diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h new file mode 100644 index 000000000..b8e1430eb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/kernels/decodingCommon.h" +#include "tensorrt_llm/kernels/speculativeDecoding/common.h" +#include "tensorrt_llm/runtime/common.h" +#include +#include +#include + +namespace tensorrt_llm::kernels::speculative_decoding +{ + +//! \brief Sets pointers to logits in logitsPtrs according to the draftDecodingTokens. +//! \param logitsPtrs [batchSize][vocabSizePadded] +//! \param decodingTokens [batchSize], on GPU. draftDecodingTokens + 1. +//! \param logits [numTokens, vocabSizePadded], on GPU. Continuous logits in memory. +//! \param draftDecodingTokens [batchSize], on GPU. 0 for context requests, and actual draft len for gen requests +//! \param batchSize batch size. Only batch size <= 512 is supported at the moment +//! \param maxDecodingTokens maximum number of decoding tokens per step per request +//! \param vocabSizePadded vocab size of the logits +//! \param stream cuda stream +template +void invokeAssembleTargetLogitsOffsets(T const** logitsPtrs, runtime::SizeType32* decodingTokens, T const* logits, + runtime::SizeType32 const* draftDecodingTokens, runtime::SizeType32 batchSize, + runtime::SizeType32 maxDecodingTokens, runtime::SizeType32 vocabSizePadded, cudaStream_t stream); + +//! \brief Sets last accepted token ids and computes inclusive sum of the indices of the last accepted tokens in +//! flattened input_ids tensor. +//! \param lastAcceptedTokenIds [batchSize], on GPU. Token ids of the last accepted tokens. +//! \param exclusiveSumLastAcceptedIndices [batchSize], on GPU. Exclusive sum of the positions of the last accepted +//! tokens in the original flattened draft sequence. +//! \param draftDecodingTokens [batchSize], on GPU. 0 for context +//! requests, and actual draft len for gen requests. +//! \param acceptedTokenIds [batchSize, maxPathLen], on GPU. Ids of the +//! accepted tokens per request. +//! \param acceptedLengths [batchSize], on GPU. Lengths of the accepted draft sequences +//! per request. +//! \param bestPathIds [batchSize], on GPU. Selected path id per request +//! \param paths [batchSize, +//! maxDecodingTokens, maxPathLen], on GPU. Indices of the draft sequences +//! \param batchSize batch size. Only batch size +//! <= 512 is supported at the moment +//! \param maxDecodingTokens maximum number of decoding tokens per step per request +//! \param maxPathLen maximum path len of the draft sequence +//! \param stream cuda stream +void invokeSelectLastAccTokenAndComputeIndicesCumSum(runtime::TokenIdType* lastAcceptedTokenIds, + runtime::SizeType32* exclusiveSumLastAcceptedIndices, runtime::SizeType32 const* draftDecodingTokens, + runtime::TokenIdType const* acceptedTokenIds, runtime::SizeType32 const* acceptedLengths, + runtime::SizeType32 const* bestPathIds, runtime::SizeType32 const* paths, runtime::SizeType32 batchSize, + runtime::SizeType32 maxDecodingTokens, runtime::SizeType32 maxPathLen, cudaStream_t stream); + +} // namespace tensorrt_llm::kernels::speculative_decoding diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu index 19336e2ed..427f1bb6b 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu @@ -60,7 +60,7 @@ __global__ void maskTargetLogitsKernel(T* targetLogits, SizeType32 const* batchS auto* outputIdsAfterSamplingPtr = outputIdsAfterSampling + batchSlot * vocabSize; auto const useDraftLogits = batchUseDraftLogits[batchSlot]; - if (finishedState.isSkipDecoding()) + if (finishedState.isSkipDecoding() || finishedState.isFinished()) { return; } @@ -75,8 +75,8 @@ __global__ void maskTargetLogitsKernel(T* targetLogits, SizeType32 const* batchS for (SizeType32 vIdx = tid; vIdx < vocabSize; vIdx += static_cast(blockDim.x)) { - if (tokensToMask == 0 && outputIdsAfterSamplingPtr[vIdx] == -1) - { // we need to find the -1 boundary from returnAllTopP outputIds if topK == 0 + if (outputIdsAfterSamplingPtr[vIdx] == -1) + { // we need to find the -1 boundary from returnAllTopP outputIds if topK == 0 or number of topP indices < topK tokensToMask = vIdx; } maskBuffer[vIdx] = false; @@ -124,12 +124,21 @@ __global__ void acceptDraftTokensKernel(T const* draftProbs, T* targetProbs, Siz auto const numDraftTokens = numsDraftTokens[batchSlotBeamWidth]; auto const useDraftLogits = batchUseDraftLogits[batchSlotBeamWidth]; - if (draftTokenIdx > numDraftTokens || finishedInput[batchSlot].isSkipDecoding()) + if (draftTokenIdx > numDraftTokens || finishedInput[batchSlot].isSkipDecoding() + || finishedInput[batchSlot].isFinished()) { if (tid == 0) { batchIsAccepted[batchSlot] = true; + + // either finished or skip decode in previous step, this step don't need decoding finishedOutput[batchSlot].setSkipDecoding(); + + // if previous step is finished, write the state to next step too + if (finishedInput[batchSlot].isFinished()) + { + finishedOutput[batchSlot] = finishedInput[batchSlot]; + } } return; } @@ -214,7 +223,8 @@ __global__ void forwardAcceptedTokensKernel(SizeType32 batchSize, SizeType32 con for (SizeType32 bi = index; bi < batchSize; bi += static_cast(gridDim.x * blockDim.x)) { auto const batchSlot = batchSlots[bi]; - if (batchIsAccepted[batchSlot] && !finishedOutput[batchSlot].isSkipDecoding()) + if (batchIsAccepted[batchSlot] && !finishedOutput[batchSlot].isSkipDecoding() + && !finishedOutput[batchSlot].isFinished()) { auto const curSeqLen = sequenceLengths[batchSlot]; auto const draftTokenIdx = step; diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.cu index 7a6d8540e..4c876bd96 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.cu @@ -46,22 +46,22 @@ __global__ void acceptDraftTokensByIdsWithPaths(TokenIdType* outputIds, TokenIdT FinishedState* finishedFinal, SizeType32 const* batchSlots, SizeType32 const* paths, TokenIdType const* endIds, T const** medusaLogits, T const** logitsPtrs, SizeType32* curTokensPerStep, SizeType32 const* targetTokensPerStep, SizeType32* bestPathIds, SizeType32 batchSize, SizeType32 vocabSize, SizeType32 maxBatchSize, SizeType32 maxSeqLen, - SizeType32 maxNumHeads, SizeType32 maxDecodingTokens) + SizeType32 maxDraftPathLen, SizeType32 maxDecodingTokens) { auto const batchIdx = static_cast(blockIdx.x); - auto const batchSlot = batchSlots[batchIdx]; - auto const inputLength = sequenceLengths[batchSlot]; - auto const endId = endIds[batchSlot]; - auto const numTokensPerStep = curTokensPerStep[batchSlot]; - auto const maxNumDraftTokens = maxNumHeads + 1; + auto const batchSlot = batchSlots == nullptr ? batchIdx : batchSlots[batchIdx]; + auto const inputLength = sequenceLengths == nullptr ? 0 : sequenceLengths[batchSlot]; + auto const endId = endIds == nullptr ? -1 : endIds[batchSlot]; + auto const numTokensPerStep = curTokensPerStep == nullptr ? maxDecodingTokens : curTokensPerStep[batchSlot]; + auto const maxPathLen = maxDraftPathLen + 1; int4 partialMax{-1, -1, 0, 0}; // Go over different paths and construct implicit sequences for (auto pathIdx = static_cast(threadIdx.x); pathIdx < maxDecodingTokens; pathIdx += static_cast(blockDim.x)) { - auto acceptedLength = maxNumDraftTokens; - auto const pathOffset = flat_index3(batchSlot, pathIdx, 0, maxDecodingTokens, maxNumDraftTokens); + auto acceptedLength = maxPathLen; + auto const pathOffset = flat_index3(batchSlot, pathIdx, 0, maxDecodingTokens, maxPathLen); bool hasEnd = false; auto const tokenId = paths[pathOffset]; @@ -75,13 +75,14 @@ __global__ void acceptDraftTokensByIdsWithPaths(TokenIdType* outputIds, TokenIdT auto nextIdx = tokenId; // Go along the path - for (SizeType32 ti = 1; ti < maxNumDraftTokens; ++ti) + for (SizeType32 ti = 1; ti < maxPathLen; ++ti) { auto const tokenId = paths[pathOffset + ti]; // Break if path terminates if (tokenId == -1) { - hasEnd = targetToken == endId; // check if last token is EOS when path terminates. + hasEnd = endIds == nullptr ? false + : targetToken == endId; // check if last token is EOS when path terminates. acceptedLength = hasEnd ? ti - 1 : ti; break; } @@ -91,7 +92,7 @@ __global__ void acceptDraftTokensByIdsWithPaths(TokenIdType* outputIds, TokenIdT auto const draftToken = tokenId >= numTokensPerStep ? -1 : draftIds[draftTokenIdx]; // Check if draft tokens are the same as target tokens bool const accepted = draftToken == targetToken; - hasEnd = targetToken == endId; + hasEnd = endIds == nullptr ? false : targetToken == endId; if (!accepted || hasEnd) { acceptedLength = hasEnd ? ti - 1 : ti; @@ -126,7 +127,7 @@ __global__ void acceptDraftTokensByIdsWithPaths(TokenIdType* outputIds, TokenIdT auto const acceptedLength = totalShared.x; auto const bestPathIdx = totalShared.y; auto const bestNextIdx = numTokensPerStep == 1 ? 0 : totalShared.w; - auto const pathOffset = flat_index3(batchSlot, bestPathIdx, 0, maxDecodingTokens, maxNumDraftTokens); + auto const pathOffset = flat_index3(batchSlot, bestPathIdx, 0, maxDecodingTokens, maxPathLen); for (auto ti = static_cast(threadIdx.x); ti < acceptedLength; ti += static_cast(blockDim.x)) { auto const tokenId = paths[pathOffset + ti]; @@ -142,15 +143,18 @@ __global__ void acceptDraftTokensByIdsWithPaths(TokenIdType* outputIds, TokenIdT { auto const hasEnd = totalShared.z; // Set end condition - if (hasEnd) + if (hasEnd && finishedFinal) { finishedFinal[batchSlot].setFinishedEOS(); } // Make correction to the sequence length - sequenceLengths[batchSlot] += acceptedLength; + if (sequenceLengths) + { + sequenceLengths[batchSlot] += acceptedLength; + } acceptedLengths[batchSlot] = acceptedLength; // In Medusa decoding step, number of draft tokens is 0 and must be updated for the next steps - if (numTokensPerStep == 1) + if (curTokensPerStep && targetTokensPerStep && numTokensPerStep == 1) { curTokensPerStep[batchSlot] = targetTokensPerStep[batchSlot]; } @@ -158,45 +162,33 @@ __global__ void acceptDraftTokensByIdsWithPaths(TokenIdType* outputIds, TokenIdT } // Prepare logits pointers to respective logits from Medusa Heads for the all-top-K sampling kernel - for (auto hi = static_cast(threadIdx.x); hi < maxNumHeads; hi += static_cast(blockDim.x)) + if (medusaLogits && logitsPtrs) { - logitsPtrs[batchIdx * maxNumHeads + hi] - = medusaLogits[batchSlot * maxNumHeads + hi] + flat_index2(bestNextIdx, 0, vocabSize); + for (auto hi = static_cast(threadIdx.x); hi < maxDraftPathLen; + hi += static_cast(blockDim.x)) + { + logitsPtrs[batchIdx * maxDraftPathLen + hi] + = medusaLogits[batchSlot * maxDraftPathLen + hi] + flat_index2(bestNextIdx, 0, vocabSize); + } } } } // namespace template -void acceptDraftTokensByIdsWithPaths(TokenIdType* outputIds, TokenIdType const* draftIds, TokenIdType const* targetIds, - SizeType32* sequenceLengths, SizeType32* acceptedLengths, FinishedState* finishedFinal, - SizeType32 const* batchSlots, SizeType32 const* paths, TokenIdType const* endIds, T const** medusaLogits, - T const** logitsPtrs, SizeType32* curTokensPerStep, SizeType32 const* targetTokensPerStep, SizeType32* bestPathIds, - SizeType32 batchSize, SizeType32 vocabSize, SizeType32 maxBatchSize, SizeType32 maxSeqLen, SizeType32 maxNumHeads, - SizeType32 maxDecodingTokens, cudaStream_t stream) +void acceptDraftTokensByIdsWithPaths(AcceptDraftTokensByIdsWithPathsParams const& params) { constexpr SizeType32 BLOCK_SIZE = 256; dim3 block(BLOCK_SIZE); - dim3 grid(batchSize); - acceptDraftTokensByIdsWithPaths<<>>(outputIds, draftIds, targetIds, - sequenceLengths, acceptedLengths, finishedFinal, batchSlots, paths, endIds, medusaLogits, logitsPtrs, - curTokensPerStep, targetTokensPerStep, bestPathIds, batchSize, vocabSize, maxBatchSize, maxSeqLen, maxNumHeads, - maxDecodingTokens); + dim3 grid(params.batchSize); + acceptDraftTokensByIdsWithPaths<<>>(params.outputIds, params.draftIds, + params.targetIds, params.sequenceLengths, params.acceptedLengths, params.finishedFinal, params.batchSlots, + params.paths, params.endIds, params.medusaLogits, params.logitsPtrs, params.curTokensPerStep, + params.targetTokensPerStep, params.bestPathIds, params.batchSize, params.vocabSize, params.maxBatchSize, + params.maxSeqLen, params.maxDraftPathLen, params.maxDecodingTokens); } -template void acceptDraftTokensByIdsWithPaths(TokenIdType* outputIds, TokenIdType const* draftIds, - TokenIdType const* targetIds, SizeType32* sequenceLengths, SizeType32* acceptedLengths, - FinishedState* finishedFinal, SizeType32 const* batchSlots, SizeType32 const* paths, TokenIdType const* endIds, - float const** medusaLogits, float const** logitsPtrs, SizeType32* curTokensPerStep, - SizeType32 const* targetTokensPerStep, SizeType32* bestPathIds, SizeType32 batchSize, SizeType32 vocabSize, - SizeType32 maxBatchSize, SizeType32 maxSeqLen, SizeType32 maxNumHeads, SizeType32 maxDecodingTokens, - cudaStream_t stream); -template void acceptDraftTokensByIdsWithPaths(TokenIdType* outputIds, TokenIdType const* draftIds, - TokenIdType const* targetIds, SizeType32* sequenceLengths, SizeType32* acceptedLengths, - FinishedState* finishedFinal, SizeType32 const* batchSlots, SizeType32 const* paths, TokenIdType const* endIds, - half const** medusaLogits, half const** logitsPtrs, SizeType32* curTokensPerStep, - SizeType32 const* targetTokensPerStep, SizeType32* bestPathIds, SizeType32 batchSize, SizeType32 vocabSize, - SizeType32 maxBatchSize, SizeType32 maxSeqLen, SizeType32 maxNumHeads, SizeType32 maxDecodingTokens, - cudaStream_t stream); +template void acceptDraftTokensByIdsWithPaths(AcceptDraftTokensByIdsWithPathsParams const& params); +template void acceptDraftTokensByIdsWithPaths(AcceptDraftTokensByIdsWithPathsParams<__half> const& params); namespace { diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h index 6a1fae1a7..67f43c9fc 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h @@ -26,46 +26,87 @@ namespace tensorrt_llm::kernels::speculative_decoding { +template +struct AcceptDraftTokensByIdsWithPathsParams +{ + //! output buffer [maxBatchSize, maxSeqLen], input tokens. + runtime::TokenIdType* outputIds{nullptr}; + //! input buffer [maxBatchSize, maxDecodingTokens], draft tokens + runtime::TokenIdType const* draftIds{nullptr}; + //! input buffer [maxBatchSize, maxDecodingTokens], tokens predicted from the target medusa head + runtime::TokenIdType const* targetIds{nullptr}; + //! input/output buffer [maxBatchSize], optional. + //! Length of the data in outputIds without draft tokens. + //! If set, incrememnted according to the accepted length. + runtime::SizeType32* sequenceLengths{nullptr}; + //! output buffer [maxBatchSize], length of the data accepted tokens + runtime::SizeType32* acceptedLengths{nullptr}; + //! input buffer [maxBatchSize], optional. Finished states per request + FinishedState* finishedFinal{nullptr}; + //! input buffer [batchSize], optional. Address map from local index + //! to global index [0, batchSize] -> [0, maxBatchSize]. + //! If nullptr, batchIdx is used. + runtime::SizeType32 const* batchSlots{nullptr}; + //! input buffer [maxBatchSize, maxDecodingTokens, maxDraftPathLen+1], + //! paths to restore sequences from outputIds and targetIds. Should be filled with -1 for everything that is not + //! path. + runtime::SizeType32 const* paths{nullptr}; + //! input buffer [maxBatchSize], optional. EOS ids per request. + //! No EOS checks if nullptr. + runtime::TokenIdType const* endIds{nullptr}; + //! input buffer [maxDraftPathLen, maxBatchSize, maxDecodingTokens, vocabSize], optional. + //! Pointer to the logits from medusa heads. + T const** medusaLogits{nullptr}; + //! output buffer [batchSize, maxDraftPathLen], optional. Contains pointers to the + //! respective rows of the medusaLogits for the next after the accepted token + T const** logitsPtrs{nullptr}; + //! current tokens to compute per step will be updated to + //! targetTokensPerStep if curTokensPerStep == 1 + runtime::SizeType32* curTokensPerStep{nullptr}; + //! target values of tokens to compute per step + runtime::SizeType32 const* targetTokensPerStep{nullptr}; + //! output buffer [maxBatchSize], indices of the selected paths + runtime::SizeType32* bestPathIds{nullptr}; + //! current batch size + runtime::SizeType32 batchSize{0}; + //! maximum batch size + runtime::SizeType32 maxBatchSize{0}; + //! vocab size + runtime::SizeType32 vocabSize{0}; + //! maximum sequence length of output ids + runtime::SizeType32 maxSeqLen{0}; + //! maximum number of medusa heads + runtime::SizeType32 maxDraftPathLen{0}; + //! maximum number of tokens per step configured in the system + runtime::SizeType32 maxDecodingTokens{0}; + //! stream + cudaStream_t stream; + + void checkParams() const + { + TLLM_CHECK(outputIds); + TLLM_CHECK(draftIds); + TLLM_CHECK(targetIds); + TLLM_CHECK(acceptedLengths); + TLLM_CHECK(paths); + TLLM_CHECK(bestPathIds); + TLLM_CHECK((curTokensPerStep == nullptr) ^ (targetTokensPerStep == nullptr) == 0); + TLLM_CHECK((medusaLogits == nullptr) ^ (logitsPtrs == nullptr) == 0); + + TLLM_CHECK(batchSize > 0); + TLLM_CHECK(batchSize <= maxBatchSize); + TLLM_CHECK(vocabSize > 0); + TLLM_CHECK(maxSeqLen > 0); + TLLM_CHECK(maxDraftPathLen > 0); + TLLM_CHECK(maxDecodingTokens > 0); + } +}; + //! \brief verifies draft medusa tokens given target tokens. Modifies outputIds tensor accordingly filling it with //! accepted tokens. Fills logitsPtrs tensor with the pointers to the respective medusa logits tensor according //! to the next after the last accepted token. -//! -//! \param outputIds output buffer [maxBatchSize, maxSeqLen], input tokens. -//! \param draftIds input buffer [maxBatchSize, maxDecodingTokens], draft tokens -//! \param targetIds input buffer [maxBatchSize, maxDecodingTokens], tokens predicted from the target medusa head -//! \param sequenceLengths input/output buffer [maxBatchSize], length of the data in outputIds without draft tokens -//! Incrememnted according to the accepted length -//! \param acceptedLengths output buffer [maxBatchSize], length of the data accepted tokens -//! \param finishedFinal input buffer [maxBatchSize], finished states per request -//! \param batchSlots input buffer [batchSize], address map from local index -//! to global index [0, batchSize] -> [0, maxBatchSize] -//! \param paths input buffer [maxBatchSize, maxDecodingTokens, maxNumHeads+1], -//! paths to restore sequences from outputIds and targetIds. Should be filled with -1 for everything that is not path. -//! \param endIds input buffer [maxBatchSize], EOS ids per request -//! \param medusaLogits input buffer [maxNumHeads, maxBatchSize, maxDecodingTokens, vocabSize], pointer -//! to the logits from medusa heads -//! \param logitsPtrs output buffer [batchSize, maxNumHeads], contains pointers to the -//! respective rows of the medusaLogits for the next after the accepted token -//! \param curTokensPerStep current tokens to compute per step will be updated to -//! targetTokensPerStep if curTokensPerStep == 1 -//! \param targetTokensPerStep target values of tokens to compute per step -//! \param bestPathIds output buffer [maxBatchSize], indices of the selected paths -//! \param batchSize current batch size -//! \param maxBatchSize maximum batch size -//! \param vocabSize vocab size -//! \param maxSeqLen maximum sequence length of output ids -//! \param maxNumHeads maximum number of medusa heads -//! \param maxDecodingTokens maximum number of tokens per step configured in the system -//! \param stream stream template -void acceptDraftTokensByIdsWithPaths(runtime::TokenIdType* outputIds, runtime::TokenIdType const* draftIds, - runtime::TokenIdType const* targetIds, runtime::SizeType32* sequenceLengths, runtime::SizeType32* acceptedLengths, - FinishedState* finishedFinal, runtime::SizeType32 const* batchSlots, runtime::SizeType32 const* paths, - runtime::TokenIdType const* endIds, T const** medusaLogits, T const** logitsPtrs, - runtime::SizeType32* curTokensPerStep, runtime::SizeType32 const* targetTokensPerStep, - runtime::SizeType32* bestPathIds, runtime::SizeType32 batchSize, runtime::SizeType32 maxBatchSize, - runtime::SizeType32 vocabSize, runtime::SizeType32 maxSeqLen, runtime::SizeType32 maxNumHeads, - runtime::SizeType32 maxDecodingTokens, cudaStream_t stream); +void acceptDraftTokensByIdsWithPaths(AcceptDraftTokensByIdsWithPathsParams const&); //! \brief assembles draft tokens to treeDraftIds from sourceDraftIds using indices of treeIds //! diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h index c53510d3e..f6ecd5b72 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h @@ -507,15 +507,12 @@ __global__ void applyBiasRopeUpdateKVCache(QKVPreprocessingParams(global_token_idx) * params.q_hidden_size + hidden_idx; - QuantizedEltType* quantized_q_ptr = STORE_QKV - ? reinterpret_cast(params.QuantizedQKV) + src_q_idx - : reinterpret_cast(params.Q) + dst_q_idx; VecType* q_ptr = STORE_QKV ? reinterpret_ptr(params.QKV, src_q_idx) : reinterpret_ptr(params.Q, dst_q_idx); // Cast float scale to dst data type. using TScale = typename mmha::kv_cache_scale_type_t::Type; - TScale scaleOrigQuant; + [[maybe_unused]] TScale scaleOrigQuant; if constexpr (FP8_OUTPUT || ENABLE_8BITS_CACHE) { mmha::convert_from_float( @@ -525,6 +522,9 @@ __global__ void applyBiasRopeUpdateKVCache(QKVPreprocessingParams(params.QuantizedQKV) + src_q_idx + : reinterpret_cast(params.Q) + dst_q_idx; mmha::store_8bits_vec(quantized_q_ptr, q, 0, scaleOrigQuant); } else @@ -813,15 +813,12 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams(global_token_idx) * params.q_hidden_size + hidden_idx; - QuantizedEltType* quantized_q_ptr = STORE_QKV - ? reinterpret_cast(params.QuantizedQKV) + src_q_idx - : reinterpret_cast(params.Q) + dst_q_idx; VecT* q_ptr = STORE_QKV ? reinterpret_ptr(params.QKV, src_q_idx) : reinterpret_ptr(params.Q, dst_q_idx); // Cast float scale to dst data type. using TScale = typename mmha::kv_cache_scale_type_t::Type; - TScale scaleOrigQuant; + [[maybe_unused]] TScale scaleOrigQuant; if constexpr (FP8_OUTPUT || ENABLE_8BITS_CACHE) { mmha::convert_from_float(&scaleOrigQuant, params.kvScaleOrigQuant ? params.kvScaleOrigQuant[0] : 1.0f); @@ -830,6 +827,9 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams(params.QuantizedQKV) + src_q_idx + : reinterpret_cast(params.Q) + dst_q_idx; mmha::store_8bits_vec(quantized_q_ptr, q, 0, scaleOrigQuant); } else diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h index db0762351..c8228f7d1 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h @@ -32,6 +32,8 @@ namespace weight_only { enum class KernelType { + FP16Int8Groupwise, + BF16Int8Groupwise, FP16Int4Groupwise, BF16Int4Groupwise, FP16Int8PerChannel, @@ -49,6 +51,8 @@ struct kernel_type_traits; static constexpr bool isGroupwise = _isGroupwise; \ static constexpr bool isInt4 = _isInt4; \ }; +KERNEL_TYPE_TRAITS_REGISTRY(KernelType::FP16Int8Groupwise, true, false); +KERNEL_TYPE_TRAITS_REGISTRY(KernelType::BF16Int8Groupwise, true, false); KERNEL_TYPE_TRAITS_REGISTRY(KernelType::FP16Int4Groupwise, true, true); KERNEL_TYPE_TRAITS_REGISTRY(KernelType::BF16Int4Groupwise, true, true); KERNEL_TYPE_TRAITS_REGISTRY(KernelType::FP16Int8PerChannel, false, false); diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorFalse.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorFalse.cu new file mode 100644 index 000000000..7fa33376f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorFalse.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace weight_only +{ +INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( + KernelType::BF16Int8Groupwise, BF16DetailsA, Int8DetailsW, ColumnMajor, false, 64); +} // namespace weight_only +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorInterleavedTrue.cu new file mode 100644 index 000000000..6c718b24a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorInterleavedTrue.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace weight_only +{ +INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( + KernelType::BF16Int8Groupwise, BF16DetailsA, Int8DetailsW, ColumnMajorInterleaved, true, 64); +} // namespace weight_only +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorFalse.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorFalse.cu new file mode 100644 index 000000000..118032999 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorFalse.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace weight_only +{ +INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( + KernelType::FP16Int8Groupwise, FP16DetailsA, Int8DetailsW, ColumnMajor, false, 64); +} // namespace weight_only +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedTrue.cu new file mode 100644 index 000000000..fa5002ae0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedTrue.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace weight_only +{ +INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( + KernelType::FP16Int8Groupwise, FP16DetailsA, Int8DetailsW, ColumnMajorInterleaved, true, 64); +} // namespace weight_only +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h index 7ff08a19e..e047d1235 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h @@ -61,6 +61,8 @@ inline void kernel_launcher(int arch, Params& params, cudaStream_t s) { EXEC_W4A8(KernelType::FP16Int4Groupwise, FP16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true); } + EXEC(KernelType::FP16Int8Groupwise, FP16DetailsA, Int8DetailsW, ColumnMajorInterleaved, true); + EXEC(KernelType::BF16Int8Groupwise, BF16DetailsA, Int8DetailsW, ColumnMajorInterleaved, true); EXEC(KernelType::FP16Int4Groupwise, FP16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true); EXEC(KernelType::BF16Int4Groupwise, BF16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true); EXEC(KernelType::FP16Int8PerChannel, FP16DetailsA, Int8DetailsW, ColumnMajorInterleaved, true); @@ -70,6 +72,8 @@ inline void kernel_launcher(int arch, Params& params, cudaStream_t s) } else if (arch >= 90) { + EXEC(KernelType::FP16Int8Groupwise, FP16DetailsA, Int8DetailsW, ColumnMajor, false); + EXEC(KernelType::BF16Int8Groupwise, BF16DetailsA, Int8DetailsW, ColumnMajor, false); EXEC(KernelType::FP16Int4Groupwise, FP16DetailsA, Int4DetailsW, ColumnMajor, false); EXEC(KernelType::BF16Int4Groupwise, BF16DetailsA, Int4DetailsW, ColumnMajor, false); EXEC(KernelType::FP16Int8PerChannel, FP16DetailsA, Int8DetailsW, ColumnMajor, false); @@ -98,6 +102,8 @@ inline bool is_supported(int arch, KernelType kernel_type) } else if (arch >= 80 && arch < 90) { + SUPPORT(KernelType::FP16Int8Groupwise); + SUPPORT(KernelType::BF16Int8Groupwise); SUPPORT(KernelType::FP16Int4Groupwise); SUPPORT(KernelType::BF16Int4Groupwise); SUPPORT(KernelType::FP16Int8PerChannel); @@ -107,6 +113,8 @@ inline bool is_supported(int arch, KernelType kernel_type) } else if (arch >= 90) { + SUPPORT(KernelType::FP16Int8Groupwise); + SUPPORT(KernelType::BF16Int8Groupwise); SUPPORT(KernelType::FP16Int4Groupwise); SUPPORT(KernelType::BF16Int4Groupwise); SUPPORT(KernelType::FP16Int8PerChannel); diff --git a/cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp b/cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp index 5f29a9a13..097fe116e 100644 --- a/cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp +++ b/cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp @@ -431,7 +431,7 @@ void ExternalDraftTokensLayer::getAllTopKs(std::shared_ptrprobsComputed; @@ -475,7 +475,7 @@ void ExternalDraftTokensLayer::getAllTopPs(std::shared_ptr(params, getStream()); diff --git a/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp index 32b812967..414572322 100644 --- a/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp +++ b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp @@ -76,6 +76,8 @@ LookaheadDecodingLayer::CpuAlgorithmResources::CpuAlgorithmResources(DecoderD ITensor::makeShape({maxTokensPerStep, maxBatchSize, beamWidth}), nvinfer1::DataType::kINT32); mPathsOffsets = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxAcceptedDraftLen}), nvinfer1::DataType::kINT32); + mPathsOffsetsBatch + = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxAcceptedDraftLen}), nvinfer1::DataType::kINT32); mNumNewTokens = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); mNumNewTokensCumSum = BufferManager::cpu(ITensor::makeShape({maxBatchSize + 1}), nvinfer1::DataType::kINT32); mNextDraftTokens = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kINT32); @@ -220,7 +222,7 @@ void LookaheadDecodingLayer::forwardAsync(std::shared_ptr::forwardSyncCPU( BufferRange nextDraftLengthsRange(*mCpuAlgo->mNextDraftLengths); BufferRange sequenceLengthsRange(*mCpuAlgo->mSequenceLengths); BufferLocation pathsOffsetLocation(*mCpuAlgo->mPathsOffsets); + BufferLocation pathsOffsetBatchLocation(*mCpuAlgo->mPathsOffsetsBatch); BufferLocation outputIdsLocation(*mCpuAlgo->mOutputIds); mBufferManager->setZero(*mCpuAlgo->mPathsOffsets); @@ -394,20 +397,22 @@ void LookaheadDecodingLayer::forwardSyncCPU( D(accepted).values().c_str(), D(draft).values().c_str()); } - numNewTokensCumSumRange[0] = 0; SizeType32 pi = 0; - for (SizeType32 bi = 0; bi < numNewTokensRange.size(); bi++) + numNewTokensCumSumRange[0] = 0; + for (SizeType32 bi = 0; bi < batchSize; bi++) { - SizeType32 acceptedDraftLen = numNewTokensRange[bi] <= 1 ? 0 : (numNewTokensRange[bi] - 1); + SizeType32 gbi = batchSlotsRange[bi]; + SizeType32 acceptedDraftLen = numNewTokensRange[gbi] <= 1 ? 0 : (numNewTokensRange[gbi] - 1); numNewTokensCumSumRange[bi + 1] = numNewTokensCumSumRange[bi] + acceptedDraftLen; for (SizeType32 tj = 0; tj < acceptedDraftLen; tj++) { - pathsOffsetLocation[pi++] = pathsOffsetLocation.at(bi, tj); + pathsOffsetBatchLocation[pi++] = pathsOffsetLocation.at(gbi, tj); } } - for (; pi < pathsOffsetLocation.size(); pi++) + + for (; pi < pathsOffsetBatchLocation.size(); pi++) { - pathsOffsetLocation[pi++] = 0; + pathsOffsetBatchLocation[pi++] = 0; } TLLM_CHECK(outputs->numNewTokens); @@ -415,8 +420,8 @@ void LookaheadDecodingLayer::forwardSyncCPU( mBufferManager->copy(*mCpuAlgo->mSequenceLengths, *outputs->sequenceLength.value()); mBufferManager->copy(*mCpuAlgo->mNewTokens, *outputs->newTokens); - mBufferManager->copy(*mCpuAlgo->mPathsOffsets, *outputs->pathsOffsets); mBufferManager->copy(*mCpuAlgo->mNumNewTokens, *outputs->numNewTokens.value()); + mBufferManager->copy(*mCpuAlgo->mPathsOffsetsBatch, *outputs->pathsOffsets); mBufferManager->copy(*mCpuAlgo->mNumNewTokensCumSum, *outputs->numNewTokensCumSum); // mBufferManager->copy(*mCpuAlgo->mNextDraftTokens, *outputs->nextDraftTokens); diff --git a/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h index f2470a411..e20b59b22 100644 --- a/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h +++ b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h @@ -70,6 +70,7 @@ class LookaheadDecodingLayer : public BaseLayer TensorPtr mOutputIds; TensorPtr mPathsOffsets; + TensorPtr mPathsOffsetsBatch; TensorPtr mNumNewTokens; TensorPtr mNumNewTokensCumSum; TensorPtr mNewTokens; diff --git a/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp b/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp index ac8f78ec1..69978863b 100644 --- a/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp +++ b/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp @@ -329,11 +329,33 @@ void MedusaDecodingLayer::acceptDraftTokens(SpeculativeDecodingOutputs const& auto medusaInputLogitsPtrsPtr = reinterpret_cast(bufferCast(*mMedusaInputLogitsPtrs)); auto medusaSelectedLogitsPtrsDevicePtr = const_cast(bufferCastOrNull(mMedusaSelectedLogitsPtrsDevice)); - acceptDraftTokensByIdsWithPaths(outputIds, draftIds, targetTokensDevicePtr, sequenceLengths, numNewTokens, - finishedStatesPtr, workspace->getDeviceBatchSlotsPtr(), paths, endIds, medusaInputLogitsPtrsPtr, - medusaSelectedLogitsPtrsDevicePtr, curTokensPerStepDevice, targetTokensPerStepDevice, bestPathIdsDevicePtr, - batchSize, mDecoderDomain.getVocabSize(), mDecoderDomain.getBatchSize(), maxSeqLen, maxDraftPathLen, - mDecoderDomain.getMaxDecodingTokens(), getStream()); + + AcceptDraftTokensByIdsWithPathsParams params; + params.outputIds = outputIds; + params.draftIds = draftIds; + params.targetIds = targetTokensDevicePtr; + params.sequenceLengths = sequenceLengths; + params.acceptedLengths = numNewTokens; + params.finishedFinal = finishedStatesPtr; + params.batchSlots = workspace->getDeviceBatchSlotsPtr(); + params.paths = paths; + params.endIds = endIds; + params.medusaLogits = medusaInputLogitsPtrsPtr; + params.logitsPtrs = medusaSelectedLogitsPtrsDevicePtr; + params.curTokensPerStep = curTokensPerStepDevice; + params.targetTokensPerStep = targetTokensPerStepDevice; + params.bestPathIds = bestPathIdsDevicePtr; + params.batchSize = batchSize; + params.maxBatchSize = mDecoderDomain.getBatchSize(); + params.vocabSize = mDecoderDomain.getVocabSize(); + params.maxSeqLen = maxSeqLen; + params.maxDraftPathLen = maxDraftPathLen; + params.maxDecodingTokens = mDecoderDomain.getMaxDecodingTokens(); + params.stream = getStream(); + + params.checkParams(); + + acceptDraftTokensByIdsWithPaths(params); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } @@ -390,7 +412,7 @@ void MedusaDecodingLayer::sampleNewDraftTokens(SpeculativeDecodingOutputs con params.maxBatchSize = maxBatchSizeHeadNums; params.maxTokensPerStep = 1; params.vocabSizePadded = mDecoderDomain.getVocabSizePadded(); - params.returnAllTopK = true; + params.returnAllSelectedTokens = true; invokeBatchTopKSampling(params, getStream()); diff --git a/cpp/tensorrt_llm/plugins/CMakeLists.txt b/cpp/tensorrt_llm/plugins/CMakeLists.txt index 604c656e5..def5e55d9 100755 --- a/cpp/tensorrt_llm/plugins/CMakeLists.txt +++ b/cpp/tensorrt_llm/plugins/CMakeLists.txt @@ -54,7 +54,8 @@ set(PLUGIN_LISTS mambaConv1dPlugin lruPlugin cumsumLastDimPlugin - lowLatencyGemmPlugin) + lowLatencyGemmPlugin + eaglePlugin) foreach(PLUGIN_ITER ${PLUGIN_LISTS}) include_directories(${PLUGIN_ITER}) diff --git a/cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp b/cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp index 8a9d6784d..2d4f94176 100644 --- a/cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp @@ -39,6 +39,9 @@ #include "tensorrt_llm/plugins/ncclPlugin/sendPlugin.h" #endif // ENABLE_MULTI_DEVICE #include "tensorrt_llm/plugins/cumsumLastDimPlugin/cumsumLastDimPlugin.h" +#include "tensorrt_llm/plugins/eaglePlugin/eagleDecodeDraftTokensPlugin.h" +#include "tensorrt_llm/plugins/eaglePlugin/eaglePrepareDrafterInputsPlugin.h" +#include "tensorrt_llm/plugins/eaglePlugin/eagleSampleAndAcceptDraftTokensPlugin.h" #include "tensorrt_llm/plugins/lowLatencyGemmPlugin/lowLatencyGemmPlugin.h" #include "tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h" #include "tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h" @@ -201,6 +204,10 @@ extern "C" static tensorrt_llm::plugins::lruPluginCreator lruPluginCreator; static tensorrt_llm::plugins::CumsumLastDimPluginCreator cumsumLastDimPluginCreator; static tensorrt_llm::plugins::LowLatencyGemmPluginCreator lowLatencyGemmPluginCreator; + static tensorrt_llm::plugins::EagleDecodeDraftTokensPluginCreator eagleDecodeDraftTokensPluginCreator; + static tensorrt_llm::plugins::EagleSampleAndAcceptDraftTokensPluginCreator + eagleSampleAndAcceptDraftTokensPluginCreator; + static tensorrt_llm::plugins::EaglePrepareDrafterInputsPluginCreator eaglePrepareDrafterInputsPluginCreator; static std::array pluginCreators = { creatorPtr(identityPluginCreator), @@ -231,6 +238,9 @@ extern "C" creatorPtr(lruPluginCreator), creatorPtr(cumsumLastDimPluginCreator), creatorPtr(lowLatencyGemmPluginCreator), + creatorPtr(eagleDecodeDraftTokensPluginCreator), + creatorPtr(eagleSampleAndAcceptDraftTokensPluginCreator), + creatorPtr(eaglePrepareDrafterInputsPluginCreator), }; nbCreators = pluginCreators.size(); return pluginCreators.data(); diff --git a/cpp/tensorrt_llm/plugins/eaglePlugin/CMakeLists.txt b/cpp/tensorrt_llm/plugins/eaglePlugin/CMakeLists.txt new file mode 100644 index 000000000..b6bd0439c --- /dev/null +++ b/cpp/tensorrt_llm/plugins/eaglePlugin/CMakeLists.txt @@ -0,0 +1,21 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & +# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# +file(GLOB SRCS *.cpp) +set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS}) +set(PLUGIN_SOURCES + ${PLUGIN_SOURCES} + PARENT_SCOPE) diff --git a/cpp/tensorrt_llm/plugins/eaglePlugin/eagleDecodeDraftTokensPlugin.cpp b/cpp/tensorrt_llm/plugins/eaglePlugin/eagleDecodeDraftTokensPlugin.cpp new file mode 100644 index 000000000..8dbb8f47d --- /dev/null +++ b/cpp/tensorrt_llm/plugins/eaglePlugin/eagleDecodeDraftTokensPlugin.cpp @@ -0,0 +1,228 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & + * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "eagleDecodeDraftTokensPlugin.h" + +using namespace nvinfer1; +using tensorrt_llm::plugins::EagleDecodeDraftTokensPluginCreator; +using tensorrt_llm::plugins::EagleDecodeDraftTokensPlugin; + +static char const* EAGLE_DECODE_DRAFT_TOKENS_PLUGIN_VERSION{"1"}; +static char const* EAGLE_DECODE_DRAFT_TOKENS_PLUGIN_NAME{"EagleDecodeDraftTokens"}; +PluginFieldCollection EagleDecodeDraftTokensPluginCreator::mFC{}; +std::vector EagleDecodeDraftTokensPluginCreator::mPluginAttributes; + +EagleDecodeDraftTokensPlugin::EagleDecodeDraftTokensPlugin(nvinfer1::DataType type, int32_t layerIdx) + : mDtype(type) + , mLayerIdx(layerIdx) +{ +} + +// Parameterized constructor +EagleDecodeDraftTokensPlugin::EagleDecodeDraftTokensPlugin(void const* data, size_t length) +{ + char const *d = reinterpret_cast(data), *a = d; + read(d, mDtype); + read(d, mLayerIdx); + TLLM_CHECK_WITH_INFO(d == a + length, + "Expected length (%d) != real length (%d). This is often " + "caused by using different TensorRT-LLM version to build " + "engine and run engine.", + static_cast(length), static_cast(d - a)); +} + +// IPluginV2DynamicExt Methods +nvinfer1::IPluginV2DynamicExt* EagleDecodeDraftTokensPlugin::clone() const noexcept +{ + auto* plugin = new EagleDecodeDraftTokensPlugin(*this); + plugin->setPluginNamespace(mNamespace.c_str()); + return plugin; +} + +nvinfer1::DimsExprs EagleDecodeDraftTokensPlugin::getOutputDimensions( + int outputIndex, nvinfer1::DimsExprs const* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept +{ + TLLM_CHECK(outputIndex < 2); + TLLM_CHECK(nbInputs == 5); + return inputs[outputIndex + 1]; +} + +bool EagleDecodeDraftTokensPlugin::supportsFormatCombination( + int pos, nvinfer1::PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept +{ + if (pos == 0) // logits + { + return (inOut[pos].type == mDtype) && (inOut[pos].format == TensorFormat::kLINEAR); + } + else if (pos == 3) // rand_data_sample + { + return (inOut[pos].type == nvinfer1::DataType::kFLOAT) && (inOut[pos].format == TensorFormat::kLINEAR); + } + else // next_draft_tokens, next_draft_lens, paths, tree_indices + { + return (inOut[pos].type == nvinfer1::DataType::kINT32) && (inOut[pos].format == TensorFormat::kLINEAR); + } +} + +void EagleDecodeDraftTokensPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int nbInputs, + nvinfer1::DynamicPluginTensorDesc const* out, int nbOutputs) noexcept +{ +} + +size_t EagleDecodeDraftTokensPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int nbInputs, + nvinfer1::PluginTensorDesc const* outputs, int nbOutputs) const noexcept +{ + return 0; +} + +int EagleDecodeDraftTokensPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, + nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) noexcept +{ + // TODO fill me + + return 0; +} + +// IPluginV2Ext Methods +nvinfer1::DataType EagleDecodeDraftTokensPlugin::getOutputDataType( + int index, nvinfer1::DataType const* inputTypes, int nbInputs) const noexcept +{ + TLLM_CHECK(index < 2); + return inputTypes[index + 1]; +} + +// IPluginV2 Methods + +char const* EagleDecodeDraftTokensPlugin::getPluginType() const noexcept +{ + return EAGLE_DECODE_DRAFT_TOKENS_PLUGIN_NAME; +} + +char const* EagleDecodeDraftTokensPlugin::getPluginVersion() const noexcept +{ + return EAGLE_DECODE_DRAFT_TOKENS_PLUGIN_VERSION; +} + +int EagleDecodeDraftTokensPlugin::getNbOutputs() const noexcept +{ + return 2; +} + +int EagleDecodeDraftTokensPlugin::initialize() noexcept +{ + return 0; +} + +void EagleDecodeDraftTokensPlugin::terminate() noexcept {} + +size_t EagleDecodeDraftTokensPlugin::getSerializationSize() const noexcept +{ + return sizeof(mDtype) + sizeof(mLayerIdx); +} + +void EagleDecodeDraftTokensPlugin::serialize(void* buffer) const noexcept +{ + char *d = static_cast(buffer), *a = d; + write(d, mLayerIdx); + write(d, mDtype); + assert(d == a + getSerializationSize()); +} + +void EagleDecodeDraftTokensPlugin::destroy() noexcept +{ + // This gets called when the network containing plugin is destroyed + delete this; +} + +/////////////// + +EagleDecodeDraftTokensPluginCreator::EagleDecodeDraftTokensPluginCreator() +{ + // Fill PluginFieldCollection with PluginField arguments metadata + mPluginAttributes.clear(); + mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1)); + mPluginAttributes.emplace_back(PluginField("layer_idx", nullptr, PluginFieldType::kINT32, 0)); + mFC.nbFields = mPluginAttributes.size(); + mFC.fields = mPluginAttributes.data(); +} + +char const* EagleDecodeDraftTokensPluginCreator::getPluginName() const noexcept +{ + return EAGLE_DECODE_DRAFT_TOKENS_PLUGIN_NAME; +} + +char const* EagleDecodeDraftTokensPluginCreator::getPluginVersion() const noexcept +{ + return EAGLE_DECODE_DRAFT_TOKENS_PLUGIN_VERSION; +} + +PluginFieldCollection const* EagleDecodeDraftTokensPluginCreator::getFieldNames() noexcept +{ + return &mFC; +} + +IPluginV2* EagleDecodeDraftTokensPluginCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept +{ + PluginField const* fields = fc->fields; + int32_t layerIdx; + nvinfer1::DataType type; + // Read configurations from each fields + for (int i = 0; i < fc->nbFields; ++i) + { + char const* attrName = fields[i].name; + if (!strcmp(attrName, "layer_idx")) + { + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); + layerIdx = *static_cast(fields[i].data); + } + else if (!strcmp(attrName, "type_id")) + { + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); + type = static_cast(*(static_cast(fields[i].data))); + } + } + + try + { + auto* obj = new EagleDecodeDraftTokensPlugin(type, layerIdx); + obj->setPluginNamespace(mNamespace.c_str()); + return obj; + } + catch (std::exception const& e) + { + caughtError(e); + } + return nullptr; +} + +IPluginV2* EagleDecodeDraftTokensPluginCreator::deserializePlugin( + char const* name, void const* serialData, size_t serialLength) noexcept +{ + // This object will be deleted when the network is destroyed, which will + // call EagleDecodeDraftTokensPlugin::destroy() + try + { + auto* obj = new EagleDecodeDraftTokensPlugin(serialData, serialLength); + obj->setPluginNamespace(mNamespace.c_str()); + return obj; + } + catch (std::exception const& e) + { + caughtError(e); + } + return nullptr; +} diff --git a/cpp/tensorrt_llm/plugins/eaglePlugin/eagleDecodeDraftTokensPlugin.h b/cpp/tensorrt_llm/plugins/eaglePlugin/eagleDecodeDraftTokensPlugin.h new file mode 100644 index 000000000..3278d4c57 --- /dev/null +++ b/cpp/tensorrt_llm/plugins/eaglePlugin/eagleDecodeDraftTokensPlugin.h @@ -0,0 +1,90 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "tensorrt_llm/plugins/common/plugin.h" +#include +#include +#include +#include + +namespace tensorrt_llm::plugins +{ + +class EagleDecodeDraftTokensPlugin : public BasePlugin +{ +public: + EagleDecodeDraftTokensPlugin(nvinfer1::DataType type, int32_t layerIdx); + + EagleDecodeDraftTokensPlugin(void const* data, size_t length); + + ~EagleDecodeDraftTokensPlugin() override = default; + + // IPluginV2DynamicExt Methods + nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; + nvinfer1::DimsExprs getOutputDimensions(int outputIndex, nvinfer1::DimsExprs const* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) noexcept override; + bool supportsFormatCombination( + int pos, nvinfer1::PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept override; + void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int nbInputs, + nvinfer1::DynamicPluginTensorDesc const* out, int nbOutputs) noexcept override; + size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int nbInputs, + nvinfer1::PluginTensorDesc const* outputs, int nbOutputs) const noexcept override; + int enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, + void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; + + // IPluginV2Ext Methods + nvinfer1::DataType getOutputDataType( + int index, nvinfer1::DataType const* inputTypes, int nbInputs) const noexcept override; + + // IPluginV2 Methods + char const* getPluginType() const noexcept override; + char const* getPluginVersion() const noexcept override; + int getNbOutputs() const noexcept override; + int initialize() noexcept override; + void terminate() noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void* buffer) const noexcept override; + void destroy() noexcept override; + +private: + nvinfer1::DataType mDtype; + int32_t mLayerIdx; +}; + +class EagleDecodeDraftTokensPluginCreator : public BaseCreator +{ +public: + EagleDecodeDraftTokensPluginCreator(); + + char const* getPluginName() const noexcept override; + + char const* getPluginVersion() const noexcept override; + + nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override; + + nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; + + nvinfer1::IPluginV2* deserializePlugin( + char const* name, void const* serialData, size_t serialLength) noexcept override; + +private: + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; +}; + +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/eaglePlugin/eaglePrepareDrafterInputsPlugin.cpp b/cpp/tensorrt_llm/plugins/eaglePlugin/eaglePrepareDrafterInputsPlugin.cpp new file mode 100644 index 000000000..39eef83d5 --- /dev/null +++ b/cpp/tensorrt_llm/plugins/eaglePlugin/eaglePrepareDrafterInputsPlugin.cpp @@ -0,0 +1,272 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & + * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "eaglePrepareDrafterInputsPlugin.h" + +using namespace nvinfer1; +using tensorrt_llm::plugins::EaglePrepareDrafterInputsPluginCreator; +using tensorrt_llm::plugins::EaglePrepareDrafterInputsPlugin; + +static char const* EAGLE_PREPARE_DRAFTER_INPUTS_PLUGIN_VERSION{"1"}; +static char const* EAGLE_PREPARE_DRAFTER_INPUTS_PLUGIN_NAME{"EaglePrepareDrafterInputs"}; +PluginFieldCollection EaglePrepareDrafterInputsPluginCreator::mFC{}; +std::vector EaglePrepareDrafterInputsPluginCreator::mPluginAttributes; + +EaglePrepareDrafterInputsPlugin::EaglePrepareDrafterInputsPlugin(nvinfer1::DataType type, int32_t layerIdx) + : mDtype(type) + , mLayerIdx(layerIdx) +{ +} + +// Parameterized constructor +EaglePrepareDrafterInputsPlugin::EaglePrepareDrafterInputsPlugin(void const* data, size_t length) +{ + char const *d = reinterpret_cast(data), *a = d; + read(d, mDtype); + read(d, mLayerIdx); + TLLM_CHECK_WITH_INFO(d == a + length, + "Expected length (%d) != real length (%d). This is often " + "caused by using different TensorRT-LLM version to build " + "engine and run engine.", + static_cast(length), static_cast(d - a)); +} + +// IPluginV2DynamicExt Methods +nvinfer1::IPluginV2DynamicExt* EaglePrepareDrafterInputsPlugin::clone() const noexcept +{ + auto* plugin = new EaglePrepareDrafterInputsPlugin(*this); + plugin->setPluginNamespace(mNamespace.c_str()); + return plugin; +} + +nvinfer1::DimsExprs EaglePrepareDrafterInputsPlugin::getOutputDimensions( + int outputIndex, nvinfer1::DimsExprs const* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept +{ + TLLM_CHECK(outputIndex < 10); + TLLM_CHECK(nbInputs == 7); + auto const batchSizeExpr = inputs[nbInputs - 2].d[0]; + auto const maxDraftLenExpr = inputs[nbInputs - 2].d[1]; + + nvinfer1::DimsExprs ret; + switch (outputIndex) + { + case 0: // sequence_length + case 1: // host_request_types + case 2: // host_past_key_value_lengths + ret = inputs[outputIndex]; + break; + case 3: // spec_decoding_generation_lengths + ret.nbDims = 1; + ret.d[0] = batchSizeExpr; + break; + case 4: // spec_decoding_position_offsets + case 5: // input_ids + case 6: // position_ids + // FIXME input_ids should have real value, not maxDraftLen + ret.nbDims = 1; + ret.d[0] = maxDraftLenExpr; + break; + case 7: // spec_decoding_packed_mask + // FIXME + ret.nbDims = 3; + ret.d[0] = batchSizeExpr; + ret.d[1] = maxDraftLenExpr; + ret.d[2] = exprBuilder.operation(DimensionOperation::kCEIL_DIV, *maxDraftLenExpr, *exprBuilder.constant(32)); + break; + case 8: // hidden_dim + ret.nbDims = 2; + // FIXME real dim instead of max draft len + ret.d[0] = maxDraftLenExpr; + ret.d[1] = inputs[4].d[1]; + break; + } + return ret; +} + +bool EaglePrepareDrafterInputsPlugin::supportsFormatCombination( + int pos, nvinfer1::PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept +{ + if (pos == nbInputs - 1 || pos == nbInputs + nbOutputs - 1) // hidden_states + { + return (inOut[pos].type == mDtype) && (inOut[pos].format == TensorFormat::kLINEAR); + } + else if (pos == 3) // kv cache pool pointers + { + return inOut[pos].type == nvinfer1::DataType::kINT64 && inOut[pos].format == TensorFormat::kLINEAR; + } + else // all other tensors + { + return (inOut[pos].type == nvinfer1::DataType::kINT32) && (inOut[pos].format == TensorFormat::kLINEAR); + } +} + +void EaglePrepareDrafterInputsPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int nbInputs, + nvinfer1::DynamicPluginTensorDesc const* out, int nbOutputs) noexcept +{ +} + +size_t EaglePrepareDrafterInputsPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int nbInputs, + nvinfer1::PluginTensorDesc const* outputs, int nbOutputs) const noexcept +{ + return 0; +} + +int EaglePrepareDrafterInputsPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, + nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) noexcept +{ + // TODO fill me + + return 0; +} + +// IPluginV2Ext Methods +nvinfer1::DataType EaglePrepareDrafterInputsPlugin::getOutputDataType( + int index, nvinfer1::DataType const* inputTypes, int nbInputs) const noexcept +{ + TLLM_CHECK(index < 9); + if (index < 8) + { + return inputTypes[0]; // type of sequence_length + } + else // hidden_states + { + return inputTypes[nbInputs - 1]; // type of hidden_states + } +} + +// IPluginV2 Methods + +char const* EaglePrepareDrafterInputsPlugin::getPluginType() const noexcept +{ + return EAGLE_PREPARE_DRAFTER_INPUTS_PLUGIN_NAME; +} + +char const* EaglePrepareDrafterInputsPlugin::getPluginVersion() const noexcept +{ + return EAGLE_PREPARE_DRAFTER_INPUTS_PLUGIN_VERSION; +} + +int EaglePrepareDrafterInputsPlugin::getNbOutputs() const noexcept +{ + return 9; +} + +int EaglePrepareDrafterInputsPlugin::initialize() noexcept +{ + return 0; +} + +void EaglePrepareDrafterInputsPlugin::terminate() noexcept {} + +size_t EaglePrepareDrafterInputsPlugin::getSerializationSize() const noexcept +{ + return sizeof(mDtype) + sizeof(mLayerIdx); +} + +void EaglePrepareDrafterInputsPlugin::serialize(void* buffer) const noexcept +{ + char *d = static_cast(buffer), *a = d; + write(d, mLayerIdx); + write(d, mDtype); + assert(d == a + getSerializationSize()); +} + +void EaglePrepareDrafterInputsPlugin::destroy() noexcept +{ + // This gets called when the network containing plugin is destroyed + delete this; +} + +/////////////// + +EaglePrepareDrafterInputsPluginCreator::EaglePrepareDrafterInputsPluginCreator() +{ + // Fill PluginFieldCollection with PluginField arguments metadata + mPluginAttributes.clear(); + mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1)); + mPluginAttributes.emplace_back(PluginField("layer_idx", nullptr, PluginFieldType::kINT32, 0)); + mFC.nbFields = mPluginAttributes.size(); + mFC.fields = mPluginAttributes.data(); +} + +char const* EaglePrepareDrafterInputsPluginCreator::getPluginName() const noexcept +{ + return EAGLE_PREPARE_DRAFTER_INPUTS_PLUGIN_NAME; +} + +char const* EaglePrepareDrafterInputsPluginCreator::getPluginVersion() const noexcept +{ + return EAGLE_PREPARE_DRAFTER_INPUTS_PLUGIN_VERSION; +} + +PluginFieldCollection const* EaglePrepareDrafterInputsPluginCreator::getFieldNames() noexcept +{ + return &mFC; +} + +IPluginV2* EaglePrepareDrafterInputsPluginCreator::createPlugin( + char const* name, PluginFieldCollection const* fc) noexcept +{ + PluginField const* fields = fc->fields; + int32_t layerIdx; + nvinfer1::DataType type; + // Read configurations from each fields + for (int i = 0; i < fc->nbFields; ++i) + { + char const* attrName = fields[i].name; + if (!strcmp(attrName, "layer_idx")) + { + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); + layerIdx = *static_cast(fields[i].data); + } + else if (!strcmp(attrName, "type_id")) + { + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); + type = static_cast(*(static_cast(fields[i].data))); + } + } + + try + { + auto* obj = new EaglePrepareDrafterInputsPlugin(type, layerIdx); + obj->setPluginNamespace(mNamespace.c_str()); + return obj; + } + catch (std::exception const& e) + { + caughtError(e); + } + return nullptr; +} + +IPluginV2* EaglePrepareDrafterInputsPluginCreator::deserializePlugin( + char const* name, void const* serialData, size_t serialLength) noexcept +{ + // This object will be deleted when the network is destroyed, which will + // call EaglePrepareDrafterInputsPlugin::destroy() + try + { + auto* obj = new EaglePrepareDrafterInputsPlugin(serialData, serialLength); + obj->setPluginNamespace(mNamespace.c_str()); + return obj; + } + catch (std::exception const& e) + { + caughtError(e); + } + return nullptr; +} diff --git a/cpp/tensorrt_llm/plugins/eaglePlugin/eaglePrepareDrafterInputsPlugin.h b/cpp/tensorrt_llm/plugins/eaglePlugin/eaglePrepareDrafterInputsPlugin.h new file mode 100644 index 000000000..d88238ba8 --- /dev/null +++ b/cpp/tensorrt_llm/plugins/eaglePlugin/eaglePrepareDrafterInputsPlugin.h @@ -0,0 +1,90 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "tensorrt_llm/plugins/common/plugin.h" +#include +#include +#include +#include + +namespace tensorrt_llm::plugins +{ + +class EaglePrepareDrafterInputsPlugin : public BasePlugin +{ +public: + EaglePrepareDrafterInputsPlugin(nvinfer1::DataType type, int32_t layerIdx); + + EaglePrepareDrafterInputsPlugin(void const* data, size_t length); + + ~EaglePrepareDrafterInputsPlugin() override = default; + + // IPluginV2DynamicExt Methods + nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; + nvinfer1::DimsExprs getOutputDimensions(int outputIndex, nvinfer1::DimsExprs const* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) noexcept override; + bool supportsFormatCombination( + int pos, nvinfer1::PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept override; + void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int nbInputs, + nvinfer1::DynamicPluginTensorDesc const* out, int nbOutputs) noexcept override; + size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int nbInputs, + nvinfer1::PluginTensorDesc const* outputs, int nbOutputs) const noexcept override; + int enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, + void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; + + // IPluginV2Ext Methods + nvinfer1::DataType getOutputDataType( + int index, nvinfer1::DataType const* inputTypes, int nbInputs) const noexcept override; + + // IPluginV2 Methods + char const* getPluginType() const noexcept override; + char const* getPluginVersion() const noexcept override; + int getNbOutputs() const noexcept override; + int initialize() noexcept override; + void terminate() noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void* buffer) const noexcept override; + void destroy() noexcept override; + +private: + nvinfer1::DataType mDtype; + int32_t mLayerIdx; +}; + +class EaglePrepareDrafterInputsPluginCreator : public BaseCreator +{ +public: + EaglePrepareDrafterInputsPluginCreator(); + + char const* getPluginName() const noexcept override; + + char const* getPluginVersion() const noexcept override; + + nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override; + + nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; + + nvinfer1::IPluginV2* deserializePlugin( + char const* name, void const* serialData, size_t serialLength) noexcept override; + +private: + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; +}; + +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/eaglePlugin/eagleSampleAndAcceptDraftTokensPlugin.cpp b/cpp/tensorrt_llm/plugins/eaglePlugin/eagleSampleAndAcceptDraftTokensPlugin.cpp new file mode 100644 index 000000000..42f03d93b --- /dev/null +++ b/cpp/tensorrt_llm/plugins/eaglePlugin/eagleSampleAndAcceptDraftTokensPlugin.cpp @@ -0,0 +1,515 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & + * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "eagleSampleAndAcceptDraftTokensPlugin.h" + +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/dataType.h" +#include "tensorrt_llm/common/memoryUtils.h" +#include "tensorrt_llm/kernels/samplingTopKKernels.h" +#include "tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h" +#include "tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h" +#include "tensorrt_llm/runtime/common.h" + +using namespace nvinfer1; +using tensorrt_llm::plugins::EagleSampleAndAcceptDraftTokensPluginCreator; +using tensorrt_llm::plugins::EagleSampleAndAcceptDraftTokensPlugin; +using namespace tensorrt_llm::kernels; +using namespace tensorrt_llm::kernels::speculative_decoding; +using namespace tensorrt_llm::runtime; +namespace tc = tensorrt_llm::common; + +static char const* EAGLE_SAMPLE_AND_ACCEPT_DRAFT_TOKENS_PLUGIN_VERSION{"1"}; +static char const* EAGLE_SAMPLE_AND_ACCEPT_DRAFT_TOKENS_PLUGIN_NAME{"EagleSampleAndAcceptDraftTokens"}; +PluginFieldCollection EagleSampleAndAcceptDraftTokensPluginCreator::mFC{}; +std::vector EagleSampleAndAcceptDraftTokensPluginCreator::mPluginAttributes; + +EagleSampleAndAcceptDraftTokensPlugin::EagleSampleAndAcceptDraftTokensPlugin( + nvinfer1::DataType type, bool greedySampling) + : mDtype(type) + , mGreedySampling(greedySampling) +{ + TLLM_CHECK_WITH_INFO(mGreedySampling, "Non-greedy sampling is not supported yet."); +} + +// Parameterized constructor +EagleSampleAndAcceptDraftTokensPlugin::EagleSampleAndAcceptDraftTokensPlugin(void const* data, size_t length) +{ + char const *d = reinterpret_cast(data), *a = d; + read(d, mDtype); + read(d, mGreedySampling); + TLLM_CHECK_WITH_INFO(d == a + length, + "Expected length (%d) != real length (%d). This is often " + "caused by using different TensorRT-LLM version to build " + "engine and run engine.", + (int) length, (int) (d - a)); +} + +// IPluginV2DynamicExt Methods +nvinfer1::IPluginV2DynamicExt* EagleSampleAndAcceptDraftTokensPlugin::clone() const noexcept +{ + auto* plugin = new EagleSampleAndAcceptDraftTokensPlugin(*this); + plugin->setPluginNamespace(mNamespace.c_str()); + return plugin; +} + +nvinfer1::DimsExprs EagleSampleAndAcceptDraftTokensPlugin::getOutputDimensions( + int outputIndex, nvinfer1::DimsExprs const* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept +{ + TLLM_CHECK(nbInputs == 6); + TLLM_CHECK(outputIndex < 7); + auto const batchSizeExpr = inputs[getIdx(InputIdxEntry::PATHS)].d[0]; + auto const maxDecodingDraftTokensExpr = inputs[getIdx(InputIdxEntry::DRAFT_TOKEN_IDS)].d[1]; + auto const maxPathLenExpr = inputs[getIdx(InputIdxEntry::PATHS)].d[2]; + + nvinfer1::DimsExprs ret; + switch (outputIndex) + { + case 0: // accepted_tokens + ret.nbDims = 2; + ret.d[0] = batchSizeExpr; + ret.d[1] = maxPathLenExpr; + break; + case 1: // num_accepted_tokens + ret.nbDims = 1; + ret.d[0] = batchSizeExpr; + break; + case 2: // accepted_paths + ret.nbDims = 1; + ret.d[0] = batchSizeExpr; + break; + case 3: // last_accepted_tokens + ret.nbDims = 1; + ret.d[0] = batchSizeExpr; + break; + case 4: // exclusive_sum_last_accepted_indices + ret.nbDims = 1; + ret.d[0] = batchSizeExpr; + break; + case 5: // next_draft_tokens + ret.nbDims = 2; + ret.d[0] = batchSizeExpr; + ret.d[1] = maxDecodingDraftTokensExpr; + break; + case 6: // next_draft_lens + ret.nbDims = 1; + ret.d[0] = batchSizeExpr; + break; + } + return ret; +} + +bool EagleSampleAndAcceptDraftTokensPlugin::supportsFormatCombination( + int pos, nvinfer1::PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept +{ + if (pos == getIdx(InputIdxEntry::LOGITS)) // logits + { + return (inOut[pos].type == mDtype) && (inOut[pos].format == TensorFormat::kLINEAR); + } + else if (pos == getIdx(InputIdxEntry::TEMPERATURE) + || pos == getIdx(InputIdxEntry::RAND_VALIDATION)) // temperature, rand_validation + { + return (inOut[pos].type == nvinfer1::DataType::kFLOAT) && (inOut[pos].format == TensorFormat::kLINEAR); + } + else // everything else + { + return (inOut[pos].type == nvinfer1::DataType::kINT32) && (inOut[pos].format == TensorFormat::kLINEAR); + } +} + +void EagleSampleAndAcceptDraftTokensPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int nbInputs, + nvinfer1::DynamicPluginTensorDesc const* out, int nbOutputs) noexcept +{ +} + +template +size_t EagleSampleAndAcceptDraftTokensPlugin::getWorkspaceSizeType(nvinfer1::PluginTensorDesc const* inputs, + int nbInputs, nvinfer1::PluginTensorDesc const* outputs, int nbOutputs) const noexcept +{ + size_t workspaceSize{0}; + + auto const vocabSizePadded = inputs[getIdx(InputIdxEntry::LOGITS)].dims.d[1]; + auto const batchSize = inputs[getIdx(InputIdxEntry::PATHS)].dims.d[0]; + auto const maxDecodingTokens = inputs[getIdx(InputIdxEntry::PATHS)].dims.d[1]; + + // Greedy sampling + { + // Top1 sampling workspace + auto const primarySamplingWorkspaceSize + = getTopKWorkspaceSize(batchSize, maxDecodingTokens, /* maxTopK */ 1, vocabSizePadded); + + // Target output ids + auto const targetOutputIdsSize = batchSize * maxDecodingTokens * sizeof(TokenIdType); + + // Logits ptrs + auto const logitsPtrsSize = batchSize * maxDecodingTokens * sizeof(T*); + SizeType32 constexpr NUM_BUFFERS{4}; + size_t workspaces[NUM_BUFFERS]; + workspaces[0] = primarySamplingWorkspaceSize; + workspaces[1] = targetOutputIdsSize; + workspaces[2] = logitsPtrsSize; + workspaces[3] = batchSize * sizeof(SizeType32); + workspaceSize = tc::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS); + } + + return workspaceSize; +} + +size_t EagleSampleAndAcceptDraftTokensPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int nbInputs, + nvinfer1::PluginTensorDesc const* outputs, int nbOutputs) const noexcept +{ + auto const logitsType = inputs[getIdx(InputIdxEntry::LOGITS)].type; + if (logitsType == nvinfer1::DataType::kFLOAT) + { + return getWorkspaceSizeType(inputs, nbInputs, outputs, nbOutputs); + } + else if (logitsType == nvinfer1::DataType::kHALF) + { + return getWorkspaceSizeType<__half>(inputs, nbInputs, outputs, nbOutputs); + } + else + { + TLLM_CHECK_WITH_INFO(false, "Unsupported logits type"); + } + return 0; +} + +template +void EagleSampleAndAcceptDraftTokensPlugin::samplePrimeHeadTokens(nvinfer1::PluginTensorDesc const* inputDesc, + nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) noexcept +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto const maxNumTokens = inputDesc[getIdx(InputIdxEntry::LOGITS)].dims.d[0]; + auto const vocabSizePadded = inputDesc[getIdx(InputIdxEntry::LOGITS)].dims.d[1]; + auto const batchSize = inputDesc[getIdx(InputIdxEntry::PATHS)].dims.d[0]; + auto const maxDecodingTokens = inputDesc[getIdx(InputIdxEntry::PATHS)].dims.d[1]; + + auto logits = static_cast(inputs[getIdx(InputIdxEntry::LOGITS)]); + auto prevDraftLens = reinterpret_cast(inputs[getIdx(InputIdxEntry::DRAFT_LENS)]); + + int8_t* workspaceBytePtr = reinterpret_cast(workspace); + size_t offset{0}; + + auto const samplingWorkspaceSize + = getTopKWorkspaceSize(batchSize, maxDecodingTokens, /* maxTopK */ 1, vocabSizePadded); + + void* workspaceSampling + = reinterpret_cast(tc::nextWorkspacePtr(workspaceBytePtr, offset, samplingWorkspaceSize)); + TokenIdType* outputIds = reinterpret_cast( + tc::nextWorkspacePtr(workspaceBytePtr, offset, batchSize * maxDecodingTokens * sizeof(TokenIdType))); + T const** logitsPtrs = reinterpret_cast( + tc::nextWorkspacePtr(workspaceBytePtr, offset, batchSize * maxDecodingTokens * sizeof(T*))); + SizeType32* decodingTokens + = reinterpret_cast(tc::nextWorkspacePtr(workspaceBytePtr, offset, batchSize * sizeof(SizeType32))); + + // Assemble pointers to logits + invokeAssembleTargetLogitsOffsets( + logitsPtrs, decodingTokens, logits, prevDraftLens, batchSize, maxDecodingTokens, vocabSizePadded, stream); + + sync_check_cuda_error(); + + TopKSamplingKernelParams params; + params.logProbsPtrs = logitsPtrs; + params.outputIds = outputIds; + params.workspace = workspaceSampling; + params.maxTopK = 1; + params.batchSize = batchSize; + params.maxBatchSize = batchSize; + params.tokensPerStep = decodingTokens; + params.maxTokensPerStep = maxDecodingTokens; + params.maxSeqLen = maxDecodingTokens; + params.vocabSizePadded = vocabSizePadded; + + invokeBatchTopKSampling(params, stream); + + sync_check_cuda_error(); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void EagleSampleAndAcceptDraftTokensPlugin::acceptDraftTokens(nvinfer1::PluginTensorDesc const* inputDesc, + nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) noexcept +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto const maxNumTokens = inputDesc[getIdx(InputIdxEntry::LOGITS)].dims.d[0]; + auto const vocabSizePadded = inputDesc[getIdx(InputIdxEntry::LOGITS)].dims.d[1]; + + auto const batchSize = inputDesc[getIdx(InputIdxEntry::PATHS)].dims.d[0]; + auto const maxDecodingTokens = inputDesc[getIdx(InputIdxEntry::PATHS)].dims.d[1]; + auto const maxPathLen = inputDesc[getIdx(InputIdxEntry::PATHS)].dims.d[2]; + auto const maxDraftPathLen = maxPathLen - 1; + + int8_t* workspaceBytePtr = reinterpret_cast(workspace); + size_t offset{0}; + + auto const samplingWorkspaceSize + = getTopKWorkspaceSize(batchSize, maxDecodingTokens, /* maxTopK */ 1, vocabSizePadded); + + void* workspaceSampling + = reinterpret_cast(tc::nextWorkspacePtr(workspaceBytePtr, offset, samplingWorkspaceSize)); + TokenIdType* outputIds = reinterpret_cast( + tc::nextWorkspacePtr(workspaceBytePtr, offset, batchSize * maxDecodingTokens * sizeof(TokenIdType))); + + AcceptDraftTokensByIdsWithPathsParams params; + params.outputIds = reinterpret_cast(outputs[getIdx(OutputIdxEntry::ACCEPTED_TOKENS)]); + params.draftIds = reinterpret_cast(inputs[getIdx(InputIdxEntry::DRAFT_TOKEN_IDS)]); + params.targetIds = outputIds; + params.acceptedLengths = reinterpret_cast(outputs[getIdx(OutputIdxEntry::ACCEPTED_LEN)]); + params.paths = reinterpret_cast(inputs[getIdx(InputIdxEntry::PATHS)]); + params.bestPathIds = reinterpret_cast(outputs[getIdx(OutputIdxEntry::BEST_ACCEPTED_PATHS)]); + params.batchSize = batchSize; + params.maxBatchSize = batchSize; + params.vocabSize = vocabSizePadded; + params.maxSeqLen = maxPathLen; + params.maxDraftPathLen = maxDraftPathLen; + params.maxDecodingTokens = maxDecodingTokens; + params.stream = stream; + + params.checkParams(); + + acceptDraftTokensByIdsWithPaths(params); + + sync_check_cuda_error(); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void EagleSampleAndAcceptDraftTokensPlugin::doGreedy(nvinfer1::PluginTensorDesc const* inputDesc, + nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) noexcept +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + // Sample all main head tokens with Top-1. + samplePrimeHeadTokens(inputDesc, outputDesc, inputs, outputs, workspace, stream); + + // Greedy accept tokens based on token ids, write the best path and best token id. + acceptDraftTokens(inputDesc, outputDesc, inputs, outputs, workspace, stream); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +void EagleSampleAndAcceptDraftTokensPlugin::selectLastAccTokenAndComputeIndicesCumSum( + nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, + void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto const batchSize = inputDesc[getIdx(InputIdxEntry::PATHS)].dims.d[0]; + auto const maxDecodingTokens = inputDesc[getIdx(InputIdxEntry::PATHS)].dims.d[1]; + auto const maxPathLen = inputDesc[getIdx(InputIdxEntry::PATHS)].dims.d[2]; + + auto lastAcceptedTokenIds + = reinterpret_cast(outputs[getIdx(OutputIdxEntry::LAST_ACCEPTED_TOKEN_IDS)]); + auto exclusiveSumLastAcceptedIndices + = reinterpret_cast(outputs[getIdx(OutputIdxEntry::EXCLUSIVE_SUM_LAST_TOKEN_INDICES)]); + auto prevDraftLens = reinterpret_cast(inputs[getIdx(InputIdxEntry::DRAFT_LENS)]); + auto acceptedTokenIds = reinterpret_cast(outputs[getIdx(OutputIdxEntry::ACCEPTED_TOKENS)]); + auto acceptedLengths = reinterpret_cast(outputs[getIdx(OutputIdxEntry::ACCEPTED_LEN)]); + auto bestPathIds = reinterpret_cast(outputs[getIdx(OutputIdxEntry::BEST_ACCEPTED_PATHS)]); + auto paths = reinterpret_cast(inputs[getIdx(InputIdxEntry::PATHS)]); + + invokeSelectLastAccTokenAndComputeIndicesCumSum(lastAcceptedTokenIds, exclusiveSumLastAcceptedIndices, + prevDraftLens, acceptedTokenIds, acceptedLengths, bestPathIds, paths, batchSize, maxDecodingTokens, maxPathLen, + stream); + + sync_check_cuda_error(); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void EagleSampleAndAcceptDraftTokensPlugin::enqueueType(nvinfer1::PluginTensorDesc const* inputDesc, + nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) noexcept +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + // TODO split batch into greedy and non-greedy and execute both paths + if (mGreedySampling) + { + doGreedy(inputDesc, outputDesc, inputs, outputs, workspace, stream); + } + else + { + // TODO fill me + TLLM_CHECK_WITH_INFO(false, "Non-greedy sampling is not supported yet"); + } + + // Find last accepted tokens and do cumulative sum of accepted indices. + selectLastAccTokenAndComputeIndicesCumSum(inputDesc, outputDesc, inputs, outputs, workspace, stream); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +int EagleSampleAndAcceptDraftTokensPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, + nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) noexcept +{ + auto const logitsType = inputDesc[getIdx(InputIdxEntry::LOGITS)].type; + if (logitsType == nvinfer1::DataType::kFLOAT) + { + enqueueType(inputDesc, outputDesc, inputs, outputs, workspace, stream); + } + else if (logitsType == nvinfer1::DataType::kHALF) + { + enqueueType<__half>(inputDesc, outputDesc, inputs, outputs, workspace, stream); + } + else + { + TLLM_CHECK_WITH_INFO(false, "Unsupported logits type"); + } + + return 0; +} + +// IPluginV2Ext Methods +nvinfer1::DataType EagleSampleAndAcceptDraftTokensPlugin::getOutputDataType( + int index, nvinfer1::DataType const* inputTypes, int nbInputs) const noexcept +{ + TLLM_CHECK(index < 7); + // input 1 is draft tokens now of int32 type. All outputs are int32_t as well. + return inputTypes[getIdx(InputIdxEntry::DRAFT_TOKEN_IDS)]; +} + +// IPluginV2 Methods + +char const* EagleSampleAndAcceptDraftTokensPlugin::getPluginType() const noexcept +{ + return EAGLE_SAMPLE_AND_ACCEPT_DRAFT_TOKENS_PLUGIN_NAME; +} + +char const* EagleSampleAndAcceptDraftTokensPlugin::getPluginVersion() const noexcept +{ + return EAGLE_SAMPLE_AND_ACCEPT_DRAFT_TOKENS_PLUGIN_VERSION; +} + +int EagleSampleAndAcceptDraftTokensPlugin::getNbOutputs() const noexcept +{ + return 7; +} + +int EagleSampleAndAcceptDraftTokensPlugin::initialize() noexcept +{ + return 0; +} + +void EagleSampleAndAcceptDraftTokensPlugin::terminate() noexcept {} + +size_t EagleSampleAndAcceptDraftTokensPlugin::getSerializationSize() const noexcept +{ + return sizeof(mDtype) + sizeof(mGreedySampling); +} + +void EagleSampleAndAcceptDraftTokensPlugin::serialize(void* buffer) const noexcept +{ + char *d = static_cast(buffer), *a = d; + write(d, mDtype); + write(d, mGreedySampling); + assert(d == a + getSerializationSize()); +} + +void EagleSampleAndAcceptDraftTokensPlugin::destroy() noexcept +{ + // This gets called when the network containing plugin is destroyed + delete this; +} + +/////////////// + +EagleSampleAndAcceptDraftTokensPluginCreator::EagleSampleAndAcceptDraftTokensPluginCreator() +{ + // Fill PluginFieldCollection with PluginField arguments metadata + mPluginAttributes.clear(); + mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1)); + mPluginAttributes.emplace_back(PluginField("greedy_sampling", nullptr, PluginFieldType::kINT32, 1)); + mFC.nbFields = mPluginAttributes.size(); + mFC.fields = mPluginAttributes.data(); +} + +char const* EagleSampleAndAcceptDraftTokensPluginCreator::getPluginName() const noexcept +{ + return EAGLE_SAMPLE_AND_ACCEPT_DRAFT_TOKENS_PLUGIN_NAME; +} + +char const* EagleSampleAndAcceptDraftTokensPluginCreator::getPluginVersion() const noexcept +{ + return EAGLE_SAMPLE_AND_ACCEPT_DRAFT_TOKENS_PLUGIN_VERSION; +} + +PluginFieldCollection const* EagleSampleAndAcceptDraftTokensPluginCreator::getFieldNames() noexcept +{ + return &mFC; +} + +IPluginV2* EagleSampleAndAcceptDraftTokensPluginCreator::createPlugin( + char const* name, PluginFieldCollection const* fc) noexcept +{ + PluginField const* fields = fc->fields; + nvinfer1::DataType type; + bool greedySampling; + // Read configurations from each fields + for (int i = 0; i < fc->nbFields; ++i) + { + char const* attrName = fields[i].name; + if (!strcmp(attrName, "type_id")) + { + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); + type = static_cast(*(static_cast(fields[i].data))); + } + else if (!strcmp(attrName, "greedy_sampling")) + { + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); + greedySampling = static_cast(*static_cast(fields[i].data)); + } + } + + try + { + auto* obj = new EagleSampleAndAcceptDraftTokensPlugin(type, greedySampling); + obj->setPluginNamespace(mNamespace.c_str()); + return obj; + } + catch (std::exception const& e) + { + caughtError(e); + } + return nullptr; +} + +IPluginV2* EagleSampleAndAcceptDraftTokensPluginCreator::deserializePlugin( + char const* name, void const* serialData, size_t serialLength) noexcept +{ + // This object will be deleted when the network is destroyed, which will + // call EagleSampleAndAcceptDraftTokensPlugin::destroy() + try + { + auto* obj = new EagleSampleAndAcceptDraftTokensPlugin(serialData, serialLength); + obj->setPluginNamespace(mNamespace.c_str()); + return obj; + } + catch (std::exception const& e) + { + caughtError(e); + } + return nullptr; +} diff --git a/cpp/tensorrt_llm/plugins/eaglePlugin/eagleSampleAndAcceptDraftTokensPlugin.h b/cpp/tensorrt_llm/plugins/eaglePlugin/eagleSampleAndAcceptDraftTokensPlugin.h new file mode 100644 index 000000000..b2de11e9b --- /dev/null +++ b/cpp/tensorrt_llm/plugins/eaglePlugin/eagleSampleAndAcceptDraftTokensPlugin.h @@ -0,0 +1,163 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "tensorrt_llm/plugins/common/plugin.h" + +#include +#include +#include +#include +#include + +namespace tensorrt_llm::plugins +{ + +class EagleSampleAndAcceptDraftTokensPlugin : public BasePlugin +{ +public: + EagleSampleAndAcceptDraftTokensPlugin(nvinfer1::DataType type, bool greedySampling); + + EagleSampleAndAcceptDraftTokensPlugin(void const* data, size_t length); + + ~EagleSampleAndAcceptDraftTokensPlugin() override = default; + + // IPluginV2DynamicExt Methods + nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; + nvinfer1::DimsExprs getOutputDimensions(int outputIndex, nvinfer1::DimsExprs const* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) noexcept override; + bool supportsFormatCombination( + int pos, nvinfer1::PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept override; + void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int nbInputs, + nvinfer1::DynamicPluginTensorDesc const* out, int nbOutputs) noexcept override; + size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int nbInputs, + nvinfer1::PluginTensorDesc const* outputs, int nbOutputs) const noexcept override; + int enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, + void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; + + // IPluginV2Ext Methods + nvinfer1::DataType getOutputDataType( + int index, nvinfer1::DataType const* inputTypes, int nbInputs) const noexcept override; + + // IPluginV2 Methods + char const* getPluginType() const noexcept override; + char const* getPluginVersion() const noexcept override; + int getNbOutputs() const noexcept override; + int initialize() noexcept override; + void terminate() noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void* buffer) const noexcept override; + void destroy() noexcept override; + +private: + enum class InputIdxEntry : int32_t + { + //! [num_tokens, vocab_size_padded] + LOGITS = 0, + //! [batch_size, max_decoding_draft_tokens] + DRAFT_TOKEN_IDS, + //! [batch_size] + DRAFT_LENS, + //! [batch_size] + TEMPERATURE, + //! []? + RAND_VALIDATION, + //! [batch_size, max_decoding_tokens, max_path_len] + PATHS + }; + + enum class OutputIdxEntry : int32_t + { + //! [batch_size, max_draft_path_len] + ACCEPTED_TOKENS = 0, + //! [batch_size] + ACCEPTED_LEN, + //! [batch_size] + BEST_ACCEPTED_PATHS, + //! [batch_size] + LAST_ACCEPTED_TOKEN_IDS, + //! [batch_size] + EXCLUSIVE_SUM_LAST_TOKEN_INDICES, + //! [batch_size, max_decoding_draft_tokens] + NEXT_DRAFT_TOKEN_IDS, + //! [batch_size] + NEXT_DRAFT_LENS + }; + + int32_t getIdx(InputIdxEntry idx) const + { + return static_cast(idx); + } + + int32_t getIdx(OutputIdxEntry idx) const + { + return static_cast(idx); + } + +private: + template + size_t getWorkspaceSizeType(nvinfer1::PluginTensorDesc const* inputs, int nbInputs, + nvinfer1::PluginTensorDesc const* outputs, int nbOutputs) const noexcept; + + template + void samplePrimeHeadTokens(nvinfer1::PluginTensorDesc const* inputDesc, + nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) noexcept; + + template + void acceptDraftTokens(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, + void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept; + + template + void doGreedy(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, + void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept; + + void selectLastAccTokenAndComputeIndicesCumSum(nvinfer1::PluginTensorDesc const* inputDesc, + nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) noexcept; + + template + void enqueueType(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, + void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept; + +private: + nvinfer1::DataType mDtype; + bool mGreedySampling; +}; + +class EagleSampleAndAcceptDraftTokensPluginCreator : public BaseCreator +{ +public: + EagleSampleAndAcceptDraftTokensPluginCreator(); + + char const* getPluginName() const noexcept override; + + char const* getPluginVersion() const noexcept override; + + nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override; + + nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; + + nvinfer1::IPluginV2* deserializePlugin( + char const* name, void const* serialData, size_t serialLength) noexcept override; + +private: + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; +}; + +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp b/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp index 40c8113ac..ff75e9f6a 100644 --- a/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp @@ -24,14 +24,17 @@ using namespace tensorrt_llm::kernels::cutlass_kernels; using tensorrt_llm::plugins::WeightOnlyGroupwiseQuantMatmulPluginCreator; using tensorrt_llm::plugins::WeightOnlyGroupwiseQuantMatmulPlugin; using tensorrt_llm::plugins::WeightOnlyGroupwiseQuantGemmPluginProfiler; +using tensorrt_llm::plugins::WeightOnlyGemmRunnerPtr; // Flags for indicating whether the corresponding inputs are applied in mQuantAlgo -// mQuantAlgo = pre_quant_scale * PRE_QUANT_SCALE + zero * ZERO + bias * BIAS -// Here pre_quant_scale, zero and bias are boolean type +// mQuantAlgo = int8_weight * INT8_WEIGHT + use_w4a8_awq * FP8_ALPHA + pre_quant_scale * PRE_QUANT_SCALE +// + zero * ZERO + bias * BIAS +// Here int8_weight, use_w4a8_awq, pre_quant_scale, zero and bias are boolean type static constexpr int BIAS = int(1) << 0; static constexpr int ZERO = int(1) << 1; static constexpr int PRE_QUANT_SCALE = int(1) << 2; static constexpr int FP8_ALPHA = int(1) << 3; +static constexpr int INT8_WEIGHT = int(1) << 4; using tensorrt_llm::plugins::read; using tensorrt_llm::plugins::write; @@ -43,11 +46,10 @@ std::vector WeightOnlyGroupwiseQuantMatmulPluginCreator:: void WeightOnlyGroupwiseQuantGemmPluginProfiler::runTactic(int m, int n, int k, WeightOnlyGroupwiseQuantGemmPluginProfiler::Config const& tactic, char* workspace, cudaStream_t const& stream) { - // Quantized weights are packed in FP16 format (INT4*4 -> FP16) - int const originalN = n * FP16_INT4_RATIO; + // Quantized weights are packed in FP16 format (INT4*4 -> FP16, INT8*2 -> FP16) + int const originalN = mQuantAlgo & INT8_WEIGHT ? n * FP16_INT8_RATIO : n * FP16_INT4_RATIO; half* actPtr = reinterpret_cast(workspace); - cutlass::uint4b_t* weightPtr = reinterpret_cast( - nextWorkspacePtr(reinterpret_cast(actPtr), m * k * sizeof(half))); + void* weightPtr = nextWorkspacePtr(reinterpret_cast(actPtr), m * k * sizeof(half)); half* inputScalesPtr = reinterpret_cast(nextWorkspacePtr(reinterpret_cast(weightPtr), n * k * sizeof(float))); half* zerosPtr = reinterpret_cast( @@ -69,15 +71,22 @@ void WeightOnlyGroupwiseQuantGemmPluginProfiler::runTactic(int m, int n, int k, } int const wsSize = mRunner->getWorkspaceSize(m, originalN, k); - - mRunner->gemm(actPtr, weightPtr, inputScalesPtr, zerosPtr, biasesPtr, outputPtr, m, originalN, k, mGroupSize, - tactic, workspacePtr, wsSize, stream); + if (mQuantAlgo & INT8_WEIGHT) + { + mRunner->gemm(actPtr, reinterpret_cast(weightPtr), inputScalesPtr, zerosPtr, biasesPtr, outputPtr, m, + originalN, k, mGroupSize, tactic, workspacePtr, wsSize, stream); + } + else + { + mRunner->gemm(actPtr, reinterpret_cast(weightPtr), inputScalesPtr, zerosPtr, biasesPtr, + outputPtr, m, originalN, k, mGroupSize, tactic, workspacePtr, wsSize, stream); + } } void WeightOnlyGroupwiseQuantGemmPluginProfiler::computeTmpSize(size_t maxM, size_t n, size_t k) { - // Quantized weights are packed in FP16 format (INT4*4 -> FP16) - int const originalN = n * FP16_INT4_RATIO; + // Quantized weights are packed in FP16 format (INT4*4 -> FP16, INT8*2 -> FP16) + int const originalN = mQuantAlgo & INT8_WEIGHT ? n * FP16_INT8_RATIO : n * FP16_INT4_RATIO; std::vector workspaces = { maxM * k * sizeof(half), // A k * n * sizeof(float), // B @@ -129,6 +138,38 @@ WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin( (int) length, (int) (d - a)); } +template +using GemmRunner = tensorrt_llm::kernels::cutlass_kernels::CutlassFpAIntBGemmRunner; + +template +WeightOnlyGemmRunnerPtr selectGemmRunnerForZERO(int quant_algo) +{ + if (quant_algo & ZERO) + { + return std::make_shared>(); + } + else + { + return std::make_shared< + GemmRunner>(); + } +} + +template +WeightOnlyGemmRunnerPtr selectGemmRunnerForWeightType(int quant_algo) +{ + if (quant_algo & INT8_WEIGHT) + { + return selectGemmRunnerForZERO(quant_algo); + } + else + { + return selectGemmRunnerForZERO(quant_algo); + } +} + void WeightOnlyGroupwiseQuantMatmulPlugin::init(nvinfer1::DataType type, int quant_algo, int group_size) { mArch = tensorrt_llm::common::getSMVersion(); @@ -136,7 +177,7 @@ void WeightOnlyGroupwiseQuantMatmulPlugin::init(nvinfer1::DataType type, int qua mQuantAlgo = quant_algo; mGroupSize = group_size; - // quant_algo = fp8_alpha * 8 + pre_quant_scale * 4 + zero * 2 + bias + // quant_algo = int8_weight * 16 + fp8_alpha * 8 + pre_quant_scale * 4 + zero * 2 + bias mPreQuantScaleInputIdx = (quant_algo & PRE_QUANT_SCALE) ? 1 : 0; mWeightInputIdx = mPreQuantScaleInputIdx + 1; mScalesInputIdx = mWeightInputIdx + 1; @@ -146,6 +187,7 @@ void WeightOnlyGroupwiseQuantMatmulPlugin::init(nvinfer1::DataType type, int qua if (mType == nvinfer1::DataType::kHALF) { + // CUTLASS kernel selection if (quant_algo & FP8_ALPHA) { // Ada & Hopper style kernels @@ -153,45 +195,34 @@ void WeightOnlyGroupwiseQuantMatmulPlugin::init(nvinfer1::DataType type, int qua { TLLM_THROW("W4A(fp)8 kernel is unsupported on pre-Ada (sm<89) architectures!"); } - if (quant_algo & ZERO) - { - // has zeros - m_weightOnlyGroupwiseGemmRunner = std::make_shared< - tensorrt_llm::kernels::cutlass_kernels::CutlassFpAIntBGemmRunner<__nv_fp8_e4m3, cutlass::uint4b_t, - cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS, half, half, half>>(); - } - else - { - // no zeros - m_weightOnlyGroupwiseGemmRunner - = std::make_shared>(); - } + assert(!(quant_algo & INT8_WEIGHT) && "W4A(fp)8 kernel requires INT4 weight!"); + m_weightOnlyGroupwiseGemmRunner + = selectGemmRunnerForZERO<__nv_fp8_e4m3, cutlass::uint4b_t, half>(quant_algo); } else { - if (quant_algo & ZERO) - { - // has zeros - m_weightOnlyGroupwiseGemmRunner - = std::make_shared>(); - } - else - { - // no zeros - m_weightOnlyGroupwiseGemmRunner - = std::make_shared>(); - } + m_weightOnlyGroupwiseGemmRunner = selectGemmRunnerForWeightType(quant_algo); + } + // CUDA kernel selection + if (quant_algo & INT8_WEIGHT) + { + // INT8 weight + mCudaKernelEnabled = tensorrt_llm::kernels::weight_only::is_supported( + mArch, tensorrt_llm::kernels::weight_only::KernelType::FP16Int8Groupwise); + mCudaKernelType = tensorrt_llm::kernels::weight_only::KernelType::FP16Int8Groupwise; + } + else + { + // INT4 weight + mCudaKernelEnabled = tensorrt_llm::kernels::weight_only::is_supported( + mArch, tensorrt_llm::kernels::weight_only::KernelType::FP16Int4Groupwise); + mCudaKernelType = tensorrt_llm::kernels::weight_only::KernelType::FP16Int4Groupwise; } - mCudaKernelEnabled = tensorrt_llm::kernels::weight_only::is_supported( - mArch, tensorrt_llm::kernels::weight_only::KernelType::FP16Int4Groupwise); - mCudaKernelType = tensorrt_llm::kernels::weight_only::KernelType::FP16Int4Groupwise; } #if defined(ENABLE_BF16) else if (mType == nvinfer1::DataType::kBF16) { + // CUTLASS kernel selection if (quant_algo & FP8_ALPHA) { // FP8 requires at least sm89 devices @@ -203,24 +234,23 @@ void WeightOnlyGroupwiseQuantMatmulPlugin::init(nvinfer1::DataType type, int qua } else { - if (quant_algo & ZERO) - { - // has zeros - m_weightOnlyGroupwiseGemmRunner - = std::make_shared>(); - } - else - { - // no zeros - m_weightOnlyGroupwiseGemmRunner - = std::make_shared>(); - } + m_weightOnlyGroupwiseGemmRunner = selectGemmRunnerForWeightType<__nv_bfloat16>(quant_algo); + } + // CUDA kernel selection + if (quant_algo & INT8_WEIGHT) + { + // INT8 weight + mCudaKernelEnabled = tensorrt_llm::kernels::weight_only::is_supported( + mArch, tensorrt_llm::kernels::weight_only::KernelType::BF16Int8Groupwise); + mCudaKernelType = tensorrt_llm::kernels::weight_only::KernelType::BF16Int8Groupwise; + } + else + { + // INT4 weight + mCudaKernelEnabled = tensorrt_llm::kernels::weight_only::is_supported( + mArch, tensorrt_llm::kernels::weight_only::KernelType::BF16Int4Groupwise); + mCudaKernelType = tensorrt_llm::kernels::weight_only::KernelType::BF16Int4Groupwise; } - mCudaKernelEnabled = tensorrt_llm::kernels::weight_only::is_supported( - mArch, tensorrt_llm::kernels::weight_only::KernelType::BF16Int4Groupwise); - mCudaKernelType = tensorrt_llm::kernels::weight_only::KernelType::BF16Int4Groupwise; } #endif else @@ -273,8 +303,9 @@ nvinfer1::DimsExprs WeightOnlyGroupwiseQuantMatmulPlugin::getOutputDimensions( ret.d[ii] = inputs[0].d[ii]; } - // int4 weight only quant - ret.d[nbDimsA - 1] = exprBuilder.constant(inputs[mWeightInputIdx].d[1]->getConstantValue() * FP16_INT4_RATIO); + // int4/int8 weight only quant (INT4*4 -> FP16, INT8*2 -> FP16) + int const weight_multiplier = mQuantAlgo & INT8_WEIGHT ? FP16_INT8_RATIO : FP16_INT4_RATIO; + ret.d[nbDimsA - 1] = exprBuilder.constant(inputs[mWeightInputIdx].d[1]->getConstantValue() * weight_multiplier); return ret; } @@ -320,11 +351,12 @@ void WeightOnlyGroupwiseQuantMatmulPlugin::configurePlugin(nvinfer1::DynamicPlug int const maxK = in[0].max.d[in[0].max.nbDims - 1]; - // Quantized weights are packed in FP16 format (INT4*4 -> FP16) - int const maxN = in[mWeightInputIdx].max.d[1] * FP16_INT4_RATIO; + // Quantized weights are packed in FP16 format (INT4*4 -> FP16, INT8*2 -> FP16) + int const weight_multiplier = mQuantAlgo & INT8_WEIGHT ? FP16_INT8_RATIO : FP16_INT4_RATIO; + int const maxN = in[mWeightInputIdx].max.d[1] * weight_multiplier; auto const K = maxK; - auto const N = maxN / FP16_INT4_RATIO; + auto const N = maxN / weight_multiplier; if (!mDims.isInitialized()) { @@ -424,8 +456,9 @@ int WeightOnlyGroupwiseQuantMatmulPlugin::enqueue(nvinfer1::PluginTensorDesc con TLLM_CHECK_WITH_INFO(mType == nvinfer1::DataType::kHALF, "No valid weightOnlyGropwiseQuantMatmul configuration"); #endif - // Quantized weights are packed in FP16 format (INT4*4 -> FP16) - int real_n = n * FP16_INT4_RATIO; + // Quantized weights are packed in FP16 format (INT4*4 -> FP16, INT8*2 -> FP16) + int real_n = mQuantAlgo & INT8_WEIGHT ? n * FP16_INT8_RATIO : n * FP16_INT4_RATIO; + if (use_cuda_kernel) { void const* pre_quant_scale_ptr = nullptr; diff --git a/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h b/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h index 7c65e6623..ed85d2098 100644 --- a/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h +++ b/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h @@ -46,6 +46,7 @@ constexpr int32_t INT8_BITS = 8; constexpr int32_t INT4_BITS = 4; constexpr int32_t INT8_INT4_RATIO = INT8_BITS / INT4_BITS; constexpr int32_t FP16_INT4_RATIO = FP16_BITS / INT4_BITS; +constexpr int32_t FP16_INT8_RATIO = FP16_BITS / INT8_BITS; inline int32_t getWeightTypeMultiplier(WeightTypeId weightTypeId) { diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp index 0d8f5a2ff..8ff76615d 100644 --- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp @@ -140,6 +140,7 @@ void InitBindings(pybind11::module_& m) .def_readwrite("iter", &tle::IterationStats::iter) .def_readwrite("iter_latency_ms", &tle::IterationStats::iterLatencyMS) .def_readwrite("new_active_requests_queue_latency_ms", &tle::IterationStats::newActiveRequestsQueueLatencyMS) + .def_readwrite("num_new_active_requests", &tle::IterationStats::numNewActiveRequests) .def_readwrite("num_active_requests", &tle::IterationStats::numActiveRequests) .def_readwrite("num_queued_requests", &tle::IterationStats::numQueuedRequests) .def_readwrite("num_completed_requests", &tle::IterationStats::numCompletedRequests) @@ -180,6 +181,9 @@ void InitBindings(pybind11::module_& m) .def_readwrite("scheduled", &tle::RequestStats::scheduled) .def_readwrite("paused", &tle::RequestStats::paused) .def_readwrite("dis_serving_stats", &tle::RequestStats::disServingStats) + .def_readwrite("alloc_total_blocks_per_request", &tle::RequestStats::allocTotalBlocksPerRequest) + .def_readwrite("alloc_new_blocks_per_request", &tle::RequestStats::allocNewBlocksPerRequest) + .def_readwrite("reused_blocks_per_request", &tle::RequestStats::reusedBlocksPerRequest) .def("to_json_str", [](tle::RequestStats const& iterationStats) { return tle::JsonSerialization::toJsonStr(iterationStats); }); diff --git a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp index da58300fa..9ad9e0bb9 100644 --- a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp +++ b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp @@ -266,6 +266,7 @@ void parsePluginConfig(ModelConfig& modelConfig, Json const& pluginConfig) auto const manageWeightsType = parseJsonFieldOr(pluginConfig, "manage_weights", false) ? ModelConfig::ManageWeightsType::kEnabled : ModelConfig::ManageWeightsType::kDisabled; + auto const ppReduceScatter = parseJsonFieldOr(pluginConfig, "pp_reduce_scatter", false); TLLM_CHECK_WITH_INFO( !removeInputPadding || modelConfig.getMaxNumTokens(), "Padding removal requires max_num_tokens to be set."); @@ -283,6 +284,7 @@ void parsePluginConfig(ModelConfig& modelConfig, Json const& pluginConfig) modelConfig.setPagedContextFMHA(pagedContextFMHA); modelConfig.useXQA(useXQA); modelConfig.setManageWeightsType(manageWeightsType); + modelConfig.setPpReduceScatter(ppReduceScatter); } void parseLora(ModelConfig& modelConfig, Json const& json, Json const& pluginConfig, bool engineVersionNone, diff --git a/cpp/tensorrt_llm/runtime/gptSession.cpp b/cpp/tensorrt_llm/runtime/gptSession.cpp index c5bc84cf1..73df2cb3f 100644 --- a/cpp/tensorrt_llm/runtime/gptSession.cpp +++ b/cpp/tensorrt_llm/runtime/gptSession.cpp @@ -72,7 +72,6 @@ auto const kProfileMbIdxs = populateMicrobatchIndexes(); GptSession::Config setPath(GptSession::Config const& original, std::string const& path) { GptSession::Config config = original; - config.enginePath = std::filesystem::path(path); return config; } diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp index a20046079..3cb9b05b6 100644 --- a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp +++ b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp @@ -408,9 +408,7 @@ void TllmRuntime::loadManagedWeights(RawEngine const& rawEngine, int localRank) { TLLM_LOG_DEBUG("Loading managed weight: %s", name.c_str()); auto iTensor = tensorrt_llm::executor::detail::toITensor(weight); - auto weightsDevice = std::shared_ptr{ - manager.allocate(MemoryType::kGPU, iTensor->getShape(), iTensor->getDataType())}; - manager.copy(iTensor->data(), *weightsDevice, MemoryType::kCPU); + auto weightsDevice = std::shared_ptr{manager.copyFrom(*iTensor, MemoryType::kGPU)}; mManagedWeightsMap.insert(std::make_pair(name, weightsDevice)); } } diff --git a/cpp/tests/kernels/decodingKernelTest.cpp b/cpp/tests/kernels/decodingKernelTest.cpp index 9b9a868b4..f820760e7 100644 --- a/cpp/tests/kernels/decodingKernelTest.cpp +++ b/cpp/tests/kernels/decodingKernelTest.cpp @@ -1326,16 +1326,34 @@ class DecodingKernelsTest : public testing::Test void callAcceptByIdsWithPaths() { - tksp::acceptDraftTokensByIdsWithPaths(bufferCast(*mOutputTokens), - bufferCast(*mDraftTokens), bufferCast(*mTargetTokens), - bufferCast(*mSequenceLengths), bufferCast(*mAcceptedLengths), - reinterpret_cast(bufferCast(*mFinishedFinal)), - bufferCast(*mBatchSlots), bufferCast(*mPaths), bufferCast(*mEndIds), - reinterpret_cast(bufferCast(*mMedusaInputLogitsPtrs)), - reinterpret_cast(bufferCast(*mMedusaLogitsPtrs)), - bufferCast(*mTokensPerStep), bufferCast(*mTokensPerStep), - bufferCast(*mBestPaths), mBatchSize, mMaxBatchSize, mVocabSize, mMaxSeqLen, mMaxNumHeads, - mMaxDraftSeqPerStep, mStream->get()); + tksp::AcceptDraftTokensByIdsWithPathsParams params; + + params.outputIds = bufferCast(*mOutputTokens); + params.draftIds = bufferCast(*mDraftTokens); + params.targetIds = bufferCast(*mTargetTokens); + params.sequenceLengths = bufferCast(*mSequenceLengths); + params.acceptedLengths = bufferCast(*mAcceptedLengths); + params.finishedFinal + = reinterpret_cast(bufferCast(*mFinishedFinal)); + params.batchSlots = bufferCast(*mBatchSlots); + params.paths = bufferCast(*mPaths); + params.endIds = bufferCast(*mEndIds); + params.medusaLogits = reinterpret_cast(bufferCast(*mMedusaInputLogitsPtrs)); + params.logitsPtrs = reinterpret_cast(bufferCast(*mMedusaLogitsPtrs)); + params.curTokensPerStep = bufferCast(*mTokensPerStep); + params.targetTokensPerStep = bufferCast(*mTokensPerStep); + params.bestPathIds = bufferCast(*mBestPaths); + params.batchSize = mBatchSize; + params.maxBatchSize = mMaxBatchSize; + params.vocabSize = mVocabSize; + params.maxSeqLen = mMaxSeqLen; + params.maxDraftPathLen = mMaxNumHeads; + params.maxDecodingTokens = mMaxDraftSeqPerStep; + params.stream = mStream->get(); + + params.checkParams(); + + tksp::acceptDraftTokensByIdsWithPaths(params); } void callTestedKernel() diff --git a/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp b/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp index 402eea153..e3c479ba4 100644 --- a/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp +++ b/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp @@ -91,54 +91,59 @@ TYPED_TEST_SUITE(AirTopPSamplingKernelTest, FloatAndHalfTypes); TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessSmallP) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.2f)); + this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.2f)); }; TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessLargeP) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.9f)); + this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.9f)); }; TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessAncestral) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(1.0f)); + this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(1.0f)); }; TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessLargeVocabSmallP) { - this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.2f)); + this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.2f)); }; TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessLargeVocabLargeP) { - this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.9f)); + this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.9f)); }; TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessSmallP) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.2f).setDeterministicTopP(true)); + this->runTest( + SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.2f).setDeterministicTopP(true)); }; TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessLargeP) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.9f).setDeterministicTopP(true)); + this->runTest( + SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.9f).setDeterministicTopP(true)); }; TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessAncestral) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(1.0f).setDeterministicTopP(true)); + this->runTest( + SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(1.0f).setDeterministicTopP(true)); }; TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessLargeVocabSmallP) { this->runTest( - SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.2f).setDeterministicTopP(true)); + SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.2f).setDeterministicTopP( + true)); }; TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessLargeVocabLargeP) { this->runTest( - SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.9f).setDeterministicTopP(true)); + SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.9f).setDeterministicTopP( + true)); }; class AirTopPSamplingKernelUtilsTest : public SamplingKernelTest diff --git a/cpp/tests/kernels/sampling/samplingTest.cpp b/cpp/tests/kernels/sampling/samplingTest.cpp index c7f9cd2b6..d5d900244 100644 --- a/cpp/tests/kernels/sampling/samplingTest.cpp +++ b/cpp/tests/kernels/sampling/samplingTest.cpp @@ -110,6 +110,8 @@ void SamplingKernelTest::setupBuffers(SamplingKernelTestParam const& param) auto const topK = param.topK; auto const topP = param.topP; + // TopK == 0 case (TopP kernel) + auto const topKDistUpperBound = std::max(topK, static_cast(1)); std::mt19937 gen(42); @@ -133,7 +135,7 @@ void SamplingKernelTest::setupBuffers(SamplingKernelTestParam const& param) 0, vocabSize - 1); // -1 because uniform_int_distribution generates closed interval std::uniform_real_distribution<> skipDecodeDist(0, 1); std::uniform_real_distribution<> topPDist(0, topP); - std::uniform_int_distribution<> topKDist(1, topK); + std::uniform_int_distribution<> topKDist(1, topKDistUpperBound); std::uniform_int_distribution<> tokensPerStepDist(1, maxTokensPerStep); std::uniform_int_distribution<> seqLenDist(0, mMaxSeqLen - maxTokensPerStep); std::uniform_real_distribution<> logProbDist(-3.f, 3.f); @@ -158,7 +160,7 @@ void SamplingKernelTest::setupBuffers(SamplingKernelTestParam const& param) endIdsHostPtr[bi] = endIdsDistr(gen); skipDecodeHostPtr[bi] = skipDecodeDist(gen) > 0.8; topPsHostPtr[bi] = topPDist(gen); - topKsHostPtr[bi] = topKDist(gen); + topKsHostPtr[bi] = topK == 0 ? 0 : topKDist(gen); tokensPerStepPtr[bi] = tokensPerStepDist(gen); finishedHostPtr[bi] = finishedDist(gen) > 0.8 ? tk::FinishedState::finished() : tk::FinishedState::empty(); } @@ -196,9 +198,9 @@ void SamplingKernelTest::setupBuffers(SamplingKernelTestParam const& param) // Init logits randomly auto logitsHostPtr = bufferCast(*mLogitsHost); initRandom(logitsHostPtr, batchSize * maxTokensPerStep * vocabSize, -3.0f, 3.0f); - // Only in greedy search we can guarantee the selected token and stop by condition - if (topK == 1) + // TopK == 1 for TopK kernel greedy, TopK == 0 for TopP kernels + if (topK <= 1) { for (SizeType32 bi = 0; bi < batchSize; ++bi) { @@ -231,13 +233,29 @@ std::vector SamplingKernelTest::computeTopKTopPVariants( auto topK = bufferCast(*mTopKsHost)[batchSlot]; auto topP = bufferCast(*mTopPsHost)[batchSlot]; - allowedTokens.insert(allowedTokens.begin(), indices.begin(), indices.begin() + topK); + if (topK > 0) // handling top K kernel, top P result based on topK tokens + { + float sSum = 0.f; // sSum as in samplingTopKKernels.cu + for (auto ki = 0; ki < topK; ki++) + { + sSum += static_cast(probsPtr[indices[ki]]); + } + topP *= sSum; // the adjusted topP in the selected topK distribution + } + float totalProb = 0.f; SizeType32 idx = 0; while (totalProb < topP && idx < vocabSize) { allowedTokens.push_back(indices[idx]); totalProb += static_cast(probsPtr[indices[idx++]]); + // cuda may selected a different index with same probability in kernel reduce, in test we allow them + while (idx < vocabSize + && static_cast(probsPtr[indices[idx]]) == static_cast(probsPtr[indices[idx - 1]])) + { + allowedTokens.push_back(indices[idx]); + totalProb += static_cast(probsPtr[indices[idx++]]); + } } return allowedTokens; } @@ -284,12 +302,15 @@ void SamplingKernelTest::verifyResult(SamplingKernelTestParam const& param) auto const tokensPerStep = tokensPerStepPtr[batchSlot]; for (SizeType32 ti = 0; ti < tokensPerStep; ++ti) { - auto kResults = param.returnAllTopK ? bufferCast(*mTopKsHost)[batchSlot] : 1; - - for (SizeType32 ki = 0; ki < kResults; ++ki) + auto topK = bufferCast(*mTopKsHost)[batchSlot]; + auto kResults = param.returnAllSelectedTokens ? (topK == 0 ? vocabSize : topK) : 1; + auto topKTopPVariants = computeTopKTopPVariants(bi, batchSlot, ti, maxTokensPerStep, vocabSize); + SizeType32 ki; + for (ki = 0; ki < kResults && ki < topKTopPVariants.size(); ++ki) { // Set reference finished state to true if we finished before or at current step - auto const idsIdx = param.returnAllTopK ? ti * mMaxTopK + ki : seqLengthsOrigHostPtr[batchSlot] + ti; + auto const idsIdx + = param.returnAllSelectedTokens ? ti * mMaxTopK + ki : seqLengthsOrigHostPtr[batchSlot] + ti; auto const outputId = outputIdsHostPtr[batchSlot * mMaxSeqLen + idsIdx]; // Check the range of the returned token ([0, vocabSize)) EXPECT_TRUE((outputId >= 0) && (outputId < vocabSize)); @@ -299,7 +320,7 @@ void SamplingKernelTest::verifyResult(SamplingKernelTestParam const& param) if (!skipDecodeHostPtr[batchSlot] && !finishedOrigHostPtr[batchSlot].isFinished() && !finishedOrigHostPtr[batchSlot].isSkipDecoding()) { - if (maxTokensPerStep == 1 && !param.returnAllTopK) + if (maxTokensPerStep == 1 && !param.returnAllSelectedTokens) { if (generatedEOS) { @@ -314,8 +335,6 @@ void SamplingKernelTest::verifyResult(SamplingKernelTestParam const& param) } } - auto topKTopPVariants = computeTopKTopPVariants(bi, batchSlot, ti, maxTokensPerStep, vocabSize); - bool found = false; for (auto const& var : topKTopPVariants) { @@ -340,11 +359,24 @@ void SamplingKernelTest::verifyResult(SamplingKernelTestParam const& param) EXPECT_EQ(finishedHostPtr[batchSlot].isFinished(), finishedOrigHostPtr[batchSlot].isFinished()); } } + + // a boundary check for returnAllSelectedTokens in topP kernel and when TopP selected indices < topK in topK + // kernel. + if (!skipDecodeHostPtr[batchSlot] && !finishedOrigHostPtr[batchSlot].isFinished() + && !finishedOrigHostPtr[batchSlot].isSkipDecoding()) + { + if (param.returnAllSelectedTokens && (topK == 0 || ki != topK)) + { + auto const idsIdx = ti * mMaxTopK + ki; + auto const outputId = outputIdsHostPtr[batchSlot * mMaxSeqLen + idsIdx]; + EXPECT_EQ(outputId, -1); + } + } } } // Cum log probs is not supported for multiple tokens per step or all top K return - if (maxTokensPerStep == 1 && !param.returnAllTopK) + if (maxTokensPerStep == 1 && !param.returnAllSelectedTokens) { for (int32_t bi = 0; bi < batchSize; ++bi) { diff --git a/cpp/tests/kernels/sampling/samplingTest.h b/cpp/tests/kernels/sampling/samplingTest.h index 33d4e46b0..10de1f059 100644 --- a/cpp/tests/kernels/sampling/samplingTest.h +++ b/cpp/tests/kernels/sampling/samplingTest.h @@ -194,7 +194,7 @@ struct SamplingKernelTestParam bool normalizeLogProbs{false}; bool logitsHasProbs{true}; int32_t maxTokensPerStep{1}; - bool returnAllTopK{false}; + bool returnAllSelectedTokens{false}; bool useLogitsPtrs{false}; bool isDeterministicTopP{false}; @@ -228,9 +228,9 @@ struct SamplingKernelTestParam return *this; } - SamplingKernelTestParam& setReturnAllTopK() + SamplingKernelTestParam& setReturnAllSelectedTokens() { - returnAllTopK = true; + returnAllSelectedTokens = true; return *this; } diff --git a/cpp/tests/kernels/sampling/samplingTopKTest.cpp b/cpp/tests/kernels/sampling/samplingTopKTest.cpp index 0d3ea5b78..2bb5763fc 100644 --- a/cpp/tests/kernels/sampling/samplingTopKTest.cpp +++ b/cpp/tests/kernels/sampling/samplingTopKTest.cpp @@ -70,10 +70,10 @@ class TopKSamplingKernelTest : public SamplingKernelTest kernelParams.finishedOutput = reinterpret_cast( bufferCast(*this->mFinishedDevice)); kernelParams.skipDecode = bufferCast(*this->mSkipDecodeDevice); - kernelParams.cumLogProbs = params.returnAllTopK || params.maxTokensPerStep > 1 + kernelParams.cumLogProbs = params.returnAllSelectedTokens || params.maxTokensPerStep > 1 ? nullptr : bufferCast(*this->mCumLogProbsDevice); - kernelParams.outputLogProbs = params.returnAllTopK || params.maxTokensPerStep > 1 + kernelParams.outputLogProbs = params.returnAllSelectedTokens || params.maxTokensPerStep > 1 ? nullptr : bufferCast(*this->mOutputLogProbsDevice); kernelParams.curandState = reinterpret_cast(bufferCast(*this->mCurandStatesDevice)); @@ -84,7 +84,7 @@ class TopKSamplingKernelTest : public SamplingKernelTest kernelParams.vocabSizePadded = params.vocabSize; kernelParams.normalizeLogProbs = params.normalizeLogProbs; kernelParams.logitsHasProbs = params.logitsHasProbs; - kernelParams.returnAllTopK = params.returnAllTopK; + kernelParams.returnAllSelectedTokens = params.returnAllSelectedTokens; // Perform batched TopK sampling tk::invokeBatchTopKSampling(kernelParams, this->mStream->get()); @@ -136,7 +136,7 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessTopKMaxTokensPerStep) SamplingKernelTestParam().setBatchSize(16).setVocabSize(4000).setTopK(63).setTopP(1.0f).setMaxTokensPerStep(4)); }; -TYPED_TEST(TopKSamplingKernelTest, CorrectnessReturnAllTopK) +TYPED_TEST(TopKSamplingKernelTest, CorrectnessReturnAllSelectedTokens) { this->runTest(SamplingKernelTestParam() .setBatchSize(16) @@ -144,7 +144,18 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessReturnAllTopK) .setTopK(10) .setTopP(1.0f) .setMaxTokensPerStep(4) - .setReturnAllTopK()); + .setReturnAllSelectedTokens()); +}; + +TYPED_TEST(TopKSamplingKernelTest, CorrectnessReturnAllSelectedTokensSmallP) +{ + this->runTest(SamplingKernelTestParam() + .setBatchSize(16) + .setVocabSize(50) + .setTopK(20) + .setTopP(0.3f) + .setMaxTokensPerStep(4) + .setReturnAllSelectedTokens()); }; TYPED_TEST(TopKSamplingKernelTest, CorrectnessLogitsPtrs) diff --git a/cpp/tests/kernels/sampling/samplingTopPTest.cpp b/cpp/tests/kernels/sampling/samplingTopPTest.cpp index 047644319..92fc81738 100644 --- a/cpp/tests/kernels/sampling/samplingTopPTest.cpp +++ b/cpp/tests/kernels/sampling/samplingTopPTest.cpp @@ -64,12 +64,15 @@ class TopPSamplingKernelTest : public SamplingKernelTest kernelParams.finishedOutput = reinterpret_cast( bufferCast(*this->mFinishedDevice)); kernelParams.skipDecode = bufferCast(*this->mSkipDecodeDevice); - kernelParams.cumLogProbs = bufferCast(*this->mCumLogProbsDevice); - kernelParams.outputLogProbs = bufferCast(*this->mOutputLogProbsDevice); + kernelParams.cumLogProbs + = params.returnAllSelectedTokens ? nullptr : bufferCast(*this->mCumLogProbsDevice); + kernelParams.outputLogProbs + = params.returnAllSelectedTokens ? nullptr : bufferCast(*this->mOutputLogProbsDevice); kernelParams.curandState = reinterpret_cast(bufferCast(*this->mCurandStatesDevice)); kernelParams.batchSize = params.batchSize; kernelParams.maxBatchSize = maxBatchSize; kernelParams.vocabSizePadded = params.vocabSize; + kernelParams.returnAllSelectedTokens = params.returnAllSelectedTokens; // Perform batched TopP sampling tk::invokeBatchTopPSampling(kernelParams, this->mStream->get()); @@ -80,26 +83,36 @@ TYPED_TEST_SUITE(TopPSamplingKernelTest, FloatAndHalfTypes); TYPED_TEST(TopPSamplingKernelTest, CorrectnessSmallP) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.2f)); + this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.2f)); }; TYPED_TEST(TopPSamplingKernelTest, CorrectnessLargeP) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.9f)); + this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.9f)); }; TYPED_TEST(TopPSamplingKernelTest, CorrectnessAncestral) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(1.0f)); + this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(1.0f)); }; TYPED_TEST(TopPSamplingKernelTest, CorrectnessLargeVocabSmallP) { - this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.2f)); + this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.2f)); }; TYPED_TEST(TopPSamplingKernelTest, CorrectnessLargeVocabLargeP) { - this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.9f)); + this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.9f)); +}; + +TYPED_TEST(TopPSamplingKernelTest, CorrectnessReturnAllSelectedTokens) +{ + this->runTest(SamplingKernelTestParam() + .setBatchSize(16) + .setVocabSize(50) + .setTopK(0) + .setTopP(0.8f) + .setReturnAllSelectedTokens()); }; } // end of namespace diff --git a/cpp/tests/kernels/weightOnly/weightOnlyKernelTest.cpp b/cpp/tests/kernels/weightOnly/weightOnlyKernelTest.cpp index 4cbda4473..1bc31e15b 100644 --- a/cpp/tests/kernels/weightOnly/weightOnlyKernelTest.cpp +++ b/cpp/tests/kernels/weightOnly/weightOnlyKernelTest.cpp @@ -164,6 +164,10 @@ struct cutlassTypeMapper return ss.str(); \ } \ }; +CUTLASS_TYPE_MAPPER_REGISTRY(wo::KernelType::FP16Int8Groupwise, "FP16Int8Groupwise", half, uint8_t, 8, + cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS); +CUTLASS_TYPE_MAPPER_REGISTRY(wo::KernelType::BF16Int8Groupwise, "BF16Int8Groupwise", __nv_bfloat16, uint8_t, 8, + cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS); CUTLASS_TYPE_MAPPER_REGISTRY(wo::KernelType::FP16Int4Groupwise, "FP16Int4Groupwise", half, cutlass::uint4b_t, 4, cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS); CUTLASS_TYPE_MAPPER_REGISTRY(wo::KernelType::BF16Int4Groupwise, "BF16Int4Groupwise", __nv_bfloat16, cutlass::uint4b_t, @@ -367,8 +371,8 @@ bool benchmark_and_verify(int m, int n, int k, int groupsize, int warmup, int it d_out.copy_to(h_out2.data()); float quant_scale = 1.f / (1 << (WSizeInBits - 1)); bool pass = compare(h_out1.data(), h_out2.data(), m * n, quant_scale); - printf( - "cuda kernel cost time %.6f, cutlass kernel cost time %.6f, cuda speedup %.3f\n", time1, time2, time2 / time1); + printf("cuda kernel cost time %.6f, cutlass kernel cost time %.6f, cuda speedup %.3f\n\n", time1, time2, + time2 / time1); return pass; } @@ -392,6 +396,10 @@ TEST(Kernel, WeightOnly) EXPECT_TRUE(pass); if (arch >= 75) { + pass = benchmark_and_verify(m, n, k, 64, warmup, iter); + EXPECT_TRUE(pass); + pass = benchmark_and_verify(m, n, k, 128, warmup, iter); + EXPECT_TRUE(pass); pass = benchmark_and_verify(m, n, k, 64, warmup, iter); EXPECT_TRUE(pass); pass = benchmark_and_verify(m, n, k, 128, warmup, iter); @@ -399,6 +407,10 @@ TEST(Kernel, WeightOnly) #if defined(ENABLE_BF16) if (arch >= 80) { + pass = benchmark_and_verify(m, n, k, 64, warmup, iter); + EXPECT_TRUE(pass); + pass = benchmark_and_verify(m, n, k, 128, warmup, iter); + EXPECT_TRUE(pass); pass = benchmark_and_verify(m, n, k, 64, warmup, iter); EXPECT_TRUE(pass); pass = benchmark_and_verify(m, n, k, 128, warmup, iter); diff --git a/cpp/tests/layers/baseSamplingLayerTest.cpp b/cpp/tests/layers/baseSamplingLayerTest.cpp index 7b286514d..cb2668426 100644 --- a/cpp/tests/layers/baseSamplingLayerTest.cpp +++ b/cpp/tests/layers/baseSamplingLayerTest.cpp @@ -73,6 +73,8 @@ void BaseSamplingLayerTest::setup(uint64_t seed, TestSamplingParams const& pa trk::invokeFill(*mCumLogProbsDevice, float{0.0f}, *mStream); trk::invokeFill(*mOutputLogProbsDevice, float{0.0f}, *mStream); trk::invokeFill(*mEndIdsDevice, int32_t{mEndId}, *mStream); + tk::invokeCurandInitialize(reinterpret_cast(bufferCast(*mCurandStatesDevice)), nullptr, + mMaxBatchSize, seed, mStream->get()); auto batchSlotsPtr = bufferCast(*mBatchSlots); for (SizeType32 bi = 0; bi < mBatchSize; ++bi) diff --git a/cpp/tests/layers/lookaheadDecodingLayerTest.cpp b/cpp/tests/layers/lookaheadDecodingLayerTest.cpp index f8f7c04f7..71c62bd1a 100644 --- a/cpp/tests/layers/lookaheadDecodingLayerTest.cpp +++ b/cpp/tests/layers/lookaheadDecodingLayerTest.cpp @@ -720,17 +720,17 @@ void LookaheadDecodingLayerTest::verifyDecode() BufferRange cumSumRange(*mNumNewTokensCumSum); BufferRange pathOffsetsRange(*mPathsOffsets); PRINT_VALUES(mNumNewTokensCumSum); - for (SizeType32 gbi = 0; gbi < mTestParam.maxBatchSize; gbi++) + for (SizeType32 bi = 0; bi < batchSize; bi++) { - SizeType32 pathOffsetBegin = cumSumRange[gbi]; - SizeType32 pathOffsetEnd = cumSumRange[gbi + 1]; + auto gbi = BufferRange(*mBatchSlots)[bi]; + SizeType32 pathOffsetBegin = cumSumRange[bi]; + SizeType32 pathOffsetEnd = cumSumRange[bi + 1]; TensorPtr golden = ITensor::at(mGoldenSampledTokens, {gbi}); auto sequenceLength = BufferLocation(*mSequenceLengths).at(gbi); auto numNewTokens = BufferLocation(*mNumNewTokens).at(gbi); TensorPtr newTokens = ITensor::slice(mOutputIds, {gbi, 0, sequenceLength - numNewTokens}, numNewTokens); BufferRange goldenRange(*ITensor::at(mGoldenSampledTokens, {gbi})); - BufferRange newTokensRange( - *ITensor::slice(mOutputIds, {gbi, 0, sequenceLength - numNewTokens}, numNewTokens)); + BufferRange newTokensRange(*newTokens); SizeType32 ni = 1; for (SizeType32 poi = pathOffsetBegin; poi < pathOffsetEnd; poi++) diff --git a/cpp/tests/layers/lookaheadRandomLlmTest.cpp b/cpp/tests/layers/lookaheadRandomLlmTest.cpp index e4570b1ee..f8e8ff027 100644 --- a/cpp/tests/layers/lookaheadRandomLlmTest.cpp +++ b/cpp/tests/layers/lookaheadRandomLlmTest.cpp @@ -207,7 +207,7 @@ TEST(LookaheadRandomllm, gpuSampling) kernelParams.vocabSizePadded = vocabSize; kernelParams.normalizeLogProbs = false; kernelParams.logitsHasProbs = false; - kernelParams.returnAllTopK = false; + kernelParams.returnAllSelectedTokens = false; PRINT_TOKENS(mEndIds); PRINT_VALUES(mTokensPerStep); diff --git a/cpp/tests/resources/scripts/test_cpp.py b/cpp/tests/resources/scripts/test_cpp.py index 0082b02d1..ad130e7ed 100755 --- a/cpp/tests/resources/scripts/test_cpp.py +++ b/cpp/tests/resources/scripts/test_cpp.py @@ -101,6 +101,9 @@ def add_parallel_info(report, parallel): document.write(report, encoding="UTF-8", xml_declaration=True) +default_test_parallel = 2 + + def parallel_run_ctest( command: _tp.Sequence[str], cwd: _pl.Path, @@ -108,7 +111,7 @@ def parallel_run_ctest( shell=False, env=None, timeout=None, - parallel=2, + parallel=default_test_parallel, ) -> None: if parallel == 1: return run_command(command, @@ -576,7 +579,16 @@ def run_unit_tests(build_dir: _pl.Path, timeout=1800): excluded_tests.append("Encoder") excluded_tests.append("EncDec") ctest.extend(["-E", "|".join(excluded_tests)]) - parallel_run_ctest(ctest, cwd=build_dir, env=cpp_env, timeout=timeout) + + parallel = default_test_parallel + if parallel_override := _os.environ.get("LLM_TEST_PARALLEL_OVERRIDE", None): + parallel = int(parallel_override) + + parallel_run_ctest(ctest, + cwd=build_dir, + env=cpp_env, + timeout=timeout, + parallel=parallel) def run_single_gpu_tests(build_dir: _pl.Path, @@ -634,7 +646,17 @@ def run_single_gpu_tests(build_dir: _pl.Path, ctest.extend(["-R", "|".join(included_tests)]) if excluded_tests: ctest.extend(["-E", "|".join(excluded_tests)]) - parallel_run_ctest(ctest, cwd=build_dir, env=cpp_env, timeout=timeout) + + parallel = default_test_parallel + if parallel_override := _os.environ.get("LLM_TEST_PARALLEL_OVERRIDE", + None): + parallel = int(parallel_override) + + parallel_run_ctest(ctest, + cwd=build_dir, + env=cpp_env, + timeout=timeout, + parallel=parallel) if run_gpt: xml_output_file = build_dir / "results-single-gpu-disagg-executor_gpt.xml" trt_model_test = produce_mpirun_command( diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index 577c68d78..e9a7506ce 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -62,7 +62,7 @@ COPY benchmarks benchmarks COPY scripts scripts COPY tensorrt_llm tensorrt_llm COPY 3rdparty 3rdparty -COPY setup.py requirements.txt requirements-dev.txt ./ +COPY .gitmodules setup.py requirements.txt requirements-dev.txt ./ # Create cache directories for pip and ccache RUN mkdir -p /root/.cache/pip /root/.cache/ccache diff --git a/docker/common/install_pytorch.sh b/docker/common/install_pytorch.sh index ebd93e81f..70c01917a 100644 --- a/docker/common/install_pytorch.sh +++ b/docker/common/install_pytorch.sh @@ -6,6 +6,10 @@ set -ex # and closest to the version specified in # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07 TORCH_VERSION="2.4.0" +# Check the compatible torchvision from +# https://github.com/pytorch/vision/tree/main?tab=readme-ov-file#installation +# and also confirm with https://pypi.org/pypi/torchvision/0.19.0/json +TORCHVISION_VERSION="0.19.0" SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') prepare_environment() { @@ -35,29 +39,44 @@ restore_environment() { install_from_source() { if [[ $SYSTEM_ID == *"centos"* ]]; then - VERSION_ID=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"') - if [[ $VERSION_ID == "7" ]]; then - echo "Installation from PyTorch source codes cannot be supported..." - exit 1 - fi + VERSION_ID=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"') + if [[ $VERSION_ID == "7" ]]; then + echo "Installation from PyTorch source codes cannot be supported..." + exit 1 + fi fi prepare_environment $1 + export _GLIBCXX_USE_CXX11_ABI=$1 - export TORCH_CUDA_ARCH_LIST="8.0;9.0" + export TORCH_CUDA_ARCH_LIST="8.0;9.0" + export PYTORCH_BUILD_VERSION=${TORCH_VERSION} + export PYTORCH_BUILD_NUMBER=0 pip3 uninstall -y torch cd /tmp - git clone --depth 1 --branch v$TORCH_VERSION https://github.com/pytorch/pytorch + git clone --depth 1 --branch v${TORCH_VERSION} https://github.com/pytorch/pytorch cd pytorch git submodule sync && git submodule update --init --recursive pip3 install -r requirements.txt python3 setup.py install cd /tmp && rm -rf /tmp/pytorch + + export PYTORCH_VERSION=${PYTORCH_BUILD_VERSION} + export FORCE_CUDA=1 + export BUILD_VERSION=${TORCHVISION_VERSION} + pip3 uninstall -y torchvision + cd /tmp + git clone --depth 1 --branch v${TORCHVISION_VERSION} https://github.com/pytorch/vision + cd vision + python3 setup.py install + cd /tmp && rm -rf /tmp/vision + restore_environment $1 } install_from_pypi() { - pip3 install torch==${TORCH_VERSION} + pip3 uninstall -y torch torchvision + pip3 install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} } case "$1" in diff --git a/docs/source/advanced/executor.md b/docs/source/advanced/executor.md index 8955c6bae..500e7cab3 100644 --- a/docs/source/advanced/executor.md +++ b/docs/source/advanced/executor.md @@ -15,37 +15,6 @@ The following sections provide an overview of the main classes defined in the Ex The `Executor` class is responsible for receiving requests from the client, and providing responses for those requests. The executor is constructed by providing a path to a directory containing the TensorRT-LLM engine or buffers containing the engine and the model JSON configuration. The client can create requests and enqueue those requests for execution using the `enqueueRequest` or `enqueueRequests` methods of the `Executor` class. Enqueued requests will be scheduled for execution by the executor, and multiple independent requests can be batched together at every iteration of the main execution loop (a process often referred to as continuous batching or iteration-level batching). Responses for a particular request can be awaited for by calling the `awaitResponses` method, and by providing the request id. Alternatively, responses for any requests can be awaited for by omitting to provide the request id when calling `awaitResponses`. The `Executor` class also allows to cancel requests using the `cancelRequest` method and to obtain per-iteration and per-request statistics using the `getLatestIterationStats`. -#### Logits Post-Processor (optional) - -Users can alter the logits produced by the network, by providing a map of named callbacks of the form: - -``` -std::unordered_map)>> -``` -to an instance of `LogitsPostProcessorConfig`. The map key is the name associated with that logits post-processing callback. Each request can then specify the name of the logits post-processor to use for that particular request, if any. - -The first argument to the callback is the request id, second is the logits tensor, third are the tokens produced by the request so far, fourth is the operation stream used by the logits tensor, and last one is an optional client id. The callback returns a modified tensor of logits. - -Users *must* use the stream to access the logits tensor. For example, performing a addition with a bias tensor should be enqueued on that stream. -Alternatively, users may call `stream->synchronize()`, however, that will slow down the entire execution pipeline. - -Multiple requests can share same client id and callback can use different logic based on client id. - -We also provide a batched version that allows altering logits of multiple requests in a batch. This allows further optimizations and reduces callback overheads. - -``` -std::function const&, std::vector&, std::vector> const&, StreamPtr const&, std::vector> const&)> -``` - -A single batched callback can be specified in `LogitsPostProcessorConfig`. Each request can opt to apply this callback by specifying the name of the logits -post-processor as `Request::kBatchedPostProcessorName`. - -Note: Neither callback variant is supported with the `STATIC` batching type for the moment. - -In a multi-GPU run, callback is invoked on all tensor parallel ranks (in last pipeline rank) by default. -For correct execution, user should replicate client-side state accessed by callback on all tensor parallel ranks. -If replication is expensive or infeasible, use `LogitsPostProcessorConfig::setReplicate(false)` to invoke callback only on first tensor parallel rank. - ### The Request Class The `Request` class is used to define properties of the request, such as the input token ids and the maximum number of tokens to generate. The `streaming` parameter can be used to indicate if the request should generate a response for each new generated tokens (`streaming = true`) or only after all tokens have been generated (`streaming = false`). Other mandatory parameters of the request include the sampling configuration (defined by the `SamplingConfig` class) which contains parameters controlling the decoding process and the output configuration (defined by the `OutputConfig` class) which controls what information should be included in the `Result` for a particular response. @@ -83,6 +52,32 @@ The executor can process requests with different beam widths if the following co The request queue of the executor must be empty to allow it to reconfigure itself for a new beam width. This reconfiguration will happen automatically when requests with a new beam width are enqueued. If requests with different beam widths are enqueued at the same time, the executor will encounter an error and terminate all requests prematurely. +### Controlling output with Logits Post-Processor + +Optionally, you can alter the logits produced by the network by providing an instance of `Executor::LogitsPostProcessorConfig`. For instance, this feature can be used to generate JSON formatted output. {cpp:class}`Executor::LogitsPostProcessorConfig ` specifies a map of named callbacks in the following form + +```cpp +std::unordered_map)>> +``` + +The map key is the name associated with that logits post-processing callback. Each request can then specify the name of the logits post-processor to use for that particular request, if any. + +The first argument to the callback is the request id, second is the logits tensor, third are the tokens produced by the request so far, fourth is the operation stream used by the logits tensor, and last one is an optional client id. The callback returns a modified tensor of logits. Multiple requests can share same client id and callback can use different logic based on client id. + +You must use the stream to access the logits tensor. For example, to perform an addition with a bias tensor, the addition operation is enqueued on that stream. Alternatively, you can call `stream->synchronize()`, however, that will slow down the entire execution pipeline. + +The executor also includes a {cpp:class}`LogitsPostProcessorBatched ` method that enables altering logits of multiple requests in a batch. The batched method allows further optimizations and reduces callback overheads. + +```cpp +std::function const&, std::vector&, std::vector> const&, StreamPtr const&, std::vector> const&)> +``` + +A single batched callback can be specified in `LogitsPostProcessorConfig`. Each request can opt to apply this callback by specifying the name of the logits post-processor as `Request::kBatchedPostProcessorName`. + +Note: Neither callback variant is supported with the `STATIC` batching type for the moment. + +In a multi-GPU run, the callback is invoked on all ranks in the first tensor-parallel group, by default. To ensure correct execution, replicate the client-side state that is accessed by the callback on these ranks. If replication is expensive or infeasible, use `LogitsPostProcessorConfig::setReplicate(false)` to invoke the callback only on rank 0. The executor broadcasts the sampled tokens internally to ensure correct execution. + ## C++ Executor API Example Two C++ examples are provided that shows how to use the Executor API and can be found in the [`examples/cpp/executor`](source:examples/cpp/executor/) folder. diff --git a/docs/source/advanced/speculative-decoding.md b/docs/source/advanced/speculative-decoding.md index f59600b86..0f06b11cc 100644 --- a/docs/source/advanced/speculative-decoding.md +++ b/docs/source/advanced/speculative-decoding.md @@ -304,7 +304,7 @@ For guidance on constructing and executing Medusa with the Python runtime, consu - TensorRT-LLM supports Medusa only for Vicuna (fine tuned LLaMA). However, similar to any new model, you can follow the same approach to define your own Medusa model and deploy with TensorRT-LLM. -- We match only tokens during the validation phasem that is `medusa_temperature=0`. +- We match only tokens during the validation phase that is `medusa_temperature=0`. - Beam search is **not** compatible with Medusa. diff --git a/docs/source/helper.py b/docs/source/helper.py index d14fcb963..4a296e627 100644 --- a/docs/source/helper.py +++ b/docs/source/helper.py @@ -93,13 +93,13 @@ def generate_llmapi(): doc_dir.mkdir(exist_ok=True) doc_path = doc_dir / "index.rst" - hlapi_all_file = root_dir / "tensorrt_llm/hlapi/__init__.py" - public_classes_names = extract_all_and_eval(hlapi_all_file)['__all__'] + llmapi_all_file = root_dir / "tensorrt_llm/llmapi/__init__.py" + public_classes_names = extract_all_and_eval(llmapi_all_file)['__all__'] content = underline("API Reference", "-") + "\n\n" for cls_name in public_classes_names: cls_name = cls_name.strip() - content += (f".. autoclass:: tensorrt_llm.hlapi.{cls_name}\n" + content += (f".. autoclass:: tensorrt_llm.llmapi.{cls_name}\n" " :members:\n" " :undoc-members:\n" " :special-members: __init__\n" diff --git a/docs/source/installation/windows.md b/docs/source/installation/windows.md index e105e998f..e8e6af64a 100644 --- a/docs/source/installation/windows.md +++ b/docs/source/installation/windows.md @@ -71,3 +71,7 @@ We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-L This may be caused by an outdated Microsoft Visual C++ Redistributable Version. Please install [the latest MSVC](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170#latest-microsoft-visual-c-redistributable-version) and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSCV DLL). + +2. OSError: [WinError 126] The specified module could not be found. Error loading “...\Lib\site-packages\torch\lib\fbgemm.dll” or one of its dependencies. + +Installing the latest [Build Tools for Visual Studio 2022] (https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022) will resolve the issue. diff --git a/docs/source/llm-api-examples/customization.md b/docs/source/llm-api-examples/customization.md index 5f574d92a..59bd0d4f1 100644 --- a/docs/source/llm-api-examples/customization.md +++ b/docs/source/llm-api-examples/customization.md @@ -5,7 +5,7 @@ TensorRT-LLM can quantize the Hugging Face model automatically. By setting the appropriate flags in the `LLM` instance. For example, to perform an Int4 AWQ quantization, the following code triggers the model quantization. Please refer to complete list of [supported flags](https://nvidia.github.io/TensorRT-LLM/_modules/tensorrt_llm/quantization/mode.html#QuantAlgo) and acceptable values. ``` python -from tensorrt_llm.hlapi import QuantConfig, QuantAlgo +from tensorrt_llm.llmapi import QuantConfig, QuantAlgo quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16_AWQ) @@ -14,12 +14,12 @@ llm = LLM(, quant_config=quant_config) ## Sampling -SamplingParams can customize the sampling strategy to control LLM generated responses, such as beam search, temperature, and [others](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/hlapi/utils.py#L55-L76). +SamplingParams can customize the sampling strategy to control LLM generated responses, such as beam search, temperature, and [others](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/llmapi/utils.py#L55-L76). As an example, to enable beam search with a beam size of 4, set the `sampling_params` as follows: ```python -from tensorrt_llm.hlapi import LLM, SamplingParams, BuildConfig +from tensorrt_llm.llmapi import LLM, SamplingParams, BuildConfig build_config = BuildConfig() build_config.max_beam_width = 4 @@ -38,7 +38,7 @@ for output in llm.generate(, sampling_params=sampling_params): * [SamplingConfig](https://nvidia.github.io/TensorRT-LLM/_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm7runtime14SamplingConfigE) * [OutputConfig](https://nvidia.github.io/TensorRT-LLM/_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor12OutputConfigE) -Refer to the [class documentation](https://nvidia.github.io/TensorRT-LLM/llm-api/index.html#tensorrt_llm.hlapi.SamplingParams) for more details. +Refer to the [class documentation](https://nvidia.github.io/TensorRT-LLM/llm-api/index.html#tensorrt_llm.llmapi.SamplingParams) for more details. ## Build Configuration @@ -55,11 +55,11 @@ Refer to the [buildconfig documentation](https://github.com/NVIDIA/TensorRT-LLM/ ## Runtime Customization -Similar to `build_config`, you can also customize the runtime configuration with the `runtime_config`, `peft_cache_config` or other [arguments](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/hlapi/llm_utils.py#L186-L223) borrowed from the lower-level APIs. These runtime configuration options provide additional flexibility with respect to KV cache management, GPU memory allocation and so on. Refer to the following example: +Similar to `build_config`, you can also customize the runtime configuration with the `runtime_config`, `peft_cache_config` or other [arguments](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/llmapi/llm_utils.py#L186-L223) borrowed from the lower-level APIs. These runtime configuration options provide additional flexibility with respect to KV cache management, GPU memory allocation and so on. Refer to the following example: ```python -from tensorrt_llm.hlapi import LLM, KvCacheConfig +from tensorrt_llm.llmapi import LLM, KvCacheConfig llm = LLM(, kv_cache_config=KvCacheConfig( diff --git a/docs/source/llm-api-examples/index.md b/docs/source/llm-api-examples/index.md index 9018aa9fa..14388a9c2 100644 --- a/docs/source/llm-api-examples/index.md +++ b/docs/source/llm-api-examples/index.md @@ -13,6 +13,7 @@ The LLM API can be used for both offline or online usage. See more examples of t * [LLM Generate Async Streaming](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_generate_async_streaming.html) * [LLM Quantization](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_quantization.html) * [LLM Auto Parallel](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_auto_parallel.html) +* [LLM Logits Processor](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_logits_processor.html) For more details on how to fully utilize this API, check out: diff --git a/docs/source/media/image-09-29-2024.png b/docs/source/media/image-09-29-2024.png deleted file mode 100644 index 840c76907..000000000 Binary files a/docs/source/media/image-09-29-2024.png and /dev/null differ diff --git a/docs/source/media/image-10-07-2024.png b/docs/source/media/image-10-07-2024.png new file mode 100644 index 000000000..61a94bad2 Binary files /dev/null and b/docs/source/media/image-10-07-2024.png differ diff --git a/docs/source/overview.md b/docs/source/overview.md index 8c321fa39..0de8a005e 100644 --- a/docs/source/overview.md +++ b/docs/source/overview.md @@ -30,7 +30,8 @@ TensorRT-LLM consists of pre– and post-processing steps and multi-GPU multi-no ### Latest GPU Support -TensorRT-LLM supports GPUs based on the NVIDIA Hopper, NVIDIA Ada Lovelace, NVIDIA Ampere, NVIDIA Turing, and NVIDIA Volta architectures. Certain limitations may, however, apply. Refer to the {ref}`support-matrix` for more information. +TensorRT-LLM supports GPUs based on the NVIDIA Hopper, NVIDIA Ada Lovelace, and NVIDIA Ampere architectures. +Certain limitations might apply. Refer to the {ref}`support-matrix` for more information. ### Native Windows Support diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index 97e3affe1..b4c0646a8 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -34,83 +34,191 @@ and shows the throughput client-server scenario under maximum load. The performance numbers below were collected using the steps described in this document. -**All data in the table below was generated using version 0.12.0 and presents token throughput in tokens/second.** - -| | | | | | | | | | -| ------------ | ------------------------ | ------------- | --------------- | ----------- | -------------- | -------------- | -------------- | ------- | -| | | **GPU** | H200 141GB HBM3 | GH200 120GB | H100 80GB HBM3 | H100 80GB HBM3 | A100-SXM4-80GB | L40S | -| | | **Precision** | FP8 | FP8 | FP8 | FP16 | FP16 | FP8 | -| **Model** | **Input/Output Lengths** | **TP** | | | | | | | -| GPTJ 6B | 128/128 | 1 | 24834.76 | 22454.79 | 24429.55 | 13085.91 | 5864.81 | 7647.24 | -| | 128/2048 | 1 | 8348.93 | 6656.25 | 7831.38 | 3882.21 | 2194.57 | 1843.91 | -| | 128/4096 | 1 | 5062.80 | 3678.91 | 3968.98 | 2046.53 | 1118.22 | 980.67 | -| | 2048/128 | 1 | 2776.53 | 2491.03 | 2724.38 | 1488.56 | 657.01 | 741.06 | -| | 2048/2048 | 1 | 3631.54 | 2994.81 | 3004.17 | 1280.54 | 854.37 | 754.16 | -| LLaMA v2 7B | 128/128 | 1 | 19706.35 | 17803.58 | 19068.99 | 11393.48 | 5272.39 | 6345.72 | -| | 128/2048 | 1 | 7651.12 | 5472.34 | 6610.03 | 2964.65 | 1785.79 | 1551.37 | -| | 128/4096 | 1 | 4424.90 | 3271.61 | 3649.38 | 1596.87 | 957.12 | 817.24 | -| | 2048/128 | 1 | 2385.54 | 2035.42 | 2271.63 | 1189.06 | 564.77 | 625.09 | -| | 2048/2048 | 1 | 3191.34 | 2726.29 | 2802.41 | 1243.96 | 735.19 | 641.56 | -| LLaMA v3 8B | 128/128 | 1 | 28288.75 | 25420.52 | 27399.75 | 15567.44 | 6586.88 | 8745.80 | -| | 128/2048 | 1 | 23230.62 | 16426.68 | 19198.73 | 8817.39 | 4882.13 | 5084.49 | -| | 128/4096 | 1 | 16144.44 | 9832.66 | 12084.97 | 5352.37 | 3079.90 | 2755.13 | -| | 2048/128 | 1 | 3623.79 | 3290.22 | 3463.26 | 1852.48 | 781.63 | 980.86 | -| | 2048/2048 | 1 | 11093.62 | 7573.35 | 8894.11 | 3986.83 | 2268.13 | 2051.79 | -| Mistral 7B | 128/128 | 1 | 30223.01 | 27696.90 | 29788.46 | 16319.25 | 6807.02 | 9612.58 | -| | 128/2048 | 1 | 24989.54 | 17942.29 | 20509.72 | 9982.01 | 5296.02 | 5444.89 | -| | 128/4096 | 1 | 17036.14 | 10846.03 | 12807.80 | 5718.89 | 3241.33 | 2931.17 | -| | 2048/128 | 1 | 3678.80 | 3294.02 | 3521.71 | 1887.75 | 786.43 | 1002.49 | -| | 2048/2048 | 1 | 11510.54 | 8357.75 | 9214.61 | 4284.82 | 2363.25 | 2154.26 | -| Mixtral 8x7B | 128/128 | 2 | 24895.03 | 8785.80 | 24394.71 | 15529.86 | 5921.41 | | -| | | 4 | 42014.24 | 38828.53 | 40197.42 | 28132.17 | 11414.95 | 6820.26 | -| | 128/2048 | 2 | 29389.21 | 5474.69 | 20873.02 | 7066.02 | 4306.98 | | -| | | 4 | 52348.10 | 41573.66 | 40588.05 | 21285.72 | 10974.83 | 7467.15 | -| | 128/4096 | 2 | 21480.27 | 2277.66 | 12838.28 | 3986.01 | 2400.11 | | -| | | 4 | 39182.04 | 28626.55 | 28337.31 | 12447.13 | 7278.89 | 5233.43 | -| | 2048/128 | 2 | 2934.44 | 1003.51 | 2898.27 | 1834.77 | 693.51 | | -| | | 4 | 5152.40 | 4724.01 | 5028.61 | 3393.18 | 1362.93 | 805.49 | -| | 2048/2048 | 2 | 14029.17 | 2671.88 | 10479.45 | 3531.31 | 1945.88 | | -| | | 4 | 25436.05 | 20302.56 | 19971.72 | 9622.66 | 5221.74 | 3616.30 | -| LLaMA v3 70B | 128/128 | 2 | 5386.88 | | | 2959.22 | 1301.14 | | -| | | 4 | 8944.26 | 8587.01 | 8642.05 | 5966.47 | 2413.95 | | -| | | 8 | 16125.20 | | 15397.47 | 10406.55 | 4548.32 | 1364.08 | -| | 128/2048 | 2 | 7007.27 | | | 720.73 | 500.83 | | -| | | 4 | 12906.75 | 10761.53 | 8978.95 | 4736.61 | 2380.02 | | -| | | 8 | 19417.37 | | 14822.93 | 6672.14 | 3815.08 | 1809.40 | -| | 128/4096 | 2 | 6183.85 | | | 369.29 | 251.24 | | -| | | 4 | 8859.54 | 7270.77 | 6073.48 | 2969.99 | 1634.82 | | -| | | 8 | 13969.95 | | 10094.57 | 4358.77 | 2847.54 | 1313.78 | -| | 2048/128 | 2 | 696.59 | | | 301.46 | 140.88 | | -| | | 4 | 1044.35 | 1000.55 | 1022.06 | 681.72 | 278.76 | | -| | | 8 | 2018.47 | | 1933.15 | 1279.46 | 543.73 | 163.36 | -| | 2048/2048 | 2 | 3525.18 | | | | 87.54 | | -| | | 4 | 6550.76 | 4859.38 | 4870.26 | 2379.66 | 1209.69 | | -| | | 8 | 9706.95 | | 7670.04 | 3692.41 | 2192.28 | 895.23 | -| LLaMA v2 70B | 128/128 | 2 | 6355.16 | | | 2927.71 | 1374.05 | | -| | | 4 | 10818.97 | 10819.19 | 10754.99 | 6603.10 | 2765.94 | | -| | | 8 | 16667.25 | | 16074.84 | 11369.11 | 4796.89 | 1402.92 | -| | 128/2048 | 2 | 6185.77 | | | 668.52 | 445.04 | | -| | | 4 | 12884.76 | 11356.48 | 8870.71 | 5067.06 | 2710.53 | | -| | | 8 | 19053.13 | | 17534.62 | 8805.16 | 5665.93 | 2203.33 | -| | 128/4096 | 2 | 4873.24 | | | 334.10 | 215.70 | | -| | | 4 | 8664.90 | 6311.85 | 7564.99 | 3354.02 | 1884.46 | | -| | | 8 | 15110.32 | | 10584.03 | 5373.10 | 3672.80 | 1787.76 | -| | 2048/128 | 2 | 732.09 | | | 302.49 | 141.70 | | -| | | 4 | 1272.90 | 1269.58 | 1265.80 | 774.93 | 320.79 | | -| | | 8 | 2015.77 | | 1943.96 | 1355.78 | 569.48 | 165.52 | -| | 2048/2048 | 2 | 3508.50 | | | 321.95 | 212.97 | | -| | | 4 | 6642.69 | 5545.83 | 4889.26 | 2439.10 | 1276.58 | | -| | | 8 | 10178.71 | | 8071.77 | 4275.74 | 2589.60 | 1083.45 | -| Falcon 180B | 128/128 | 4 | 5129.55 | | | | | | -| | | 8 | 8370.98 | | 8268.72 | | | | -| | 128/2048 | 4 | 7823.79 | | | | | | -| | | 8 | 13278.59 | | 13107.48 | | | | -| | 128/4096 | 4 | 6374.10 | | | | | | -| | | 8 | 12660.89 | | 10493.79 | | | | -| | 2048/128 | 4 | 601.67 | | | | | | -| | | 8 | 1002.57 | | 991.22 | | | | -| | 2048/2048 | 4 | 3869.76 | | | | | | -| | | 8 | 7134.33 | | 6386.83 | | | | +**All data in the table below was generated using version 0.13.0 and presents token throughput in tokens/second.** + +| | | | | | | | | | +| --------------- | ------------------------ | ------------- | ------------------- | --------------- | ------------------ | ------------------ | ------------------ | -------- | +| | | **GPU** | **H200 141GB HBM3** | **GH200 120GB** | **H100 80GB HBM3** | **H100 80GB HBM3** | **A100-SXM4-80GB** | **L40S** | +| | | **Precision** | **FP8** | **FP8** | **FP8** | **FP16** | **FP16** | **FP8** | +| **Model** | **Input/Output Lengths** | **TP** | | | | | | | +| GPTJ 6B | 128/128 | 1 | 24,533.54 | 22,368.50 | 24,318.61 | 12,936.63 | 5,964.19 | 7,688.44 | +| | 128/2048 | 1 | 8,375.67 | 6,588.73 | 7,829.91 | 3,931.61 | 2,215.88 | 1,842.82 | +| | 128/4096 | 1 | 5,048.59 | 3,662.81 | 3,955.28 | 2,041.06 | 1,118.12 | 980.23 | +| | 2048/128 | 1 | 2,770.27 | 2,520.37 | 2,698.08 | 1,479.48 | 650.09 | 746.54 | +| | 5000/500 | 1 | 1,791.39 | 1,449.23 | 1,623.17 | 818.80 | 436.85 | 413.33 | +| | 500/2000 | 1 | 6,770.60 | 5,565.62 | 6,149.65 | 3,030.03 | 1,673.05 | 1,538.45 | +| | 1000/1000 | 1 | 6,465.73 | 5,580.37 | 6,078.80 | 2,797.48 | 1,673.45 | 1,531.57 | +| | 2048/2048 | 1 | 3,637.42 | 2,998.01 | 3,060.80 | 1,285.08 | 845.83 | 753.55 | +| LLaMA v3.1 8B | 128/128 | 1 | 28,125.59 | 26,045.60 | 27,147.22 | 15,647.83 | 6,687.04 | 8,548.90 | +| | 128/2048 | 1 | 22,989.20 | 16,497.79 | 19,221.02 | 8,882.95 | 4,918.53 | 4,988.61 | +| | 128/4096 | 1 | 16,077.62 | 9,637.91 | 11,856.11 | 5,462.96 | 3,054.46 | 2,768.91 | +| | 2048/128 | 1 | 3,625.83 | 3,357.60 | 3,497.30 | 1,859.37 | 796.17 | 1,000.90 | +| | 5000/500 | 1 | 3,823.76 | 3,217.40 | 3,276.69 | 1,687.74 | 788.66 | 872.14 | +| | 500/2000 | 1 | 19,382.37 | 15,128.77 | 13,996.05 | 6,834.76 | 3,929.83 | 3,911.14 | +| | 1000/1000 | 1 | 16,435.21 | 12,355.41 | 13,411.43 | 7,160.92 | 3,592.16 | 3,648.21 | +| | 2048/2048 | 1 | 11,072.97 | 7,850.75 | 8,851.23 | 4,152.21 | 2,269.78 | 2,055.78 | +| | 20000/2000 | 1 | 1,634.98 | 1,200.89 | 1,278.04 | 595.89 | 316.43 | 263.75 | +| LLaMA v3 8B | 128/128 | 1 | 27,940.47 | 26,117.13 | 27,156.81 | 15,489.11 | 6,656.98 | 8,734.57 | +| | 128/2048 | 1 | 23,228.98 | 16,417.04 | 19,209.17 | 8,901.43 | 4,967.37 | 5,004.93 | +| | 128/4096 | 1 | 15,980.94 | 9,351.95 | 11,889.67 | 5,455.91 | 3,053.27 | 2,768.15 | +| | 2048/128 | 1 | 3,631.45 | 3,339.90 | 3,476.37 | 1,918.56 | 796.28 | 1,050.68 | +| | 5000/500 | 1 | 3,836.98 | 3,186.22 | 3,279.24 | 1,668.42 | 792.95 | 860.31 | +| | 500/2000 | 1 | 19,725.45 | 15,241.74 | 14,218.30 | 6,816.62 | 3,899.64 | 3,990.73 | +| | 1000/1000 | 1 | 16,201.60 | 12,049.81 | 13,371.60 | 7,041.47 | 3,617.10 | 3,679.10 | +| | 2048/2048 | 1 | 11,097.69 | 7,255.55 | 8,852.87 | 4,251.45 | 2,269.68 | 2,048.94 | +| LLaMA v2 7B | 128/128 | 1 | 19,549.13 | 17,823.45 | 19,298.99 | 11,436.31 | 5,238.68 | 6,396.62 | +| | 128/2048 | 1 | 7,675.14 | 5,438.53 | 6,607.33 | 2,985.61 | 1,807.39 | 1,566.03 | +| | 128/4096 | 1 | 4,397.83 | 3,310.09 | 3,628.46 | 1,575.35 | 957.24 | 821.83 | +| | 2048/128 | 1 | 2,392.31 | 2,064.18 | 2,304.02 | 1,157.55 | 560.35 | 619.83 | +| | 5000/500 | 1 | 1,570.37 | 1,250.11 | 1,419.09 | 624.75 | 366.39 | 347.03 | +| | 500/2000 | 1 | 6,044.15 | 4,717.51 | 5,188.69 | 2,382.75 | 1,408.58 | 1,231.78 | +| | 1000/1000 | 1 | 5,896.10 | 4,825.24 | 5,208.97 | 2,462.65 | 1,431.92 | 1,277.79 | +| | 2048/2048 | 1 | 3,193.42 | 2,693.21 | 2,792.53 | 1,263.11 | 734.38 | 641.47 | +| Mistral 7B | 128/128 | 1 | 30,152.19 | 27,738.08 | 29,672.75 | 16,711.12 | 6,863.59 | 9,676.88 | +| | 128/2048 | 1 | 24,742.09 | 17,528.14 | 20,318.60 | 9,774.11 | 5,321.44 | 5,437.25 | +| | 128/4096 | 1 | 16,905.49 | 10,671.38 | 12,715.46 | 5,740.41 | 3,257.23 | 2,941.08 | +| | 2048/128 | 1 | 3,676.37 | 3,369.77 | 3,502.83 | 1,893.42 | 796.00 | 996.65 | +| | 5000/500 | 1 | 3,890.07 | 3,401.45 | 3,358.65 | 1,740.69 | 807.07 | 904.45 | +| | 500/2000 | 1 | 20,788.70 | 15,035.59 | 15,962.94 | 7,494.80 | 4,168.89 | 4,088.52 | +| | 1000/1000 | 1 | 17,620.46 | 13,362.84 | 14,213.48 | 7,281.07 | 3,794.31 | 3,972.63 | +| | 2048/2048 | 1 | 11,747.88 | 8,599.03 | 9,200.19 | 4,349.39 | 2,320.50 | 2,170.16 | +| | 20000/2000 | 1 | 1,693.41 | 1,271.85 | 1,299.05 | 609.91 | 324.52 | 276.19 | +| LLaMA v3.1 405B | 128/128 | 8 | 3,734.50 | | | | | | +| | 128/2048 | 8 | 3,039.70 | | | | | | +| | 128/4096 | 8 | 3,144.97 | | | | | | +| | 2048/128 | 8 | 454.17 | | | | | | +| | 5000/500 | 8 | 459.91 | | | | | | +| | 500/2000 | 8 | 2,967.98 | | | | | | +| | 1000/1000 | 8 | 2,259.32 | | | | | | +| | 2048/2048 | 8 | 2,067.15 | | | | | | +| | 20000/2000 | 8 | 447.67 | | | | | | +| LLaMA v3.1 70B | 128/128 | 1 | 3,923.61 | 2,998.99 | 2,168.72 | | | | +| | | 2 | 5,358.16 | 1,839.02 | 5,215.12 | 3,156.10 | 1,340.20 | | +| | | 4 | 8,969.59 | 8,655.98 | 8,677.59 | 5,845.53 | 2,426.46 | 1,434.63 | +| | | 8 | 16,449.68 | | 15,711.60 | 10,643.75 | 4,491.42 | 1,365.36 | +| | 128/2048 | 1 | 3,503.59 | 1,343.53 | 344.22 | | | | +| | | 2 | 7,068.42 | 1,146.08 | 5,654.43 | 801.82 | 498.44 | | +| | | 4 | 12,890.95 | 10,358.10 | 9,377.87 | 4,791.11 | 2,460.91 | 1,748.87 | +| | | 8 | 19,947.02 | | 15,168.97 | 6,892.18 | 4,148.33 | 1,890.62 | +| | 128/4096 | 1 | 2,314.83 | | | | | | +| | | 2 | 6,227.19 | 896.56 | 3,302.41 | 413.22 | 268.86 | | +| | | 4 | 10,059.64 | 6,628.22 | 6,501.69 | 3,056.98 | 1,660.93 | 1,180.87 | +| | | 8 | 14,393.28 | | 9,699.99 | 4,238.15 | 2,705.77 | 1,417.60 | +| | 2048/128 | 1 | 459.73 | 372.44 | 211.51 | | | | +| | | 2 | 689.30 | 280.61 | 690.05 | 323.66 | 143.39 | | +| | | 4 | 1,047.96 | 1,015.14 | 1,016.24 | 672.37 | 278.87 | 167.87 | +| | | 8 | 2,061.19 | | 1,964.49 | 1,273.97 | 539.57 | 163.91 | +| | 5000/500 | 1 | 534.79 | 283.19 | 112.21 | | | | +| | | 2 | 943.78 | 337.04 | 897.36 | 224.31 | 115.63 | | +| | | 4 | 1,437.45 | 1,383.61 | 1,329.82 | 851.12 | 361.39 | 235.90 | +| | | 8 | 2,795.95 | | 2,472.69 | 1,438.10 | 679.27 | 224.33 | +| | 500/2000 | 1 | 2,758.24 | 1,083.48 | | | | | +| | | 2 | 6,063.53 | 851.46 | 4,347.69 | 652.34 | 423.06 | | +| | | 4 | 10,061.89 | 9,090.78 | 8,378.16 | 3,441.34 | 2,072.88 | 1,436.41 | +| | | 8 | 16,139.49 | | 10,790.85 | 5,792.17 | 3,115.20 | 1,512.78 | +| | 1000/1000 | 1 | 2,539.65 | 728.79 | | | | | +| | | 2 | 4,572.03 | 1,223.92 | 3,880.41 | 737.40 | 451.82 | | +| | | 4 | 7,612.56 | 6,705.02 | 6,553.00 | 3,655.64 | 1,731.86 | 1,113.18 | +| | | 8 | 12,660.86 | | 11,121.10 | 5,599.45 | 3,013.95 | 1,120.73 | +| | 2048/2048 | 1 | 1,753.58 | 611.08 | 161.60 | | | | +| | | 2 | 3,407.26 | 626.26 | 2,432.55 | | 108.91 | | +| | | 4 | 6,565.77 | 4,864.55 | 4,948.83 | 2,396.06 | 1,220.93 | 855.44 | +| | | 8 | 9,948.56 | | 8,527.52 | 3,819.60 | 2,103.68 | 924.89 | +| | 20000/2000 | 1 | 262.82 | 88.89 | | | | | +| | | 2 | 598.19 | 177.04 | 414.17 | | | | +| | | 4 | 1,047.27 | 958.88 | 856.31 | 375.85 | 187.42 | 140.73 | +| | | 8 | 1,793.52 | | 1,359.27 | 650.78 | 344.41 | 122.04 | +| LLaMA v3 70B | 128/128 | 1 | 3,924.02 | 3,161.73 | 2,177.84 | | | | +| | | 2 | 5,388.22 | 1,551.84 | 5,205.80 | 3,186.61 | 1,321.55 | | +| | | 4 | 8,958.95 | 8,618.55 | 8,678.68 | 5,857.16 | 2,424.68 | 1,432.46 | +| | | 8 | 16,375.41 | | 15,703.26 | 10,627.36 | 4,490.19 | 1,333.09 | +| | 128/2048 | 1 | 3,519.24 | 1,346.37 | 353.68 | | | | +| | | 2 | 7,071.54 | 862.54 | 5,878.06 | 802.98 | 512.11 | | +| | | 4 | 12,876.38 | 10,015.23 | 8,929.23 | 4,768.27 | 2,458.73 | 1,737.31 | +| | | 8 | 20,013.92 | | 15,171.91 | 6,875.97 | 3,906.35 | 1,892.41 | +| | 128/4096 | 1 | 2,310.85 | | | | | | +| | | 2 | 6,199.95 | 602.98 | 3,311.05 | 413.29 | 269.02 | | +| | | 4 | 9,633.49 | 7,370.19 | 6,489.95 | 3,053.89 | 1,677.51 | 1,199.71 | +| | | 8 | 14,552.09 | | 9,632.02 | 4,259.39 | 2,697.61 | 1,358.34 | +| | 2048/128 | 1 | 458.75 | 371.70 | 210.27 | | | | +| | | 2 | 694.00 | 277.85 | 692.74 | 321.71 | 144.61 | | +| | | 4 | 1,048.84 | 1,016.03 | 1,022.77 | 690.10 | 279.06 | 168.52 | +| | | 8 | 2,072.33 | | 1,976.76 | 1,273.41 | 542.93 | 158.63 | +| | 5000/500 | 1 | 533.37 | 303.33 | 112.68 | | | | +| | | 2 | 936.82 | 379.62 | 899.29 | 224.65 | 115.00 | | +| | | 4 | 1,442.76 | 1,384.62 | 1,326.95 | 853.73 | 361.06 | 235.19 | +| | | 8 | 2,797.36 | | 2,483.56 | 1,437.15 | 678.70 | 225.15 | +| | 500/2000 | 1 | 2,763.89 | 1,074.62 | 293.47 | | | | +| | | 2 | 6,054.46 | 1,109.13 | 4,356.55 | 683.11 | 423.82 | | +| | | 4 | 10,103.08 | 7,325.93 | 8,370.32 | 3,436.29 | 2,064.47 | 1,412.78 | +| | | 8 | 16,857.45 | | 10,760.65 | 5,665.02 | 3,159.89 | 1,517.76 | +| | 1000/1000 | 1 | 2,540.45 | 1,164.45 | | | | | +| | | 2 | 4,590.38 | 1,040.64 | 3,879.25 | 768.53 | 453.73 | | +| | | 4 | 7,606.92 | 6,655.61 | 6,547.23 | 3,655.19 | 1,732.86 | 1,117.53 | +| | | 8 | 12,660.32 | | 11,155.47 | 5,617.24 | 2,894.58 | 1,126.50 | +| | 2048/2048 | 1 | 1,746.77 | 610.87 | 162.10 | | | | +| | | 2 | 3,405.72 | 738.51 | 2,548.70 | | 108.66 | | +| | | 4 | 6,571.34 | 4,880.28 | 5,060.39 | 2,391.55 | 1,222.11 | 854.65 | +| | | 8 | 9,923.96 | | 8,480.48 | 3,826.38 | 2,181.07 | 927.54 | +| LLaMA v2 70B | 128/128 | 1 | 3,969.25 | 3,502.35 | 3,413.82 | | | | +| | | 2 | 6,394.64 | 3,252.69 | 6,432.82 | 3,170.28 | 1,336.48 | | +| | | 4 | 11,031.42 | 11,126.95 | 10,865.42 | 6,420.88 | 2,766.00 | 1,487.71 | +| | | 8 | 17,060.04 | | 16,384.83 | 11,146.15 | 4,742.74 | 1,404.99 | +| | 128/2048 | 1 | 3,742.99 | 1,660.81 | | | | | +| | | 2 | 6,453.25 | 1,335.80 | 5,775.34 | 757.21 | 476.46 | | +| | | 4 | 13,869.67 | 11,098.69 | 9,536.82 | 5,274.27 | 2,686.16 | 1,880.22 | +| | | 8 | 19,220.48 | | 17,715.01 | 8,904.94 | 5,520.41 | 2,186.68 | +| | 128/4096 | 1 | 2,459.63 | | 446.60 | | | | +| | | 2 | 4,831.03 | 684.68 | 3,354.60 | 385.98 | 235.22 | | +| | | 4 | 8,988.84 | 8,397.13 | 7,619.62 | 3,228.36 | 1,941.07 | 1,318.51 | +| | | 8 | 15,115.41 | | 12,506.95 | 5,996.81 | 3,539.36 | 1,782.93 | +| | 2048/128 | 1 | 458.88 | 400.31 | 328.90 | | | | +| | | 2 | 745.71 | 457.57 | 742.17 | 308.02 | 138.81 | | +| | | 4 | 1,297.10 | 1,330.90 | 1,270.78 | 755.30 | 321.72 | 171.67 | +| | | 8 | 2,060.53 | | 2,009.57 | 1,348.71 | 561.71 | 160.37 | +| | 5000/500 | 1 | 548.46 | 364.00 | 224.17 | | | | +| | | 2 | 1,020.86 | 335.07 | 885.67 | 212.20 | 112.43 | | +| | | 4 | 1,759.69 | 1,683.26 | 1,590.94 | 837.57 | 386.78 | 231.54 | +| | | 8 | 2,839.69 | | 2,546.12 | 1,570.91 | 709.66 | 238.59 | +| | 500/2000 | 1 | 3,019.28 | 1,364.66 | 716.54 | | | | +| | | 2 | 6,402.94 | 1,292.24 | 4,462.98 | 629.21 | 387.61 | | +| | | 4 | 12,429.18 | 8,951.07 | 8,753.09 | 4,012.41 | 2,158.17 | 1,517.53 | +| | | 8 | 16,789.12 | | 15,260.29 | 7,384.79 | 4,104.80 | 1,739.28 | +| | 1000/1000 | 1 | 2,706.04 | 1,449.83 | | | | | +| | | 2 | 4,693.24 | 960.39 | 3,958.45 | 736.68 | 425.70 | | +| | | 4 | 8,557.11 | 7,278.64 | 6,817.41 | 3,866.05 | 1,876.40 | 1,188.91 | +| | | 8 | 13,483.04 | | 11,511.74 | 6,543.96 | 3,285.82 | 1,241.42 | +| | 2048/2048 | 1 | 1,911.20 | 798.50 | 412.37 | | | | +| | | 2 | 3,408.82 | 767.24 | 2,551.21 | 388.82 | 226.60 | | +| | | 4 | 6,702.46 | 5,354.80 | 5,212.02 | 2,512.22 | 1,316.92 | 891.95 | +| | | 8 | 10,348.65 | | 8,016.14 | 4,414.75 | 2,492.09 | 1,083.26 | +| Mixtral 8x7B | 128/128 | 2 | 25,135.25 | 8,512.51 | 24,572.90 | 15,395.59 | 5,927.88 | | +| | | 4 | 42,394.61 | 40,148.01 | 40,309.25 | 27,747.43 | 11,205.51 | 6,784.44 | +| | | 8 | 54,648.80 | | 51,683.16 | 40,116.51 | 18,496.66 | 6,437.72 | +| | 128/2048 | 2 | 29,412.17 | 3,271.02 | 20,938.80 | 7,391.51 | 4,278.79 | | +| | | 4 | 52,603.13 | 43,071.34 | 40,580.94 | 21,332.15 | 10,946.58 | 7,475.05 | +| | | 8 | 70,427.00 | | 64,161.64 | 41,101.18 | 21,235.99 | 9,955.21 | +| | 128/4096 | 2 | 21,312.11 | 2,254.56 | | 3,896.02 | 2,388.14 | | +| | | 4 | 39,353.01 | 30,065.77 | | | 7,108.03 | 5,232.44 | +| | | 8 | 32,992.62 | | 47,860.65 | 27,261.67 | 15,943.70 | 8,081.21 | +| | 2048/128 | 2 | 2,946.01 | 921.87 | 2,894.09 | 1,790.49 | 684.71 | | +| | | 4 | 5,237.58 | 5,056.60 | 4,988.14 | 3,354.89 | 1,338.54 | 803.50 | +| | | 8 | 7,053.32 | | 6,559.63 | 5,072.46 | 2,244.39 | 753.39 | +| | 5000/500 | 2 | 3,848.10 | 997.06 | 3,630.24 | 1,656.04 | 739.84 | | +| | | 4 | 6,877.65 | 6,466.39 | 6,237.22 | 3,607.46 | 1,619.49 | 1,048.60 | +| | | 8 | 9,531.26 | | 8,709.34 | 6,237.96 | 2,927.13 | 1,109.25 | +| | 500/2000 | 2 | 23,539.24 | 2,773.86 | 16,886.30 | 5,773.33 | 3,325.73 | | +| | | 4 | 40,035.05 | 33,478.35 | 32,047.73 | 16,897.03 | 8,908.09 | 6,153.32 | +| | | 8 | 60,572.77 | | 41,597.80 | 31,392.32 | 16,954.54 | 7,980.34 | +| | 1000/1000 | 2 | 18,644.51 | 4,540.15 | 14,154.95 | 5,826.43 | 3,289.27 | | +| | | 4 | 32,709.62 | 29,046.16 | 25,291.30 | 14,307.91 | 7,461.63 | 4,697.19 | +| | | 8 | 44,072.88 | | 40,628.46 | 27,633.48 | 13,741.62 | 5,706.17 | +| | 2048/2048 | 2 | 14,017.70 | 2,870.77 | 10,448.79 | 3,535.21 | 1,954.32 | | +| | | 4 | 25,550.44 | 21,488.32 | 19,977.11 | 9,620.99 | 5,191.30 | 3,593.18 | +| | | 8 | 24,999.94 | | 31,678.85 | 19,372.52 | 10,572.07 | 4,860.61 | +| | 20000/2000 | 2 | 2,195.84 | 367.81 | 1,583.86 | 626.60 | 320.41 | | +| | | 4 | 4,086.41 | 3,301.28 | 2,982.42 | 1,586.09 | 807.67 | 579.49 | +| | | 8 | 5,797.57 | | 5,163.91 | 3,106.98 | 1,653.55 | 821.64 | *TP stands for Tensor Parallelism* ## Reproducing Benchmarked Results @@ -169,7 +277,10 @@ remain in the system longer and therefore require less requests to achieve stead | 128 | 4096 | 4224 | 1500 | | 2048 | 128 | 2176 | 3000 | | 2048 | 2048 | 4096 | 1500 | - +| 5000 | 500 | 5500 | 1500 | +| 1000 | 1000 | 2000 | 3000 | +| 500 | 2000 | 2500 | 3000 | +| 20000 | 2000 | 22000 | 1000 | ## Engine Building diff --git a/docs/source/reference/support-matrix.md b/docs/source/reference/support-matrix.md index bb7071aab..c6b301807 100644 --- a/docs/source/reference/support-matrix.md +++ b/docs/source/reference/support-matrix.md @@ -75,7 +75,8 @@ TensorRT-LLM optimizes the performance of a range of well-known models on NVIDIA The following table shows the supported hardware for TensorRT-LLM. -If a GPU is not listed, it is important to note that TensorRT-LLM is expected to work on GPUs based on the Volta, Turing, Ampere, Hopper, and Ada Lovelace architectures. Certain limitations may, however, apply. +If a GPU architecture is not listed, the TensorRT-LLM team does not develop or test the software on the architecture and support is limited to community support. +In addition, older architectures can have limitations for newer software releases. ```{list-table} :header-rows: 1 @@ -90,8 +91,6 @@ If a GPU is not listed, it is important to note that TensorRT-LLM is expected to - [NVIDIA Hopper Architecture](https://www.nvidia.com/en-us/data-center/technologies/hopper-architecture/) - [NVIDIA Ada Lovelace Architecture](https://www.nvidia.com/en-us/technologies/ada-architecture/) - [NVIDIA Ampere Architecture](https://www.nvidia.com/en-us/data-center/ampere-architecture/) - - [NVIDIA Turing Architecture](https://www.nvidia.com/en-us/geforce/turing/) - - [NVIDIA Volta Architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) (experimental) ``` (support-matrix-software)= @@ -114,14 +113,8 @@ The following table shows the supported software for TensorRT-LLM. - Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4 - Ada Lovelace (SM89) - FP32, FP16, BF16, FP8, INT8, INT4 - Ampere (SM80, SM86) - FP32, FP16, BF16, INT8, INT4[^smgte89] - - Turing (SM75) - FP32, FP16, INT8[^smooth], INT4 - - Volta (SM70) - FP32, FP16, INT8[^smooth], INT4[^smlt75] ``` -[^smooth]: INT8 SmoothQuant is not supported on SM70 and SM75. - -[^smlt75]: INT4 AWQ and GPTQ are not supported on SM < 75. - [^smgte89]: INT4 AWQ and GPTQ with FP8 activations require SM >= 89. [^encdec]: Encoder-Decoder provides general encoder-decoder functionality that supports many encoder-decoder models such as T5 family, BART family, Whisper family, NMT family, and so on. diff --git a/docs/source/reference/troubleshooting.md b/docs/source/reference/troubleshooting.md index 9fcffae71..463449599 100644 --- a/docs/source/reference/troubleshooting.md +++ b/docs/source/reference/troubleshooting.md @@ -258,8 +258,13 @@ SLURM, depending upon the SLURM version you are using: Please configure as appropriate and try again. -------------------------------------------------------------------------- ``` + +You may experience other problems like hanging on the program startup. + As a rule of thumb, if you are running TensorRT-LLM interactively on a Slurm node, prefix your commands with `mpirun -n 1` to run TensorRT-LLM in a dedicated MPI environment, not the one provided by your Slurm allocation. For example: `mpirun -n 1 python3 examples/gpt/build.py ...` + +It's critical that it's always `-n 1` regardless of how many GPUs are being used. If you'd use `-n 2` for a 2 GPU program it will not work. `mpirun` here isn't being used to orchestrate multiple processes, but to invoke the right environment on SLURM. The internal MPI implementation deals with spawning the additional processes. diff --git a/docs/source/release-notes.md b/docs/source/release-notes.md index b4b77ea43..9db92e68e 100644 --- a/docs/source/release-notes.md +++ b/docs/source/release-notes.md @@ -179,13 +179,13 @@ All published functionality in the Release Notes has been fully tested and verif - Moved the most commonly used options in the explicit arg-list, and hidden the expert options in the kwargs. - Exposed `model` to accept either HuggingFace model name or local HuggingFace model/TensorRT-LLM checkpoint/TensorRT-LLM engine. - Support downloading model from HuggingFace model hub, currently only Llama variants are supported. - - Support build cache to reuse the built TensorRT-LLM engines by setting environment variable `TLLM_HLAPI_BUILD_CACHE=1` or passing `enable_build_cache=True` to `LLM` class. + - Support build cache to reuse the built TensorRT-LLM engines by setting environment variable `TLLM_LLMAPI_BUILD_CACHE=1` or passing `enable_build_cache=True` to `LLM` class. - Exposed low-level options including `BuildConfig`, `SchedulerConfig` and so on in the kwargs, ideally you should be able to configure details about the build and runtime phase. - Refactored `LLM.generate()` and `LLM.generate_async()` API. - Removed `SamplingConfig`. - - Added `SamplingParams` with more extensive parameters, see `tensorrt_llm/hlapi/utils.py`. + - Added `SamplingParams` with more extensive parameters, see `tensorrt_llm/llmapi/utils.py`. - The new `SamplingParams` contains and manages fields from Python bindings of `SamplingConfig`, `OutputConfig`, and so on. - - Refactored `LLM.generate()` output as `RequestOutput`, see `tensorrt_llm/hlapi/llm.py`. + - Refactored `LLM.generate()` output as `RequestOutput`, see `tensorrt_llm/llmapi/llm.py`. - Updated the `apps` examples, specially by rewriting both `chat.py` and `fastapi_server.py` using the `LLM` APIs, please refer to the `examples/apps/README.md` for details. - Updated the `chat.py` to support multi-turn conversation, allowing users to chat with a model in the terminal. - Fixed the `fastapi_server.py` and eliminate the need for `mpirun` in multi-GPU scenarios. @@ -481,7 +481,7 @@ All published functionality in the Release Notes has been fully tested and verif Refer to the {ref}`support-matrix-software` section for a list of supported models. * API - - Add a set of High-level APIs for end-to-end generation tasks (see examples/high-level-api/README.md) + - Add a set of LLM APIs for end-to-end generation tasks (see examples/llm-api/README.md) - **[BREAKING CHANGES]** Migrate models to the new build workflow, including LLaMA, Mistral, Mixtral, InternLM, ChatGLM, Falcon, GPT-J, GPT-NeoX, Medusa, MPT, Baichuan and Phi (see docs/source/new_workflow.md) - **[BREAKING CHANGES]** Deprecate `LayerNorm` and `RMSNorm` plugins and removed corresponding build parameters - **[BREAKING CHANGES]** Remove optional parameter `maxNumSequences` for GPT manager diff --git a/examples/apps/README.md b/examples/apps/README.md index 75d0e7663..1e025e7a6 100644 --- a/examples/apps/README.md +++ b/examples/apps/README.md @@ -1,4 +1,4 @@ -# Apps examples with GenerationExecutor / High-level API +# Apps examples with GenerationExecutor / LLM API ## OpenAI API [openai_server.py](./openai_server.py) is an OpenAI compatible server which supports `v1/version`, `v1/completions` and `v1/chat/completions`. [openai_client.py](./openai_client.py) is a simple example using OpenAI client to query your model. To start the server, you can run ``` diff --git a/examples/apps/chat.py b/examples/apps/chat.py index 25fe67afb..9de80e55b 100755 --- a/examples/apps/chat.py +++ b/examples/apps/chat.py @@ -5,7 +5,7 @@ import colorama from transformers import AutoTokenizer, PreTrainedTokenizer -from tensorrt_llm.hlapi import LLM, BuildConfig, KvCacheConfig, SamplingParams +from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams class LlmConsole(code.InteractiveConsole): diff --git a/examples/apps/fastapi_server.py b/examples/apps/fastapi_server.py index c9e5d70fb..83562c1e1 100755 --- a/examples/apps/fastapi_server.py +++ b/examples/apps/fastapi_server.py @@ -9,7 +9,7 @@ from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse -from tensorrt_llm.hlapi import LLM, BuildConfig, KvCacheConfig, SamplingParams +from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams TIMEOUT_KEEP_ALIVE = 5 # seconds. diff --git a/examples/apps/openai_server.py b/examples/apps/openai_server.py index 5c502aa13..ac392f72b 100644 --- a/examples/apps/openai_server.py +++ b/examples/apps/openai_server.py @@ -15,9 +15,9 @@ from transformers import AutoTokenizer, PreTrainedTokenizer # yapf: disable -from tensorrt_llm.hlapi import LLM, BuildConfig, KvCacheConfig -from tensorrt_llm.hlapi.llm import RequestOutput -from tensorrt_llm.hlapi.openai_protocol import ( +from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig +from tensorrt_llm.llmapi.llm import RequestOutput +from tensorrt_llm.llmapi.openai_protocol import ( ChatCompletionLogProbs, ChatCompletionLogProbsContent, ChatCompletionNamedToolChoiceParam, ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, diff --git a/examples/baichuan/requirements.txt b/examples/baichuan/requirements.txt index 02425e2a7..719f9073f 100644 --- a/examples/baichuan/requirements.txt +++ b/examples/baichuan/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.15.0 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/bloom/requirements.txt b/examples/bloom/requirements.txt index a3c6da3e0..7c83c66a3 100644 --- a/examples/bloom/requirements.txt +++ b/examples/bloom/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/chatglm/requirements.txt b/examples/chatglm/requirements.txt index 606480701..9da421d63 100644 --- a/examples/chatglm/requirements.txt +++ b/examples/chatglm/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.14.5 evaluate~=0.4.1 protobuf diff --git a/examples/commandr/README.md b/examples/commandr/README.md new file mode 100644 index 000000000..cbe51a6bb --- /dev/null +++ b/examples/commandr/README.md @@ -0,0 +1,207 @@ +# Command R + +This document explains how to build the [C4AI Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01), [C4AI Command R+](https://huggingface.co/CohereForAI/c4ai-command-r-plus), [Aya-23-8B](https://huggingface.co/CohereForAI/aya-23-8B), [Aya-23-35B](https://huggingface.co/CohereForAI/aya-23-35B) models using TensorRT-LLM and run on a single GPU or a single node with multiple GPUs. + +- [Command R](#Command-R) + - [Overview](#overview) + - [Support Matrix](#support-matrix) + - [Usage](#usage) + - [1. Download repo and weights from HuggingFace Transformers](#1-download-repo-and-weights-from-huggingface-transformers) + - [2. Convert weights from HF Transformers to TensorRT-LLM format](#2-convert-weights-from-hf-transformers-to-tensorrt-llm-format) + - [3. Build TensorRT engine(s)](#3-build-tensorrt-engines) + - [4. Run inference](#4-run-inference) + - [Single node, single GPU](#single-node-single-gpu) + - [Single node, multi GPU](#single-node-multi-gpu) + - [5. Run summarization task](#5-run-summarization-task) + - [Weight Only quantization](#weight-only-quantization) + + +## Overview + +The TensorRT-LLM Command-R implementation can be found in [`tensorrt_llm/models/commandr/model.py`](../../tensorrt_llm/models/commandr/model.py). +The TensorRT-LLM Command-R example code is located in [`examples/commandr`](./). There is one main file: + +* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format. + +In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation: + +* [`../run.py`](../run.py) to run the inference on an input text; +* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset. + +## Support Matrix + + * FP16 + * INT8 & INT4 Weight-Only + * Tensor Parallel + +## Usage + +The next section describe how to build the engine and run the inference demo. + +### 1. Download repo and weights from HuggingFace Transformers + +```bash +pip install -r requirements.txt +apt-get update +apt-get install git-lfs + +# clone one or more models we want to build +git clone https://huggingface.co/CohereForAI/c4ai-command-r-v01 command_r_v01 +git clone https://huggingface.co/CohereForAI/c4ai-command-r-plus command_r_plus +git clone https://huggingface.co/CohereForAI/aya-23-8B aya_23_8B +git clone https://huggingface.co/CohereForAI/aya-23-35B aya_23_35B +``` + +### 2. Convert weights from HF Transformers to TensorRT-LLM format + +The [`convert_checkpoint.py`](./convert_checkpoint.py) script converts HF weights to TensorRT-LLM checkpoints. The number of checkpoint files (in .safetensors format) is same to the number of GPUs used to run inference. + +```bash +# Command-R: single gpu, dtype float16 +python3 convert_checkpoint.py --model_dir command_r_v01 --output_dir trt_ckpt/command_r_v01/fp16/1-gpu + +# Command-R+: 4-way tensor parallelism +python3 convert_checkpoint.py --model_dir command_r_plus --tp_size 4 --output_dir trt_ckpt/command_r_plus/fp16/4-gpu + +# Aya-23-8B: single gpu, dtype float16 +python3 convert_checkpoint.py --model_dir aya_23_8B --output_dir trt_ckpt/aya_23_8B/fp16/1-gpu + +# Aya-23-35B: single gpu, dtype float16 +python3 convert_checkpoint.py --model_dir aya_23_35B --output_dir trt_ckpt/aya_23_35B/fp16/1-gpu +``` + +### 3. Build TensorRT engine(s) + +The `trtllm-build` command builds TensorRT-LLM engines from TensorRT-LLM checkpoints. The number of engine files is also same to the number of GPUs used to run inference. + +Normally, the `trtllm-build` command only requires a single GPU, but you can enable parallel building by passing the number of GPUs to the `--workers` argument. + +```bash +# Command-R: single-gpu engine with dtype float16, GPT Attention plugin, Gemm plugin +trtllm-build --checkpoint_dir trt_ckpt/command_r_v01/fp16/1-gpu \ + --gemm_plugin float16 \ + --output_dir trt_engines/command_r_v01/fp16/1-gpu + +# Command-R+: 4-way tensor parallelism +trtllm-build --checkpoint_dir trt_ckpt/command_r_plus/fp16/4-gpu \ + --gemm_plugin float16 \ + --output_dir trt_engines/command_r_plus/fp16/4-gpu + +# Command-R: single-gpu engine with dtype float16, GPT Attention plugin, Gemm plugin +trtllm-build --checkpoint_dir trt_ckpt/aya_23_8B/fp16/1-gpu \ + --gemm_plugin float16 \ + --output_dir trt_engines/aya_23_8B/fp16/1-gpu + +# Command-R: single-gpu engine with dtype float16, GPT Attention plugin, Gemm plugin +trtllm-build --checkpoint_dir trt_ckpt/aya_23_35B/fp16/1-gpu \ + --gemm_plugin float16 \ + --output_dir trt_engines/aya_23_35B/fp16/1-gpu +``` + +If the engines are built successfully, you will see output like (Command-R as the example): + +```txt +...... +[09/19/2024-03:34:30] [TRT] [I] Engine generation completed in 26.9495 seconds. +[09/19/2024-03:34:30] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 4 MiB, GPU 70725 MiB +[09/19/2024-03:34:55] [TRT] [I] [MemUsageStats] Peak memory usage during Engine building and serialization: CPU: 176260 MiB +[09/19/2024-03:34:55] [TRT-LLM] [I] Total time of building Unnamed Network 0: 00:00:52 +[09/19/2024-03:34:55] [TRT] [I] Serialized 26 bytes of code generator cache. +[09/19/2024-03:34:55] [TRT] [I] Serialized 315007 bytes of compilation cache. +[09/19/2024-03:34:55] [TRT] [I] Serialized 12 timing cache entries +[09/19/2024-03:34:55] [TRT-LLM] [I] Timing cache serialized to model.cache +[09/19/2024-03:34:55] [TRT-LLM] [I] Build phase peak memory: 176257.29 MB, children: 17.65 MB +[09/19/2024-03:34:55] [TRT-LLM] [I] Serializing engine to trt_engines/command_r_v01/fp16/1-gpu/rank0.engine... +[09/19/2024-03:35:20] [TRT-LLM] [I] Engine serialized. Total time: 00:00:25 +[09/19/2024-03:35:23] [TRT-LLM] [I] Total time of building all engines: 00:01:47 +``` + +### 4. Run inference + +#### Single node, single GPU + +```bash +# Run the default engine of Command-R on single GPU. +python3 ../run.py --max_output_len 50 \ + --tokenizer_dir command_r_v01 \ + --engine_dir trt_engines/command_r_v01/fp16/1-gpu + +# Run the default engine of Command-R on single GPU, using streaming output. +python3 ../run.py --max_output_len 50 \ + --tokenizer_dir command_r_v01 \ + --engine_dir trt_engines/command_r_v01/fp16/1-gpu \ + --streaming + +# Run the default engine of Aya-23-8B on single GPU. +python3 ../run.py --max_output_len 50 \ + --tokenizer_dir aya_23_8B \ + --engine_dir trt_engines/aya_23_8B/fp16/1-gpu + +# Run the default engine of Aya-23-35B on single GPU. +python3 ../run.py --max_output_len 50 \ + --tokenizer_dir aya_23_35B \ + --engine_dir trt_engines/aya_23_35B/fp16/1-gpu +``` + +#### Single node, multi GPU + +```bash +# Run the Tensor Parallel 4 engine of Command-R+ on 4 GPUs. +mpirun -n 4 \ + python ../run.py --max_output_len 50 \ + --tokenizer_dir command_r_plus \ + --engine_dir trt_engines/command_r_plus/fp16/4-gpu +``` + +If the engines are run successfully, you will see output like (Command-R as the example): + +```txt +...... +Input [Text 0]: "Born in north-east France, Soyer trained as a" +Output [Text 0 Beam 0]: " chef in Paris and worked in the kitchens of the French royal family. He came to England in 1814 and worked in a number of London hotels and restaurants, including the Reform Club and the London Tavern. He also opened his own restaurant" +``` + +### 5. Run summarization task + +```bash +# Run the summarization of Command-R task. +python3 ../summarize.py --test_trt_llm \ + --hf_model_dir command_r_v01 \ + --engine_dir trt_engines/command_r_v01/fp16/1-gpu +``` + +If the engines are run successfully, you will see output like (Command-R as the example): + +```txt +...... +[01/26/2024-02:51:56] [TRT-LLM] [I] TensorRT-LLM (total latency: 81.05689692497253 sec) +[01/26/2024-02:51:56] [TRT-LLM] [I] TensorRT-LLM (total output tokens: 2000) +[01/26/2024-02:51:56] [TRT-LLM] [I] TensorRT-LLM (tokens per second: 24.67402621952367) +[01/26/2024-02:51:56] [TRT-LLM] [I] TensorRT-LLM beam 0 result +[01/26/2024-02:51:56] [TRT-LLM] [I] rouge1 : 24.06804397902119 +[01/26/2024-02:51:56] [TRT-LLM] [I] rouge2 : 6.456513335555016 +[01/26/2024-02:51:56] [TRT-LLM] [I] rougeL : 16.77644999660741 +[01/26/2024-02:51:56] [TRT-LLM] [I] rougeLsum : 20.57359472317834 +``` + +### Weight Only quantization + +Use `--use_weight_only` to enable INT8-Weight-Only quantization, this will significantly lower the latency and memory footprint. Furthermore, use `--weight_only_precision int8` or `--weight_only_precision int4` to configure the data type of the weights. + +```bash +# Command-R: single gpu, int8 weight only quantization +python3 convert_checkpoint.py --model_dir command_r_v01 \ + --use_weight_only \ + --weight_only_precision int8 \ + --output_dir trt_ckpt/command_r_v01/int8_wo/1-gpu + +# Command-R: single-gpu engine with int8 weight only quantization, GPT Attention plugin, Gemm plugin +trtllm-build --checkpoint_dir trt_ckpt/command_r_v01/int8_wo/1-gpu \ + --gemm_plugin float16 \ + --output_dir trt_engines/command_r_v01/int8_wo/1-gpu + +# Run inference. +python3 ../run.py --max_output_len 50 \ + --tokenizer_dir command_r_v01 \ + --engine_dir trt_engines/command_r_v01/int8_wo/1-gpu +``` diff --git a/examples/commandr/convert_checkpoint.py b/examples/commandr/convert_checkpoint.py new file mode 100644 index 000000000..8debb1f41 --- /dev/null +++ b/examples/commandr/convert_checkpoint.py @@ -0,0 +1,188 @@ +import argparse +import os +import time +import traceback +from concurrent.futures import ThreadPoolExecutor, as_completed + +import tensorrt_llm +from tensorrt_llm._utils import release_gc +from tensorrt_llm.logger import logger +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.models import CohereForCausalLM +from tensorrt_llm.models.modeling_utils import QuantConfig +from tensorrt_llm.quantization import QuantAlgo + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--model_dir', type=str, default=None) + parser.add_argument('--tp_size', + type=int, + default=1, + help='N-way tensor parallelism size') + parser.add_argument('--pp_size', + type=int, + default=1, + help='N-way pipeline parallelism size') + parser.add_argument('--dtype', + type=str, + default='float16', + choices=['float32', 'bfloat16', 'float16']) + parser.add_argument( + '--use_weight_only', + default=False, + action="store_true", + help='Quantize weights for the various GEMMs to INT4/INT8.' + 'See --weight_only_precision to set the precision') + parser.add_argument( + '--disable_weight_only_quant_plugin', + default=False, + action="store_true", + help= + 'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.' + 'You must also use --use_weight_only for that argument to have an impact.' + ) + parser.add_argument( + '--weight_only_precision', + const='int8', + type=str, + nargs='?', + default='int8', + choices=['int8', 'int4'], + help= + 'Define the precision for the weights when using weight-only quantization.' + 'You must also use --use_weight_only for that argument to have an impact.' + ) + parser.add_argument("--load_model_on_cpu", action="store_true") + parser.add_argument( + '--use_parallel_embedding', + action="store_true", + default=False, + help= + 'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled' + ) + parser.add_argument( + '--embedding_sharding_dim', + type=int, + default=0, + choices=[0, 1], + help= + 'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). ' + 'To shard it along hidden dimension, set embedding_sharding_dim=1' + 'Note: embedding sharing is only enabled when embedding_sharding_dim = 0' + ) + parser.add_argument( + '--use_embedding_sharing', + action="store_true", + default=False, + help= + 'Try to reduce the engine size by sharing the embedding lookup table between two layers.' + 'Note: the flag might not take effect when the criteria are not met.') + parser.add_argument('--output_dir', + type=str, + default='tllm_checkpoint', + help='The path to save the TensorRT-LLM checkpoint') + parser.add_argument( + '--workers', + type=int, + default=1, + help='The number of workers for converting checkpoint in parallel') + parser.add_argument('--log_level', type=str, default='info') + + args = parser.parse_args() + + return args + + +def args_to_quant_config(args: argparse.Namespace) -> QuantConfig: + '''return config dict with quantization info based on the command line args + ''' + quant_config = QuantConfig() + if args.use_weight_only: + if args.weight_only_precision == 'int8': + quant_config.quant_algo = QuantAlgo.W8A16 + elif args.weight_only_precision == 'int4': + quant_config.quant_algo = QuantAlgo.W4A16 + + return quant_config + + +def args_to_build_options(args): + return { + 'use_parallel_embedding': args.use_parallel_embedding, + 'embedding_sharding_dim': args.embedding_sharding_dim, + 'share_embedding_table': args.use_embedding_sharing, + 'disable_weight_only_quant_plugin': + args.disable_weight_only_quant_plugin, + 'load_model_on_cpu': args.load_model_on_cpu, + } + + +def convert_and_save_hf(args): + model_dir = args.model_dir + world_size = args.tp_size * args.pp_size + # Need to convert the cli args to the kay-value pairs and override them in the generate config dict. + # Ideally these fields will be moved out of the config and pass them into build API, keep them here for compatibility purpose for now, + # before the refactor is done. + override_fields = {} + override_fields.update(args_to_build_options(args)) + + quant_config = args_to_quant_config(args) + + def convert_and_save_rank(args, rank): + mapping = Mapping(world_size=world_size, + rank=rank, + tp_size=args.tp_size, + pp_size=args.pp_size) + cohere = CohereForCausalLM.from_hugging_face( + model_dir, + args.dtype, + mapping=mapping, + quant_config=quant_config, + **override_fields, + ) + cohere.save_checkpoint(args.output_dir, save_config=(rank == 0)) + del cohere + + execute(args.workers, [convert_and_save_rank] * world_size, args) + release_gc() + + +def execute(workers, func, args): + if workers == 1: + for rank, f in enumerate(func): + f(args, rank) + else: + with ThreadPoolExecutor(max_workers=workers) as p: + futures = [p.submit(f, args, rank) for rank, f in enumerate(func)] + exceptions = [] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + traceback.print_exc() + exceptions.append(e) + assert len( + exceptions + ) == 0, "Checkpoint conversion failed, please check error log." + + +def main(): + print(tensorrt_llm.__version__) + args = parse_arguments() + logger.set_level(args.log_level) + tik = time.time() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + assert args.model_dir is not None + convert_and_save_hf(args) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + print(f'Total time of converting checkpoints: {t}') + + +if __name__ == '__main__': + main() diff --git a/examples/commandr/requirements.txt b/examples/commandr/requirements.txt new file mode 100644 index 000000000..71675afd9 --- /dev/null +++ b/examples/commandr/requirements.txt @@ -0,0 +1,5 @@ +--extra-index-url https://pypi.nvidia.com +tensorrt_llm==0.15.0.dev2024101500 +datasets==2.14.6 +evaluate~=0.4.1 +rouge_score~=0.1.2 diff --git a/examples/dbrx/convert_checkpoint.py b/examples/dbrx/convert_checkpoint.py index 41e19b309..2009bbac7 100644 --- a/examples/dbrx/convert_checkpoint.py +++ b/examples/dbrx/convert_checkpoint.py @@ -545,6 +545,7 @@ def execute(workers, func, hf_model): kv_cache_quant_algo = QuantAlgo.INT8 hf_config = None + if args.model_dir is not None: hf_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True) @@ -563,8 +564,10 @@ def execute(workers, func, hf_model): args.clip_qkv = hf_config.attn_config.clip_qkv args.hidden_act = 'swiglu' args.rotary_base = hf_config.attn_config.rope_theta - args.moe_config = MoeConfig(args.moe_num_experts, args.moe_top_k, - args.moe_renorm_mode).validate() + args.moe_config = MoeConfig( + num_experts=args.moe_num_experts, + top_k=args.moe_top_k, + normalization_mode=args.moe_renorm_mode).validate() config = { 'architecture': hf_config.architectures[0], 'dtype': args.dtype, diff --git a/examples/dbrx/requirements.txt b/examples/dbrx/requirements.txt index d881f96af..bb07c7c34 100644 --- a/examples/dbrx/requirements.txt +++ b/examples/dbrx/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/deepseek_v1/README.md b/examples/deepseek_v1/README.md old mode 100644 new mode 100755 index 6bca80df1..a60afb83d --- a/examples/deepseek_v1/README.md +++ b/examples/deepseek_v1/README.md @@ -39,7 +39,7 @@ In addition, there are three shared files in the parent folder [`examples`](../) - [x] FP16 - [x] TENSOR PARALLEL -- [ ] FP8 +- [x] FP8 ## Usage @@ -64,6 +64,25 @@ trtllm-build --checkpoint_dir ./trtllm_checkpoint_deepseek_v1_1gpu_bf16 \ --moe_plugin bfloat16 \ ``` +### FP8 Quantization + +The [`../quantization/quantize.py`](../quantization/quantize.py) script can be used to quantize the models and export TensorRT-LLM checkpoints. + +```bash +# Deepseek-v1: single gpu, fp8 quantization +python ../quantization/quantize.py --model_dir deepseek_moe_16b \ + --dtype float16 \ + --qformat fp8 \ + --kv_cache_dtype fp8 \ + --output_dir trt_ckpt/deepseek_moe_16b/fp8/1-gpu \ + --calib_size 512 + +# Deepseek-v1: single-gpu engine with fp8 quantization, GPT Attention plugin, Gemm plugin +trtllm-build --checkpoint_dir ./trt_ckpt/deepseek_moe_16b/fp8/1-gpu \ + --gemm_plugin float16 \ + --output_dir ./trt_engines/fp8/1-gpu/ +``` + Then, test the engine with [run.py](../run.py) script: ```bash diff --git a/examples/draft_target_model/requirements.txt b/examples/draft_target_model/requirements.txt index 5ac8a0ffb..aebdd4207 100644 --- a/examples/draft_target_model/requirements.txt +++ b/examples/draft_target_model/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.14.5 rouge_score~=0.1.2 sentencepiece~=0.1.99 diff --git a/examples/eagle/README.md b/examples/eagle/README.md new file mode 100644 index 000000000..f7a5bae8c --- /dev/null +++ b/examples/eagle/README.md @@ -0,0 +1,59 @@ +# **MODEL IS NOT FULLY SUPPORTED YET! DO NOT USE IT.** + +# EAGLE speculative Decoding + +This document shows how to build and run a model using EAGLE decoding ([`Github`](https://github.com/SafeAILab/EAGLE/tree/main), [`BLOG`](https://sites.google.com/view/eagle-llm)) in TensorRT-LLM on a single node with one GPU or more. + +## Overview +Different from other models, EAGLE decoding needs a base model and EAGLE model. + +The TensorRT-LLM EAGLE Decoding implementation can be found in [tensorrt_llm/models/eagle/model.py](../../tensorrt_llm/models/eagle/model.py), which actually adds Eagle draft network to a base model. + + + +## Support Matrix + * GPU Compute Capability >= 8.0 (Ampere or newer) + * FP16 + * BF16 + * PAGED_KV_CACHE + * Tensor Parallel + +This example focuses on adding EAGLE to LLaMA base model. With some modifications EAGLE can be added to the other base models as well. + +## Usage +The TensorRT-LLM EAGLE example code is located in [`examples/eagle`](./). There is one [`convert_checkpoint.py`](./convert_checkpoint.py) file to convert and build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run models with EAGLE decoding support. +In our example, we use the model from HuggingFace [`yuhuili/EAGLE-Vicuna-7B-v1.3`](https://huggingface.co/yuhuili/EAGLE-Vicuna-7B-v1.3), which is a LLAMA based model. + +### Build TensorRT engine(s) +Get the weights by downloading the base model [`vicuna-7b-v1.3`](https://huggingface.co/lmsys/vicuna-7b-v1.3) and the EAGLE draft model [`EAGLE-Vicuna-7B-v1.3`](https://huggingface.co/yuhuili/EAGLE-Vicuna-7B-v1.3) from HF. + +``` +pip install -r requirements.txt + +git lfs install +git clone https://huggingface.co/lmsys/vicuna-7b-v1.3 +https://huggingface.co/yuhuili/EAGLE-Vicuna-7B-v1.3 +``` + + +Here is the example: +```bash +python convert_checkpoint.py --model_dir ./vicuna-7b-v1.3 \ + --eagle_model_dir EAGLE-Vicuna-7B-v1.3 \ + --output_dir ./tllm_checkpoint_1gpu_eagle \ + --dtype float16 \ + --max_draft_len 63 \ + --num_eagle_layers 4 + +trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_eagle \ + --output_dir ./tmp/eagle/7B/trt_engines/fp16/1-gpu/ \ + --gemm_plugin float16 \ + --speculative_decoding_mode eagle \ + --max_batch_size 4 +``` + +### Run + +### Summarization using EAGLE decoding diff --git a/examples/eagle/convert_checkpoint.py b/examples/eagle/convert_checkpoint.py new file mode 100644 index 000000000..d1283c7a6 --- /dev/null +++ b/examples/eagle/convert_checkpoint.py @@ -0,0 +1,470 @@ +import argparse +import json +import os +import time +import traceback +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +import safetensors +import torch +from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer + +import tensorrt_llm +from tensorrt_llm.logger import logger +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.models import PretrainedConfig +from tensorrt_llm.models.convert_utils import load_calib_dataset +from tensorrt_llm.models.eagle.weight import (capture_activation_range, + convert_hf_llama, load_eagle_hf) +from tensorrt_llm.models.llama.convert import load_weights_from_hf_by_shard +from tensorrt_llm.quantization import QuantAlgo + +try: + from transformers import MixtralForCausalLM +except ImportError: + MixtralForCausalLM = None + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--model_dir', type=str, default=None) + parser.add_argument('--meta_ckpt_dir', type=str, default=None) + parser.add_argument('--tp_size', + type=int, + default=1, + help='N-way tensor parallelism size') + parser.add_argument('--pp_size', + type=int, + default=1, + help='N-way pipeline parallelism size') + parser.add_argument('--dtype', + type=str, + default='float16', + choices=['float32', 'bfloat16', 'float16']) + parser.add_argument('--vocab_size', type=int, default=32000) + parser.add_argument('--n_positions', type=int, default=2048) + parser.add_argument('--n_layer', type=int, default=32) + + parser.add_argument( + '--use_weight_only', + default=False, + action="store_true", + help='Quantize weights for the various GEMMs to INT4/INT8.' + 'See --weight_only_precision to set the precision') + parser.add_argument( + '--weight_only_precision', + const='int8', + type=str, + nargs='?', + default='int8', + choices=['int8', 'int4', 'int4_gptq'], + help= + 'Define the precision for the weights when using weight-only quantization.' + 'You must also use --use_weight_only for that argument to have an impact.' + ) + parser.add_argument( + '--calib_dataset', + type=str, + default='ccdv/cnn_dailymail', + help= + "The huggingface dataset name or the local directory of the dataset for calibration." + ) + parser.add_argument( + "--smoothquant", + "-sq", + type=float, + default=None, + help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)" + " to Smoothquant the model, and output int8 weights." + " A good first try is 0.5. Must be in [0, 1]") + parser.add_argument( + '--per_channel', + action="store_true", + default=False, + help= + 'By default, we use a single static scaling factor for the GEMM\'s result. ' + 'per_channel instead uses a different static scaling factor for each channel. ' + 'The latter is usually more accurate, but a little slower.') + parser.add_argument( + '--per_token', + action="store_true", + default=False, + help= + 'By default, we use a single static scaling factor to scale activations in the int8 range. ' + 'per_token chooses at run time, and for each token, a custom scaling factor. ' + 'The latter is usually more accurate, but a little slower.') + parser.add_argument( + '--int8_kv_cache', + default=False, + action="store_true", + help= + 'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV' + ) + + parser.add_argument( + '--per_group', + default=False, + action="store_true", + help= + 'By default, we use a single static scaling factor to scale weights in the int4 range. ' + 'per_group chooses at run time, and for each group, a custom scaling factor. ' + 'The flag is built for GPTQ/AWQ quantization.') + + parser.add_argument('--load_by_shard', + action='store_true', + help='Load a pretrained model shard-by-shard.') + parser.add_argument('--hidden_act', type=str, default='silu') + + parser.add_argument('--rotary_base', type=float, default=10000.0) + parser.add_argument('--rotary_scaling', nargs=2, type=str, default=None) + + parser.add_argument('--group_size', + type=int, + default=128, + help='Group size used in GPTQ/AWQ quantization.') + + parser.add_argument("--storage-type", + "-t", + type=str, + default="fp32", + choices=["fp32", "fp16"]) + parser.add_argument("--dataset-cache-dir", + type=str, + default=None, + help="cache dir to load the hugging face dataset") + parser.add_argument("--load-model-on-cpu", action="store_true") + parser.add_argument("--convert-model-on-cpu", action="store_true") + parser.add_argument( + '--use_parallel_embedding', + action="store_true", + default=False, + help= + 'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled' + ) + parser.add_argument( + '--embedding_sharding_dim', + type=int, + default=0, + choices=[0, 1], + help= + 'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). ' + 'To shard it along hidden dimension, set embedding_sharding_dim=1' + 'Note: embedding sharing is only enabled when embedding_sharding_dim = 0' + ) + parser.add_argument( + '--use_embedding_sharing', + action="store_true", + default=False, + help= + 'Try to reduce the engine size by sharing the embedding lookup table between two layers.' + 'Note: the flag might not take effect when the criteria are not met.') + parser.add_argument('--output_dir', + type=str, + default='tllm_checkpoint', + help='The path to save the TensorRT-LLM checkpoint') + parser.add_argument( + '--workers', + type=int, + default=1, + help='The number of workers for converting checkpoint in parallel') + + parser.add_argument('--eagle_model_dir', type=str, default=None) + parser.add_argument('--max_draft_len', type=int, default=63) + parser.add_argument( + '--num_eagle_layers', + type=int, + default=4, + help= + 'Maximum depth of the EAGLE choices tree, i.e. maximum number of accepted tokens.' + ) + args = parser.parse_args() + return args + + +if __name__ == '__main__': + # TODO(qijun): Currently, the convert script depends on a torch op: + # torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix, + # which is included in tensorrt_llm Python package. Otherwise, the convert + # script does not need to import tensorrt_llm. Will remove it after reimplementing + # the op with PyTorch. + print(tensorrt_llm.__version__) + args = parse_arguments() + world_size = args.tp_size * args.pp_size + + tik = time.time() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + hf_config = None + if args.model_dir is not None: + hf_config = LlamaConfig.from_pretrained(args.model_dir) + + args.model_type = hf_config.model_type + args.n_head = hf_config.num_attention_heads + args.inter_size = hf_config.intermediate_size + args.n_layer = hf_config.num_hidden_layers + args.n_embd = hf_config.hidden_size + args.n_kv_head = hf_config.num_key_value_heads + args.rms_norm_eps = hf_config.rms_norm_eps + args.vocab_size = hf_config.vocab_size + args.n_positions = hf_config.max_position_embeddings + + hf_config_eagle = LlamaConfig.from_pretrained(args.eagle_model_dir) + args.n_head_eagle = hf_config_eagle.num_attention_heads + args.inter_size_eagle = hf_config_eagle.intermediate_size + args.n_layer_eagle = hf_config_eagle.num_hidden_layers + args.n_embd_eagle = hf_config_eagle.hidden_size + args.n_kv_head_eagle = hf_config_eagle.num_key_value_heads + args.rms_norm_eps_eagle = hf_config_eagle.rms_norm_eps + args.n_positions_eagle = hf_config_eagle.max_position_embeddings + + elif args.meta_ckpt_dir is not None: + assert False, "meta ckpt is not supported yet" + + with open(Path(args.meta_ckpt_dir, "params.json")) as fp: + meta_config: dict = json.load(fp) + args.n_embd = meta_config["dim"] + args.n_head = meta_config["n_heads"] + args.n_layer = meta_config["n_layers"] + args.n_kv_head = meta_config.get("n_kv_heads", args.n_head) + + if "hidden_dim" in meta_config: + args.inter_size = meta_config["hidden_dim"] + else: + args.multiple_of = meta_config.get("multiple_of", 1) + n_embd = int(4 * args.n_embd * 2 / 3) + args.ffn_dim_multiplier = meta_config.get("ffn_dim_multiplier", 1) + args.inter_size = args.multiple_of * ( + (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1) + // args.multiple_of) + args.rms_norm_eps = meta_config["norm_eps"] + + if args.rotary_scaling is not None: + # assert args.use_gpt_attention_plugin, "RoPE scaling is only supported through GPT attention plugin." + rotary_scaling = { + "type": args.rotary_scaling[0], + "factor": float(args.rotary_scaling[1]) + } + assert rotary_scaling["type"] in ["linear", "dynamic"] + assert rotary_scaling["factor"] > 1.0 + args.rotary_scaling = rotary_scaling + + eagle_net_config = { + 'architecture': "LlamaForCausalLM", + 'dtype': args.dtype, + 'logits_dtype': 'float32', + 'num_hidden_layers': args.n_layer_eagle, + 'num_attention_heads': args.n_head_eagle, + 'hidden_size': args.n_embd_eagle, + 'intermediate_size': args.inter_size_eagle, + 'num_key_value_heads': args.n_kv_head_eagle, + 'vocab_size': args.vocab_size, + 'position_embedding_type': 'rope_gpt_neox', + 'max_position_embeddings': args.n_positions_eagle, + 'hidden_act': args.hidden_act, + 'rotary_base': args.rotary_base, + 'rotary_scaling': args.rotary_scaling, + 'norm_epsilon': args.rms_norm_eps_eagle, + 'quantization': { + 'quant_algo': None, + 'kv_cache_quant_algo': None, + }, + 'mapping': { + 'world_size': world_size, + 'tp_size': args.tp_size, + 'pp_size': args.pp_size, + }, + 'use_parallel_embedding': args.use_parallel_embedding, + 'embedding_sharding_dim': args.embedding_sharding_dim, + 'share_embedding_table': args.use_embedding_sharing, + } + + config = { + 'architecture': 'EagleForCausalLM', + 'dtype': args.dtype, + 'logits_dtype': 'float32', + 'num_hidden_layers': args.n_layer, + 'num_attention_heads': args.n_head, + 'hidden_size': args.n_embd, + 'intermediate_size': args.inter_size, + 'num_key_value_heads': args.n_kv_head, + 'vocab_size': args.vocab_size, + 'position_embedding_type': 'rope_gpt_neox', + 'max_position_embeddings': args.n_positions, + 'hidden_act': args.hidden_act, + 'rotary_base': args.rotary_base, + 'rotary_scaling': args.rotary_scaling, + 'norm_epsilon': args.rms_norm_eps, + 'quantization': { + 'quant_algo': None, + 'kv_cache_quant_algo': None, + }, + 'mapping': { + 'world_size': world_size, + 'tp_size': args.tp_size, + 'pp_size': args.pp_size, + }, + 'use_parallel_embedding': args.use_parallel_embedding, + 'embedding_sharding_dim': args.embedding_sharding_dim, + 'share_embedding_table': args.use_embedding_sharing, + 'max_draft_len': args.max_draft_len, + 'num_eagle_layers': args.num_eagle_layers, + 'eagle_net_config': eagle_net_config + } + + if args.use_weight_only: + if args.weight_only_precision == 'int8': + config['quantization']['quant_algo'] = QuantAlgo.W8A16 + elif args.weight_only_precision == 'int4': + config['quantization']['quant_algo'] = QuantAlgo.W4A16 + elif args.smoothquant: + if args.per_channel: + if args.per_token: + config['quantization'][ + 'quant_algo'] = QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN + else: + config['quantization'][ + 'quant_algo'] = QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN + else: + if args.per_token: + config['quantization'][ + 'quant_algo'] = QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN + else: + config['quantization'][ + 'quant_algo'] = QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN + + if args.int8_kv_cache: + config['quantization']['kv_cache_quant_algo'] = QuantAlgo.INT8 + + if args.weight_only_precision == 'int4_gptq': + config['quantization'].update({ + "group_size": args.group_size, + "has_zero_point": True, + "pre_quant_scale": False, + 'quant_algo': QuantAlgo.W4A16_GPTQ + }) + + with open(os.path.join(args.output_dir, 'config.json'), 'w') as f: + json.dump(config, f, indent=4) + + if args.weight_only_precision == 'int8': + plugin_weight_only_quant_type = torch.int8 + elif args.weight_only_precision == 'int4': + plugin_weight_only_quant_type = torch.quint4x2 + + act_range = {} + llama_qkv_para = {} + # smoother for inputs of self_attn.o_proj and mlp.down_proj + llama_smoother = {} + base_model = None + eagle_model = None + + def get_hf_model(model_dir): + hf_model = LlamaForCausalLM if args.model_type != "mixtral" else MixtralForCausalLM + + model = hf_model.from_pretrained( + model_dir, + torch_dtype='auto', + device_map='auto' if not args.load_model_on_cpu else 'cpu', + trust_remote_code=True) + + if args.smoothquant is not None or args.int8_kv_cache: + os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get( + "TOKENIZERS_PARALLELISM", "false") + if args.load_model_on_cpu: + logger.warning( + "Note that running capture_activation_range on cpu would be very slow." + ) + tokenizer = LlamaTokenizer.from_pretrained(args.model_dir, + padding_side='left') + dataset = load_calib_dataset(args.calib_dataset, + cache_dir=args.dataset_cache_dir) + + act_range = capture_activation_range(model, tokenizer, dataset) + if args.smoothquant is not None: + smooth_llama_model(model, act_range, args.smoothquant, + llama_qkv_para, llama_smoother) + return model + + if args.model_dir is not None: + base_model = get_hf_model(args.model_dir) + if args.eagle_model_dir is not None: + eagle_model = get_hf_model(args.eagle_model_dir) + + convert_args = { + 'hf_base_model': base_model, + 'hf_eagle_model': eagle_model, + 'act_range': act_range, + 'llama_qkv_para': llama_qkv_para, + 'llama_smoother': llama_smoother + } + + def covert_and_save(rank, convert_args): + mapping = Mapping(world_size=world_size, + rank=rank, + tp_size=args.tp_size, + pp_size=args.pp_size) + + if args.use_weight_only and args.weight_only_precision == 'int4_gptq': + assert False, "Never supported" + else: + if args.load_by_shard: + weights = load_weights_from_hf_by_shard( + args.model_dir, PretrainedConfig.from_dict(config)) + + else: + weights = convert_hf_llama( + convert_args['hf_base_model'], + mapping, + rank, + dtype=args.dtype, + use_weight_only=args.use_weight_only, + plugin_weight_only_quant_type=plugin_weight_only_quant_type, + use_parallel_embedding=args.use_parallel_embedding, + sharding_dim=args.embedding_sharding_dim, + share_embedding_table=args.use_embedding_sharing, + use_smooth_quant=args.smoothquant, + per_channel=args.per_channel, + per_token=args.per_token, + int8_kv_cache=args.int8_kv_cache, + act_range=convert_args['act_range'], + qkv_para=convert_args['llama_qkv_para'], + smoother=convert_args['llama_smoother']) + + eagle_weights = load_eagle_hf( + eagle_model_dir=args.eagle_model_dir, + eagle_model=convert_args['hf_eagle_model'], + base_model=convert_args['hf_base_model'], + num_eagle_layers=args.num_eagle_layers, + mapping=mapping, + rank=rank, + dtype=args.dtype) + weights.update(eagle_weights) + + safetensors.torch.save_file( + weights, os.path.join(args.output_dir, f'rank{rank}.safetensors')) + + if args.workers == 1: + for rank in range(world_size): + covert_and_save(rank, convert_args) + else: + with ThreadPoolExecutor(max_workers=args.workers) as p: + futures = [ + p.submit(covert_and_save, rank, convert_args) + for rank in range(world_size) + ] + exceptions = [] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + traceback.print_exc() + exceptions.append(e) + assert len( + exceptions + ) == 0, "Checkpoint conversion failed, please check error log." + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + print(f'Total time of converting checkpoints: {t}') diff --git a/examples/eagle/requirements.txt b/examples/eagle/requirements.txt new file mode 100644 index 000000000..a86a3148d --- /dev/null +++ b/examples/eagle/requirements.txt @@ -0,0 +1,6 @@ +--extra-index-url https://pypi.nvidia.com +tensorrt_llm==0.15.0.dev2024101500 +datasets~=2.14.5 +rouge_score~=0.1.2 +SentencePiece~=0.1.99 +evaluate~=0.4.1 diff --git a/examples/exaone/README.md b/examples/exaone/README.md index 030dfc49d..5b98f0884 100644 --- a/examples/exaone/README.md +++ b/examples/exaone/README.md @@ -8,14 +8,22 @@ See the LLaMA example [`examples/llama`](../llama) for details. - [EXAONE](#exaone) - [Support Matrix](#support-matrix) - [Download model checkpoints](#download-model-checkpoints) - - [TensorRT-LLM workflow](#tensorrt-llm-workflow) + - [Usage](#usage) - [Convert checkpoint and build TensorRT engine(s)](#convert-checkpoint-and-build-tensorrt-engines) + - [FP8 Post-Training Quantization](#fp8-post-training-quantization) + - [SmoothQuant](#smoothquant) + - [Groupwise quantization (AWQ)](#groupwise-quantization-awq) + - [W4A16 AWQ with FP8 GEMM (W4A8 AWQ)](#w4a16-awq-with-fp8-gemm-w4a8-awq) - [Run Engine](#run-engine) ## Support Matrix * FP16 * BF16 + * Tensor Parallel + * FP8 * INT8 & INT4 Weight-Only + * INT8 SmoothQuant + * INT4 AWQ & W4A8 AWQ ## Download model checkpoints @@ -25,12 +33,11 @@ First, download the HuggingFace FP32 checkpoints of EXAONE model. git clone https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct hf_models/exaone ``` -## TensorRT-LLM workflow -Next, we build the model with `trtllm-build`. +## Usage +The next section describe how to convert the weights from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format. We will use llama's [convert_checkpoint.py](../llama/convert_checkpoint.py) for EXAONE model and then we build the model with `trtllm-build`. ### Convert checkpoint and build TensorRT engine(s) -As written above, we will use llama's [convert_checkpoint.py](../llama/convert_checkpoint.py) for EXAONE model. ```bash # Build a single-GPU float16 engine from HF weights. @@ -48,17 +55,122 @@ trtllm-build \ # Build the EXAONE model using a single GPU and and apply INT8 weight-only quantization. python ../llama/convert_checkpoint.py \ --model_dir hf_models/exaone \ - --output_dir trt_models/exaone/fp16_wq_8/1-gpu \ + --output_dir trt_models/exaone/int8_wq/1-gpu \ --use_weight_only \ --weight_only_precision int8 \ --dtype float16 trtllm-build \ - --checkpoint_dir trt_models/exaone/fp16_wq_8/1-gpu \ - --output_dir trt_engines/exaone/fp16_wq_8/1-gpu \ + --checkpoint_dir trt_models/exaone/int8_wq/1-gpu \ + --output_dir trt_engines/exaone/int8_wq/1-gpu \ + --gemm_plugin auto + +# Build the EXAONE model using a single GPU and and apply INT4 weight-only quantization. +python ../llama/convert_checkpoint.py \ + --model_dir hf_models/exaone \ + --output_dir trt_models/exaone/int4_wq/1-gpu \ + --use_weight_only \ + --weight_only_precision int4 \ + --dtype float16 + +trtllm-build \ + --checkpoint_dir trt_models/exaone/int4_wq/1-gpu \ + --output_dir trt_engines/exaone/int4_wq/1-gpu \ + --gemm_plugin auto + +# Build the EXAONE model using using 2-way tensor parallelism and FP16. +python ../llama/convert_checkpoint.py \ + --model_dir hf_models/exaone \ + --output_dir trt_models/exaone/fp16/2-gpu \ + --tp_size 2 \ + --dtype float16 + +trtllm-build \ + --checkpoint_dir trt_models/exaone/fp16/2-gpu \ + --output_dir trt_engines/exaone/fp16/2-gpu \ + --gemm_plugin auto +``` +> **NOTE**: EXAONE model is not supported with `--load_by_shard`. + +### FP8 Post-Training Quantization + +The examples below uses the NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit for the model quantization process. + +First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation)) + +```bash +# Build the EXAONE model using a single GPU and and apply FP8 quantization. +python ../quantization/quantize.py \ + --model_dir hf_models/exaone \ + --dtype float16 \ + --qformat fp8 \ + --kv_cache_dtype fp8 \ + --output_dir trt_models/exaone/fp8/1-gpu \ + +trtllm-build \ + --checkpoint_dir trt_models/exaone/fp8/1-gpu \ + --output_dir trt_engines/exaone/fp8/1-gpu \ + --gemm_plugin auto +``` + +### SmoothQuant + +The examples below uses the NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit for the model quantization process. + +First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation)) + +```bash +# Build the EXAONE model using a single GPU and and apply INT8 SmoothQuant. +python ../quantization/quantize.py \ + --model_dir hf_models/exaone \ + --dtype float16 \ + --qformat int8_sq \ + --output_dir trt_models/exaone/int8_sq/1-gpu + +trtllm-build \ + --checkpoint_dir trt_models/exaone/int8_sq/1-gpu \ + --output_dir trt_engines/exaone/int8_sq/1-gpu \ + --gemm_plugin auto +``` + +### Groupwise quantization (AWQ) + +The examples below uses the NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit for the model quantization process. + +First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation)) + +```bash +# Build the EXAONE model using a single GPU and and apply INT4 AWQ. +python ../quantization/quantize.py \ + --model_dir hf_models/exaone \ + --dtype float16 \ + --qformat int4_awq \ + --output_dir trt_models/exaone/int4_awq/1-gpu + +trtllm-build \ + --checkpoint_dir trt_models/exaone/int4_awq/1-gpu \ + --output_dir trt_engines/exaone/int4_awq/1-gpu \ + --gemm_plugin auto +``` + +#### W4A16 AWQ with FP8 GEMM (W4A8 AWQ) +For Hopper GPUs, TRT-LLM also supports employing FP8 GEMM for accelerating linear layers. This mode is noted with `w4a8_awq` for Modelopt and TRT-LLM, in which both weights and activations are converted from W4A16 to FP8 for GEMM calculation. + +Please make sure your system contains a Hopper GPU before trying the commands below. + +```bash +# Build the EXAONE model using a single GPU and and apply W4A8 AWQ. +python ../quantization/quantize.py \ + --model_dir hf_models/exaone \ + --dtype float16 \ + --qformat w4a8_awq \ + --output_dir trt_models/exaone/w4a8_awq/1-gpu + +trtllm-build \ + --checkpoint_dir trt_models/exaone/w4a8_awq/1-gpu \ + --output_dir trt_engines/exaone/w4a8_awq/1-gpu \ --gemm_plugin auto ``` -> **NOTE**: EXAONE model is currently not supported with `--load_by_shard`. ### Run Engine @@ -71,6 +183,14 @@ python3 ../run.py \ --tokenizer_dir hf_models/exaone \ --engine_dir trt_engines/exaone/fp16/1-gpu +# Run with 2 GPUs +mpirun -n 2 --allow-run-as-root \ + python3 ../run.py \ + --input_text "When did the first world war end?" \ + --max_output_len=100 \ + --tokenizer_dir hf_models/exaone \ + --engine_dir trt_engines/exaone/fp16/2-gpu + python ../summarize.py \ --test_trt_llm \ --data_type fp16 \ diff --git a/examples/falcon/README.md b/examples/falcon/README.md index f4736f5f9..63d65e427 100644 --- a/examples/falcon/README.md +++ b/examples/falcon/README.md @@ -67,6 +67,9 @@ git clone https://huggingface.co/tiiuae/falcon-40b-instruct falcon/40b-instruct # falcon-180b git clone https://huggingface.co/tiiuae/falcon-180B falcon/180b + +# falcon-11b (Falcon 2) +git clone https://huggingface.co/tiiuae/falcon-11B falcon/11b ``` ### 2. Convert weights from HF Transformers to TensorRT-LLM format @@ -112,6 +115,11 @@ python3 convert_checkpoint.py --model_dir ./falcon/180b \ --pp_size 2 \ --load_by_shard \ --workers 8 + +# falcon-11b (Falcon 2): single gpu, dtype bfloat16 +python3 convert_checkpoint.py --model_dir ./falcon/11b \ + --dtype bfloat16 \ + --output_dir ./falcon/11b/trt_ckpt/bf16/1-gpu/ ``` Note that in order to use N-way tensor parallelism, the number of attention heads must be a multiple of N. @@ -162,6 +170,12 @@ trtllm-build --checkpoint_dir ./falcon/180b/trt_ckpt/bf16/tp4-pp2/ \ --gpt_attention_plugin bfloat16 \ --output_dir ./falcon/180b/trt_engines/bf16/tp4-pp2/ \ --workers 8 + +# falcon-11b (Falcon 2) +trtllm-build --checkpoint_dir ./falcon/11b/trt_ckpt/bf16/1-gpu/ \ + --gemm_plugin bfloat16 \ + --gpt_attention_plugin bfloat16 \ + --output_dir ./falcon/11b/trt_engines/bf16/1-gpu/ ``` If the engines are built successfully, you will see output like (falcon-rw-1b as the example): @@ -215,6 +229,11 @@ mpirun -n 8 --allow-run-as-root --oversubscribe \ python ../summarize.py --test_trt_llm \ --hf_model_dir ./falcon/180b \ --engine_dir ./falcon/180b/trt_engines/bf16/tp4-pp2/ + +# falcon-11b (Falcon 2) +python ../summarize.py --test_trt_llm \ + --hf_model_dir ./falcon/11b \ + --engine_dir ./falcon/11b/trt_engines/bf16/1-gpu/ ``` If the engines are run successfully, you will see output like (falcon-rw-1b as the example): diff --git a/examples/falcon/requirements.txt b/examples/falcon/requirements.txt index cd85a4ef2..d3f245425 100644 --- a/examples/falcon/requirements.txt +++ b/examples/falcon/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 transformers>=4.31.0 datasets~=2.14.5 evaluate~=0.4.1 diff --git a/examples/gemma/requirements.txt b/examples/gemma/requirements.txt index dc52c6f39..d1fb82adc 100644 --- a/examples/gemma/requirements.txt +++ b/examples/gemma/requirements.txt @@ -3,7 +3,7 @@ # WAR the new posting of "nvidia-cudnn-cu12~=9.0". # "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9". nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64" -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 flax~=0.8.0 # jax[cuda12_pip]~=0.4.19; platform_system != "Windows" jax~=0.4.19; platform_system == "Windows" diff --git a/examples/gpt/requirements.txt b/examples/gpt/requirements.txt index b1baef82c..b924ab56f 100644 --- a/examples/gpt/requirements.txt +++ b/examples/gpt/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/gptj/convert_checkpoint.py b/examples/gptj/convert_checkpoint.py index 6b1ec5b4d..a8b697a0e 100644 --- a/examples/gptj/convert_checkpoint.py +++ b/examples/gptj/convert_checkpoint.py @@ -7,7 +7,7 @@ from transformers import AutoModelForCausalLM import tensorrt_llm -from tensorrt_llm.hlapi import QuantConfig +from tensorrt_llm.llmapi import QuantConfig from tensorrt_llm.mapping import Mapping from tensorrt_llm.models import GPTJConfig, GPTJForCausalLM from tensorrt_llm.quantization import QuantAlgo diff --git a/examples/gptj/requirements.txt b/examples/gptj/requirements.txt index ee73d169c..e876bf8dd 100644 --- a/examples/gptj/requirements.txt +++ b/examples/gptj/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/gptneox/requirements.txt b/examples/gptneox/requirements.txt index 593f3125c..3c899267c 100644 --- a/examples/gptneox/requirements.txt +++ b/examples/gptneox/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.14.5 rouge_score~=0.1.2 evaluate~=0.4.1 diff --git a/examples/grok/requirements.txt b/examples/grok/requirements.txt index 4fa525358..7254ae638 100644 --- a/examples/grok/requirements.txt +++ b/examples/grok/requirements.txt @@ -1,6 +1,6 @@ -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/internlm/requirements.txt b/examples/internlm/requirements.txt index 640b43eac..2f81769bc 100644 --- a/examples/internlm/requirements.txt +++ b/examples/internlm/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets==2.14.5 rouge_score~=0.1.2 sentencepiece~=0.1.99 diff --git a/examples/jais/requirements.txt b/examples/jais/requirements.txt index b1baef82c..b924ab56f 100644 --- a/examples/jais/requirements.txt +++ b/examples/jais/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/llama/README.md b/examples/llama/README.md index d373f932a..948f13888 100644 --- a/examples/llama/README.md +++ b/examples/llama/README.md @@ -48,7 +48,7 @@ In addition, there are two shared files in the parent folder [`examples`](../) f * [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset. ## Support Matrix - * FP16 + * BF16/FP16 * FP8 * INT8 & INT4 Weight-Only * SmoothQuant diff --git a/examples/llama/convert_checkpoint.py b/examples/llama/convert_checkpoint.py index 6443f8513..0edc18184 100644 --- a/examples/llama/convert_checkpoint.py +++ b/examples/llama/convert_checkpoint.py @@ -79,7 +79,7 @@ def parse_arguments(): type=str, nargs='?', default='int8', - choices=['int8', 'int4', 'int4_gptq', 'int4_awq'], + choices=['int8', 'int4', 'int8_gptq', 'int4_gptq', 'int4_awq'], help= 'Define the precision for the weights when using weight-only quantization.' 'You must also use --use_weight_only for that argument to have an impact.' @@ -254,15 +254,36 @@ def parse_arguments(): return args +def precision_to_config(precision, group_size, quant_config) -> QuantConfig: + '''update config dict for weight-only quantization + ''' + quant_config = QuantConfig() + precision_to_algo = { + 'int8': QuantAlgo.W8A16, + 'int4': QuantAlgo.W4A16, + 'int8_gptq': QuantAlgo.W8A16_GPTQ, + 'int4_gptq': QuantAlgo.W4A16_GPTQ, + 'int4_awq': QuantAlgo.W4A16_AWQ + } + quant_config.quant_algo = precision_to_algo.get(precision) + if precision in {'int4_gptq', 'int8_gptq'}: + quant_config.group_size = group_size + quant_config.has_zero_point = True + quant_config.pre_quant_scale = False + elif precision == 'int4_awq': + quant_config.group_size = group_size + quant_config.has_zero_point = False + quant_config.pre_quant_scale = True + return quant_config + + def args_to_quant_config(args: argparse.Namespace) -> QuantConfig: '''return config dict with quantization info based on the command line args ''' quant_config = QuantConfig() if args.use_weight_only: - if args.weight_only_precision == 'int8': - quant_config.quant_algo = QuantAlgo.W8A16 - elif args.weight_only_precision == 'int4': - quant_config.quant_algo = QuantAlgo.W4A16 + quant_config = precision_to_config(args.weight_only_precision, + args.group_size, quant_config) elif args.use_fp8: quant_config.quant_algo = QuantAlgo.FP8 elif args.smoothquant: @@ -288,18 +309,6 @@ def args_to_quant_config(args: argparse.Namespace) -> QuantConfig: if args.fp8_kv_cache: quant_config.kv_cache_quant_algo = QuantAlgo.FP8 - if args.weight_only_precision == 'int4_gptq': - quant_config.group_size = args.group_size - quant_config.has_zero_point = True - quant_config.pre_quant_scale = False - quant_config.quant_algo = QuantAlgo.W4A16_GPTQ - - if args.weight_only_precision == 'int4_awq': - quant_config.group_size = args.group_size - quant_config.has_zero_point = False - quant_config.pre_quant_scale = True - quant_config.quant_algo = QuantAlgo.W4A16_AWQ - return quant_config @@ -443,11 +452,13 @@ def convert_and_save_rank(args, rank): load_by_shard=load_by_shard, **override_fields, ) - print(f'Total time of reading and converting {time.time()-tik} s') + print( + f'Total time of reading and converting: {time.time()-tik:.3f} s' + ) tik = time.time() llama.save_checkpoint(args.output_dir, save_config=(rank == 0)) del llama - print(f'Total time of saving checkpoint {time.time()-tik} s') + print(f'Total time of saving checkpoint: {time.time()-tik:.3f} s') execute(args.workers, [convert_and_save_rank] * world_size, args) release_gc() @@ -505,7 +516,7 @@ def main(): assert args.model_dir is not None assert ( args.quant_ckpt_path is not None - and args.weight_only_precision == 'int4_gptq' + and args.weight_only_precision in {'int4_gptq', 'int8_gptq'} ) or args.quant_ckpt_path is None, "only gptq weights only needs this option" convert_and_save_hf(args) diff --git a/examples/llama/requirements.txt b/examples/llama/requirements.txt index 44c2d95c5..a06397938 100644 --- a/examples/llama/requirements.txt +++ b/examples/llama/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/llm-api/README.md b/examples/llm-api/README.md index 1dbba4f28..6afcd53fd 100644 --- a/examples/llm-api/README.md +++ b/examples/llm-api/README.md @@ -1,10 +1,10 @@ -# High-level API -We are working on a Python high-level API(HLAPI) for LLM workflow, which is still in incubation and may change later. +# LLM API +We are working on a Python high-level API(LLMAPI) for LLM workflow, which is still in incubation and may change later. Here we show you a preview of how it works and how to use it. Note that the APIs are not stable and we appreciate your patience and understanding as we improve this API. -## HLAPI Supported Model +## LLM API Supported Model * LLaMA (including variants Mistral, Mixtral, InternLM) * GPT (including variants Starcoder-1/2, Santacoder) * Gemma-1/2 @@ -23,7 +23,7 @@ Please install the required packages first: pip install -r requirements.txt ``` -Here is a simple example to show how to use the HLAPI: +Here is a simple example to show how to use the LLMAPI: Firstly, import the `LLM` and `SamplingParams` from the `tensorrt_llm` package, and create an LLM object with a HuggingFace (HF) model directly. Here we use the TinyLlama model as an example, `LLM` will download the model from the HuggingFace model hub automatically. You can also specify local models, either in HF format, TensorRT-LLM engine format or TensorRT-LLM checkpoint format. @@ -65,16 +65,16 @@ The `LLM` class supports four kinds of model inputs: 1. **HuggingFace model name**: triggers a download from the HuggingFace model hub, e.g. `TinyLlama/TinyLlama-1.1B-Chat-v1.0` in the quickstart. 1. **Local HuggingFace models**: uses a locally stored HuggingFace model. -2. **Local TensorRT-LLM engine**: built by `trtllm-build` tool or saved by the HLAPI +2. **Local TensorRT-LLM engine**: built by `trtllm-build` tool or saved by the LLMAPI 3. **Local TensorRT-LLM checkpoints**: converted by `convert_checkpoint.py` script in the examples -All kinds of the model inputs can be seamlessly integrated with the HLAPI, and the `LLM(model=)` construcotr can accommodate models in any of the above formats. +All kinds of the model inputs can be seamlessly integrated with the LLMAPI, and the `LLM(model=)` construcotr can accommodate models in any of the above formats. Let's delve into the preparation of the three kinds of local model formats. ### Option 1: From HuggingFace models -Given its popularity, the TRT-LLM HLAPI chooses to support HuggingFace format as one of the start points, to use the HLAPI on LLaMA models, you need to run the following conversion script provided in [transformers/llama](https://huggingface.co/docs/transformers/main/model_doc/llama) or [transformers/llama2](https://huggingface.co/docs/transformers/main/model_doc/llama2) to convert the Meta checkpoint to HuggingFace format. +Given its popularity, the TRT-LLM LLMAPI chooses to support HuggingFace format as one of the start points, to use the LLMAPI on LLaMA models, you need to run the following conversion script provided in [transformers/llama](https://huggingface.co/docs/transformers/main/model_doc/llama) or [transformers/llama2](https://huggingface.co/docs/transformers/main/model_doc/llama2) to convert the Meta checkpoint to HuggingFace format. For instance, when targeting the LLaMA2 7B model, the official way to retrieve the model is to visit the [LLaMA2 model page](https://huggingface.co/docs/transformers/main/en/model_doc/llama2), normally you need to submit a request for the model file. @@ -91,13 +91,13 @@ python /src/transformers/models/llama/convert_llama_weights_to --input_dir Llama-2-7b --model_size 7B --output_dir llama-hf-7b ``` -That should produce a HuggingFace format model in `./llama-hf-7b`, which could be used by the HLAPI. +That should produce a HuggingFace format model in `./llama-hf-7b`, which could be used by the LLMAPI. ### Option 2: From TensorRT-LLM engine There are two ways to build the TensorRT-LLM engine: 1. You can build the TensorRT-LLM engine from the HuggingFace model directly with the `trtllm-build` tool, and save the engine to disk for later use. Please consult the LLaMA's [README](../llama/README.md). -2. Use the HLAPI to save one: +2. Use the LLMAPI to save one: ```python llm = LLM() @@ -108,7 +108,7 @@ llm.save() ### Option 3: From TensorRT-LLM checkpoint In each model example, there is a `convert_checkpoint.py` to convert third-party models to TensorRT-LLM checkpoint for further usage. -The HLAPI could seamlessly accept the checkpoint, and build the engine in the backend. +The LLMAPI could seamlessly accept the checkpoint, and build the engine in the backend. For step-by-step guidance on checkpoint conversion, please refer to the LLaMA's [README](../llama/README.md). @@ -156,7 +156,7 @@ By simply setting several flags in the `LLM`, TensorRT-LLM can quantize the Hugg ``` python -from tensorrt_llm.hlapi import QuantConfig, QuantAlgo +from tensorrt_llm.llmapi import QuantConfig, QuantAlgo quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16_AWQ) @@ -166,17 +166,17 @@ llm = LLM(, quant_config=quant_config) ## Parallelism ### Tensor Parallelism -It is easy to enable Tensor Parallelism in the HLAPI. For example, setting `parallel_config.tp_size=2` to perform a 2-way parallelism: +It is easy to enable Tensor Parallelism in the LLMAPI. For example, setting `parallel_config.tp_size=2` to perform a 2-way parallelism: ```python -from tensorrt_llm.hlapi import LLM +from tensorrt_llm.llmapi import LLM llm = LLM(, tensor_parallel_size=2) ``` ### Pipeline Parallelism -Similar to Tensor Parallelism, you can enable Pipeline Parallelism in the HLAPI with following code: +Similar to Tensor Parallelism, you can enable Pipeline Parallelism in the LLMAPI with following code: ```python llm = LLM(, @@ -236,7 +236,7 @@ With SamplingParams, you can customize the sampling strategy, such as beam searc To enable beam search with a beam size of 4, set the `sampling_params` as follows: ```python -from tensorrt_llm.hlapi import LLM, SamplingParams, BuildConfig +from tensorrt_llm.llmapi import LLM, SamplingParams, BuildConfig build_config = BuildConfig() build_config.max_beam_width = 4 @@ -269,12 +269,23 @@ llm = LLM(, max_beam_width=4)) ``` +### Fast build +The `fast_build` is an experimental feature that speeds up engine building. It can be enabled by setting argument `fast_build` to `True`. For example: + +```python +llm = LLM(, + fast_build=True) +``` + +Notice that `fast_build` currently does not work with int8/int4 quantization. + + ### Runtime customization Similar to `build_config`, you can also customize the runtime configuration with the `runtime_config`, `peft_cache_config` or other arguments borrowed from the lower-level APIs. For example: ```python -from tensorrt_llm.hlapi import LLM, KvCacheConfig +from tensorrt_llm.llmapi import LLM, KvCacheConfig llm = LLM(, kv_cache_config=KvCacheConfig( @@ -317,11 +328,11 @@ RequestOutput(request_id=1, prompt=None, prompt_token_ids=[1, 15043, 29892, 590, Note that the `text` field in `CompletionOutput` is empty since the tokenizer is deactivated. ### Build caching -Although the HLAPI runs the engine building in the background, you can also cache the built engine to disk and load it in the next run to save the engine building time. +Although the LLMAPI runs the engine building in the background, you can also cache the built engine to disk and load it in the next run to save the engine building time. To enable the build cache, there are two ways to do it: -1. Use the environment variable: `export TLLM_HLAPI_BUILD_CACHE=1` to enable the build cache globally, and optionally export `TLLM_HLAPI_BUILD_CACHE_ROOT` to specify the cache root directory. +1. Use the environment variable: `export TLLM_LLMAPI_BUILD_CACHE=1` to enable the build cache globally, and optionally export `TLLM_LLMAPI_BUILD_CACHE_ROOT` to specify the cache root directory. 2. Pass the `enable_build_cache` to the `LLM` constructor The build cache will reuse the built engine if all the building settings are the same, or it will rebuild the engine. diff --git a/examples/llm-api/llm_logits_processor.py b/examples/llm-api/llm_logits_processor.py new file mode 100644 index 000000000..33e46657a --- /dev/null +++ b/examples/llm-api/llm_logits_processor.py @@ -0,0 +1,51 @@ +### Control generated text using logits post processor +import typing as tp + +import torch + +from tensorrt_llm import LLM, SamplingParams + + +# Define the logits post-processor callback. This simple callback will output +# a specific token at each step irrespective of prompt. +# Refer to ../bindings/executor/example_logits_processor.py for a more +# sophisticated callback that generates JSON structured output. +def logits_post_processor(req_id: int, logits: torch.Tensor, + ids: tp.List[tp.List[int]], stream_ptr: int, + client_id: tp.Optional[int]): + target_token_id = 42 + with torch.cuda.stream(torch.cuda.ExternalStream(stream_ptr)): + logits[:] = float("-inf") + logits[..., target_token_id] = 0 + + +# Several callbacks can be specified when initializing LLM +llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + logits_post_processor_map={"my_logits_pp": logits_post_processor}) + +# Sample prompts +prompts = [ + "Hello, my name is", + "The president of the United States is", +] + +# Generate text +for prompt_id, prompt in enumerate(prompts): + # We will use logits post processor callback only for odd-numbered prompts + if prompt_id % 2 == 0: + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + else: + # Each prompt can use one callback from the choices that were provided to LLM + sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + logits_post_processor_name='my_logits_pp') + + for output in llm.generate([prompt], sampling_params): + print( + f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}" + ) + +# Got output like +# Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming' +# Prompt: 'The president of the United States is', Generated text: "''''''''''''''''''''''''''''''''" diff --git a/examples/llm-api/llm_quantization.py b/examples/llm-api/llm_quantization.py index e746f0ef1..9f048bc3e 100644 --- a/examples/llm-api/llm_quantization.py +++ b/examples/llm-api/llm_quantization.py @@ -4,7 +4,7 @@ import torch from tensorrt_llm import LLM, SamplingParams -from tensorrt_llm.hlapi import QuantAlgo, QuantConfig +from tensorrt_llm.llmapi import QuantAlgo, QuantConfig major, minor = torch.cuda.get_device_capability() post_ada = major > 8 or (major == 8 and minor >= 9) diff --git a/examples/llm-api/requirements.txt b/examples/llm-api/requirements.txt index 7ac907aa0..0b26e914d 100644 --- a/examples/llm-api/requirements.txt +++ b/examples/llm-api/requirements.txt @@ -1,2 +1,2 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 diff --git a/examples/mamba/convert_checkpoint.py b/examples/mamba/convert_checkpoint.py index d0de466e2..0e48c97fb 100644 --- a/examples/mamba/convert_checkpoint.py +++ b/examples/mamba/convert_checkpoint.py @@ -1,336 +1,95 @@ import argparse -import copy import json import os -import re import time -from dataclasses import dataclass, field -from enum import Enum +from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path -from typing import List, Union - -import safetensors.torch -import torch -from transformers import AutoConfig, AutoModelForCausalLM import tensorrt_llm from tensorrt_llm import logger -from tensorrt_llm.models.convert_utils import (iterate_shard_files, - load_state_dict) - - -class CheckpointType(str, Enum): - mistral_inference = "mistral_inference" - state_spaces = "state_spaces" - hf = "hf" +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.models import MambaForCausalLM +from tensorrt_llm.models.modeling_utils import QuantConfig +from tensorrt_llm.quantization import QuantAlgo def parse_arguments(): parser = argparse.ArgumentParser() - parser.add_argument("--ckpt_type", - type=CheckpointType, - choices=list(CheckpointType), - default=CheckpointType.hf, - help='Checkpoint type') + parser.add_argument('--model_dir', type=Path, default=None) parser.add_argument("--world_size", type=int, default=1, help="world size, only support tensor parallelism now") + parser.add_argument('--tp_size', + type=int, + default=1, + help='N-way tensor parallelism size') + parser.add_argument('--pp_size', + type=int, + default=1, + help='N-way pipeline parallelism size') parser.add_argument('--dtype', type=str, default='float16', choices=['float32', 'bfloat16', 'float16']) + parser.add_argument( + '--use_weight_only', + default=False, + action="store_true", + help='Quantize weights for the various GEMMs to INT4/INT8.' + 'See --weight_only_precision to set the precision') + parser.add_argument( + '--weight_only_precision', + const='int8', + type=str, + nargs='?', + default='int8', + choices=['int8', 'int4'], + help= + 'Define the precision for the weights when using weight-only quantization.' + 'You must also use --use_weight_only for that argument to have an impact.' + ) parser.add_argument( '--output_dir', type=Path, default='mamba_tllm_checkpoint', help='The path to save the mamba TensorRT-LLM checkpoint') parser.add_argument('--log_level', type=str, default='info') + parser.add_argument( + '--workers', + type=int, + default=1, + help='The number of workers for converting checkpoint in parallel') args = parser.parse_args() return args -def get_weight(config, prefix, dtype): - return config[prefix + '.weight'].to(dtype).detach() - - -def get_bias(config, prefix, dtype): - if (prefix + '.bias') in config: - return config[prefix + '.bias'].to(dtype).detach() - return None - - -def get_weight_and_bias(config, prefix, dtype_w, dtype_b): - return get_weight(config, prefix, - dtype_w), get_bias(config, prefix, dtype_b) - - -def get_tllm_linear_weight(weight, prefix, bias=None): - results = {} - results[prefix + 'weight'] = weight.contiguous() - if bias is not None: - results[prefix + 'bias'] = bias - return results - - -def split(v, tp_size, idx, dim=0): - assert v.shape[dim] % tp_size == 0 - split_size = v.shape[dim] // tp_size - if tp_size == 1: - return v - return torch.split(v, split_size, dim=dim)[idx] - - -def convert_hf_mamba(hf_mamba, - rank=0, - dtype='float32', - mamba_version: str = 'Mamba1'): - weights = {} - tik = time.time() - - model_params = dict(hf_mamba.named_parameters()) - dtype = getattr(torch, dtype) - - # Parameter names in mamba block - for l in range(hf_mamba.config.num_hidden_layers): - # ssm layer - prefix = f'backbone.layers.{l}.mixer.' - tllm_prex = f'backbone.layers.{l}.ssm.' - for layer in ['conv1d', 'x_proj', 'dt_proj', 'out_proj']: - dtype_b = torch.float32 if layer == 'dt_proj' else dtype - weight, bias = get_weight_and_bias(model_params, prefix + layer, - dtype, dtype_b) - if layer == 'conv1d': - weight = weight.unsqueeze(3) - tllm_weight_name = tllm_prex + layer + '.weight' - tllm_bias_name = tllm_prex + ('dt_bias' if layer == 'dt_proj' else - layer + '.bias') - weights[tllm_weight_name] = weight - if bias is not None: - weights[tllm_bias_name] = bias - # in_proj - weight, bias = get_weight_and_bias(model_params, prefix + 'in_proj', - dtype, dtype) - in_proj_weights = torch.split(weight, weight.size(0) // 2, dim=0) - tllm_weight_name = tllm_prex + 'in_proj.weight' - weights[tllm_weight_name.replace('proj', 'proj_x')] = in_proj_weights[0] - weights[tllm_weight_name.replace('proj', 'proj_z')] = in_proj_weights[1] - if bias is not None: - in_proj_biases = torch.split(bias, bias.size(0) // 2, dim=0) - tllm_bias_name = tllm_prex + 'in_proj.bias' - weights[tllm_bias_name.replace('proj', - 'proj_x')] = in_proj_biases[0] - weights[tllm_bias_name.replace('proj', - 'proj_x')] = in_proj_biases[1] - - # A and D - Aparam = model_params[prefix + 'A_log'].float().detach() - Aparam = Aparam.permute(1, 0).contiguous() - weights[tllm_prex + 'A'] = -torch.exp(Aparam) - weights[tllm_prex + 'D'] = model_params[prefix + 'D'].float().detach() - # norm - prefix = f'backbone.layers.{l}.norm' - tllm_prex = f'backbone.layers.{l}.input_layernorm.' - weight, bias = get_weight_and_bias(model_params, prefix, dtype, dtype) - weights[tllm_prex + 'weight'] = weight - if bias is not None: - weights[tllm_prex + 'bias'] = bias - - # others - for layer in ['backbone.embeddings', 'backbone.norm_f']: - weight, bias = get_weight_and_bias(model_params, layer, dtype, dtype) - layer = layer.replace('embeddings', 'vocab_embedding') - layer = layer.replace('norm_f', 'ln_f') - weights[layer + '.weight'] = weight - if bias is not None: - weights[layer + '.bias'] = bias - weights['lm_head.weight'], _ = get_weight_and_bias(model_params, - 'backbone.embeddings', - dtype, dtype) - - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - print(f'Weights loaded. Total time: {t}') - return weights - - -def rename_hf_to_tllm(name: str): - """ Rename a HF parameter name by the corresponding TRT-LLM style name. """ - # remove model - if 'model.' in name: - name = name.replace('model.', '') - - # change layer name - if 'embeddings.' in name: - name = name.replace('embeddings', 'vocab_embedding') - elif 'embedding.' in name: - name = name.replace('embedding', 'vocab_embedding') - norm_pattern = r'\d\.norm\.' - if 'mixer.' in name: - name = name.replace('mixer.', 'ssm.') - elif re.search(norm_pattern, name): - name = name.replace('norm.', 'input_layernorm.') - elif 'norm_f.' in name: - name = name.replace('norm_f.', 'ln_f.') - - # Parameter names in ssm layers - if 'A_log' in name: - name = name.replace('A_log', 'A') - elif 'dt_proj.bias' in name: - name = name.replace('dt_proj.bias', 'dt_bias') - return name - - -def convert_from_hf_checkpoint(mamba_config: dict, - model_dir: Union[str, Path], - rank=0, - dtype: Union[str, torch.dtype] = torch.float32, - mamba_version: str = 'Mamba1'): - logger.info('Loading weights from HF Mamba...') - tik = time.time() - - tp_rank = rank - tp_size = mamba_config['mapping']['tp_size'] - d_inner = mamba_config['rnn_hidden_size'] - d_state = mamba_config['state_size'] - weights = {} - if isinstance(dtype, str): - dtype = tensorrt_llm.str_dtype_to_torch(dtype) - - for model_file in iterate_shard_files(model_dir, 0): - logger.debug(f'Loading file {str(model_file)}...') - model_params = load_state_dict(model_file, dtype=dtype) - for name, param in model_params.items(): - logger.debug(f'Converting weight {name}...') - tllm_name = rename_hf_to_tllm(name) - param = param.detach().cpu() - if 'A_log' in name: - param = -torch.exp(param.float()) - if mamba_version == 'Mamba1': - param = param.permute(1, 0).contiguous() - elif 'D' in name: - param = param.float() - elif 'dt_proj.bias' in name: - param = param.float() - elif 'dt_bias' in name: - param = param.float() - elif 'conv1d.weight' in name: - param = param.unsqueeze(3) - - # split in_proj in Mamba1 - if 'in_proj' in name and mamba_version == 'Mamba1': - in_proj_params = torch.split(param, param.size(0) // 2, dim=0) - weights[tllm_name.replace('proj', 'proj_x')] = in_proj_params[0] - weights[tllm_name.replace('proj', 'proj_z')] = in_proj_params[1] - elif 'in_proj' in name and mamba_version == 'Mamba2': - nheads = d_inner // mamba_config['rnn_head_size'] - ngroups = mamba_config['ngroups'] - in_proj_z, in_proj_x, in_proj_b, in_proj_c, in_proj_dt = torch.split( - param, [ - d_inner, d_inner, ngroups * d_state, ngroups * d_state, - nheads - ], - dim=0) - in_proj_z = split(in_proj_z, tp_size, tp_rank, dim=0) - in_proj_x = split(in_proj_x, tp_size, tp_rank, dim=0) - in_proj_b = split(in_proj_b, tp_size, tp_rank, dim=0) - in_proj_c = split(in_proj_c, tp_size, tp_rank, dim=0) - in_proj_dt = split(in_proj_dt, tp_size, tp_rank, dim=0) - in_proj = torch.concat( - [in_proj_z, in_proj_x, in_proj_b, in_proj_c, in_proj_dt]) - weights[tllm_name] = in_proj.contiguous() - elif 'conv1d' in name and mamba_version == 'Mamba2': - ngroups = mamba_config['ngroups'] - conv_x, conv_b, conv_c = torch.split( - param, [d_inner, ngroups * d_state, ngroups * d_state], - dim=0) - conv_x = split(conv_x, tp_size, tp_rank, dim=0) - conv_b = split(conv_b, tp_size, tp_rank, dim=0) - conv_c = split(conv_c, tp_size, tp_rank, dim=0) - conv = torch.concat([conv_x, conv_b, conv_c]) - weights[tllm_name] = conv.contiguous() - elif any(keyword in name for keyword in ( - 'mixer.norm.weight', - 'A_log', - 'D', - 'dt_proj.bias', - 'dt_bias', - )) and mamba_version == 'Mamba2': - weights[tllm_name] = split(param, tp_size, tp_rank, dim=0) - elif 'out_proj' in name and mamba_version == 'Mamba2': - weights[tllm_name] = split(param, tp_size, tp_rank, - dim=1).contiguous() - else: - weights[tllm_name] = param - del model_params - - # lm_head - emb = weights['backbone.vocab_embedding.weight'] - if 'lm_head.weight' not in weights or weights['lm_head.weight'].data_ptr( - ) == emb.data_ptr(): - weights['lm_head.weight'] = copy.deepcopy(emb) - if mamba_version == 'Mamba2': - weights['lm_head.weight'] = split(weights['lm_head.weight'], - tp_size, - tp_rank, - dim=0) - - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}') - return weights - - -def do_convert_from_ckpt(args): - return args.model_dir.exists() - - -def convert(worker_rank, args, convert_args): - convert_from_ckpt = do_convert_from_ckpt(args) - for rank in range(worker_rank, args.world_size): - if convert_from_ckpt: - weights = convert_from_hf_checkpoint(rank=rank, **convert_args) - else: - weights = convert_hf_mamba(rank=rank, **convert_args) - safetensors.torch.save_file(weights, - args.output_dir / f'rank{rank}.safetensors') - - -@dataclass -class MambaConfig: - - architectures: List[str] = field( - default_factory=lambda: ['MambaForCausalLM']) - d_intermediate: int = 0 - vocab_size: int = 50277 - attn_layer_idx: list = field(default_factory=list) - attn_cfg: dict = field(default_factory=dict) - rms_norm: bool = True - residual_in_fp32: bool = True - pad_vocab_size_multiple: int = 8 - hidden_size: int = 2560 - num_hidden_layers: int = 64 - intermediate_size: int = 0 - state_size: int = 128 - conv_kernel: int = 4 - use_bias: bool = False - head_dim: int = 64 - n_groups: int = 1 - chunk_size: int = 256 - ssm_rmsnorm: bool = True - - def update(self, data_dict): - self.__dict__.update(data_dict) - - -def load_config_hf(model_name, ckpt_type): +def load_config_hf(model_name, ckpt_type, dtype, mapping, quant_config, + output_dir): if ckpt_type == CheckpointType.hf: # transformer compatible models - hf_config = AutoConfig.from_pretrained(model_name, - trust_remote_code=True) - mamba_version = 'Mamba2' if hf_config.model_type == 'mamba2' else 'Mamba1' + override_fields = {} + mamba = MambaForCausalLM.from_hugging_face( + model_name, + dtype, + mapping=mapping, + quant_config=quant_config, + **override_fields, + ) + mamba.save_checkpoint(output_dir, save_config=True) + elif ckpt_type == CheckpointType.state_spaces: # state-spaces/mamba models config = json.load(open(os.path.join(model_name, 'config.json'))) + override_fields = {} + mamba = MambaForCausalLM.from_hugging_face( + model_name, + dtype, + mapping=mapping, + quant_config=quant_config, + **override_fields, + ) + mamba.save_checkpoint(output_dir, save_config=True) + ssm_cfg = config.pop('ssm_cfg') cfg_to_mamba_cfg = { 'd_model': 'hidden_size', @@ -355,6 +114,7 @@ def load_config_hf(model_name, ckpt_type): for k in ssm_cfg_to_mamba_cfg: if k in ssm_cfg and ssm_cfg_to_mamba_cfg[k] is not None: config[ssm_cfg_to_mamba_cfg[k]] = ssm_cfg[k] + hf_config = MambaConfig(**config) if 'expand' in ssm_cfg: expand = ssm_cfg['expand'] @@ -362,6 +122,7 @@ def load_config_hf(model_name, ckpt_type): else: hf_config.intermediate_size = 2 * hf_config.hidden_size mamba_version = ssm_cfg.pop("layer", "Mamba1") + elif ckpt_type == CheckpointType.mistral_inference: # mistral inference format config = json.load(open(os.path.join(model_name, 'params.json'))) cfg_to_mamba_cfg = { @@ -384,90 +145,71 @@ def load_config_hf(model_name, ckpt_type): else: hf_config.intermediate_size = 2 * hf_config.hidden_size mamba_version = 'Mamba2' + return hf_config, mamba_version +def execute(workers, func, args): + if workers == 1: + for rank, f in enumerate(func): + f(args, rank) + else: + with ThreadPoolExecutor(max_workers=workers) as p: + futures = [p.submit(f, args, rank) for rank, f in enumerate(func)] + exceptions = [] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + traceback.print_exc() + exceptions.append(e) + assert len( + exceptions + ) == 0, "Checkpoint conversion failed, please check error log." + + +def args_to_quant_config(args: argparse.Namespace) -> QuantConfig: + '''return config dict with quantization info based on the command line args + ''' + quant_config = QuantConfig() + if args.use_weight_only: + if args.weight_only_precision == 'int8': + quant_config.quant_algo = QuantAlgo.W8A16 + elif args.weight_only_precision == 'int4': + quant_config.quant_algo = QuantAlgo.W4A16 + + return quant_config + + def main(): print(tensorrt_llm.__version__) args = parse_arguments() logger.set_level(args.log_level) tik = time.time() + assert args.pp_size == 1, "Pipeline parallelism is not supported." + world_size = args.tp_size * args.pp_size args.output_dir.mkdir(exist_ok=True, parents=True) - hf_config, mamba_version = load_config_hf(args.model_dir, args.ckpt_type) - - vocab_size = hf_config.vocab_size - pad_vocab_size_multiple = getattr(hf_config, "pad_vocab_size_multiple", 1) - if vocab_size % pad_vocab_size_multiple != 0: - vocab_size += pad_vocab_size_multiple - (vocab_size % - pad_vocab_size_multiple) - - config = { - 'architecture': 'MambaForCausalLM', - 'dtype': args.dtype, - 'logits_dtype': 'float32', - 'hidden_size': hf_config.hidden_size, - 'num_hidden_layers': hf_config.num_hidden_layers, - 'layer_types': ['recurrent'], - 'vocab_size': vocab_size, - 'rms_norm': hf_config.rms_norm, - 'residual_in_fp32': hf_config.residual_in_fp32, - 'pad_vocab_size_multiple': pad_vocab_size_multiple, - 'hidden_act': 'silu', - 'num_attention_heads': args.world_size, - 'rnn_hidden_size': hf_config.intermediate_size, - 'rnn_conv_dim_size': hf_config.intermediate_size, - 'state_size': hf_config.state_size, - 'conv_kernel': hf_config.conv_kernel, - 'use_bias': hf_config.use_bias, - 'mamba_version': mamba_version, - 'mapping': { - 'world_size': args.world_size, - 'tp_size': args.world_size, - 'pp_size': 1 - }, - } - if mamba_version == 'Mamba2': - conv_dim = hf_config.intermediate_size + 2 * hf_config.n_groups * hf_config.state_size - ssm_rmsnorm = getattr(hf_config, "ssm_rmsnorm", hf_config.rms_norm) - mamba2_cfg = { - 'rnn_head_size': hf_config.head_dim, - 'rnn_conv_dim_size': conv_dim, - 'ngroups': hf_config.n_groups, - 'chunk_size': hf_config.chunk_size, - 'ssm_rmsnorm': ssm_rmsnorm, - } - config.update(mamba2_cfg) - - with (args.output_dir / 'config.json').open('w') as f: - json.dump(config, f, indent=4) - - convert_from_ckpt = do_convert_from_ckpt(args) - # TODO: Add convert_hf_mamba support for Mamba2 when transformers can support Mamba2 models - assert convert_from_ckpt or mamba_version == 'Mamba2', "Mamba2 can only support convert from checkpoints." - assert args.world_size == 1 or mamba_version == 'Mamba2', "Mamba1 can not support tensor parallelism." - if not convert_from_ckpt: - logger.info(f'Convert by using model') - hf_mamba = AutoModelForCausalLM.from_pretrained(args.model_dir, - device_map="auto", - torch_dtype="auto", - trust_remote_code=True) - else: - logger.info(f'Convert by using checkpoint') - hf_mamba = None + quant_config = args_to_quant_config(args) - convert_args = dict(dtype=args.dtype, ) + def convert_and_save_rank(args, rank): + mapping = Mapping(world_size=world_size, + rank=rank, + tp_size=args.tp_size, + pp_size=args.pp_size) - if convert_from_ckpt: - convert_args['model_dir'] = args.model_dir - else: - convert_args['hf_mamba'] = hf_mamba - convert_args['mamba_version'] = mamba_version - convert_args['mamba_config'] = config + mamba = MambaForCausalLM.from_hugging_face( + args.model_dir, + args.dtype, + mapping=mapping, + quant_config=quant_config, + ) + mamba.save_checkpoint(args.output_dir, save_config=(rank == 0)) + del mamba - convert(0, args, convert_args) + execute(args.workers, [convert_and_save_rank] * world_size, args) tok = time.time() t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) diff --git a/examples/mamba/requirements.txt b/examples/mamba/requirements.txt index 7adcda04f..3991da794 100644 --- a/examples/mamba/requirements.txt +++ b/examples/mamba/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 transformers>=4.39.0 datasets~=2.14.5 evaluate diff --git a/examples/medusa/README.md b/examples/medusa/README.md index 1ef2d4377..bb100a5cf 100644 --- a/examples/medusa/README.md +++ b/examples/medusa/README.md @@ -152,3 +152,7 @@ mpirun -np 4 --allow-run-as-root --oversubscribe \ --temperature 1.0 \ --batch_size 1 ``` + +### Medusa with Qwen2 + +To use Medusa with Qwen2 models, specify `--model_type qwen2` to `convert_checkpoint.py`. You have to provide a Qwen2 model checkpoint and the medusa heads. After TRT-LLM checkpoint is generated, trllm-build and `../run.py` use the same arguments as for LLaMA models. diff --git a/examples/medusa/convert_checkpoint.py b/examples/medusa/convert_checkpoint.py index 9c517f991..94278260e 100644 --- a/examples/medusa/convert_checkpoint.py +++ b/examples/medusa/convert_checkpoint.py @@ -8,12 +8,14 @@ import safetensors import torch -from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer +from transformers import (LlamaConfig, LlamaForCausalLM, LlamaTokenizer, + Qwen2Config) import tensorrt_llm +from tensorrt_llm._utils import numpy_to_torch from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import PretrainedConfig +from tensorrt_llm.models import PretrainedConfig, QWenForCausalLM from tensorrt_llm.models.convert_utils import load_calib_dataset from tensorrt_llm.models.llama.convert import load_weights_from_hf_by_shard from tensorrt_llm.models.medusa.weight import (capture_activation_range, @@ -174,6 +176,7 @@ def parse_arguments(): parser.add_argument('--max_medusa_token_len', type=int, default=63) parser.add_argument('--medusa_hidden_act', type=str, default="silu") parser.add_argument('--medusa_model_dir', type=str, default=None) + parser.add_argument('--model_type', type=str, default="llama") args = parser.parse_args() return args @@ -186,15 +189,17 @@ def parse_arguments(): # the op with PyTorch. print(tensorrt_llm.__version__) args = parse_arguments() + assert args.model_type in ["llama", "mixtral", + "qwen2"], "Invalid model type" world_size = args.tp_size * args.pp_size tik = time.time() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) - hf_config = None if args.model_dir is not None: - hf_config = LlamaConfig.from_pretrained(args.model_dir) + config_cls = Qwen2Config if args.model_type == "qwen2" else LlamaConfig + hf_config = config_cls.from_pretrained(args.model_dir) args.model_type = hf_config.model_type args.n_head = hf_config.num_attention_heads @@ -205,6 +210,7 @@ def parse_arguments(): args.rms_norm_eps = hf_config.rms_norm_eps args.vocab_size = hf_config.vocab_size args.n_positions = hf_config.max_position_embeddings + args.rotary_base = hf_config.rope_theta elif args.meta_ckpt_dir is not None: @@ -266,8 +272,11 @@ def parse_arguments(): 'share_embedding_table': args.use_embedding_sharing, 'max_draft_len': args.max_medusa_token_len, 'num_medusa_heads': args.num_medusa_heads, - 'num_medusa_layers': args.num_medusa_layers + 'num_medusa_layers': args.num_medusa_layers, + 'model_type': args.model_type, } + if args.model_type == "qwen2": + config['qwen_type'] = args.model_type if args.use_weight_only: if args.weight_only_precision == 'int8': @@ -315,13 +324,16 @@ def parse_arguments(): llama_smoother = {} model = None if args.model_dir is not None: - hf_model = LlamaForCausalLM if args.model_type != "mixtral" else MixtralForCausalLM - - model = hf_model.from_pretrained( - args.model_dir, - torch_dtype='auto', - device_map='auto' if not args.load_model_on_cpu else 'cpu', - trust_remote_code=True) + if args.model_type == "qwen2": + model = QWenForCausalLM.from_hugging_face(args.model_dir, + args.dtype) + else: + hf_model = LlamaForCausalLM if args.model_type != "mixtral" else MixtralForCausalLM + model = hf_model.from_pretrained( + args.model_dir, + torch_dtype='auto', + device_map='auto' if not args.load_model_on_cpu else 'cpu', + trust_remote_code=True) if args.smoothquant is not None or args.int8_kv_cache: os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get( @@ -360,23 +372,31 @@ def covert_and_save(rank, convert_args): args.model_dir, PretrainedConfig.from_dict(config)) else: - weights = convert_hf_llama( - convert_args['hf_model'], - mapping, - rank, - dtype=args.dtype, - use_weight_only=args.use_weight_only, - plugin_weight_only_quant_type=plugin_weight_only_quant_type, - use_parallel_embedding=args.use_parallel_embedding, - sharding_dim=args.embedding_sharding_dim, - share_embedding_table=args.use_embedding_sharing, - use_smooth_quant=args.smoothquant, - per_channel=args.per_channel, - per_token=args.per_token, - int8_kv_cache=args.int8_kv_cache, - act_range=convert_args['act_range'], - qkv_para=convert_args['llama_qkv_para'], - smoother=convert_args['llama_smoother']) + if args.model_type == "qwen2": + weights = { + name: numpy_to_torch(param.raw_value) + for name, param in + convert_args['hf_model'].named_parameters() + } + else: + weights = convert_hf_llama( + convert_args['hf_model'], + mapping, + rank, + dtype=args.dtype, + use_weight_only=args.use_weight_only, + plugin_weight_only_quant_type= + plugin_weight_only_quant_type, + use_parallel_embedding=args.use_parallel_embedding, + sharding_dim=args.embedding_sharding_dim, + share_embedding_table=args.use_embedding_sharing, + use_smooth_quant=args.smoothquant, + per_channel=args.per_channel, + per_token=args.per_token, + int8_kv_cache=args.int8_kv_cache, + act_range=convert_args['act_range'], + qkv_para=convert_args['llama_qkv_para'], + smoother=convert_args['llama_smoother']) if args.medusa_model_dir is not None: config_file = Path(args.medusa_model_dir) / "config.json" diff --git a/examples/medusa/requirements.txt b/examples/medusa/requirements.txt index 5ac8a0ffb..aebdd4207 100644 --- a/examples/medusa/requirements.txt +++ b/examples/medusa/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.14.5 rouge_score~=0.1.2 sentencepiece~=0.1.99 diff --git a/examples/mixtral/requirements.txt b/examples/mixtral/requirements.txt index 7c4024262..8bf182455 100644 --- a/examples/mixtral/requirements.txt +++ b/examples/mixtral/requirements.txt @@ -1,4 +1,4 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 transformers==4.38.2 accelerate==0.25.0 diff --git a/examples/model_api/llama.py b/examples/model_api/llama.py index 5ca11b98a..5ca107e20 100644 --- a/examples/model_api/llama.py +++ b/examples/model_api/llama.py @@ -7,7 +7,7 @@ import tensorrt_llm from tensorrt_llm import BuildConfig, build from tensorrt_llm.executor import GenerationExecutor -from tensorrt_llm.hlapi import SamplingParams +from tensorrt_llm.llmapi import SamplingParams from tensorrt_llm.models import LLaMAForCausalLM diff --git a/examples/model_api/llama_quantize.py b/examples/model_api/llama_quantize.py index 699229181..e716843ce 100644 --- a/examples/model_api/llama_quantize.py +++ b/examples/model_api/llama_quantize.py @@ -7,7 +7,7 @@ import tensorrt_llm from tensorrt_llm import BuildConfig, build from tensorrt_llm.executor import GenerationExecutor -from tensorrt_llm.hlapi import SamplingParams +from tensorrt_llm.llmapi import SamplingParams from tensorrt_llm.models import LLaMAForCausalLM from tensorrt_llm.models.modeling_utils import QuantConfig from tensorrt_llm.quantization import QuantAlgo diff --git a/examples/mpt/requirements.txt b/examples/mpt/requirements.txt index ee73d169c..e876bf8dd 100644 --- a/examples/mpt/requirements.txt +++ b/examples/mpt/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/nemotron/requirements.txt b/examples/nemotron/requirements.txt index 79128ee0d..7219b68e8 100644 --- a/examples/nemotron/requirements.txt +++ b/examples/nemotron/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 nemo-toolkit[all]==2.0.0rc1 megatron-core==0.8.0 datasets~=2.14.5 diff --git a/examples/opt/requirements.txt b/examples/opt/requirements.txt index ee73d169c..e876bf8dd 100644 --- a/examples/opt/requirements.txt +++ b/examples/opt/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/phi/requirements.txt b/examples/phi/requirements.txt index d145002d2..74da63c53 100644 --- a/examples/phi/requirements.txt +++ b/examples/phi/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/quantization/requirements.txt b/examples/quantization/requirements.txt index 34dfb19dd..57f140ee5 100644 --- a/examples/quantization/requirements.txt +++ b/examples/quantization/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets>=2.14.4 nemo-toolkit[all]<=1.20.0,>=1.18.0 rouge_score~=0.1.2 diff --git a/examples/qwen/requirements.txt b/examples/qwen/requirements.txt index 46dbe9896..5507f6c6d 100644 --- a/examples/qwen/requirements.txt +++ b/examples/qwen/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.16.0 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/qwenvl/requirements.txt b/examples/qwenvl/requirements.txt index 9f6671eb3..980ac2840 100644 --- a/examples/qwenvl/requirements.txt +++ b/examples/qwenvl/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.16.0 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/recurrentgemma/requirements.txt b/examples/recurrentgemma/requirements.txt index d589b27fe..ad2fbc00f 100644 --- a/examples/recurrentgemma/requirements.txt +++ b/examples/recurrentgemma/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 git+https://github.com/google-deepmind/recurrentgemma.git flax>=0.8.2 jax~=0.4.23 diff --git a/examples/redrafter/requirements.txt b/examples/redrafter/requirements.txt index 5ac8a0ffb..aebdd4207 100644 --- a/examples/redrafter/requirements.txt +++ b/examples/redrafter/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.14.5 rouge_score~=0.1.2 sentencepiece~=0.1.99 diff --git a/examples/skywork/requirements.txt b/examples/skywork/requirements.txt index 065e4c4f4..df00a9f71 100644 --- a/examples/skywork/requirements.txt +++ b/examples/skywork/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets~=2.16.1 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/smaug/requirements.txt b/examples/smaug/requirements.txt index 44c2d95c5..a06397938 100644 --- a/examples/smaug/requirements.txt +++ b/examples/smaug/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/whisper/README.md b/examples/whisper/README.md index 80bed9915..81f653bf4 100755 --- a/examples/whisper/README.md +++ b/examples/whisper/README.md @@ -116,6 +116,24 @@ For pure C++ runtime, there is no example given yet. Please check the [`Executor For pure Python runtime, you can simply add the `--use_py_session` option. +#### Advanced Usage + +`--padding_strategy` +OpenAI's official Whisper models accept WAV files of up to 30 seconds in length. For files shorter than 30 seconds, padding is required to reach the 30-second mark, which may not be efficient. Currently, three padding strategies are supported: + +1. **max (default)**: Pads to 30 seconds. +2. **longest**: Pads according to the longest duration in the current batch. +3. **zero**: No padding is applied. You will need to fine-tune the Whisper model to maintain accuracy. See [examples](https://github.com/k2-fsa/icefall/blob/master/egs/aishell/ASR/whisper/whisper_encoder_forward_monkey_patch.py#L15). + +`--text_prefix` +You can modify the input prompt for the Whisper decoder. For example, use `<|startoftranscript|><|en|><|zh|><|transcribe|><|notimestamps|>` to perform code-switching ASR between Chinese and English. + +`--compute_cer` +Calculates the character error rate (CER) instead of the word error rate (WER) for languages such as Chinese and Japanese. + +`--dataset`, `--dataset_name`, and `--dataset_split` +These options allow you to select different decoding audio datasets from Hugging Face. + ### Distil-Whisper TensorRT-LLM also supports using [distil-whisper's](https://github.com/huggingface/distil-whisper) different models by first converting their params and weights from huggingface's naming format to [openai whisper](https://github.com/openai/whisper) naming format. You can do so by running the script [distil_whisper/convert_from_distil_whisper.py](./convert_from_distil_whisper.py) as follows: diff --git a/examples/whisper/convert_checkpoint.py b/examples/whisper/convert_checkpoint.py index 31ae15cf9..78b571c04 100644 --- a/examples/whisper/convert_checkpoint.py +++ b/examples/whisper/convert_checkpoint.py @@ -35,6 +35,7 @@ def parse_arguments(): type=str, default="large-v3", choices=[ + "large-v3-turbo", "large-v3", "large-v2", "medium", @@ -94,8 +95,9 @@ def get_encoder_config(model_metadata: dict, dtype: str, 'num_hidden_layers': model_metadata['n_audio_layer'], 'num_attention_heads': model_metadata['n_audio_head'], 'hidden_size': model_metadata['n_audio_state'], + 'max_position_embeddings': model_metadata['n_audio_ctx'], + 'has_position_embedding': True, 'n_mels': model_metadata['n_mels'], - 'n_audio_ctx': model_metadata['n_audio_ctx'], 'vocab_size': model_metadata['n_vocab'], 'hidden_act': "gelu", 'num_languages': num_languages, @@ -167,7 +169,7 @@ def sinusoids(length, channels, max_timescale=10000): torch.cos(scaled_time)], dim=1) - weights['positional_embedding'] = sinusoids( + weights['position_embedding.weight'] = sinusoids( model_metadata['n_audio_ctx'], model_metadata['n_audio_state']).contiguous() @@ -393,6 +395,8 @@ def convert_openai_whisper_decoder(model_metadata: dict, print(f"Loaded model from {model_path}") model_metadata = model['dims'] model_state_dict = model['model_state_dict'] + for param_tensor in model_state_dict: + model_state_dict[param_tensor] = model_state_dict[param_tensor].half() def convert_and_save(component: str = "encoder"): # call get_encoder_config or get_decoder_config according to component diff --git a/examples/whisper/requirements.txt b/examples/whisper/requirements.txt index 4b3670a10..6f76c44b8 100644 --- a/examples/whisper/requirements.txt +++ b/examples/whisper/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.14.0.dev2024100800 +tensorrt_llm==0.15.0.dev2024101500 tiktoken datasets kaldialign diff --git a/examples/whisper/run.py b/examples/whisper/run.py index 567b18a5f..0f657a2a0 100755 --- a/examples/whisper/run.py +++ b/examples/whisper/run.py @@ -14,6 +14,7 @@ # limitations under the License. import argparse import json +import math import re import time from collections import OrderedDict @@ -25,8 +26,8 @@ from tokenizer import get_tokenizer from torch.utils.data import DataLoader from whisper.normalizers import EnglishTextNormalizer -from whisper_utils import (N_SAMPLES, log_mel_spectrogram, pad_or_trim, - store_transcripts, write_error_stats) +from whisper_utils import (log_mel_spectrogram, store_transcripts, + write_error_stats) import tensorrt_llm import tensorrt_llm.logger as logger @@ -50,6 +51,14 @@ def parse_arguments(): parser.add_argument('--dataset', type=str, default="hf-internal-testing/librispeech_asr_dummy") + parser.add_argument( + '--dataset_name', + type=str, + default="clean", + help= + "dataset configuration name in the dataset, see https://huggingface.co/docs/datasets/v3.0.0/en/package_reference/loading_methods#datasets.load_dataset" + ) + parser.add_argument('--dataset_split', type=str, default="validation") parser.add_argument('--name', type=str, default="librispeech_dummy_benchmark") @@ -67,11 +76,37 @@ def parse_arguments(): parser.add_argument('--use_py_session', action='store_true', help="use python session or cpp session") + parser.add_argument( + "--compute_cer", + action="store_true", + default=False, + help="""True to compute character error rate (CER), e.g., for Chinese. + False to compute word error rate (WER), e.g., for English words. + """, + ) + parser.add_argument( + "--text_prefix", + default="<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", + help="""Text prefix to be used for decoding. Default is for English ASR. + """, + ) + parser.add_argument( + "--padding_strategy", + default="max", + help= + """1. max: pad to the 30s, using the option if the model is trained with max padding e.g. openai official models, + 2. longest: pad to the longest sequence in the batch, + 3. zero: no padding, only works with cpp session, + """, + ) return parser.parse_args() -def remove_tensor_padding(input_tensor, input_tensor_lengths=None, pad_value=0): - if input_tensor.dim() == 2: +def remove_tensor_padding(input_tensor, + input_tensor_lengths=None, + pad_value=None): + if pad_value: + assert input_tensor_lengths is None, "input_tensor_lengths should be None when pad_value is provided" # Text tensor case: batch, seq_len assert torch.all( input_tensor[:, 0] != pad_value @@ -84,24 +119,20 @@ def remove_tensor_padding(input_tensor, input_tensor_lengths=None, pad_value=0): # Apply the mask to input_tensor to remove pad tokens output_tensor = input_tensor[mask].view(1, -1) - elif input_tensor.dim() == 3: + else: # Audio tensor case: batch, seq_len, feature_len + # position_ids case: batch, seq_len assert input_tensor_lengths is not None, "input_tensor_lengths must be provided for 3D input_tensor" - batch_size, seq_len, feature_len = input_tensor.shape # Initialize a list to collect valid sequences valid_sequences = [] - for i in range(batch_size): + for i in range(input_tensor.shape[0]): valid_length = input_tensor_lengths[i] - valid_sequences.append(input_tensor[i, :valid_length, :]) + valid_sequences.append(input_tensor[i, :valid_length]) # Concatenate all valid sequences along the batch dimension output_tensor = torch.cat(valid_sequences, dim=0) - - else: - raise ValueError("Input tensor must have 2 or 3 dimensions") - return output_tensor @@ -135,20 +166,31 @@ def get_audio_features(self, mel, mel_input_lengths, encoder_downsampling_factor=2): + if isinstance(mel, list): + mel = torch.cat(mel, dim=0).type(str_dtype_to_torch("float16")) + bsz, seq_len = mel.shape[0], mel.shape[2] + position_ids = torch.arange( + math.ceil(seq_len / encoder_downsampling_factor), + dtype=torch.int32, + device=mel.device).expand(bsz, -1).contiguous() if self.encoder_config['plugin_config']['remove_input_padding']: # mel B,D,T -> B,T,D -> BxT, D mel = mel.transpose(1, 2) mel = remove_tensor_padding(mel, mel_input_lengths) - + position_ids = remove_tensor_padding(position_ids, + mel_input_lengths) inputs = OrderedDict() inputs['input_features'] = mel inputs['input_lengths'] = mel_input_lengths + inputs['position_ids'] = position_ids output_list = [ TensorInfo('input_features', str_dtype_to_trt(self.dtype), mel.shape), TensorInfo('input_lengths', str_dtype_to_trt('int32'), - mel_input_lengths.shape) + mel_input_lengths.shape), + TensorInfo('position_ids', str_dtype_to_trt('int32'), + inputs['position_ids'].shape) ] output_info = (self.session).infer_shapes(output_list) @@ -168,7 +210,6 @@ def get_audio_features(self, stream.synchronize() encoder_output = outputs['encoder_output'] encoder_output_lengths = mel_input_lengths // encoder_downsampling_factor - return encoder_output, encoder_output_lengths @@ -282,6 +323,7 @@ def __init__(self, engine_dir, debug_mode=False, assets_dir=None, + batch_size=64, use_py_session=False): world_size = 1 runtime_rank = tensorrt_llm.mpi_rank() @@ -318,10 +360,10 @@ def __init__(self, assert json_config.model_config.supports_inflight_batching runner_kwargs = dict(engine_dir=engine_dir, is_enc_dec=True, - max_batch_size=16, + max_batch_size=batch_size, max_input_len=3000, max_output_len=96, - max_beam_width=4, + max_beam_width=1, debug_mode=debug_mode, kv_cache_free_gpu_memory_fraction=0.9) self.model_runner_cpp = ModelRunnerCpp.from_dir(**runner_kwargs) @@ -337,7 +379,7 @@ def process_batch( prompt_id = self.tokenizer.encode( text_prefix, allowed_special=self.tokenizer.special_tokens_set) prompt_id = torch.tensor(prompt_id) - batch_size = mel.shape[0] + batch_size = len(mel) decoder_input_ids = prompt_id.repeat(batch_size, 1) if self.use_py_session: encoder_output, encoder_output_lengths = self.encoder.get_audio_features( @@ -352,9 +394,17 @@ def process_batch( num_beams=num_beams) else: with torch.no_grad(): + if isinstance(mel, list): + mel = [ + m.transpose(1, 2).type( + str_dtype_to_torch("float16")).squeeze(0) + for m in mel + ] + else: + mel = mel.transpose(1, 2) outputs = self.model_runner_cpp.generate( batch_input_ids=decoder_input_ids, - encoder_input_features=mel.transpose(1, 2), + encoder_input_features=mel, encoder_output_lengths=mel_input_lengths // 2, max_new_tokens=max_new_tokens, end_id=self.eot_id, @@ -379,7 +429,8 @@ def decode_wav_file( batch_size=1, num_beams=1, normalizer=None, - mel_filters_dir=None): + mel_filters_dir=None, + padding_strategy="longest"): mel, total_duration = log_mel_spectrogram(input_file_path, model.n_mels, device='cuda', @@ -389,12 +440,15 @@ def decode_wav_file( mel = mel.unsqueeze(0) # repeat the mel spectrogram to match the batch size mel = mel.repeat(batch_size, 1, 1) - # TODO: use the actual input_lengths rather than padded input_lengths - feature_input_lengths = torch.full((mel.shape[0], ), - mel.shape[2], - dtype=torch.int32, - device=mel.device) - predictions = model.process_batch(mel, feature_input_lengths, text_prefix, + if padding_strategy == "longest": + pass + else: + mel = torch.nn.functional.pad(mel, (0, 3000 - mel.shape[2])) + features_input_lengths = torch.full((mel.shape[0], ), + mel.shape[2], + dtype=torch.int32, + device=mel.device) + predictions = model.process_batch(mel, features_input_lengths, text_prefix, num_beams) prediction = predictions[0] @@ -412,13 +466,15 @@ def collate_wrapper(batch): for item in batch: speech = item["audio"]["array"] duration = speech.shape[-1] - speech = pad_or_trim(speech, N_SAMPLES) speech = speech.astype(np.float32) speech = torch.from_numpy(speech) speeches.append(speech) durations.append(duration) labels.append(item["text"]) - ids.append(item["id"]) + if 'id' in item: + ids.append(item["id"]) + else: + ids.append(item["segment_id"]) return speeches, durations, labels, ids @@ -431,10 +487,10 @@ def decode_dataset( num_beams=1, normalizer=None, sample_rate=16000, - mel_filters_dir=None): - librispeech_dummy = load_dataset(dataset, "clean", split="validation") - - data_loader = DataLoader(librispeech_dummy, + mel_filters_dir=None, + compute_cer=False, + padding_strategy="longest"): + data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=4, pin_memory=True, @@ -448,63 +504,92 @@ def decode_dataset( for wave in waveforms: assert wave.is_pinned() + if padding_strategy == "longest": + longest_duration = max(durations) + elif padding_strategy == "zero": + longest_duration = 0 + else: + longest_duration = int(16000 * 30) + features = [ log_mel_spectrogram(wave, model.n_mels, + padding=longest_duration - wave.shape[-1], device='cuda', mel_filters_dir=mel_filters_dir).unsqueeze(0) for wave in waveforms ] - features = torch.cat(features, dim=0).type(str_dtype_to_torch(dtype)) - # TODO: use the actual input_lengths rather than padded input_lengths - feature_input_lengths = torch.full((features.shape[0], ), - features.shape[2], - dtype=torch.int32, - device=features.device) - predictions = model.process_batch(features, feature_input_lengths, + + # pad to the even number of features, for remove_padding option, conv layer padding corner case + for i, feature in enumerate(features): + if feature.shape[2] % 2: + features[i] = torch.nn.functional.pad(feature, (0, 1)) + + features_input_lengths = torch.tensor([f.shape[2] for f in features], + dtype=torch.int32, + device='cuda') + + predictions = model.process_batch(features, features_input_lengths, text_prefix, num_beams) for wav_id, label, prediction in zip(ids, texts, predictions): # remove all special tokens in the prediction prediction = re.sub(r'<\|.*?\|>', '', prediction) if normalizer: prediction, label = normalizer(prediction), normalizer(label) + label = label.split() + prediction = prediction.split() + if compute_cer: + label = list("".join(label)) + prediction = list("".join(prediction)) print(f"wav_id: {wav_id}, label: {label}, prediction: {prediction}") - results.append((wav_id, label.split(), prediction.split())) + results.append((wav_id, label, prediction)) return results, total_duration if __name__ == '__main__': args = parse_arguments() tensorrt_llm.logger.set_level(args.log_level) + if args.padding_strategy == "zero": + assert not args.use_py_session, "zero padding strategy only works with cpp session" model = WhisperTRTLLM(args.engine_dir, args.debug, args.assets_dir, - args.use_py_session) + args.batch_size, args.use_py_session) normalizer = EnglishTextNormalizer() + dataset = load_dataset(args.dataset, + args.dataset_name, + split=args.dataset_split) if args.enable_warmup: results, total_duration = decode_dataset( model, - "hf-internal-testing/librispeech_asr_dummy", + dataset, batch_size=args.batch_size, num_beams=args.num_beams, normalizer=normalizer, - mel_filters_dir=args.assets_dir) + mel_filters_dir=args.assets_dir, + padding_strategy=args.padding_strategy) + start_time = time.time() if args.input_file: results, total_duration = decode_wav_file( args.input_file, model, + text_prefix=args.text_prefix, dtype=args.dtype, batch_size=args.batch_size, num_beams=args.num_beams, - mel_filters_dir=args.assets_dir) + mel_filters_dir=args.assets_dir, + padding_strategy=args.padding_strategy) else: results, total_duration = decode_dataset( model, - args.dataset, + dataset, + text_prefix=args.text_prefix, dtype=args.dtype, batch_size=args.batch_size, num_beams=args.num_beams, normalizer=normalizer, - mel_filters_dir=args.assets_dir) + mel_filters_dir=args.assets_dir, + compute_cer=args.compute_cer, + padding_strategy=args.padding_strategy) elapsed = time.time() - start_time results = sorted(results) diff --git a/examples/whisper/whisper_utils.py b/examples/whisper/whisper_utils.py index f65f44d40..932e6f2b1 100644 --- a/examples/whisper/whisper_utils.py +++ b/examples/whisper/whisper_utils.py @@ -167,13 +167,13 @@ def log_mel_spectrogram( assert isinstance(audio, np.ndarray), f"Unsupported audio type: {type(audio)}" duration = audio.shape[-1] / SAMPLE_RATE - audio = pad_or_trim(audio, N_SAMPLES) audio = audio.astype(np.float32) audio = torch.from_numpy(audio) if device is not None: audio = audio.to(device) if padding > 0: + # pad to N_SAMPLES audio = F.pad(audio, (0, padding)) window = torch.hann_window(N_FFT).to(audio.device) stft = torch.stft(audio, diff --git a/requirements-windows.txt b/requirements-windows.txt index e3fc56308..c7970a46b 100644 --- a/requirements-windows.txt +++ b/requirements-windows.txt @@ -20,7 +20,7 @@ tensorrt~=10.4.0 tokenizers>=0.14 # Default torch is CPU-only on Windows, so need to specify a torch version with GPU support torch==2.4.0+cu124 -nvidia-modelopt~=0.15.0 +nvidia-modelopt[torch]~=0.17.0 transformers>=4.38.2 wheel optimum diff --git a/requirements.txt b/requirements.txt index 698662167..ef84cd4da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,10 +18,9 @@ h5py==3.10.0 StrEnum sentencepiece>=0.1.99 tensorrt~=10.4.0 -# https://github.com/pytorch/pytorch/blob/v2.4.0/version.txt uses 2.4.0a0. # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07 uses 2.4.0a0. torch>=2.4.0a0,<=2.4.0 -nvidia-modelopt~=0.15.0 +nvidia-modelopt[torch]~=0.17.0 transformers>=4.38.2,<=4.42.4 pillow==10.3.0 wheel diff --git a/setup.py b/setup.py index a3db33e18..6330450f7 100644 --- a/setup.py +++ b/setup.py @@ -129,7 +129,7 @@ def has_ext_modules(self): 'trtllm-bench=tensorrt_llm.commands.bench:main', ], }, - scripts=['tensorrt_llm/hlapi/trtllm-hlapi-launch'], + scripts=['tensorrt_llm/llmapi/trtllm-llmapi-launch'], extras_require={ "devel": devel_deps, "benchmarking": [ diff --git a/tensorrt_llm/__init__.py b/tensorrt_llm/__init__.py index 8b2bd324d..56a5ef20e 100644 --- a/tensorrt_llm/__init__.py +++ b/tensorrt_llm/__init__.py @@ -45,7 +45,7 @@ def _add_trt_llm_dll_directory(): from .auto_parallel import AutoParallelConfig, auto_parallel from .builder import BuildConfig, Builder, BuilderConfig, build from .functional import Tensor, constant -from .hlapi.llm import LLM, LlmArgs, SamplingParams +from .llmapi.llm import LLM, LlmArgs, SamplingParams from .logger import logger from .mapping import Mapping from .models.automodel import AutoConfig, AutoModelForCausalLM diff --git a/tensorrt_llm/bench/build/build.py b/tensorrt_llm/bench/build/build.py index 9e7719adc..956a84939 100644 --- a/tensorrt_llm/bench/build/build.py +++ b/tensorrt_llm/bench/build/build.py @@ -13,8 +13,8 @@ from tensorrt_llm.bench.utils.data import create_dataset_from_stream, initialize_tokenizer from tensorrt_llm.bench.utils import (VALID_QUANT_ALGOS, VALID_COMPUTE_DTYPES) from tensorrt_llm.builder import BuildConfig -from tensorrt_llm.hlapi import LLM -from tensorrt_llm.hlapi.llm_utils import QuantConfig +from tensorrt_llm.llmapi import LLM +from tensorrt_llm.llmapi.llm_utils import QuantConfig from tensorrt_llm.logger import logger from tensorrt_llm.quantization.mode import QuantAlgo @@ -250,7 +250,7 @@ def build_command( f"Quantization:\t\t\t{quantization}\n" "===========================================================\n") - # Build the LLM engine with the HLAPI. + # Build the LLM engine with the LLMAPI. logger.set_level("error") llm = LLM(bench_env.model, tokenizer, @@ -258,7 +258,8 @@ def build_command( tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, build_config=build_config, - quant_config=quant_config) + quant_config=quant_config, + workspace=bench_env.workspace) # Save the engine. llm.save(engine_dir) llm._shutdown() diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py index fe951e9a6..f53504406 100644 --- a/tensorrt_llm/builder.py +++ b/tensorrt_llm/builder.py @@ -26,7 +26,8 @@ import tensorrt as trt from ._common import _is_building, check_max_num_tokens, serialize_engine -from ._utils import np_bfloat16, np_float8, str_dtype_to_trt, to_json_file +from ._utils import (np_bfloat16, np_float8, str_dtype_to_trt, to_json_file, + trt_gte) from .auto_parallel import auto_parallel from .auto_parallel.config import AutoParallelConfig from .bindings import KVCacheType @@ -224,6 +225,13 @@ def create_builder_config(self, if weight_sparsity: config.set_flag(trt.BuilderFlag.SPARSE_WEIGHTS) + # TODO(Junyi): remove this constraint after trt 10.6 is integrated + if trt_gte(10, 6): + # set monitor memory + monitor_memory = kwargs.get("monitor_memory", False) + if monitor_memory: + config.set_flag(trt.BuilderFlag.MONITOR_MEMORY) + return BuilderConfig()._init(config, precision=precision, tensor_parallel=tensor_parallel, @@ -494,6 +502,7 @@ class BuildConfig: use_fused_mlp: bool = False dry_run: bool = False visualize_network: bool = False + monitor_memory: bool = False # Since we have some overlapping between kv_cache_type, paged_kv_cache, and paged_state (later two will be deprecated in the future), # we need to handle it given model architecture. @@ -578,7 +587,7 @@ def from_dict(cls, config, plugin_config=None): config.get('auto_parallel_config', {})) max_encoder_input_len = config.pop('max_encoder_input_len', 1024) weight_streaming = config.pop('weight_streaming', False) - + use_fused_mlp = config.pop('use_fused_mlp', True) use_strip_plan = config.pop('use_strip_plan', False) if plugin_config is None: @@ -588,6 +597,7 @@ def from_dict(cls, config, plugin_config=None): dry_run = config.pop('dry_run', False) visualize_network = config.pop('visualize_network', False) + monitor_memory = config.pop('monitor_memory', False) return cls( max_input_len=max_input_len, @@ -616,9 +626,11 @@ def from_dict(cls, config, plugin_config=None): max_encoder_input_len=max_encoder_input_len, weight_sparsity=weight_sparsity, weight_streaming=weight_streaming, + use_fused_mlp=use_fused_mlp, plugin_config=plugin_config, dry_run=dry_run, - visualize_network=visualize_network) + visualize_network=visualize_network, + monitor_memory=monitor_memory) @classmethod def from_json_file(cls, config_file, plugin_config=None): @@ -682,13 +694,11 @@ def __init__( self, config: EngineConfig, engine: Union[trt.IHostMemory, None], - managed_weights: dict[str, np.ndarray] = None, + managed_weights: dict[str, np.ndarray] = {}, ): self.config = config self.engine = engine self.managed_weights = managed_weights - - def regularize_managed_weights(self): if self.managed_weights is None: self.managed_weights = {} for name, value in self.managed_weights.items(): @@ -1081,6 +1091,7 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine: use_strip_plan=build_config.use_strip_plan, weight_sparsity=build_config.weight_sparsity, weight_streaming=build_config.weight_streaming, + monitor_memory=build_config.monitor_memory, ) network = builder.create_network() @@ -1099,7 +1110,7 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine: network.plugin_config.weight_only_groupwise_quant_matmul_plugin = model.config.dtype else: network.plugin_config.weight_only_quant_matmul_plugin = model.config.dtype - if use_smooth_quant and model.config.quantization.use_plugin_sq: + if use_smooth_quant and model.config.quantization.use_plugin_sq and build_config.plugin_config.smooth_quant_plugins: network.plugin_config.set_smooth_quant_plugins(model.config.dtype) if use_fp8_rowwise: network.plugin_config.set_fp8_rowwise_quant_plugins(model.config.dtype) @@ -1154,7 +1165,7 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine: "max_batch_size": build_config.max_batch_size, } - if build_config.speculative_decoding_mode == SpeculativeDecodingMode.LOOKAHEAD_DECODING: + if build_config.speculative_decoding_mode == SpeculativeDecodingMode.LOOKAHEAD_DECODING or build_config.speculative_decoding_mode == SpeculativeDecodingMode.EAGLE: prepare_input_args[ "spec_decoding_is_generation_length_variable"] = True diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py index 3a7772ecc..b8652b9b2 100644 --- a/tensorrt_llm/commands/build.py +++ b/tensorrt_llm/commands/build.py @@ -160,11 +160,6 @@ def parse_arguments(): help= "The profiling verbosity for the generated TensorRT engine. Setting to detailed allows inspecting tactic choices and kernel parameters." ) - parser.add_argument( - '--builder_force_num_profiles', - type=int, - default=None, - help="If specified, force to use the number of profiles.") parser.add_argument( '--strip_plan', default=False, @@ -217,6 +212,10 @@ def parse_arguments(): help= "Run through the build process except the actual Engine build for debugging." ) + parser.add_argument('--monitor_memory', + default=False, + action='store_true', + help="Enable memory monitor during Engine build.") logits_parser = parser.add_argument_group("Logits arguments") logits_parser.add_argument('--logits_dtype', @@ -273,10 +272,8 @@ def parse_arguments(): spec_parser.add_argument('--speculative_decoding_mode', default=None, choices=[ - "draft_tokens_external", - "lookahead_decoding", - "medusa", - "explicit_draft_tokens", + "draft_tokens_external", "lookahead_decoding", + "medusa", "explicit_draft_tokens", "eagle" ], help="Mode of speculative decoding.") spec_parser.add_argument( @@ -334,6 +331,8 @@ def build_model( architecture = model_config.architecture assert not build_config.plugin_config.streamingllm or architecture == "LlamaForCausalLM", \ "StreamingLLM is only supported in the llama model." + assert not build_config.plugin_config.pp_reduce_scatter or architecture == "MixtralForCausalLM", \ + "PP reduce scatter is only supported in the mixtral model." real_rank = rank model_config.mapping.gpus_per_node = build_config.auto_parallel_config.gpus_per_node @@ -517,6 +516,18 @@ def main(): else: cluster_config = infer_cluster_config() + # This should only be used for debugging. + # The env var BUILDER_FORCE_NUM_PROFILES should override the number of + # optimization profiles during TRT build. + # BUILDER_FORCE_NUM_PROFILES must be less than or equal to the number of + # optimization profiles set by model's prepare_inputs(). + force_num_profiles_from_env = os.environ.get( + "BUILDER_FORCE_NUM_PROFILES", None) + if force_num_profiles_from_env is not None: + logger.warning( + f"Overriding # of builder profiles <= {force_num_profiles_from_env}." + ) + build_config = BuildConfig.from_dict( { 'max_input_len': args.max_input_len, @@ -530,7 +541,7 @@ def main(): 'gather_context_logits': args.gather_context_logits, 'gather_generation_logits': args.gather_generation_logits, 'strongly_typed': True, - 'force_num_profiles': args.builder_force_num_profiles, + 'force_num_profiles': force_num_profiles_from_env, 'weight_sparsity': args.weight_sparsity, 'profiling_verbosity': args.profiling_verbosity, 'enable_debug_output': args.enable_debug_output, @@ -556,6 +567,7 @@ def main(): 'visualize_network': args.visualize_network, 'max_encoder_input_len': args.max_encoder_input_len, 'weight_streaming': args.weight_streaming, + 'monitor_memory': args.monitor_memory, }, plugin_config=plugin_config) diff --git a/tensorrt_llm/executor.py b/tensorrt_llm/executor.py index eaafdb5a3..e52014624 100644 --- a/tensorrt_llm/executor.py +++ b/tensorrt_llm/executor.py @@ -1,9 +1,11 @@ import asyncio import atexit import concurrent.futures +import copy import datetime import io import json +import os import secrets import time import traceback @@ -22,11 +24,12 @@ from ._utils import mpi_rank, mpi_world_size from .bindings import executor as tllm from .builder import ConfigEncoder, Engine, EngineConfig -from .hlapi.mpi_session import (MpiPoolSession, MpiSession, - external_mpi_comm_available, find_free_port, - need_spawn_mpi_workers) -from .hlapi.utils import ManagedThread, SamplingParams +from .llmapi.mpi_session import (MpiPoolSession, MpiSession, + external_mpi_comm_available, find_free_port, + need_spawn_mpi_workers) +from .llmapi.utils import ManagedThread, SamplingParams from .lora_manager import LoraManager +from .prompt_adapter_manager import PromptAdapterManager from .runtime import ModelConfig from .runtime.model_runner import _engine_config_to_model_config @@ -46,7 +49,8 @@ class LoRARequest: lora_path: str = "" def __post_init__(self): - assert self.lora_path, "lora_path cannot be empty" + if not os.path.exists(self.lora_path): + raise RuntimeError(f"lora_path ({self.lora_path}) does not exist.") @property def adapter_id(self): @@ -61,6 +65,34 @@ def path(self): return self.lora_path +@dataclass(slots=True) +class PromptAdapterRequest: + """ + Request for a Prompt adapter. + """ + prompt_adapter_name: str + prompt_adapter_id: int + prompt_adapter_local_path: str = "" + + def __post_init__(self): + if not os.path.exists(self.prompt_adapter_local_path): + raise RuntimeError( + f"prompt_adapter_local_path ({self.prompt_adapter_local_path}) does not exist." + ) + + @property + def adapter_id(self): + return self.prompt_adapter_id + + @property + def name(self): + return self.prompt_adapter_name + + @property + def local_path(self): + return self.prompt_adapter_local_path + + class GenerationRequest: def __init__( @@ -68,6 +100,7 @@ def __init__( prompt_token_ids: Union[torch.Tensor, np.ndarray, list], sampling_params: SamplingParams, lora_request: Optional[LoRARequest] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, streaming: bool = False, ): if isinstance(prompt_token_ids, list): @@ -81,6 +114,7 @@ def __init__( self.sampling_params = sampling_params self.lora_request = lora_request + self.prompt_adapter_request = prompt_adapter_request self.streaming = streaming self.id = -1 @@ -445,6 +479,7 @@ def generate_async( prompt_token_ids: List[int], sampling_params: SamplingParams, lora_request: Optional[LoRARequest] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, streaming: bool = False, ) -> GenerationResult: """Generate output for the given prompt token ids in the asynchronous mode. @@ -456,6 +491,7 @@ def generate_async( GenerationRequest(prompt_token_ids, sampling_params=sampling_params, lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request, streaming=streaming)) return result @@ -464,6 +500,8 @@ def generate( prompt_token_ids: Union[List[int], List[List[int]]], sampling_params: Union[SamplingParams, List[SamplingParams]], lora_request: Optional[Union[LoRARequest, List[LoRARequest]]] = None, + prompt_adapter_request: Optional[Union[ + PromptAdapterRequest, List[PromptAdapterRequest]]] = None, ) -> Union[GenerationResult, List[GenerationResult]]: """Generate output for the given prompt token ids in the synchronous mode. Synchronous generation accepts either single prompt or batched prompts. @@ -483,9 +521,14 @@ def generate( lora_req = lora_request[i] else: lora_req = lora_request + if isinstance(prompt_adapter_request, list): + pa_req = prompt_adapter_request[i] + else: + pa_req = prompt_adapter_request future = self.generate_async(p, sampling_params=sp, lora_request=lora_req, + prompt_adapter_request=pa_req, streaming=False) futures.append(future) @@ -640,20 +683,19 @@ def __init__( engine = engine[self.rank] if isinstance(engine, Engine): - engine.regularize_managed_weights() self.engine = tllm.Executor(engine.engine, json.dumps(engine.config.to_dict(), cls=ConfigEncoder), tllm.ModelType.DECODER_ONLY, executor_config=executor_config, - managed_weights=engine.managed_weights - or {}) + managed_weights=engine.managed_weights) else: self.engine = tllm.Executor(engine, tllm.ModelType.DECODER_ONLY, executor_config=executor_config) self._lora_manager: Optional[LoraManager] = None + self._prompt_adapter_manager: Optional[PromptAdapterManager] = None self._runtime_model_config: Optional[ModelConfig] = None if self.rank == 0: if isinstance(engine, Engine): @@ -661,10 +703,12 @@ def __init__( else: engine_config = EngineConfig.from_json_file( f"{engine}/config.json") + self._runtime_model_config = _engine_config_to_model_config( + engine_config) if engine_config.build_config.plugin_config.lora_plugin: - self._runtime_model_config = _engine_config_to_model_config( - engine_config) self._lora_manager = LoraManager() + if engine_config.build_config.max_prompt_embedding_table_size > 0: + self._prompt_adapter_manager = PromptAdapterManager() self.await_response_thread = ManagedThread( self.await_response_task, @@ -790,11 +834,18 @@ def start(self): def _load_lora_adapter(self, lora_request: LoRARequest): self._lora_manager.load_from_ckpt( - [lora_request.lora_path], + [lora_request.path], model_config=self._runtime_model_config, runtime_mapping=None, uids=[str(lora_request.adapter_id)]) + def _load_prompt_adapter(self, + prompt_adapter_request: PromptAdapterRequest): + self._prompt_adapter_manager.load_from_ckpt( + [prompt_adapter_request.local_path], + model_config=self._runtime_model_config, + uids=[str(prompt_adapter_request.adapter_id)]) + def _enqueue_request(self, request: GenerationRequest) -> int: if self._lora_manager is not None and request.lora_request is not None: self._load_lora_adapter(request.lora_request) @@ -806,8 +857,21 @@ def _enqueue_request(self, request: GenerationRequest) -> int: else: lora_config = None + prompt_token_ids = copy.deepcopy(request.prompt_token_ids) + if request.prompt_adapter_request is not None: + self._load_prompt_adapter(request.prompt_adapter_request) + uid = str(request.prompt_adapter_request.adapter_id) + prompt_tuning_config = tllm.PromptTuningConfig( + self._prompt_adapter_manager.uid_to_weights[uid]) + vocab_size = self._runtime_model_config.vocab_size + pa_length = prompt_tuning_config.embedding_table.size(0) + prompt_token_ids = list(range( + vocab_size, vocab_size + pa_length)) + prompt_token_ids + else: + prompt_tuning_config = None + executor_request = tllm.Request( - input_token_ids=request.prompt_token_ids, + input_token_ids=prompt_token_ids, max_tokens=request.sampling_params.max_tokens, max_new_tokens=request.sampling_params.max_new_tokens, streaming=request.streaming, @@ -820,8 +884,8 @@ def _enqueue_request(self, request: GenerationRequest) -> int: embedding_bias=request.sampling_params.embedding_bias, external_draft_tokens_config=request.sampling_params. external_draft_tokens_config, - prompt_tuning_config=request.sampling_params.prompt_tuning_config, lora_config=lora_config, + prompt_tuning_config=prompt_tuning_config, logits_post_processor_name=request.sampling_params. logits_post_processor_name, ) diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py index 335f85d7c..bd8303847 100644 --- a/tensorrt_llm/functional.py +++ b/tensorrt_llm/functional.py @@ -3920,6 +3920,29 @@ def allgather(tensor: Tensor, group: List[int], gather_dim: int = 0) -> Tensor: return x +def reduce_scatter(tensor: Tensor, group: List[int]) -> Tensor: + + plg_creater = trt.get_plugin_registry().get_plugin_creator( + 'ReduceScatter', '1', TRT_LLM_PLUGIN_NAMESPACE) + assert plg_creater is not None + + p_dtype = default_net().plugin_config.nccl_plugin + pf_type = trt.PluginField( + "type_id", np.array([int(str_dtype_to_trt(p_dtype))], np.int32), + trt.PluginFieldType.INT32) + group = trt.PluginField("group", np.array(group, dtype=np.int32), + trt.PluginFieldType.INT32) + pfc = trt.PluginFieldCollection([group, pf_type]) + + reduce_scatter_plug = plg_creater.create_plugin("reduce_scatter", pfc) + plug_inputs = [tensor.cast(p_dtype).trt_tensor] + + layer = default_trtnet().add_plugin_v2(plug_inputs, reduce_scatter_plug) + _add_plugin_info(layer, plg_creater, "reduce_scatter", pfc) + + return _create_tensor(layer.get_output(0), layer).cast(tensor.dtype) + + def send(tensor: Tensor, tgt: int) -> Tensor: ''' Add an operation that performs a send from a rank to another. diff --git a/tensorrt_llm/layers/attention.py b/tensorrt_llm/layers/attention.py index b7ad79cff..652cac581 100644 --- a/tensorrt_llm/layers/attention.py +++ b/tensorrt_llm/layers/attention.py @@ -323,6 +323,7 @@ def __init__(self, attention_head_size=None, qk_layernorm=False, layernorm_type=LayerNormType.LayerNorm, + layernorm_share=True, inner_layernorm=False, eps=1e-05, attention_mask_type=AttentionMaskType.padding, @@ -474,16 +475,38 @@ def __init__(self, self.rel_attn_table = Parameter(shape=(num_attention_heads // tp_size, num_buckets), dtype=dtype) + + # qk layernorm self.qk_layernorm = qk_layernorm self.layernorm_type = layernorm_type + self.layernorm_share = layernorm_share ln_type = layernorm_map[layernorm_type] if self.qk_layernorm: - self.q_layernorm = ln_type(self.attention_head_size, - eps=eps, - dtype=dtype) - self.k_layernorm = ln_type(self.attention_head_size, - eps=eps, - dtype=dtype) + # layernorm_share indicates whether all the QK head in one layer shares the same norm parameters or not + if layernorm_share: + self.q_layernorm = ln_type(self.attention_head_size, + eps=eps, + dtype=dtype) + self.k_layernorm = ln_type(self.attention_head_size, + eps=eps, + dtype=dtype) + else: + assert ln_type == LayerNorm + self.q_layernorm = ln_type( + (self.num_attention_heads, self.attention_head_size), + eps=eps, + dtype=dtype, + bias=False, + tp_size=tp_size, + tp_dim=0) + self.k_layernorm = ln_type( + (self.num_attention_kv_heads, self.attention_head_size), + eps=eps, + dtype=dtype, + bias=False, + tp_size=tp_size, + tp_dim=0) + self.inner_layernorm = ln_type(self.hidden_size, dtype=dtype, eps=eps) if inner_layernorm else None if clip_qkv is not None: @@ -741,25 +764,38 @@ def forward(self, if self.qk_layernorm: base_shape = shape(qkv, 0) if qkv.ndim() == 2 else concat( [shape(qkv, 0), shape(qkv, 1)]) - # here we assume that q, k and v have the same number of attention heads - # TODO: allow different number of attention heads for q, k and v. - qkv = qkv.view( - concat([ - base_shape, self.num_attention_heads, 3, + qkv_sections = [ + self.num_attention_heads, self.num_attention_kv_heads, + self.num_attention_kv_heads + ] + total_heads = sum(qkv_sections) + if self.num_attention_heads != self.num_attention_kv_heads: + qkv = qkv.view( + concat([base_shape, total_heads, self.attention_head_size])) + query, key, value = split(qkv, qkv_sections, dim=qkv.ndim() - 2) + else: + qkv = qkv.view( + concat([ + base_shape, self.num_attention_heads, 3, + self.attention_head_size + ])) + query, key, value = split(qkv, 1, dim=qkv.ndim() - 2) + q_shape = concat([ + base_shape, self.num_attention_heads, self.attention_head_size - ])) - query, key, value = split(qkv, 1, dim=qkv.ndim() - 2) - q_shape = concat([ - base_shape, self.num_attention_heads, self.attention_head_size - ]) - query = query.view(q_shape) - key = key.view(q_shape) - value = value.view(q_shape) - - query = self.q_layernorm(query) - key = self.k_layernorm(key) + ]) + query = query.view(q_shape) + key = key.view(q_shape) + value = value.view(q_shape) + + normalized_shape = None + if not self.layernorm_share: + normalized_shape = self.attention_head_size + query = self.q_layernorm(query, normalized_shape=normalized_shape) + key = self.k_layernorm(key, normalized_shape=normalized_shape) qkv = concat([query, key, value], dim=query.ndim() - 2) - qkv = qkv.view(concat([base_shape, self.attention_hidden_size * 3])) + qkv = qkv.view( + concat([base_shape, total_heads * self.attention_head_size])) if self.position_embedding_type == PositionEmbeddingType.chatglm: qkv = RopeEmbeddingUtils.apply_rotary_pos_emb_chatglm( qkv, diff --git a/tensorrt_llm/layers/linear.py b/tensorrt_llm/layers/linear.py index 2608708ac..b039a0f5e 100644 --- a/tensorrt_llm/layers/linear.py +++ b/tensorrt_llm/layers/linear.py @@ -358,24 +358,23 @@ def postprocess(self, tllm_key, weights, **kwargs): config = kwargs.get("config", None) if self.is_qkv: if isinstance(weights, list): - if hasattr(config, "remove_duplicated_kv_heads"): - if config.remove_duplicated_kv_heads: - head_size = config.hidden_size // config.num_attention_heads if config.head_size is None else config.head_size - k, v = weights[1:] - k = k.reshape([ - k.shape[0] // head_size // 2, 2, head_size, - self.in_features - ]) - v = v.reshape([ - v.shape[0] // head_size // 2, 2, head_size, - self.in_features - ]) - assert (k[:, 0] == k[:, 1]).all() - assert (v[:, 0] == v[:, 1]).all() - k = k[:, 0].reshape([-1, self.in_features]) - v = v[:, 0].reshape([-1, self.in_features]) - weights[1] = k - weights[2] = v + if getattr(config, "remove_duplicated_kv_heads", False): + head_size = config.hidden_size // config.num_attention_heads if config.head_size is None else config.head_size + k, v = weights[1:] + k = k.reshape([ + k.shape[0] // head_size // 2, 2, head_size, + self.in_features + ]) + v = v.reshape([ + v.shape[0] // head_size // 2, 2, head_size, + self.in_features + ]) + assert (k[:, 0] == k[:, 1]).all() + assert (v[:, 0] == v[:, 1]).all() + k = k[:, 0].reshape([-1, self.in_features]) + v = v[:, 0].reshape([-1, self.in_features]) + weights[1] = k + weights[2] = v weights = torch.cat(weights) if using_head_as_leading_dim: # Reorder [n_head, 3, head_dim, ...] into [3, n_head, head_dim, ...] diff --git a/tensorrt_llm/layers/moe.py b/tensorrt_llm/layers/moe.py old mode 100644 new mode 100755 index e05ea6de3..5dff02ada --- a/tensorrt_llm/layers/moe.py +++ b/tensorrt_llm/layers/moe.py @@ -29,8 +29,9 @@ from ..functional import (AllReduceFusionParams, _add_plugin_info, _create_tensor, allreduce, cast, concat, constant, div, expand, gather_nd, is_gated_activation, - non_gated_version, nonzero, repeat_interleave, - scatter_nd, shape, softmax, split, sum, topk) + non_gated_version, nonzero, reduce_scatter, + repeat_interleave, scatter_nd, shape, softmax, split, + sum, topk) from ..layers import MLP, GatedMLP from ..mapping import Mapping from ..module import Module, ModuleList @@ -503,7 +504,8 @@ def forward(self, hidden_states, finished=None, lora_layer_params=None, - reduce_fusion_params: Optional[AllReduceFusionParams] = None): + reduce_fusion_params: Optional[AllReduceFusionParams] = None, + last_local_layer_residual=None): moe_router_lora_params = None if lora_layer_params is not None: moe_router_lora_params = lora_layer_params.get_runtime_params( @@ -513,7 +515,8 @@ def forward(self, output = self.forward_experts(hidden_states, routing, finished, lora_layer_params) if self.use_all_reduce: - output = self.forward_allreduce(output, reduce_fusion_params) + output = self.forward_allreduce(output, reduce_fusion_params, + last_local_layer_residual) return output def forward_experts(self, hidden_states, routing, finished, @@ -593,9 +596,25 @@ def forward_experts(self, hidden_states, routing, finished, return output - def forward_allreduce( - self, output, - reduce_fusion_params: Optional[AllReduceFusionParams]): + def forward_allreduce(self, + output, + reduce_fusion_params: Optional[AllReduceFusionParams], + last_local_layer_residual=None): + + if last_local_layer_residual is not None: + if self.mapping.tp_rank == 0: + output = output + last_local_layer_residual + else: + # we need to add this line here to minimize the numerical difference + output = output + 0 + # reshape to (-1) + output = output.view(concat([-1])) + if self.tp_size > 1 and self.tp_group is not None: + output = reduce_scatter(output, self.tp_group) + # reshape to (-1, hidden_size // tp_size) + output = output.view(concat([-1, self.hidden_size // self.tp_size])) + return output + if self.tp_size > 1 and self.tp_group is not None: output = allreduce(output, self.tp_group, @@ -852,6 +871,7 @@ def __init__(self, mapping: Mapping = Mapping(), bias: bool = True, dtype=None, + quant_mode=QuantMode(0), **kwargs): super().__init__() @@ -862,6 +882,7 @@ def __init__(self, self.mapping = mapping self.bias = bias self.dtype = dtype + self.quant_mode = quant_mode self.moe = MOE(hidden_size=self.hidden_size, moe_config=self.moe_config, @@ -871,7 +892,8 @@ def __init__(self, dtype=self.dtype, bias=False, tp_group=self.mapping.tp_group, - tp_size=self.mapping.tp_size) + tp_size=self.mapping.tp_size, + quant_mode=self.quant_mode) ClsMLP = GatedMLP if is_gated_activation(self.hidden_act) else MLP self.shared_experts = ClsMLP( hidden_size=self.hidden_size, @@ -880,7 +902,8 @@ def __init__(self, bias=False, dtype=self.dtype, tp_group=self.mapping.tp_group, - tp_size=self.mapping.tp_size) + tp_size=self.mapping.tp_size, + quant_mode=self.quant_mode) def forward(self, hidden_states): if self.moe_config.num_shared_experts > 0: diff --git a/tensorrt_llm/layers/normalization.py b/tensorrt_llm/layers/normalization.py index 3c8f82913..3d81f69dc 100644 --- a/tensorrt_llm/layers/normalization.py +++ b/tensorrt_llm/layers/normalization.py @@ -24,7 +24,9 @@ def __init__(self, eps=1e-05, elementwise_affine=True, bias=True, - dtype=None): + dtype=None, + tp_size=1, + tp_dim=-1): super().__init__() if isinstance(normalized_shape, int): normalized_shape = (normalized_shape, ) @@ -42,11 +44,15 @@ def __init__(self, self.eps = eps self.dtype = dtype + self.tp_size = tp_size + self.tp_dim = tp_dim - def forward(self, x): + def forward(self, x, normalized_shape=None): weight = 1. if self.weight is None else self.weight.value bias = 0. if self.bias is None else self.bias.value - return layer_norm(x, self.normalized_shape, weight, bias, self.eps) + if normalized_shape is None: + normalized_shape = self.normalized_shape + return layer_norm(x, normalized_shape, weight, bias, self.eps) class RmsNorm(Module): diff --git a/tensorrt_llm/hlapi/__init__.py b/tensorrt_llm/llmapi/__init__.py similarity index 100% rename from tensorrt_llm/hlapi/__init__.py rename to tensorrt_llm/llmapi/__init__.py diff --git a/tensorrt_llm/hlapi/_perf_evaluator.py b/tensorrt_llm/llmapi/_perf_evaluator.py similarity index 100% rename from tensorrt_llm/hlapi/_perf_evaluator.py rename to tensorrt_llm/llmapi/_perf_evaluator.py diff --git a/tensorrt_llm/hlapi/build_cache.py b/tensorrt_llm/llmapi/build_cache.py similarity index 98% rename from tensorrt_llm/hlapi/build_cache.py rename to tensorrt_llm/llmapi/build_cache.py index 30fcfffba..9d4a4fe8a 100644 --- a/tensorrt_llm/hlapi/build_cache.py +++ b/tensorrt_llm/llmapi/build_cache.py @@ -20,10 +20,10 @@ def get_build_cache_config_from_env() -> tuple[bool, str]: """ Get the build cache configuration from the environment variables """ - build_cache_enabled = os.environ.get('TLLM_HLAPI_BUILD_CACHE') == '1' + build_cache_enabled = os.environ.get('TLLM_LLMAPI_BUILD_CACHE') == '1' build_cache_root = os.environ.get( - 'TLLM_HLAPI_BUILD_CACHE_ROOT', - '/tmp/.cache/tensorrt_llm/hlapi/') # nosec B108 + 'TLLM_LLMAPI_BUILD_CACHE_ROOT', + '/tmp/.cache/tensorrt_llm/llmapi/') # nosec B108 return build_cache_enabled, build_cache_root diff --git a/tensorrt_llm/hlapi/llm.py b/tensorrt_llm/llmapi/llm.py similarity index 93% rename from tensorrt_llm/hlapi/llm.py rename to tensorrt_llm/llmapi/llm.py index 023e55db2..2744120aa 100644 --- a/tensorrt_llm/hlapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -10,7 +10,8 @@ from .. import bindings as tllm from ..bindings import executor as tllm from ..builder import EngineConfig -from ..executor import GenerationExecutor, GenerationResult, LoRARequest +from ..executor import (GenerationExecutor, GenerationResult, LoRARequest, + PromptAdapterRequest) from ..logger import logger from .llm_utils import (LLMARGS_REMAINING_ARGS_DOCSTRING, CachedModelLoader, LlmArgs, LlmBuildStats, ModelLoader, @@ -85,6 +86,8 @@ class LLM: revision(Optional[str]): The revision of the model. tokenzier_revision(Optional[str]): The revision of the tokenizer. + + workspace(Optional[str]): The directory to store intermediate files. ''' def __init__(self, @@ -97,6 +100,7 @@ def __init__(self, trust_remote_code: bool = False, revision: Optional[str] = None, tokenizer_revision: Optional[str] = None, + workspace: Optional[str] = None, **kwargs: Any): self._executor_cls = kwargs.pop("executor_cls", GenerationExecutor) @@ -139,7 +143,9 @@ def __init__(self, # Due to the Executor can only accept a engine path, we need to save the engine to a directory self._engine_dir: Optional[Path] = None self._executor: Optional[GenerationExecutor] = None - self._workspace = tempfile.TemporaryDirectory("llm-workspace") + + self._workspace = tempfile.TemporaryDirectory( + suffix="-llm-workspace", dir=workspace) self.runtime_context: Optional[_ModelRuntimeContext] = None self.llm_build_stats = LlmBuildStats() @@ -165,6 +171,8 @@ def generate( use_tqdm: bool = True, lora_request: Optional[Union[LoRARequest, Sequence[LoRARequest]]] = None, + prompt_adapter_request: Optional[Union[ + PromptAdapterRequest, Sequence[PromptAdapterRequest]]] = None, ) -> Union[RequestOutput, List[RequestOutput]]: ''' Generate output for the given prompts in the synchronous mode. Synchronous generation accepts either single prompt or batched prompts. @@ -176,6 +184,7 @@ def generate( generation, a default one will be used if not provided. use_tqdm (bool): Whether to use tqdm to display the progress bar. lora_request (Optional[Union[LoRARequest, Sequence[LoRARequest]]]): LoRA request to use for generation, if any. + prompt_adapter_request (Optional[Union[PromptAdapterRequest, Sequence[PromptAdapterRequest]]]): Prompt Adapter request to use for generation, if any. Returns: Union[RequestOutput, List[RequestOutput]]: The output data of the completion request to the LLM. @@ -198,9 +207,14 @@ def generate( lora_req = lora_request[i] else: lora_req = lora_request + if isinstance(prompt_adapter_request, list): + pa_req = prompt_adapter_request[i] + else: + pa_req = prompt_adapter_request future = self.generate_async(request_inputs, sampling_params=sp, lora_request=lora_req, + prompt_adapter_request=pa_req, streaming=False) futures.append(future) @@ -220,6 +234,7 @@ def generate_async( inputs: PromptInputs, sampling_params: Optional[SamplingParams] = None, lora_request: Optional[LoRARequest] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, streaming: bool = False, ) -> RequestOutput: ''' Generate output for the given prompt in the asynchronous mode. @@ -230,6 +245,7 @@ def generate_async( sampling_params (Optional[SamplingParams]): The sampling params for the generation, a default one will be used if not provided. lora_request (Optional[LoRARequest]): LoRA request to use for generation, if any. + prompt_adapter_request (Optional[PromptAdapterRequest]): Prompt Adapter request to use for generation, if any. streaming (bool): Whether to use the streaming mode for the generation. Returns: @@ -254,6 +270,7 @@ def generate_async( prompt_token_ids, sampling_params=sampling_params, lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request, streaming=streaming, ) return RequestOutput(result, prompt, self.tokenizer) diff --git a/tensorrt_llm/hlapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py similarity index 97% rename from tensorrt_llm/hlapi/llm_utils.py rename to tensorrt_llm/llmapi/llm_utils.py index a19ac6e0a..d425fc624 100644 --- a/tensorrt_llm/hlapi/llm_utils.py +++ b/tensorrt_llm/llmapi/llm_utils.py @@ -33,7 +33,6 @@ from pathlib import Path from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union -import tensorrt as trt import torch from tqdm import tqdm from transformers import PreTrainedTokenizerBase @@ -232,6 +231,10 @@ def from_module(cls, module: Module): max_cpu_loras (int, default=4): Maximum number of LoRA adapters to be stored in CPU memory. + enable_prompt_adapter (bool, default=False): Enable prompt adapters. + + max_prompt_adapter_token (int, default=0): Maximum number of prompt adapter tokens. + build_config (BuildConfig, default=BuildConfig()): The build configuration for the model. Default is an empty BuildConfig instance. @@ -310,9 +313,16 @@ class LlmArgs: max_cpu_loras: int = 4 + # Prompt adapter arguments + enable_prompt_adapter: bool = False + + max_prompt_adapter_token: int = 0 + # BuildConfig is introduced to give users a familiar interface to configure the model building. build_config: Optional[BuildConfig] = None + fast_build: Optional[bool] = False + quant_config: QuantConfig = field(default_factory=QuantConfig) calib_config: CalibConfig = field(default_factory=CalibConfig) @@ -356,7 +366,7 @@ def __post_init__(self): # The underlying implementation might disable it if it is not supported. self.enable_chunked_context: bool = False # TODO[chunweiy]: Enable this option in the future - # Currently we want HLAPI to be consistent with the lower APIs in the model building, thus disable this to avoid + # Currently we want LLMAPI to be consistent with the lower APIs in the model building, thus disable this to avoid # magics. self.perform_config_arbitration = False @@ -371,8 +381,6 @@ def __post_init__(self): if self.dtype == 'bfloat16': raise RuntimeError("Pre SM 80 GPUs do not support bfloat16") - self._engine_config: Optional[EngineConfig] = None - self.auto_parallel_config = AutoParallelConfig( sharded_io_allowlist=[ "past_key_value_\\d+", @@ -462,11 +470,19 @@ def setup(self): self.build_config = self.build_config or BuildConfig() + # TODO(xiweny): remove the checker when manage weights support all data types + if self.fast_build and (self.quant_config.quant_algo is QuantAlgo.FP8 + or self.quant_config.quant_algo is None): + self._update_plugin_config("manage_weights", True) + if self.enable_lora: self.build_config.plugin_config.lora_plugin = 'auto' if self.max_lora_rank is not None: self.build_config.lora_config.max_lora_rank = self.max_lora_rank + if self.enable_prompt_adapter: + self.build_config.max_prompt_embedding_table_size = self.max_prompt_adapter_token * self.build_config.max_batch_size + if self.perform_config_arbitration: self._perform_config_arbitration() @@ -813,24 +829,17 @@ class _ModelRuntimeContext: ''' _ModelRuntimeContext holds the minimum runtime resources for running a model. It could be a runtime cache in MPI nodes. ''' - engine_buffer: Optional[trt.IHostMemory] = None - # engine_config is only used for saving the engine to disk - engine_config: Optional[Union[dict, EngineConfig]] = None + engine: Optional[Engine] = None mapping: Optional[Mapping] = None model_info: Optional[_ModelInfo] = None # This is only used when build-cache is enabled engine_path: Optional[str] = None - @property - def engine(self) -> trt.IHostMemory: - assert self.engine_buffer is not None - return self.engine_buffer - @property def model_arch(self) -> str: # "LlaMACausalForLM" or "OPTForCausalLM" and so on - return self.engine_config.pretrained_config['architecture'] + return self.engine.config.pretrained_config['architecture'] class ModelLoader: @@ -969,6 +978,8 @@ def step_forward(self): # execute the step start_time = time.time() self.step_handlers[self.counter]() + # release resource after each step + release_gc() if self.progress_bar: self.progress_bar.update(1) @@ -999,16 +1010,10 @@ def __call__(self, engine_dir: Optional[Path] = None) -> Path: ) pipeline() - if not hasattr(self, '_engine_config'): - raise RuntimeError("config is not loaded.") - - config = self._engine_config - assert engine_dir runtime_context = _ModelRuntimeContext( - engine_buffer=self._engine_buffer, - engine_config=config, + engine=self._engine, mapping=self.mapping, model_info=self._model_info, ) @@ -1061,8 +1066,7 @@ def copy_hf_tokenizer_data_to_engine_dir(): else: shutil.copy2(src, dst) - engine = Engine(config=model.engine_config, engine=model.engine) - engine.save(engine_dir) + model.engine.save(engine_dir) if rank == 0: copy_hf_tokenizer_data_to_engine_dir() @@ -1201,10 +1205,7 @@ def _build_engine(self): self.model.config.mapping.rank = self.rank assert self.model is not None, "model is loaded yet." - engine = build(self.model, copied_build_config) - - self._engine_buffer = engine.engine - self._engine_config = engine.config + self._engine = build(self.model, copied_build_config) self.mapping = self.model.config.mapping # delete the model explicitly to free all the build-time resources @@ -1225,9 +1226,7 @@ def _save_engine_for_runtime(self): def _load_engine_buffer(self): # Load engine buffer from disk - engine = Engine.from_dir(self._model_dir) - self._engine_buffer = engine.engine - self._engine_config = engine.config + self._engine = Engine.from_dir(self._model_dir) @staticmethod def load_extra_build_configs_from_engine( diff --git a/tensorrt_llm/hlapi/mgmn_worker_node.py b/tensorrt_llm/llmapi/mgmn_worker_node.py similarity index 100% rename from tensorrt_llm/hlapi/mgmn_worker_node.py rename to tensorrt_llm/llmapi/mgmn_worker_node.py diff --git a/tensorrt_llm/hlapi/mpi_session.py b/tensorrt_llm/llmapi/mpi_session.py similarity index 100% rename from tensorrt_llm/hlapi/mpi_session.py rename to tensorrt_llm/llmapi/mpi_session.py diff --git a/tensorrt_llm/hlapi/openai_protocol.py b/tensorrt_llm/llmapi/openai_protocol.py similarity index 99% rename from tensorrt_llm/hlapi/openai_protocol.py rename to tensorrt_llm/llmapi/openai_protocol.py index 21d6e1f41..ca41b65fc 100644 --- a/tensorrt_llm/hlapi/openai_protocol.py +++ b/tensorrt_llm/llmapi/openai_protocol.py @@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator from typing_extensions import Annotated, Required, TypedDict -from tensorrt_llm.hlapi import SamplingParams +from tensorrt_llm.llmapi import SamplingParams class OpenAIBaseModel(BaseModel): diff --git a/tensorrt_llm/hlapi/tokenizer.py b/tensorrt_llm/llmapi/tokenizer.py similarity index 100% rename from tensorrt_llm/hlapi/tokenizer.py rename to tensorrt_llm/llmapi/tokenizer.py diff --git a/tensorrt_llm/hlapi/trtllm-hlapi-launch b/tensorrt_llm/llmapi/trtllm-llmapi-launch similarity index 89% rename from tensorrt_llm/hlapi/trtllm-hlapi-launch rename to tensorrt_llm/llmapi/trtllm-llmapi-launch index 71048c196..f71d0a0fb 100755 --- a/tensorrt_llm/hlapi/trtllm-hlapi-launch +++ b/tensorrt_llm/llmapi/trtllm-llmapi-launch @@ -12,5 +12,5 @@ if [ -z "$mpi_rank" ] || [ "$mpi_rank" -eq 0 ]; then $task_with_command else echo "${mpi_rank} launch worker ..." >> /dev/stderr - python3 -m tensorrt_llm.hlapi.mgmn_worker_node + python3 -m tensorrt_llm.llmapi.mgmn_worker_node fi diff --git a/tensorrt_llm/hlapi/utils.py b/tensorrt_llm/llmapi/utils.py similarity index 98% rename from tensorrt_llm/hlapi/utils.py rename to tensorrt_llm/llmapi/utils.py index 21776591b..09ae9b056 100644 --- a/tensorrt_llm/hlapi/utils.py +++ b/tensorrt_llm/llmapi/utils.py @@ -52,7 +52,6 @@ class SamplingParams: include_stop_str_in_output (bool): Whether to include the stop strings in output text. Defaults to False. embedding_bias (torch.Tensor): The embedding bias tensor. Expected type is kFP32 and shape is [vocab_size]. external_draft_tokens_config (ExternalDraftTokensConfig): The speculative decoding configuration. - prompt_tuning_config (PromptTuningConfig): The prompt tuning configuration. logits_post_processor_name (str): The logits postprocessor name. Must correspond to one of the logits postprocessor name provided to the ExecutorConfig. beam_width (int): The beam width. Default is 1 which disables beam search. @@ -82,7 +81,7 @@ class SamplingParams: add_special_tokens (bool): Whether to add special tokens to the prompt. """ - # [TO DEVELOPER] This class provides an interface to HLAPI users. + # [TO DEVELOPER] This class provides an interface to LLMAPI users. # Internally, it manages and dispatches fields to Python bindings of C++ objects, currently including: # (1) all fields of tllme.SamplingConfig; # (2) all fields of tllme.OutputConfig; @@ -111,7 +110,6 @@ class SamplingParams: embedding_bias: Optional[torch.Tensor] = None external_draft_tokens_config: Optional[ tllme.ExternalDraftTokensConfig] = None - prompt_tuning_config: Optional[tllme.PromptTuningConfig] = None logits_post_processor_name: Optional[str] = None # Keep the below fields in sync with tllme.SamplingConfig diff --git a/tensorrt_llm/models/__init__.py b/tensorrt_llm/models/__init__.py index 7f3a66df9..a32635ce0 100755 --- a/tensorrt_llm/models/__init__.py +++ b/tensorrt_llm/models/__init__.py @@ -20,10 +20,12 @@ from .chatglm.model import ChatGLMForCausalLM, ChatGLMModel from .cogvlm.config import CogVLMConfig from .cogvlm.model import CogVLMForCausalLM +from .commandr.model import CohereForCausalLM from .dbrx.config import DbrxConfig from .dbrx.model import DbrxForCausalLM from .deepseek_v1.model import DeepseekForCausalLM from .dit.model import DiT +from .eagle.model import EagleForCausalLM from .enc_dec.model import DecoderModel, EncoderModel, WhisperEncoder from .falcon.config import FalconConfig from .falcon.model import FalconForCausalLM, FalconModel @@ -97,6 +99,7 @@ 'PretrainedModel', 'WhisperEncoder', 'MambaForCausalLM', + 'MambaConfig', 'MPTForCausalLM', 'MPTModel', 'SkyworkForCausalLM', @@ -107,7 +110,9 @@ 'RecurrentGemmaForCausalLM', 'CogVLMConfig', 'CogVLMForCausalLM', + 'EagleForCausalLM', 'SpeculativeDecodingMode', + 'CohereForCausalLM', ] MODEL_MAP = { @@ -163,4 +168,6 @@ 'DiT': DiT, 'DeepseekForCausalLM': DeepseekForCausalLM, 'DeciLMForCausalLM': DeciLMForCausalLM, + 'EagleForCausalLM': EagleForCausalLM, + 'CohereForCausalLM': CohereForCausalLM, } diff --git a/tensorrt_llm/models/automodel.py b/tensorrt_llm/models/automodel.py index a65781a88..9e382d9df 100644 --- a/tensorrt_llm/models/automodel.py +++ b/tensorrt_llm/models/automodel.py @@ -17,7 +17,14 @@ def from_hugging_face(hf_model_or_dir, hf_config = transformers.AutoConfig.from_pretrained( hf_model_or_dir, trust_remote_code=True) - hf_arch = hf_config.architectures[0] + + if hasattr(hf_config, + 'architectures') and hf_config.architectures is not None: + hf_arch = hf_config.architectures[0] + elif hasattr(hf_config, + 'model_type') and hf_config.model_type.find('mamba') != -1: + hf_arch = 'MambaForCausalLM' + trtllm_model_cls = MODEL_MAP.get(hf_arch, None) if trtllm_model_cls is None: raise NotImplementedError( @@ -47,7 +54,14 @@ def get_trtllm_model_class(hf_model_or_dir, trust_remote_code=False): hf_config = transformers.AutoConfig.from_pretrained( hf_model_or_dir, trust_remote_code=trust_remote_code) - hf_arch = hf_config.architectures[0] + + if hasattr(hf_config, + 'architectures') and hf_config.architectures is not None: + hf_arch = hf_config.architectures[0] + elif hasattr(hf_config, + 'model_type') and hf_config.model_type.find('mamba') != -1: + hf_arch = 'MambaForCausalLM' + trtllm_model_cls = MODEL_MAP.get(hf_arch, None) if trtllm_model_cls is None: diff --git a/tensorrt_llm/models/commandr/__init__.py b/tensorrt_llm/models/commandr/__init__.py new file mode 100644 index 000000000..71bf6d298 --- /dev/null +++ b/tensorrt_llm/models/commandr/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tensorrt_llm/models/commandr/config.py b/tensorrt_llm/models/commandr/config.py new file mode 100644 index 000000000..d82ac9c9e --- /dev/null +++ b/tensorrt_llm/models/commandr/config.py @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional, Union + +import torch +import transformers + +from ..._utils import torch_dtype_to_str +from ...mapping import Mapping +from ..modeling_utils import PretrainedConfig, QuantConfig + + +class CohereConfig(PretrainedConfig): + + def __init__(self, + *, + output_multiplier_scale: float = 0.0625, + rotary_base: float = 10000.0, + attn_bias: bool = False, + **kwargs): + self.output_multiplier_scale = output_multiplier_scale + self.rotary_base = rotary_base + self.attn_bias = attn_bias + super().__init__(**kwargs) + + def to_dict(self): + output = super().to_dict() + # Serialize the fields added in CohereConfig + output['output_multiplier_scale'] = self.output_multiplier_scale + output['rotary_base'] = self.rotary_base + output['attn_bias'] = self.attn_bias + return output + + @classmethod + def from_hugging_face( + cls, + hf_config_or_dir: Union[str, 'transformers.PretrainedConfig'], + dtype: str = 'auto', + mapping: Optional[Mapping] = None, + quant_config: Optional[QuantConfig] = None, + **kwargs): + if isinstance(hf_config_or_dir, transformers.PretrainedConfig): + hf_config = hf_config_or_dir + else: + hf_config = transformers.AutoConfig.from_pretrained( + hf_config_or_dir, trust_remote_code=True) + + head_size = hf_config.hidden_size // hf_config.num_attention_heads + + if dtype == 'auto': + dtype = getattr(hf_config, 'torch_dtype', None) + if dtype is None: + dtype = 'float16' + if isinstance(dtype, torch.dtype): + dtype = torch_dtype_to_str(dtype) + if dtype == 'float32': + dtype = 'float16' + + if hf_config.tie_word_embeddings: + kwargs['share_embedding_table'] = True + kwargs['use_parallel_embedding'] = True + kwargs['embedding_sharding_dim'] = 0 + + return CohereConfig( + architecture=hf_config.architectures[0], + dtype=dtype, + num_hidden_layers=hf_config.num_hidden_layers, + num_attention_heads=hf_config.num_attention_heads, + hidden_size=hf_config.hidden_size, + intermediate_size=hf_config.intermediate_size, + num_key_value_heads=hf_config.num_key_value_heads, + head_size=head_size, + vocab_size=hf_config.vocab_size, + position_embedding_type='rope_gptj', # different rope type + max_position_embeddings=hf_config.max_position_embeddings, + hidden_act=hf_config.hidden_act, + norm_epsilon=hf_config.layer_norm_eps, + output_multiplier_scale=hf_config.logit_scale, + rotary_base=hf_config.rope_theta, + attn_bias=hf_config.attention_bias, + qk_layernorm=hf_config.use_qk_norm, + mapping=mapping, + quantization=quant_config, + **kwargs) diff --git a/tensorrt_llm/models/commandr/model.py b/tensorrt_llm/models/commandr/model.py new file mode 100644 index 000000000..283af0385 --- /dev/null +++ b/tensorrt_llm/models/commandr/model.py @@ -0,0 +1,196 @@ +from typing import Optional + +from ..._common import default_net +from ..._utils import pad_vocab_size +from ...functional import recv, send +from ...layers import (Attention, AttentionMaskType, ColumnLinear, Embedding, + GatedMLP, LayerNorm, PositionEmbeddingType) +from ...mapping import Mapping +from ...module import Module +from ..model_weights_loader import ModelWeightsLoader +from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM, + QuantConfig) +from .config import CohereConfig + + +class CohereDecoderLayer(Module): + + def __init__(self, config: CohereConfig, layer_idx: int): + super().__init__() + self.layer_idx = layer_idx + self.config = config + + self.input_layernorm = LayerNorm(normalized_shape=config.hidden_size, + eps=config.norm_epsilon, + bias=False, + dtype=config.dtype) + + layers_range = config.mapping.pp_layers(config.num_hidden_layers) + self.local_layer_idx = layer_idx - layers_range[0] + self.attention = Attention( + local_layer_idx=self.local_layer_idx, + hidden_size=config.hidden_size, + attention_head_size=config.head_size, + num_attention_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + max_position_embeddings=config.max_position_embeddings, + dtype=config.dtype, + attention_mask_type=AttentionMaskType.causal, + bias=config.attn_bias, + position_embedding_type=PositionEmbeddingType.rope_gptj, + rotary_embedding_base=config.rotary_base, + tp_group=config.mapping.tp_group, + tp_size=config.mapping.tp_size, + tp_rank=config.mapping.tp_rank, + qk_layernorm=config.qk_layernorm, + layernorm_share=False, + eps=config.norm_epsilon, + quant_mode=config.quant_mode) + + self.mlp = GatedMLP(hidden_size=config.hidden_size, + ffn_hidden_size=config.intermediate_size, + hidden_act=config.hidden_act, + dtype=config.dtype, + bias=False, + tp_group=config.mapping.tp_group, + tp_size=config.mapping.tp_size, + quant_mode=config.quant_mode) + + def forward(self, + hidden_states, + attention_mask=None, + use_cache=False, + spec_decoding_params=None, + kv_cache_params=None, + attention_params=None): + assert not ( + default_net().plugin_config.reduce_fusion + ), "Custom all reduce and residual mlp can't be enabled at the same time." + + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + attention_output = self.attention( + hidden_states, + attention_mask=attention_mask, + use_cache=use_cache, + spec_decoding_params=spec_decoding_params, + kv_cache_params=kv_cache_params, + attention_params=attention_params) + + if use_cache: + attention_output, presents = attention_output + + mlp_output = self.mlp(hidden_states) + hidden_states = residual + attention_output + mlp_output + + if use_cache: + return (hidden_states, presents) + return hidden_states + + +class CohereModel(Module): + + def __init__(self, config: CohereConfig) -> None: + super().__init__() + + self.mapping = config.mapping + if self.mapping.is_first_pp_rank(): + self.vocab_embedding = Embedding(config.vocab_size, + config.hidden_size, + dtype=config.dtype, + tp_group=config.mapping.tp_group, + tp_size=config.mapping.tp_size, + tp_rank=config.mapping.tp_rank) + + self.layers = DecoderLayerList(CohereDecoderLayer, config) + + if self.mapping.is_last_pp_rank(): + self.ln_f = LayerNorm(normalized_shape=config.hidden_size, + eps=config.norm_epsilon, + bias=False, + dtype=config.dtype) + + def forward( + self, + input_ids=None, + position_ids=None, + use_cache=False, + attention_mask=None, + spec_decoding_params=None, + kv_cache_params=None, + attention_params=None, + hidden_states=None, + ): + if self.mapping.is_first_pp_rank(): + hidden_states = self.vocab_embedding(input_ids) + else: + hidden_states = recv(hidden_states, self.mapping.prev_pp_rank()) + + hidden_states = self.layers.forward( + hidden_states, + use_cache=use_cache, + attention_mask=attention_mask, + kv_cache_params=kv_cache_params, + attention_params=attention_params, + spec_decoding_params=spec_decoding_params) + + if use_cache: + hidden_states, presents = hidden_states + + if self.mapping.is_last_pp_rank(): + hidden_states = self.ln_f(hidden_states) + else: + hidden_states = send(hidden_states, self.mapping.next_pp_rank()) + + if use_cache: + return (hidden_states, presents) + return hidden_states + + +class CohereForCausalLM(DecoderModelForCausalLM): + config_class = CohereConfig + + def __init__(self, config: CohereConfig): + transformer = CohereModel(config) + vocab_size_padded = pad_vocab_size(config.vocab_size, + config.mapping.tp_size) + if config.mapping.is_last_pp_rank(): + lm_head = ColumnLinear(config.hidden_size, + vocab_size_padded, + bias=False, + dtype=config.dtype, + tp_group=config.mapping.tp_group, + tp_size=config.mapping.tp_size, + gather_output=True) + else: + lm_head = None + self.quant_mode = config.quant_mode + self.mapping = config.mapping + super().__init__(config, transformer, lm_head) + + @classmethod + def from_hugging_face(cls, + hf_model_or_dir: str, + dtype: str = 'auto', + mapping: Optional[Mapping] = None, + quant_config: Optional[QuantConfig] = None, + **kwargs): + ''' Create a CohereForCausalLM object from give parameters + ''' + + config = CohereConfig.from_hugging_face(hf_model_or_dir, + dtype=dtype, + mapping=mapping, + quant_config=quant_config, + **kwargs) + model = cls(config) + custom_dict = { + 'q_layernorm': 'q_norm', + 'k_layernorm': 'k_norm', + } + loader = ModelWeightsLoader(hf_model_or_dir, custom_dict) + loader.check_share_embedding(config) + loader.generate_tllm_weights(model) + + return model diff --git a/tensorrt_llm/models/deepseek_v1/convert.py b/tensorrt_llm/models/deepseek_v1/convert.py old mode 100644 new mode 100755 index 0e7edb796..8f54d5496 --- a/tensorrt_llm/models/deepseek_v1/convert.py +++ b/tensorrt_llm/models/deepseek_v1/convert.py @@ -67,11 +67,13 @@ def create_trt_config_from_hf(model_dir, 'rotary_base': rotary_base, 'norm_epsilon': rms_norm_eps, 'rotary_scaling': rotary_scaling, - 'moe_num_experts': moe_num_experts, - 'moe_top_k': moe_top_k, - 'moe_renorm_mode': moe_renorm_mode, - 'moe_num_shared_experts': moe_num_shared_experts, - 'moe_inter_size': moe_inter_size, + 'moe': { + 'num_experts': moe_num_experts, + 'top_k': moe_top_k, + 'normalization_mode': moe_renorm_mode, + 'num_shared_experts': moe_num_shared_experts, + 'moe_intermediate_size': moe_inter_size, + }, 'mapping': { 'world_size': mapping.tp_size * mapping.pp_size, 'tp_size': mapping.tp_size, @@ -82,11 +84,7 @@ def create_trt_config_from_hf(model_dir, } config.update(override_fields) - moe_config = MoeConfig(num_experts=config['moe_num_experts'], - moe_intermediate_size=config['moe_inter_size'], - num_shared_experts=config['moe_num_shared_experts'], - top_k=config['moe_top_k'], - normalization_mode=config['moe_renorm_mode']) + moe_config = MoeConfig.from_dict(config['moe']) moe_config.validate() return config @@ -151,11 +149,7 @@ def convert_deepseek(hf_model, mapping.tp_size model_params = dict(hf_model.named_parameters()) dtype = getattr(torch, dtype) - moe_config = MoeConfig(num_experts=config['moe_num_experts'], - moe_intermediate_size=config['moe_inter_size'], - num_shared_experts=config['moe_num_shared_experts'], - top_k=config['moe_top_k'], - normalization_mode=config['moe_renorm_mode']) + moe_config = MoeConfig.from_dict(config['moe']) layers_range = mapping.pp_layers(config['num_hidden_layers']) diff --git a/tensorrt_llm/models/deepseek_v1/model.py b/tensorrt_llm/models/deepseek_v1/model.py old mode 100644 new mode 100755 index ff6dcc18d..ea66f9879 --- a/tensorrt_llm/models/deepseek_v1/model.py +++ b/tensorrt_llm/models/deepseek_v1/model.py @@ -60,18 +60,14 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): rotary_embedding_scaling=config.rotary_scaling, tp_group=config.mapping.tp_group, tp_size=config.mapping.tp_size, - tp_rank=config.mapping.tp_rank) + tp_rank=config.mapping.tp_rank, + quant_mode=config.quant_mode) ClsMLP = GatedMLP - - moe_config = MoeConfig(num_experts=config.moe_num_experts, - moe_intermediate_size=config.moe_inter_size, - num_shared_experts=config.moe_num_shared_experts, - top_k=config.moe_top_k, - normalization_mode=config.moe_renorm_mode) + moe_config = MoeConfig.from_dict(config.moe) mlp_kwargs = {} - if config.moe_num_experts > 0 and layer_idx > 0: + if moe_config.num_experts > 0 and layer_idx > 0: mlp_hidden_size = moe_config.num_shared_experts * moe_config.moe_intermediate_size hidden_act = config.hidden_act ClsMLP = SharedMoE @@ -89,6 +85,7 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): bias=False, tp_group=config.mapping.tp_group, tp_size=config.mapping.tp_size, + quant_mode=config.quant_mode, **mlp_kwargs) ### Pose layernorm in Deepseek v1 is same as Llama ) diff --git a/tensorrt_llm/models/eagle/__init__.py b/tensorrt_llm/models/eagle/__init__.py new file mode 100644 index 000000000..a08b2c204 --- /dev/null +++ b/tensorrt_llm/models/eagle/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tensorrt_llm/models/eagle/config.py b/tensorrt_llm/models/eagle/config.py new file mode 100644 index 000000000..1092b92a2 --- /dev/null +++ b/tensorrt_llm/models/eagle/config.py @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..llama.config import LLaMAConfig + + +class EagleConfig(LLaMAConfig): + + def __init__(self, + *, + num_eagle_layers: int = 1, + max_draft_len: int = 63, + **kwargs): + self.num_eagle_layers = num_eagle_layers + self.max_draft_len = max_draft_len + self.eagle_net_config = LLaMAConfig.from_dict( + kwargs["eagle_net_config"]) + del kwargs["eagle_net_config"] + super().__init__(**kwargs) + + def to_dict(self): + output = super().to_dict() + # Serialize the fields added in EagleConfig + output['num_eagle_layers'] = self.num_eagle_layers + output['max_draft_len'] = self.max_draft_len + output['eagle_net_config'] = self.eagle_net_config.to_dict() + return output diff --git a/tensorrt_llm/models/eagle/model.py b/tensorrt_llm/models/eagle/model.py new file mode 100644 index 000000000..28cc21c43 --- /dev/null +++ b/tensorrt_llm/models/eagle/model.py @@ -0,0 +1,524 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict + +import numpy as np +import tensorrt as trt + +from tensorrt_llm.models.generation_mixin import GenerationMixin +from tensorrt_llm.models.llama.model import LLaMAForCausalLM, LLaMAModel + +from ..._common import default_net, default_trtnet +from ..._utils import pad_vocab_size +from ...bindings import KVCacheType +from ...functional import Tensor, _create_tensor, concat, index_select, shape +from ...layers import (AttentionParams, ColumnLinear, KeyValueCacheParams, + SpecDecodingParams) +from ...module import Module, ModuleList +from ...plugin import TRT_LLM_PLUGIN_NAMESPACE +from .config import EagleConfig + + +class TreeParams(object): + + def __init__(self, paths: Tensor = None): + self.paths = paths + + +def eagle_sample_and_accept_draft_plugin(lm_logits: Tensor = None, + draft_tokens: Tensor = None, + draft_lens: Tensor = None, + eagle_temperature: Tensor = None, + rand_data_validation: Tensor = None, + tree_params: TreeParams = None, + greedy_sampling: bool = True): + # TODO + ''' + Parameters: + lm_logits : Tensor (On GPU) + + draft_tokens: Tensor + + draft_lens: Tensor + + eagle_temperature: Tensor + + rand_data_validation: Tensor + + tree_params : TreeParams + + greedy_sampling : bool + + Return: + accepted_tokens, num_accepted_tokens, accepted_paths, last_accepted_tokens, + cum_sum_last_accepted_indices, next_draft_tokens, next_draft_lens + + ''' + + plg_creator = trt.get_plugin_registry().get_plugin_creator( + 'EagleSampleAndAcceptDraftTokens', '1', TRT_LLM_PLUGIN_NAMESPACE) + assert plg_creator is not None + + pf_type = trt.PluginField("type_id", + np.array([int(lm_logits.dtype)], np.int32), + trt.PluginFieldType.INT32) + + greedy_sampling = 1 if greedy_sampling else 0 + greedy_sampling = trt.PluginField("greedy_sampling", + np.array(greedy_sampling, dtype=np.int32), + trt.PluginFieldType.INT32) + + pfc = trt.PluginFieldCollection([pf_type, greedy_sampling]) + plugin = plg_creator.create_plugin("eagle_sample_and_accept_draft_plugin", + pfc) + + plug_inputs = [ + lm_logits, draft_tokens, draft_lens, eagle_temperature, + rand_data_validation, tree_params.paths + ] + + plug_inputs = [i.trt_tensor for i in plug_inputs] + layer = default_trtnet().add_plugin_v2(plug_inputs, plugin) + + accepted_tokens = _create_tensor(layer.get_output(0), layer) + num_accepted_tokens = _create_tensor(layer.get_output(1), layer) + accepted_paths = _create_tensor(layer.get_output(2), layer) + last_accepted_tokens = _create_tensor(layer.get_output(3), layer) + cum_sum_last_accepted_indices = _create_tensor(layer.get_output(4), layer) + next_draft_tokens = _create_tensor(layer.get_output(5), layer) + next_draft_lens = _create_tensor(layer.get_output(6), layer) + return tuple([ + accepted_tokens, num_accepted_tokens, accepted_paths, + last_accepted_tokens, cum_sum_last_accepted_indices, next_draft_tokens, + next_draft_lens + ]) + + +def eagle_draft_decoder_plugin(layer_idx: int, logits: Tensor, + next_draft_tokens: Tensor, + next_draft_lens: Tensor, + rand_data_sample: Tensor, + tree_params: TreeParams): + # TODO + ''' + Parameters: + layer_idx : int + + logits : Tensor + + next_draft_tokens : Tensor + + next_draft_lens : Tensor + + rand_data_sample : Tensor + + tree_params : TreeParams + + Return: + output_next_draft_tokens, output_next_draft_lens + + ''' + + plg_creator = trt.get_plugin_registry().get_plugin_creator( + 'EagleDecodeDraftTokens', '1', TRT_LLM_PLUGIN_NAMESPACE) + assert plg_creator is not None + + pf_type = trt.PluginField("type_id", np.array([int(logits.dtype)], + np.int32), + trt.PluginFieldType.INT32) + + layer_idx = trt.PluginField("layer_idx", np.array(layer_idx, + dtype=np.int32), + trt.PluginFieldType.INT32) + + pfc = trt.PluginFieldCollection([pf_type, layer_idx]) + plugin = plg_creator.create_plugin("eagle_draft_decoder_plugin", pfc) + + plug_inputs = [ + logits, next_draft_tokens, next_draft_lens, rand_data_sample, + tree_params.paths + ] + + plug_inputs = [i.trt_tensor for i in plug_inputs] + layer = default_trtnet().add_plugin_v2(plug_inputs, plugin) + + output_next_draft_tokens = _create_tensor(layer.get_output(0), layer) + output_next_draft_lens = _create_tensor(layer.get_output(1), layer) + return tuple([output_next_draft_tokens, output_next_draft_lens]) + + +def eagle_prepare_drafter_inputs_plugin(layer_idx: int, + attention_params: AttentionParams, + kv_cache_params: KeyValueCacheParams, + tree_params: TreeParams, + hidden_states: Tensor): + # TODO + ''' + Parameters: + layer_idx : int + + attention_params : AttentionParams + + kv_cache_params : KeyValueCacheParams + + tree_params : TreeParams + + hidden_states: Tensor + + Return: + sequence_length, host_request_types, host_past_key_value_lengths, + past_key_value_length, spec_decoding_generation_lengths, spec_decoding_position_offsets, + spec_decoding_packed_mask, input_ids, position_ids, hidden_states + + ''' + + plg_creator = trt.get_plugin_registry().get_plugin_creator( + 'EaglePrepareDrafterInputs', '1', TRT_LLM_PLUGIN_NAMESPACE) + assert plg_creator is not None + + pf_type = trt.PluginField("type_id", + np.array([int(hidden_states.dtype)], np.int32), + trt.PluginFieldType.INT32) + + layer_idx = trt.PluginField("layer_idx", np.array(layer_idx, + dtype=np.int32), + trt.PluginFieldType.INT32) + + pfc = trt.PluginFieldCollection([pf_type, layer_idx]) + plugin = plg_creator.create_plugin("eagle_prepare_drafter_inputs_plugin", + pfc) + + plug_inputs = [ + attention_params.sequence_length, attention_params.host_request_types, + kv_cache_params.host_past_key_value_lengths, + kv_cache_params.host_kv_cache_pool_pointers, + kv_cache_params.kv_cache_block_offsets, tree_params.paths, hidden_states + ] + + plug_inputs = [i.trt_tensor for i in plug_inputs] + layer = default_trtnet().add_plugin_v2(plug_inputs, plugin) + + sequence_length = _create_tensor(layer.get_output(0), layer) + host_request_types = _create_tensor(layer.get_output(1), layer) + host_past_key_value_lengths = _create_tensor(layer.get_output(2), layer) + spec_decoding_generation_lengths = _create_tensor(layer.get_output(3), + layer) + spec_decoding_position_offsets = _create_tensor(layer.get_output(4), layer) + spec_decoding_packed_mask = _create_tensor(layer.get_output(5), layer) + input_ids = _create_tensor(layer.get_output(6), layer) + position_ids = _create_tensor(layer.get_output(7), layer) + hidden_states = _create_tensor(layer.get_output(8), layer) + return tuple([ + sequence_length, host_request_types, host_past_key_value_lengths, + spec_decoding_generation_lengths, spec_decoding_position_offsets, + spec_decoding_packed_mask, input_ids, position_ids, hidden_states + ]) + + +class EagleNet(Module): + + def __init__( + self, + config, + ): + super().__init__() + self.drafter = LLaMAModel(config) + + vocab_size_padded = pad_vocab_size(config.vocab_size, + config.mapping.tp_size) + if config.mapping.is_last_pp_rank(): + self.lm_head = ColumnLinear(config.hidden_size, + vocab_size_padded, + bias=False, + dtype=config.dtype, + tp_group=config.mapping.tp_group, + tp_size=config.mapping.tp_size, + gather_output=True) + else: + self.lm_head = None + + def forward(self, hidden_states, input_ids, position_ids, + spec_decoding_params, kv_cache_params, attention_params): + hidden_states, cache = self.drafter( + input_ids, + position_ids=position_ids, + use_cache=True, + spec_decoding_params=spec_decoding_params, + kv_cache_params=kv_cache_params, + attention_params=attention_params, + hidden_states_for_embed=hidden_states) + + return self.lm_head(hidden_states), hidden_states, cache + + +class EagleForCausalLM(LLaMAForCausalLM): + config_class = EagleConfig + + def __init__(self, config: EagleConfig): + + super().__init__(config) + self.num_eagle_layers = config.num_eagle_layers + self.hidden_size = config.hidden_size + self.vocab_size = config.vocab_size + vocab_size_padded = pad_vocab_size(self.vocab_size, + config.mapping.tp_size) + eagle_net_config = config.eagle_net_config + eagle_net_config.fc_after_embed = True + eagle_net_config.use_input_layernorm_in_first_layer = False + self.eagle_nets = ModuleList([ + EagleNet(config=eagle_net_config) + for _ in range(self.num_eagle_layers) + ]) + self.max_draft_len = config.max_draft_len + + def _prepare_drafter_inputs(self, layer_idx, input_attention_params, + input_kv_cache_params, input_tree_params, + hidden_states): + + drafter_inputs = eagle_prepare_drafter_inputs_plugin( + layer_idx, input_attention_params, input_kv_cache_params, + input_tree_params, hidden_states) + + sequence_length, host_request_types, host_past_key_value_lengths, \ + spec_decoding_generation_lengths, spec_decoding_position_offsets, \ + spec_decoding_packed_mask, input_ids, position_ids, hidden_states = drafter_inputs + + attention_params = input_attention_params + attention_params.sequence_length = sequence_length + attention_params.host_request_types = host_request_types + + kv_cache_params = input_kv_cache_params + kv_cache_params.host_past_key_value_lengths = host_past_key_value_lengths + + spec_decoding_params = SpecDecodingParams( + True, self.max_draft_len, spec_decoding_generation_lengths, + spec_decoding_position_offsets, spec_decoding_packed_mask) + + eagle_net_inputs = {} + eagle_net_inputs["input_ids"] = input_ids + eagle_net_inputs["position_ids"] = position_ids + eagle_net_inputs["attention_params"] = attention_params + eagle_net_inputs["kv_cache_params"] = kv_cache_params + eagle_net_inputs["spec_decoding_params"] = spec_decoding_params + eagle_net_inputs["hidden_states"] = hidden_states + return eagle_net_inputs + + def _slice_hidden_states(self, hidden_states, last_ids): + hidden_states = index_select(hidden_states, 0, + last_ids - 1) # [seq_len, hidden] + + hidden_states = hidden_states.view( + concat([shape(last_ids, 0), + shape(hidden_states, 1)])) + return hidden_states + + def _eagle_fwd_helper(self, lm_logits, hidden_states, *args, **kwargs): + # TODO what to do for context + + # FIXME Once it is clear what we need from Tree, either get them from runtime or assemble dynamically in the TRT + # Most likely the latter, as EAGLE-2 gets dynamic tree. + input_tree_params = kwargs["tree_params"] + + draft_tokens = kwargs['draft_tokens'] + draft_lens = kwargs['draft_lens'] + eagle_temperature = kwargs['eagle_temperature'] + rand_data_validation = kwargs['rand_data_validation'] + rand_data_sample = kwargs['rand_data_sample'] + + # Sample target tokens and accept them + # next_draft_tokens, next_draft_lens, next_paths are outputted here just to + # reserve the tensor with max size, which eagle_draft_decoder_plugin is going to directly write to + output = eagle_sample_and_accept_draft_plugin(lm_logits, draft_tokens, + draft_lens, + eagle_temperature, + rand_data_validation, + input_tree_params) + accepted_tokens, num_accepted_tokens, accepted_paths, \ + last_accepted_tokens, cum_sum_last_accepted_indices, next_draft_tokens, next_draft_lens = output + + # Get hidden states for accepted ids + hidden_states = self._slice_hidden_states( + hidden_states, cum_sum_last_accepted_indices) + + attention_params = kwargs["attention_params"] + kv_cache_params = kwargs["kv_cache_params"] + + # Run EAGLE nets + for li in range(self.num_eagle_layers): + + # FIXME: what to do with appending KV cache in the decoder + # We won't append more than max_draft_len + max_path_len + # TODO handle context requests in a special way + # TODO rewind KV cache + # For the 1st layer, rewind kv cache for the accepted tokens, prepare EAGLE Net inputs + eagle_net_inputs = self._prepare_drafter_inputs( + li, attention_params, kv_cache_params, input_tree_params, + hidden_states) + + # Run EAGLE Net + logits, hidden_states, _ = self.eagle_nets[li](**eagle_net_inputs) + + # Decode draft tokens + next_draft_tokens, next_draft_lens = eagle_draft_decoder_plugin( + li, logits, next_draft_tokens, next_draft_lens, + rand_data_sample, input_tree_params) + + # Update params + attention_params = eagle_net_inputs["attention_params"] + kv_cache_params = eagle_net_inputs["kv_cache_params"] + + # Mark tensors as output + accepted_tokens.mark_output('accepted_tokens') + num_accepted_tokens.mark_output('num_accepted_tokens') + accepted_paths.mark_output('accepted_paths') + next_draft_tokens.mark_output('next_draft_tokens') + next_draft_lens.mark_output('next_draft_lens') + + return next_draft_tokens + + def forward(self, *args, **kwargs): + extra_args = [ + "draft_tokens", "draft_lens", "eagle_temperature", + "rand_data_validation", "rand_data_sample", "tree_params" + ] + + base_kwargs = {k: v for k, v in kwargs.items() if k not in extra_args} + + # Base model forward + hidden_states = super().forward(*args, **base_kwargs) + + assert kwargs['use_cache'] and default_net( + ).plugin_config.paged_kv_cache + + lm_logits, hidden_states = hidden_states + + if self.mapping.is_last_pp_rank(): + # Call eagle logic to accept prev draft tokens and predict next draft tokens + next_draft_tokens = self._eagle_fwd_helper(lm_logits, hidden_states, + *args, **kwargs) + else: + hidden_states.mark_output('hidden_states_output', self.config.dtype) + + if self.mapping.is_last_pp_rank(): + return next_draft_tokens + return hidden_states + + def prepare_inputs(self, *args, **kwargs): + """ + Inputs needed: + device_request_types: [bs] + draft_tokens: [bs, max_draft_len] + draft_lens: [bs] + spec_decoding_generation_lengths: [bs] + spec_decoding_position_offsets: [bs, max_gen_tokens] + spec_decoding_packed_mask: [bs, max_draft_len, packed_length] ** + eagle_temperature: [bs] + rand_data_sample: [bs] + rand_data_validation: [bs, max_draft_tokens] + + ** The mask is tricky since the boolean mask will need to be + packed in runtime. So, the last dim will be: + packed_length = ceil(max_draft_tokens/32) + """ + default_range = GenerationMixin.default_range + remove_input_padding = default_net().plugin_config.remove_input_padding + use_gpt_attention_plugin = default_net( + ).plugin_config.gpt_attention_plugin + use_gemm_plugin = default_net().plugin_config.gemm_plugin + paged_kv_cache = default_net().plugin_config.paged_kv_cache + max_batch_size = kwargs['max_batch_size'] + assert max_batch_size is not None + bb_range = default_range(max_batch_size) + bb0_range = default_range(max_batch_size, min_range=0, opt_offset=1) + + kwargs['speculative_decoding_draft_tokens_external'] = False + kwargs['max_draft_len'] = self.max_draft_len + kwargs['spec_decoding_is_generation_length_variable'] = True + + # Call base class prepare inputs + inputs = super().prepare_inputs(*args, **kwargs) + + assert inputs['spec_decoding_params'] is not None + + enable_two_optimization_profiles = GenerationMixin.has_ctx_gen_opt_profiles( + use_gpt_attention_plugin=use_gpt_attention_plugin, + use_gemm_plugin=use_gemm_plugin, + remove_input_padding=remove_input_padding, + kv_cache_type=KVCacheType.PAGED + if paged_kv_cache else KVCacheType.CONTINUOUS) + if enable_two_optimization_profiles: + bb_range = [bb_range, bb_range] + bb0_range = [bb0_range, bb0_range] + draft_len_range = [self.max_draft_len] + path_len_range = [self.num_eagle_layers + 1] + else: + bb_range = [bb_range] + bb0_range = [bb0_range] + draft_len_range = [self.max_draft_len] + path_len_range = [self.num_eagle_layers + 1] + + draft_tokens = Tensor(name='draft_tokens', + dtype=trt.int32, + shape=[-1, self.max_draft_len], + dim_range=OrderedDict([ + ('batch_size_wt0', bb0_range), + ('draft_len', draft_len_range), + ])) + draft_lens = Tensor(name='draft_lens', + dtype=trt.int32, + shape=[-1], + dim_range=OrderedDict([ + ('batch_size_wt0', bb0_range), + ])) + eagle_temperature = Tensor(name='eagle_temperature', + dtype=trt.float32, + shape=[-1], + dim_range=OrderedDict([ + ("batch_size", bb_range), + ])) + rand_data_validation = Tensor(name='rand_data_validation', + dtype=trt.float32, + shape=[-1, self.max_draft_len], + dim_range=OrderedDict([ + ('batch_size_wt0', bb0_range), + ('draft_len', draft_len_range), + ])) + rand_data_sample = Tensor(name='rand_data_sample', + dtype=trt.float32, + shape=[-1], + dim_range=OrderedDict([ + ('batch_size', bb_range), + ])) + tree_paths = Tensor( + name='tree_paths', + dtype=trt.int32, + # FIXME max_accepted len is not necessary self.num_eagle_layers + 1. Only True for EAGLE-1 + shape=[-1, self.max_draft_len, self.num_eagle_layers + 1], + dim_range=OrderedDict([ + ('batch_size', bb_range), + ('draft_len', draft_len_range), + ('path_len', path_len_range), + ])) + + tree_params = TreeParams(paths=tree_paths) + + inputs['draft_tokens'] = draft_tokens + inputs['draft_lens'] = draft_lens + inputs['eagle_temperature'] = eagle_temperature + inputs['rand_data_validation'] = rand_data_validation + inputs['rand_data_sample'] = rand_data_sample + inputs['tree_params'] = tree_params + return inputs diff --git a/tensorrt_llm/models/eagle/weight.py b/tensorrt_llm/models/eagle/weight.py new file mode 100644 index 000000000..d0e114549 --- /dev/null +++ b/tensorrt_llm/models/eagle/weight.py @@ -0,0 +1,613 @@ +import copy +import functools +import time +from collections import defaultdict +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from tqdm import tqdm +from transformers.models.llama.modeling_llama import LlamaDecoderLayer +from transformers.pytorch_utils import Conv1D + +from tensorrt_llm import logger +from tensorrt_llm._utils import str_dtype_to_torch +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.models.convert_utils import (dup_kv_weight, generate_int8, + smooth_gemm, + smooth_gemm_fc1_gate, split, + split_matrix_tp, split_qkv_tp) + + +def get_tllm_linear_weight(weight, + prefix, + bias=None, + use_weight_only=False, + plugin_weight_only_quant_type=torch.int8, + postfix='weight'): + results = {} + if use_weight_only: + v = weight.t().contiguous().cpu() + processed_torch_weights, torch_weight_scales = \ + torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix( + v, plugin_weight_only_quant_type) + results[prefix + postfix] = processed_torch_weights + results[prefix + 'per_channel_scale'] = torch_weight_scales + else: + results[prefix + postfix] = weight.contiguous() + + if bias is not None: + results[prefix + 'bias'] = bias + + return results + + +@torch.no_grad() +def smooth_llama_model(model, scales, alpha, llama_qkv_para, llama_smoother): + # Smooth the activation and weights with smoother = $\diag{s}$ + for name, module in model.named_modules(): + if not isinstance(module, LlamaDecoderLayer): + continue + # qkv_proj + layer_name_q = name + ".self_attn.q_proj" + layer_name_k = name + ".self_attn.k_proj" + layer_name_v = name + ".self_attn.v_proj" + layer_name_qkv = name + ".self_attn.qkv_proj" + + weight = torch.cat([ + module.self_attn.q_proj.weight, module.self_attn.k_proj.weight, + module.self_attn.v_proj.weight + ], + dim=0) + + smoother = smooth_gemm(weight, scales[layer_name_q]["x"], + module.input_layernorm.weight, None, alpha) + + scales[layer_name_qkv]["x"] = scales[layer_name_q]["x"] / smoother + scales[layer_name_qkv]["w"] = weight.abs().max(dim=1)[0] + scales[layer_name_qkv]["y"] = torch.cat([ + scales[layer_name_q]["y"], scales[layer_name_k]["y"], + scales[layer_name_v]["y"] + ], + dim=0) + + # see transpose_weights function + llama_qkv_para[layer_name_qkv] = weight.transpose(0, 1) + + # ================================================================= + layer_name = name + ".self_attn.o_proj" + smoother = smooth_gemm(module.self_attn.o_proj.weight, + scales[layer_name]["x"], None, None, alpha) + llama_smoother[layer_name] = smoother.float() + + scales[layer_name]["x"] = scales[layer_name]["x"] / smoother + scales[layer_name]["w"] = module.self_attn.o_proj.weight.abs().max( + dim=1)[0] + + # ================================================================== + fc1_layer_name = name + ".mlp.gate_proj" + gate_layer_name = name + ".mlp.up_proj" + + smoother = smooth_gemm_fc1_gate(module.mlp.gate_proj.weight, + module.mlp.up_proj.weight, + scales[fc1_layer_name]["x"], + module.post_attention_layernorm.weight, + None, alpha) + + scales[fc1_layer_name]["x"] = scales[fc1_layer_name]["x"] / smoother + scales[fc1_layer_name]["w"] = module.mlp.gate_proj.weight.abs().max( + dim=1)[0] + + scales[gate_layer_name]["x"] = scales[gate_layer_name]["x"] / smoother + scales[gate_layer_name]["w"] = module.mlp.up_proj.weight.abs().max( + dim=1)[0] + + # ================================================================== + layer_name = name + ".mlp.down_proj" + smoother = smooth_gemm(module.mlp.down_proj.weight, + scales[layer_name]["x"], None, None, alpha) + llama_smoother[layer_name] = smoother.float() + scales[layer_name]["x"] = scales[layer_name]["x"] / smoother + scales[layer_name]["w"] = module.mlp.down_proj.weight.abs().max( + dim=1)[0] + + +@torch.no_grad() +def capture_activation_range(model, + tokenizer, + dataset, + num_samples=512, + seq_len=512): + model.eval() + device = next(model.parameters()).device + act_scales = defaultdict(lambda: {"x": None, "y": None, "w": None}) + + tokenizer.pad_token = tokenizer.eos_token + + def stat_tensor(name, tensor, act_scales, key): + hidden_dim = tensor.shape[-1] + tensor = tensor.view(-1, hidden_dim).abs().detach() + comming_max = torch.max(tensor, dim=0)[0].float() + + if act_scales[name][key] is None: + act_scales[name][key] = comming_max + else: + act_scales[name][key] = torch.max(act_scales[name][key], + comming_max) + + def stat_input_hook(m, x, y, name): + if isinstance(x, tuple): + x = x[0] + stat_tensor(name, x, act_scales, "x") + stat_tensor(name, y, act_scales, "y") + + if act_scales[name]["w"] is None: + act_scales[name]["w"] = m.weight.abs().clip(1e-8, + None).max(dim=1)[0] + + hooks = [] + for name, m in model.named_modules(): + if isinstance(m, nn.Linear) or isinstance(m, Conv1D): + hooks.append( + m.register_forward_hook( + functools.partial(stat_input_hook, name=name))) + + for i in tqdm(range(num_samples), desc="calibrating model"): + datapoint = dataset[i:i + 1] + line = copy.copy(datapoint) + line[0] = line[0] + ' TL;DR: ' + line[0] = line[0].strip() + line[0] = line[0].replace(" n't", "n't") + input_ids = tokenizer(line, + return_tensors="pt", + max_length=seq_len, + padding=True, + truncation=True).input_ids.to(device) + model(input_ids) + for h in hooks: + h.remove() + return act_scales + + +def get_weight(config, prefix, dtype): + if config[prefix + '.weight'].dtype != dtype: + config[prefix + '.weight'].data = config[prefix + '.weight'].to(dtype) + return config[prefix + '.weight'] + + +def get_bias(config, prefix, dtype): + if config[prefix + '.bias'].dtype != dtype: + config[prefix + '.bias'].data = config[prefix + '.bias'].to(dtype) + return config[prefix + '.bias'] + + +def get_weight_and_bias(config, prefix, dtype): + return get_weight(config, prefix, dtype), get_bias(config, prefix, dtype) + + +def get_tllm_linear_sq_weight(vals, + prefix, + shape, + tensor_parallel, + is_qkv=False, + per_token=False, + per_channel=False, + last_prefix=None, + bias=None, + smoother_value=None, + smoother_shape=None, + rank=0, + cat_dim=0, + multi_query_mode=False): + results = {} + + def multi_query_split(data, local_dim, head_size, tp_size, cur_rank): + q, k, v = np.split(data, [local_dim, local_dim + head_size], axis=-1) + q_split = np.split(q, tp_size, axis=-1) + k_split = np.split(k, tp_size, axis=-1) + v_split = np.split(v, tp_size, axis=-1) + return [ + np.concatenate((q_split[ii], k_split[ii], v_split[ii]), axis=-1) + for ii in range(tp_size) + ][cur_rank] + + col_shape = shape if (is_qkv or per_channel) else [1, 1] + + if per_token: + original_weights = vals["weight.int8.col"] + + local_dim = original_weights.shape[0] + head_size = (original_weights.shape[1] - local_dim) // 2 + if multi_query_mode: + cur_weights = multi_query_split(original_weights, local_dim, + head_size, tensor_parallel, rank) + else: + cur_weights = np.split(original_weights, + tensor_parallel, + axis=cat_dim)[rank] + if is_qkv: + hidden_dim = cur_weights.shape[0] + cur_weights = cur_weights.reshape(hidden_dim, -1) + results[prefix + + 'weight'] = torch.from_numpy(cur_weights).t().contiguous() + if smoother_value is None: + results[last_prefix] = torch.from_numpy( + np.array([1.0], dtype=np.float32)) + + if smoother_value is None: + if multi_query_mode: + cur_per_channel_value = multi_query_split( + vals["scale_w_quant_orig.col"], local_dim, head_size, + tensor_parallel, rank) + else: + cur_per_channel_value = np.split(vals["scale_w_quant_orig.col"], + tensor_parallel, + axis=cat_dim)[rank] + else: + cur_per_channel_value = vals["scale_w_quant_orig.col"] + results[prefix + 'per_channel_scale'] = torch.from_numpy( + np.array(cur_per_channel_value, + dtype=np.float32).reshape(col_shape)).contiguous() + else: + original_weights = np.array(vals["weight.int8"]) + cur_weights = np.split(original_weights, tensor_parallel, + axis=cat_dim)[rank] + + if is_qkv: + hidden_dim = cur_weights.shape[0] + cur_weights = cur_weights.reshape(hidden_dim, -1) + results[prefix + + 'weight'] = torch.from_numpy(cur_weights).t().contiguous() + + cur_per_channel_value = vals["scale_y_accum_quant"] + + results[prefix + 'per_channel_scale'] = torch.from_numpy( + np.array([cur_per_channel_value], + dtype=np.float32).reshape(col_shape)).contiguous() + + results[last_prefix] = torch.from_numpy( + np.array([vals['scale_x_orig_quant']], + dtype=np.float32)).contiguous() + + results[prefix + 'act_scale'] = torch.from_numpy( + np.array([[vals["scale_y_quant_orig"]]], + dtype=np.float32)).contiguous() + + if smoother_value is not None: + cur_smoother_value = np.split(smoother_value, + tensor_parallel, + axis=cat_dim)[rank] + results[prefix + 'smoother'] = cur_smoother_value.reshape( + smoother_shape).contiguous().to(torch.float32) + + if bias is not None: + results[prefix + 'bias'] = bias + + return results + + +def convert_hf_llama(hf_model, + mapping, + rank=0, + dtype='float32', + use_parallel_embedding=False, + sharding_dim=0, + use_weight_only=False, + share_embedding_table=False, + plugin_weight_only_quant_type=torch.int8, + use_smooth_quant=False, + per_channel=False, + per_token=False, + int8_kv_cache=False, + act_range=[], + qkv_para=[], + smoother=[], + lora_config=None): + + weights = {} + tik = time.time() + tensor_parallel = mapping.tp_size + model_params = dict(hf_model.named_parameters()) + dtype = getattr(torch, dtype) + num_attention_heads = hf_model.config.num_attention_heads + hidden_size = hf_model.config.hidden_size + intermediate_size = hf_model.config.intermediate_size + num_key_value_heads = hf_model.config.num_key_value_heads + mha_mode = (num_key_value_heads == num_attention_heads) + + num_hidden_layers = hf_model.config.num_hidden_layers + layers_range = mapping.pp_layers(num_hidden_layers) + for l in layers_range: + layer_idx = l - layers_range[0] + prefix = f'model.layers.{l}.' + tllm_prex = f'transformer.layers.{layer_idx}.' + + q_weight = get_weight(model_params, prefix + 'self_attn.q_proj', dtype) + k_weight = get_weight(model_params, prefix + 'self_attn.k_proj', dtype) + v_weight = get_weight(model_params, prefix + 'self_attn.v_proj', dtype) + + if not mha_mode: + head_size = hidden_size // num_attention_heads + if num_key_value_heads < tensor_parallel: + # duplicate the KV heads up to tensor_parallel + k_weight = dup_kv_weight(k_weight, num_key_value_heads, + tensor_parallel) + v_weight = dup_kv_weight(v_weight, num_key_value_heads, + tensor_parallel) + assert (k_weight.shape[0] % (mapping.tp_size * head_size)) == 0 + assert (v_weight.shape[0] % (mapping.tp_size * head_size)) == 0 + + wq = split(q_weight, mapping.tp_size, mapping.tp_rank) + wk = split(k_weight, mapping.tp_size, mapping.tp_rank) + wv = split(v_weight, mapping.tp_size, mapping.tp_rank) + + split_v = torch.concat((wq, wk, wv)) + + else: + qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0) + + split_v = split_qkv_tp(qkv_weight, num_attention_heads, hidden_size, + tensor_parallel, mapping.tp_rank) + if use_smooth_quant: + qkv_weight = qkv_para[prefix + 'self_attn.qkv_proj'] + + if not mha_mode: + hidden_size = qkv_weight.shape[0] + local_dim = hidden_size + head_size = (qkv_weight.shape[-1] - local_dim) // 2 + qkv_weight = qkv_weight.reshape(hidden_size, + local_dim + 2 * head_size) + else: + qkv_weight = qkv_weight.reshape(hidden_size, 3, hidden_size) + + int8_weights = generate_int8(qkv_weight, + act_range.get(prefix + + 'self_attn.qkv_proj'), + is_qkv=True, + multi_query_mode=bool(not mha_mode)) + + weights.update( + get_tllm_linear_sq_weight( + int8_weights, + tllm_prex + 'attention.qkv.', [ + 1, 3 * hidden_size // tensor_parallel + if mha_mode else hidden_size // tensor_parallel + + (hidden_size // num_key_value_heads) // + tensor_parallel * 2 + ], + tensor_parallel, + is_qkv=True, + per_token=per_token, + per_channel=per_channel, + last_prefix=tllm_prex + 'input_layernorm.scale_to_int', + smoother_value=None, + smoother_shape=None, + rank=mapping.tp_rank, + cat_dim=-1, + multi_query_mode=bool(not mha_mode))) + else: + weights.update( + get_tllm_linear_weight(split_v, tllm_prex + 'attention.qkv.', + None, use_weight_only, + plugin_weight_only_quant_type)) + + if int8_kv_cache: + qkv_y = torch.cat([ + act_range.get(prefix + 'self_attn.q_proj')["y"], + act_range.get(prefix + 'self_attn.k_proj')["y"], + act_range.get(prefix + 'self_attn.v_proj')["y"] + ], + dim=0) + + int8_kv_scales = qkv_y.max() / 127. + + kv_cache_weights = {} + + kv_cache_weights[ + tllm_prex + + 'attention.kv_cache_scaling_factor'] = int8_kv_scales.reshape( + [1]) + + attn_dense_weight = get_weight(model_params, + prefix + 'self_attn.o_proj', dtype) + split_v = split_matrix_tp(attn_dense_weight, + tensor_parallel, + mapping.tp_rank, + dim=1) + if use_smooth_quant: + attn_dense_weight = attn_dense_weight.t() + int8_weights = generate_int8( + attn_dense_weight, act_range.get(prefix + 'self_attn.o_proj')) + weights.update( + get_tllm_linear_sq_weight( + int8_weights, + tllm_prex + 'attention.dense.', [1, hidden_size], + tensor_parallel, + is_qkv=False, + per_token=per_token, + per_channel=per_channel, + last_prefix=tllm_prex + + 'attention.quantization_scaling_factor', + smoother_value=smoother[(prefix + 'self_attn.o_proj')], + smoother_shape=[1, hidden_size // tensor_parallel], + rank=mapping.tp_rank, + cat_dim=0)) + else: + weights.update( + get_tllm_linear_weight(split_v, tllm_prex + 'attention.dense.', + None, use_weight_only, + plugin_weight_only_quant_type)) + + mlp_gate_weight = get_weight(model_params, prefix + 'mlp.up_proj', + dtype) + split_v = split_matrix_tp(mlp_gate_weight, + tensor_parallel, + mapping.tp_rank, + dim=0) + if use_smooth_quant: + mlp_gate_weight = mlp_gate_weight.t() + int8_weights = generate_int8(mlp_gate_weight, + act_range.get(prefix + 'mlp.up_proj')) + + weights.update( + get_tllm_linear_sq_weight( + int8_weights, + tllm_prex + 'mlp.gate.', + [1, intermediate_size // tensor_parallel], + tensor_parallel, + is_qkv=False, + per_token=per_token, + per_channel=per_channel, + last_prefix=tllm_prex + 'post_layernorm.scale_to_int', + smoother_value=None, + smoother_shape=None, + rank=mapping.tp_rank, + cat_dim=-1)) + else: + weights.update( + get_tllm_linear_weight(split_v, tllm_prex + 'mlp.gate.', None, + use_weight_only, + plugin_weight_only_quant_type)) + + mlp_fc_weight = get_weight(model_params, prefix + 'mlp.gate_proj', + dtype) + split_v = split_matrix_tp(mlp_fc_weight, + tensor_parallel, + mapping.tp_rank, + dim=0) + + if use_smooth_quant: + mlp_fc_weight = mlp_fc_weight.t() #verified + int8_weights = generate_int8( + mlp_fc_weight, act_range.get(prefix + 'mlp.gate_proj')) + weights.update( + get_tllm_linear_sq_weight( + int8_weights, + tllm_prex + 'mlp.fc.', + [1, intermediate_size // tensor_parallel], + tensor_parallel, + is_qkv=False, + per_token=per_token, + per_channel=per_channel, + last_prefix=tllm_prex + 'post_layernorm.scale_to_int', + smoother_value=None, + smoother_shape=None, + rank=mapping.tp_rank, + cat_dim=-1)) + else: + weights.update( + get_tllm_linear_weight(split_v, tllm_prex + 'mlp.fc.', None, + use_weight_only, + plugin_weight_only_quant_type)) + + mlp_proj_weight = get_weight(model_params, prefix + 'mlp.down_proj', + dtype) + split_v = split_matrix_tp(mlp_proj_weight, + tensor_parallel, + mapping.tp_rank, + dim=1) + + if use_smooth_quant: + mlp_proj_weight = mlp_proj_weight.t() + int8_weights = generate_int8( + mlp_proj_weight, act_range.get(prefix + 'mlp.down_proj')) + weights.update( + get_tllm_linear_sq_weight( + int8_weights, + tllm_prex + 'mlp.proj.', [1, hidden_size], + tensor_parallel, + is_qkv=False, + per_token=per_token, + per_channel=per_channel, + last_prefix=tllm_prex + 'mlp.quantization_scaling_factor', + smoother_value=smoother[prefix + 'mlp.down_proj'], + smoother_shape=[1, intermediate_size // tensor_parallel], + rank=mapping.tp_rank, + cat_dim=0)) + else: + weights.update( + get_tllm_linear_weight(split_v, tllm_prex + 'mlp.proj.', None, + use_weight_only, + plugin_weight_only_quant_type)) + # Layer norms do not use tensor parallelism + input_ln_weight = get_weight(model_params, prefix + 'input_layernorm', + dtype) + weights[tllm_prex + 'input_layernorm.weight'] = input_ln_weight + + post_ln_weight = get_weight(model_params, + prefix + 'post_attention_layernorm', dtype) + weights[tllm_prex + 'post_layernorm.weight'] = post_ln_weight + + v = get_weight(model_params, 'model.embed_tokens', dtype) + + if hf_model.config.tie_word_embeddings: + # lm_head.weight has the same weights as embedding + if mapping.is_last_pp_rank(): + weights['lm_head.weight'] = split(v, mapping.tp_size, + mapping.tp_rank) + + if use_parallel_embedding: + v = split_matrix_tp(v, + mapping.tp_size, + mapping.tp_rank, + dim=sharding_dim) + + if mapping.is_first_pp_rank(): + weights['transformer.vocab_embedding.weight'] = v + + lm_head_weights = get_weight(model_params, 'lm_head', dtype) + + if mapping.is_last_pp_rank(): + weights['lm_head.weight'] = split_matrix_tp(lm_head_weights, + tensor_parallel, + mapping.tp_rank, + dim=0) + + ln_f_w = get_weight(model_params, 'model.norm', dtype) + weights['transformer.ln_f.weight'] = ln_f_w + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + print(f'Weights loaded. Total time: {t}') + return weights + + +def load_eagle_hf(eagle_model_dir, + eagle_model, + base_model, + num_eagle_layers: int, + mapping=Mapping(), + rank=0, + dtype='float32'): + logger.info("Loading EAGLE weights ...") + + renamed_weights = {} + weights = convert_hf_llama(eagle_model, mapping, rank, dtype=dtype) + + eagle_ckpt_file = Path(eagle_model_dir) / "pytorch_model.bin" + eagle_state_dict = torch.load(eagle_ckpt_file, map_location="cpu") + base_model_params = dict(base_model.named_parameters()) + torch_dtype = str_dtype_to_torch(dtype) + + for di in range(num_eagle_layers): + renamed_weights[ + f"eagle_nets.{di}.drafter.fc.weight"] = eagle_state_dict[ + "fc.weight"].clone().to(torch_dtype) + renamed_weights[f"eagle_nets.{di}.drafter.fc.bias"] = eagle_state_dict[ + "fc.bias"].clone().to(torch_dtype) + # Use main model to duplicate lm_head + renamed_weights[f"eagle_nets.{di}.lm_head.weight"] = base_model_params[ + "lm_head.weight"].clone().to(torch_dtype) + + for di in range(num_eagle_layers): + for name, param in weights.items(): + name = name.replace("transformer.", "") + if "input_layernorm" in name: + continue + if "lm_head" in name: + continue + # FIXME tensors are duplicated on disk + renamed_weights[f"eagle_nets.{di}.drafter.{name}"] = param.clone() + + return renamed_weights diff --git a/tensorrt_llm/models/enc_dec/model.py b/tensorrt_llm/models/enc_dec/model.py index 52a013d16..0e1051adb 100644 --- a/tensorrt_llm/models/enc_dec/model.py +++ b/tensorrt_llm/models/enc_dec/model.py @@ -25,7 +25,7 @@ MLPType, PositionEmbeddingType, Tensor, assertion, cast, gather_last_token_logits, gelu, maximum, minimum, recv, send, shape, - slice, transpose) + transpose, unsqueeze) from tensorrt_llm.layers import (MLP, Attention, AttentionMaskType, AttentionParams, BertAttention, ColumnLinear, Conv1d, Embedding, FusedGatedMLP, GatedMLP, @@ -1887,11 +1887,9 @@ def __init__(self, config: PretrainedConfig): stride=2, padding=1, dtype=self._conv_dtype) - self.downsample_factor = 2 - - self.positional_embedding = Parameter(shape=(config.n_audio_ctx, - config.hidden_size), - dtype=self._dtype) + self.position_embedding = Embedding(self.config.max_position_embeddings, + self.config.hidden_size, + dtype=self.config.dtype) self.encoder_layers = ModuleList([ EncoderLayer( hidden_size=config.hidden_size, @@ -1899,7 +1897,6 @@ def __init__(self, config: PretrainedConfig): num_attention_heads=config.num_attention_heads, num_kv_heads=config.num_attention_heads, head_size=config.hidden_size // config.num_attention_heads, - max_position_embeddings=3000, q_scaling=1.0, has_attention_qkvo_bias=True, has_mlp_bias=True, @@ -1909,14 +1906,15 @@ def __init__(self, config: PretrainedConfig): self.ln_post = LayerNorm(config.hidden_size, dtype=self._dtype) self.max_audio_feature_seq_len = 3000 + self.downsample_factor = 2 - def forward(self, input_features: Tensor, input_lengths=None): + def forward(self, + input_features: Tensor, + input_lengths=None, + position_ids=None): if default_net().plugin_config.remove_input_padding: - # BXT,D -> B,T,D -> B,D,T - input_features = input_features.view([ - input_lengths.shape[0], self.max_audio_feature_seq_len, - self.config.n_mels - ]) + # BXT,D -> 1,BxT,D -> 1,D,BxT + input_features = unsqueeze(input_features, 0) input_features = transpose(input_features, 1, 2) # Encoder conv needs to run in fp32 on Volta/Turing x_type = input_features.dtype @@ -1927,14 +1925,8 @@ def forward(self, input_features: Tensor, input_lengths=None): x = cast(x, x_type) x = gelu(x) x = transpose(x, 2, 1) - x = x + cast( - slice(input=self.positional_embedding.value, - starts=[0, 0], - sizes=[ - self.max_audio_feature_seq_len // self.downsample_factor, - self.positional_embedding.shape[1] - ], - strides=[1, 1]), x.dtype) + x = x + cast(self.position_embedding(position_ids), x.dtype) + if default_net().plugin_config.remove_input_padding: #B,T,D -> BxT,D x = x.view([-1, self.config.hidden_size]) @@ -1952,23 +1944,44 @@ def forward(self, input_features: Tensor, input_lengths=None): def prepare_inputs(self, max_batch_size=16): bs_range = [1, (max_batch_size + 1) // 2, max_batch_size] - # You may change max_audio_feature_seq_len here for distill-whisper models. - max_audio_feature_seq_len = self.max_audio_feature_seq_len + min_feat_len, optimal_feat_len = 10, 1000 # 100ms, 10s + inlen_range = [ + min_feat_len, optimal_feat_len, self.max_audio_feature_seq_len + ] + inlen_range_after_downsample = [ + min_feat_len // self.downsample_factor, + optimal_feat_len // self.downsample_factor, + self.max_audio_feature_seq_len // self.downsample_factor + ] if not default_net().plugin_config.remove_input_padding: - x = Tensor( - name="input_features", - dtype=self._dtype, - shape=[-1, self.config.n_mels, max_audio_feature_seq_len], - dim_range=OrderedDict([ - ("batch_size", [bs_range]), - ("feature_dim", [self.config.n_mels]), - ("feature_len_range", [max_audio_feature_seq_len]), - ])) + x = Tensor(name="input_features", + dtype=self._dtype, + shape=[-1, self.config.n_mels, -1], + dim_range=OrderedDict([ + ("batch_size", [bs_range]), + ("feature_dim", [self.config.n_mels]), + ("feature_len_range", [inlen_range]), + ])) + position_ids = Tensor( + name='position_ids', + dtype=trt.int32, + shape=[-1, -1], + dim_range=OrderedDict([('batch_size', [bs_range]), + ('feature_len_downsample_range', + [inlen_range_after_downsample])]), + ) else: batch_seqlen_range = [ 1, - (max_audio_feature_seq_len * max_batch_size + 1) // 2, - max_audio_feature_seq_len * max_batch_size, + (self.max_audio_feature_seq_len * max_batch_size + 1) // 2, + self.max_audio_feature_seq_len * max_batch_size, + ] + batch_seqlen_downsample_range = [ + 1, + (self.max_audio_feature_seq_len // self.downsample_factor * + max_batch_size + 1) // 2, + self.max_audio_feature_seq_len // self.downsample_factor * + max_batch_size, ] x = Tensor(name="input_features", dtype=self._dtype, @@ -1977,6 +1990,13 @@ def prepare_inputs(self, max_batch_size=16): ("batch_seqlen_range", [batch_seqlen_range]), ("feature_dim", [self.config.n_mels]), ])) + position_ids = Tensor( + name='position_ids', + dtype=trt.int32, + shape=[-1], + dim_range=OrderedDict([('batch_seqlen_downsample_range', + [batch_seqlen_downsample_range])]), + ) input_lengths = Tensor( name="input_lengths", dtype=trt.int32, @@ -1984,7 +2004,11 @@ def prepare_inputs(self, max_batch_size=16): dim_range=OrderedDict([("batch_size", [bs_range])]), ) - return {'input_features': x, 'input_lengths': input_lengths} + return { + 'input_features': x, + 'input_lengths': input_lengths, + 'position_ids': position_ids + } def precompute_relative_attention_bias(self, build_config): pass diff --git a/tensorrt_llm/models/falcon/config.py b/tensorrt_llm/models/falcon/config.py index 79af97dff..5ee2ee99b 100644 --- a/tensorrt_llm/models/falcon/config.py +++ b/tensorrt_llm/models/falcon/config.py @@ -28,12 +28,15 @@ def __init__(self, *, bias: bool = False, parallel_attention: bool = False, + num_ln_in_parallel_attn: int | None = None, new_decoder_architecture: bool = False, + rotary_base: float = 10000.0, **kwargs): self.bias = bias self.parallel_attention = parallel_attention + self.num_ln_in_parallel_attn = num_ln_in_parallel_attn self.new_decoder_architecture = new_decoder_architecture - + self.rotary_base = rotary_base super().__init__(**kwargs) def to_dict(self): @@ -114,10 +117,14 @@ def from_hugging_face( hidden_act='gelu', bias=hf_config.bias, parallel_attention=hf_config.parallel_attn, + num_ln_in_parallel_attn=getattr(hf_config, + 'num_ln_in_parallel_attn', + None), new_decoder_architecture=hf_config.new_decoder_architecture, max_position_embeddings=getattr(hf_config, 'max_position_embeddings', 2048), + rotary_base=getattr(hf_config, 'rope_theta', 10000.0), intermediate_size=getattr(hf_config, 'ffn_hidden_size', None), mapping=mapping, diff --git a/tensorrt_llm/models/falcon/convert.py b/tensorrt_llm/models/falcon/convert.py index ef8713142..cade65c84 100644 --- a/tensorrt_llm/models/falcon/convert.py +++ b/tensorrt_llm/models/falcon/convert.py @@ -277,13 +277,17 @@ def load_weights_from_hf_model(model, config: FalconConfig): if new_decoder_architecture: input_ln_weight, input_ln_bias = get_weight_and_bias( model_params, f'{prefix}.ln_attn', dtype) + if input_ln_weight is None: + input_ln_weight, input_ln_bias = get_weight_and_bias( + model_params, f'{prefix}.input_layernorm', dtype) weights[f'{tllm_prex}.input_layernorm.weight'] = input_ln_weight if input_ln_bias is not None: weights[f'{tllm_prex}.input_layernorm.bias'] = input_ln_bias mlp_ln_weight, mlp_ln_bias = get_weight_and_bias( model_params, f'{prefix}.ln_mlp', dtype) - weights[f'{tllm_prex}.mlp_layernorm.weight'] = mlp_ln_weight + if mlp_ln_weight is not None: + weights[f'{tllm_prex}.mlp_layernorm.weight'] = mlp_ln_weight if mlp_ln_bias is not None: weights[f'{tllm_prex}.mlp_layernorm.bias'] = mlp_ln_bias else: @@ -316,7 +320,11 @@ def load_weights_from_hf_model(model, config: FalconConfig): if mapping.is_last_pp_rank(): if not share_embedding_table: - weights['lm_head.weight'] = split_matrix(embed_w.clone(), + lm_head = get_weight(model_params, 'lm_head', dtype) + if lm_head is None: + # No lm_head in the checkpoint, cloning word_embedding. + lm_head = embed_w.clone() + weights['lm_head.weight'] = split_matrix(lm_head, mapping.tp_size, mapping.tp_rank, dim=0) diff --git a/tensorrt_llm/models/falcon/model.py b/tensorrt_llm/models/falcon/model.py index 627335eeb..327ce2a64 100644 --- a/tensorrt_llm/models/falcon/model.py +++ b/tensorrt_llm/models/falcon/model.py @@ -46,6 +46,9 @@ def __init__(self, config: FalconConfig, layer_idx: int): self.new_decoder_architecture = config.new_decoder_architecture self.parallel_attn = config.parallel_attention + self.num_ln_in_parallel_attn = config.num_ln_in_parallel_attn + if self.num_ln_in_parallel_attn is None and self.new_decoder_architecture: + self.num_ln_in_parallel_attn = 2 if self.is_parallel_attention: # Not to apply allreduce inside the Attention/MLP layers. # allreduce applies after those layer. @@ -65,11 +68,13 @@ def __init__(self, config: FalconConfig, layer_idx: int): tp_rank=tp_rank, bias=config.bias, position_embedding_type=config.position_embedding_type, - quant_mode=config.quantization.quant_mode) + rotary_embedding_base=config.rotary_base, + quant_mode=config.quantization.quant_mode, + ) mlp_hidden_size = hidden_size * 4 if config.intermediate_size is None else config.intermediate_size - if self.new_decoder_architecture: + if self.new_decoder_architecture and self.num_ln_in_parallel_attn == 2: # Layernorm before MLP. self.mlp_layernorm = LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, @@ -106,7 +111,7 @@ def forward(self, residual = hidden_states - if self.new_decoder_architecture: + if self.new_decoder_architecture and self.num_ln_in_parallel_attn == 2: mlp_ln_output = self.mlp_layernorm(hidden_states) hidden_states = self.input_layernorm(hidden_states) input_ln_output = hidden_states @@ -126,9 +131,13 @@ def forward(self, hidden_states = residual + attention_output residual = hidden_states hidden_states = self.post_layernorm(hidden_states) - else: + elif self.num_ln_in_parallel_attn == 2: hidden_states = mlp_ln_output + if (self.new_decoder_architecture and self.parallel_attn + and self.num_ln_in_parallel_attn == 1): + hidden_states = input_ln_output + hidden_states = self.mlp(hidden_states) if self.is_parallel_attention: diff --git a/tensorrt_llm/models/generation_mixin.py b/tensorrt_llm/models/generation_mixin.py index 92361e705..3ae431a48 100644 --- a/tensorrt_llm/models/generation_mixin.py +++ b/tensorrt_llm/models/generation_mixin.py @@ -519,7 +519,8 @@ def prepare_basic_inputs( max_draft_len=0, multiple_profiles: bool = False, streamingllm: bool = False, - opt_batch_size=None): + opt_batch_size=None, + pp_reduce_scatter: bool = False): enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles( use_gpt_attention_plugin=use_gpt_attention_plugin, @@ -581,13 +582,14 @@ def prepare_basic_inputs( else: assert dtype is not None assert num_heads is not None + pp_hidden_size = hidden_size // mapping.tp_size if pp_reduce_scatter else hidden_size hidden_states = Tensor( name='hidden_states_input', dtype=dtype, - shape=[-1, hidden_size], + shape=[-1, pp_hidden_size], dim_range=OrderedDict([ ('num_tokens', num_tokens_range), - ('hidden_size', [hidden_size] * num_profiles), + ('hidden_size', [pp_hidden_size] * num_profiles), ]), ) diff --git a/tensorrt_llm/models/grok/convert.py b/tensorrt_llm/models/grok/convert.py index 527782e3c..219233cef 100644 --- a/tensorrt_llm/models/grok/convert.py +++ b/tensorrt_llm/models/grok/convert.py @@ -504,8 +504,10 @@ def load_weights_from_xai(*, config, mapping, model): assert quant_algo == QuantAlgo.W8A16 plugin_weight_only_quant_type = torch.int8 - moe_config = MoeConfig(config['moe_num_experts'], config['moe_top_k'], - config['moe_normalization_mode']).validate() + moe_config = MoeConfig( + num_experts=config['moe_num_experts'], + top_k=config['moe_top_k'], + normalization_mode=config['moe_normalization_mode']).validate() use_weight_only = quant_algo in [QuantAlgo.W8A16] diff --git a/tensorrt_llm/models/grok/model.py b/tensorrt_llm/models/grok/model.py index 7b77873d7..8fc34349f 100644 --- a/tensorrt_llm/models/grok/model.py +++ b/tensorrt_llm/models/grok/model.py @@ -68,8 +68,10 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): mlp_kwargs = {} assert config.moe_num_experts > 1, "Grok model is a MoE model." ClsMLP = MOE - moe_config = MoeConfig(config.moe_num_experts, config.moe_top_k, - config.moe_normalization_mode).validate() + moe_config = MoeConfig( + num_experts=config.moe_num_experts, + top_k=config.moe_top_k, + normalization_mode=config.moe_normalization_mode).validate() mlp_kwargs = { "moe_config": moe_config, "mapping": config.mapping, diff --git a/tensorrt_llm/models/llama/config.py b/tensorrt_llm/models/llama/config.py index 25ea86e78..039354da7 100644 --- a/tensorrt_llm/models/llama/config.py +++ b/tensorrt_llm/models/llama/config.py @@ -57,6 +57,8 @@ def __init__(self, assert isinstance(moe, MoeConfig) self.moe = moe.validate() self.remove_duplicated_kv_heads = remove_duplicated_kv_heads + self.fc_after_embed = False + self.use_input_layernorm_in_first_layer = True super().__init__(**kwargs) @@ -70,6 +72,9 @@ def to_dict(self): output['residual_mlp'] = self.residual_mlp output[ 'disable_weight_only_quant_plugin'] = self.disable_weight_only_quant_plugin + output['fc_after_embed'] = self.fc_after_embed + output[ + 'use_input_layernorm_in_first_layer'] = self.use_input_layernorm_in_first_layer output['moe'] = self.moe.to_dict() return output diff --git a/tensorrt_llm/models/llama/convert.py b/tensorrt_llm/models/llama/convert.py index 3f102712a..4ed019b1d 100644 --- a/tensorrt_llm/models/llama/convert.py +++ b/tensorrt_llm/models/llama/convert.py @@ -1705,6 +1705,7 @@ def load_weights_from_gptq(quant_ckpt_path: str, config: LLaMAConfig): vocab_size = config.vocab_size dtype = config.dtype mapping = config.mapping + quant_algo = config.quantization.quant_algo gptq_llama = safetensors.safe_open(quant_ckpt_path, framework="pt", @@ -1757,6 +1758,7 @@ def unpack_int32_into_int8(w_packed): def process_and_assign_weight(v: List[torch.Tensor], tllm_prex: str, + quant_algo: QuantAlgo, tp_dim: int = -1): if tp_dim == -1: qweight_int32, qzeros_int32, scales_fp16 = [ @@ -1768,22 +1770,39 @@ def process_and_assign_weight(v: List[torch.Tensor], ] USE_UINT4_INPUT = 1 # Set to true if checkpoint store UINT4 weights + USE_UINT8_INPUT = 1 # Set to true if checkpoint store UINT8 weights USE_GPTQ_FOR_LLAMA = 1 # GPTQ-for-LLaMA added 1 to zeros - qweight_unpacked_int8 = unpack_int32_into_int8( - qweight_int32.T).T.contiguous() - 8 - qweight_interleaved = preprocessor(packer(qweight_unpacked_int8), - torch.quint4x2, - torch.float16).view(torch.float16) - # zeros = zeros * scales - qzeros_unpacked_int32 = unpack_int32_into_int8(qzeros_int32) - if not USE_UINT4_INPUT: - # Correcting UINT4 values back to INT4 order - mask_negative = qzeros_unpacked_int32[qzeros_unpacked_int32 < 0] - mask_positive = qzeros_unpacked_int32[qzeros_unpacked_int32 >= 0] - qzeros_unpacked_int32 = qzeros_unpacked_int32 + 16 * mask_negative - 16 * mask_positive - zeros_x_scales_fp16 = (-qzeros_unpacked_int32 + 8 * USE_UINT4_INPUT - - USE_GPTQ_FOR_LLAMA) * scales_fp16 + if quant_algo == QuantAlgo.W4A16_GPTQ: + # unpack inputs packed in int32 into int4 and store them in int8 format + qweight_unpacked_int8 = unpack_int32_into_int8( + qweight_int32.T).T.contiguous() - 8 + qweight_interleaved = preprocessor( + packer(qweight_unpacked_int8), torch.quint4x2, + torch.float16).view(torch.float16) + # zeros = zeros * scales + qzeros_unpacked_int32 = unpack_int32_into_int8(qzeros_int32) + if not USE_UINT4_INPUT: + # Correcting UINT4 values back to INT4 order + mask_negative = qzeros_unpacked_int32[qzeros_unpacked_int32 < 0] + mask_positive = qzeros_unpacked_int32[ + qzeros_unpacked_int32 >= 0] + qzeros_unpacked_int32 = qzeros_unpacked_int32 + 16 * mask_negative - 16 * mask_positive + zeros_x_scales_fp16 = (-qzeros_unpacked_int32 + 8 * USE_UINT4_INPUT + - USE_GPTQ_FOR_LLAMA) * scales_fp16 + else: + # unpack inputs packed in int32 into int8 + qweight_unpacked_int8 = ( + qweight_int32.T.contiguous().view(torch.uint8).T.contiguous() - + 128).to(torch.int8) + qweight_interleaved = preprocessor(qweight_unpacked_int8, + torch.int8, torch.float16).view( + torch.float16) + qzeros_unpacked_int32 = qzeros_int32.view(torch.uint8) + zeros_x_scales_fp16 = (-qzeros_unpacked_int32 + + 128 * USE_UINT8_INPUT - + USE_GPTQ_FOR_LLAMA) * scales_fp16 + zeros_x_scales_fp16 = zeros_x_scales_fp16.half() results = { @@ -1842,29 +1861,39 @@ def process_and_assign_weight(v: List[torch.Tensor], # process_and_assign_weight(layer.attention.qkv, qkv_weight_list) weights.update( process_and_assign_weight(qkv_weight_list, - f'{tllm_prex}.attention.qkv')) + f'{tllm_prex}.attention.qkv', quant_algo)) # 4.2 attention.dense v = [load(prefix + gptq_key_list[5] + suf) for suf in gptq_suffix_list] # process_and_assign_weight(layer.attention.dense, v, 0) weights.update( process_and_assign_weight(v, f'{tllm_prex}.attention.dense', + quant_algo, tp_dim=0)) # 4.3 mlp.gate v = [load(prefix + gptq_key_list[6] + suf) for suf in gptq_suffix_list] # process_and_assign_weight(layer.mlp.gate, v, 1) weights.update( - process_and_assign_weight(v, f'{tllm_prex}.mlp.gate', tp_dim=1)) + process_and_assign_weight(v, + f'{tllm_prex}.mlp.gate', + quant_algo, + tp_dim=1)) # 4.4 mlp.proj v = [load(prefix + gptq_key_list[7] + suf) for suf in gptq_suffix_list] # process_and_assign_weight(layer.mlp.proj, v, 0) weights.update( - process_and_assign_weight(v, f'{tllm_prex}.mlp.proj', tp_dim=0)) + process_and_assign_weight(v, + f'{tllm_prex}.mlp.proj', + quant_algo, + tp_dim=0)) # 4.5 mlp.fc v = [load(prefix + gptq_key_list[8] + suf) for suf in gptq_suffix_list] # process_and_assign_weight(layer.mlp.fc, v, 1) weights.update( - process_and_assign_weight(v, f'{tllm_prex}.mlp.fc', tp_dim=1)) + process_and_assign_weight(v, + f'{tllm_prex}.mlp.fc', + quant_algo, + tp_dim=1)) # 4.6 input_layernorm v = load(prefix + gptq_key_list[9]) # layer.input_layernorm.weight.value = v.to(torch_dtype).cpu().numpy() diff --git a/tensorrt_llm/models/llama/model.py b/tensorrt_llm/models/llama/model.py index aabcc5265..72d1b3d7d 100644 --- a/tensorrt_llm/models/llama/model.py +++ b/tensorrt_llm/models/llama/model.py @@ -20,7 +20,7 @@ from ..._common import default_net from ..._utils import pad_vocab_size from ...functional import (AllReduceFusionOp, AllReduceFusionParams, Tensor, - non_gated_version, recv, send) + allgather, concat, non_gated_version, recv, send) from ...layers import (MOE, Attention, AttentionMaskType, ColumnLinear, Embedding, GatedMLP, PositionEmbeddingType, RmsNorm) from ...lora_manager import LoraConfig, use_lora @@ -43,13 +43,17 @@ def __init__(self, config: LLaMAConfig, layer_idx: int): super().__init__() self.layer_idx = layer_idx self.config = config + self.mapping = config.mapping - self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size, - eps=config.norm_epsilon, - dtype=config.dtype) + if (self.config.use_input_layernorm_in_first_layer + and self.layer_idx == 0) or self.layer_idx > 0: + self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size, + eps=config.norm_epsilon, + dtype=config.dtype) layers_range = config.mapping.pp_layers(config.num_hidden_layers) self.local_layer_idx = layer_idx - layers_range[0] + self.is_last_local_layer = layer_idx == layers_range[-1] self.attention = Attention( local_layer_idx=self.local_layer_idx, hidden_size=config.hidden_size, @@ -134,7 +138,9 @@ def forward(self, hidden_states, residual = hidden_states else: residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) + if (self.config.use_input_layernorm_in_first_layer + and self.layer_idx == 0) or self.layer_idx > 0: + hidden_states = self.input_layernorm(hidden_states) attention_output = self.attention( hidden_states, @@ -190,9 +196,17 @@ def forward(self, norm_weight=next_layer_input_layernorm_args[0], eps=next_layer_input_layernorm_args[1])) else: - hidden_states = self.mlp(hidden_states, - lora_layer_params=lora_layer_params) - hidden_states = residual + hidden_states + if default_net( + ).plugin_config.pp_reduce_scatter and self.is_last_local_layer and not self.mapping.is_last_pp_rank( + ): + hidden_states = self.mlp( + hidden_states, + lora_layer_params=lora_layer_params, + last_local_layer_residual=residual) + else: + hidden_states = self.mlp( + hidden_states, lora_layer_params=lora_layer_params) + hidden_states = residual + hidden_states if use_cache: return (hidden_states, presents) return hidden_states @@ -204,6 +218,7 @@ def __init__(self, config: LLaMAConfig) -> None: super().__init__() self.mapping = config.mapping + self.hidden_size = config.hidden_size if self.mapping.is_first_pp_rank(): self.vocab_embedding = Embedding(config.vocab_size, config.hidden_size, @@ -211,6 +226,15 @@ def __init__(self, config: LLaMAConfig) -> None: self.layers = DecoderLayerList(LLaMADecoderLayer, config) + if config.fc_after_embed: + self.fc = ColumnLinear(2 * config.hidden_size, + config.hidden_size, + bias=True, + dtype=config.dtype, + tp_group=config.mapping.tp_group, + tp_size=config.mapping.tp_size, + gather_output=False) + if self.mapping.is_last_pp_rank(): self.ln_f = RmsNorm(normalized_shape=config.hidden_size, eps=config.norm_epsilon, @@ -225,6 +249,7 @@ def forward(self, kv_cache_params=None, attention_params=None, hidden_states=None, + hidden_states_for_embed=None, prompt_embedding_table: Optional[Tensor] = None, prompt_tasks: Optional[Tensor] = None, prompt_vocab_size: Optional[Tensor] = None, @@ -238,6 +263,18 @@ def forward(self, hidden_states = self.vocab_embedding(input_ids, *ptuning_args) else: hidden_states = recv(hidden_states, self.mapping.prev_pp_rank()) + if default_net().plugin_config.pp_reduce_scatter: + hidden_states = allgather(hidden_states, + self.mapping.tp_group, + gather_dim=0) + # reshape to (-1, hidden_size) + hidden_states = hidden_states.view( + concat([-1, self.hidden_size])) + + if hidden_states_for_embed is not None: + hidden_states = concat([hidden_states, hidden_states_for_embed], + dim=-1) + hidden_states = self.fc(hidden_states) hidden_states = self.layers.forward( hidden_states, diff --git a/tensorrt_llm/models/mamba/config.py b/tensorrt_llm/models/mamba/config.py new file mode 100644 index 000000000..70c41abda --- /dev/null +++ b/tensorrt_llm/models/mamba/config.py @@ -0,0 +1,342 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import os +from enum import Enum +from typing import List, Optional, Union + +import torch +import transformers + +from ..._utils import torch_dtype_to_str +from ...logger import logger +from ...mapping import Mapping +from ..modeling_utils import PretrainedConfig, QuantConfig + + +class CheckpointType(str, Enum): + mistral_inference = "mistral_inference" + state_spaces = "state_spaces" + hf = "hf" + + +def get_ckpt_type(model_path): + try: + hf_config = transformers.AutoConfig.from_pretrained( + model_path, trust_remote_code=True) + if hasattr(hf_config, "ssm_cfg") and hf_config.ssm_cfg: + return CheckpointType.state_spaces + return CheckpointType.hf + except OSError: + if os.path.exists(os.path.join(model_path, "params.json")): + return CheckpointType.mistral_inference + + +class MambaConfig(PretrainedConfig): + + def __init__(self, + *, + residual_in_fp32: bool = True, + pad_vocab_size_multiple: int = -1, + layer_types: List[str] = ["recurrent"], + **kwargs): + self.residual_in_fp32 = residual_in_fp32 + self.pad_vocab_size_multiple = pad_vocab_size_multiple + self.layer_types = layer_types + super().__init__(**kwargs) + + def to_dict(self): + output = super().to_dict() + # Serialize the fields added in MambaConfig + + return output + + def update(self, data_dict): + self.__dict__.update(data_dict) + + @classmethod + def from_hugging_face( + cls, + hf_config_or_dir: Union[str, 'transformers.PretrainedConfig'], + dtype: str = 'auto', + mapping: Optional[Mapping] = None, + quant_config: Optional[QuantConfig] = None, + **kwargs): + import transformers + + ckpt_type = get_ckpt_type(hf_config_or_dir) + + mamba_version = 'Mamba1' + if ckpt_type == CheckpointType.hf: + if isinstance(hf_config_or_dir, transformers.PretrainedConfig): + hf_config = hf_config_or_dir + else: + hf_config_dir = str(hf_config_or_dir) + + hf_config = transformers.AutoConfig.from_pretrained( + hf_config_dir, trust_remote_code=True) + + if dtype == 'auto': + dtype = getattr(hf_config, 'torch_dtype', None) + if dtype is None: + dtype = 'float16' + if isinstance(dtype, torch.dtype): + dtype = torch_dtype_to_str(dtype) + if dtype == 'float32': + dtype = 'float16' + if dtype == 'bfloat16' and torch.cuda.get_device_properties( + 0).major < 8: + logger.warning( + "Pre SM 80 GPUs do not support bfloat16, fallback to float16" + ) + dtype = 'float16' + + vocab_size = hf_config.vocab_size + pad_vocab_size_multiple = getattr(hf_config, + "pad_vocab_size_multiple", 1) + if vocab_size % pad_vocab_size_multiple != 0: + vocab_size += pad_vocab_size_multiple - ( + vocab_size % pad_vocab_size_multiple) + return cls(architecture="MambaForCausalLM", + dtype=dtype, + num_hidden_layers=hf_config.num_hidden_layers, + num_attention_heads=mapping.world_size, + hidden_size=hf_config.hidden_size, + intermediate_size=hf_config.intermediate_size, + vocab_size=vocab_size, + mamba_version=mamba_version, + hidden_act=hf_config.hidden_act, + rms_norm=hf_config.rms_norm, + residual_in_fp32=hf_config.residual_in_fp32, + pad_vocab_size_multiple=pad_vocab_size_multiple, + rnn_hidden_size=hf_config.intermediate_size, + rnn_conv_dim_size=hf_config.intermediate_size, + state_size=hf_config.state_size, + conv_kernel=hf_config.conv_kernel, + use_bias=hf_config.use_bias, + mapping=mapping, + quantization=quant_config, + **kwargs) + elif ckpt_type == CheckpointType.state_spaces: + + mamba_version = 'Mamba2' + if isinstance(hf_config_or_dir, transformers.PretrainedConfig): + hf_config = hf_config_or_dir + else: + hf_config_dir = str(hf_config_or_dir) + + hf_config = transformers.AutoConfig.from_pretrained( + hf_config_dir, trust_remote_code=True) + if dtype == 'auto': + dtype = getattr(hf_config, 'torch_dtype', None) + if dtype is None: + dtype = 'float16' + if isinstance(dtype, torch.dtype): + dtype = torch_dtype_to_str(dtype) + if dtype == 'float32': + dtype = 'float16' + if dtype == 'bfloat16' and torch.cuda.get_device_properties( + 0).major < 8: + logger.warning( + "Pre SM 80 GPUs do not support bfloat16, fallback to float16" + ) + dtype = 'float16' + + vocab_size = hf_config.vocab_size + pad_vocab_size_multiple = getattr(hf_config, + "pad_vocab_size_multiple", 1) + if vocab_size % pad_vocab_size_multiple != 0: + vocab_size += pad_vocab_size_multiple - ( + vocab_size % pad_vocab_size_multiple) + assert hasattr(hf_config, + 'ssm_cfg') and hf_config.ssm_cfg['layer'] == 'Mamba2' + config = json.load( + open(os.path.join(hf_config_or_dir, 'config.json'))) + ssm_cfg = config.pop('ssm_cfg') + cfg_to_mamba_cfg = { + 'd_model': 'hidden_size', + 'n_layer': 'num_hidden_layers', + 'fused_add_norm': None, + 'tie_embeddings': None, + } + ssm_cfg_to_mamba_cfg = { + 'd_state': 'state_size', + 'd_conv': 'conv_kernel', + 'bias': 'use_bias', + 'headdim': 'head_dim', + 'ngroups': 'n_groups', + 'chunk_size': 'chunk_size', + 'rmsnorm': 'ssm_rmsnorm', + } + for k in cfg_to_mamba_cfg: + if k in config: + v = config.pop(k) + if cfg_to_mamba_cfg[k] is not None: + config[cfg_to_mamba_cfg[k]] = v + for k in ssm_cfg_to_mamba_cfg: + if k in ssm_cfg and ssm_cfg_to_mamba_cfg[k] is not None: + config[ssm_cfg_to_mamba_cfg[k]] = ssm_cfg[k] + + if 'expand' in config: + expand = config['expand'] + hf_config.intermediate_size = expand * config['hidden_size'] + else: + hf_config.intermediate_size = 2 * config['hidden_size'] + mamba2_default_cfg = { + 'n_groups': 1, + 'hidden_size': hf_config.d_model, + 'head_dim': 64, + 'chunk_size': 256, + 'state_size': 128, + } + hf_config.update(mamba2_default_cfg) + + conv_dim = hf_config.intermediate_size + 2 * hf_config.n_groups * hf_config.state_size + ssm_rmsnorm = getattr(hf_config, "ssm_rmsnorm", hf_config.rms_norm) + mamba2_cfg = { + 'rnn_head_size': hf_config.head_dim, + 'rnn_conv_dim_size': conv_dim, + 'ngroups': hf_config.n_groups, + 'chunk_size': hf_config.chunk_size, + 'ssm_rmsnorm': ssm_rmsnorm, + } + hf_config.update(mamba2_cfg) + + return cls(architecture="MambaForCausalLM", + dtype=dtype, + num_hidden_layers=hf_config.n_layer, + num_attention_heads=mapping.world_size + if mapping is not None else 1, + hidden_size=hf_config.hidden_size, + intermediate_size=hf_config.intermediate_size, + vocab_size=vocab_size, + mamba_version=mamba_version, + hidden_act=hf_config.hidden_act, + rms_norm=hf_config.rms_norm, + residual_in_fp32=hf_config.residual_in_fp32, + pad_vocab_size_multiple=pad_vocab_size_multiple, + rnn_hidden_size=hf_config.intermediate_size, + rnn_conv_dim_size=hf_config.rnn_conv_dim_size, + state_size=hf_config.state_size, + conv_kernel=hf_config.conv_kernel, + use_bias=hf_config.use_bias, + mapping=mapping, + quantization=quant_config, + rnn_head_size=hf_config.rnn_head_size, + ngroups=hf_config.ngroups, + chunk_size=hf_config.chunk_size, + ssm_rmsnorm=hf_config.ssm_rmsnorm, + **kwargs) + elif ckpt_type == CheckpointType.mistral_inference: + mamba_version = 'Mamba2' + + config = json.load( + open(os.path.join(hf_config_or_dir, 'params.json'))) + cfg_to_mamba_cfg = { + 'dim': 'hidden_size', + 'n_layers': 'num_hidden_layers', + 'n_groups': 'n_groups', + 'fused_add_norm': None, + 'tie_embeddings': None, + 'model_type': None, + } + for k in cfg_to_mamba_cfg: + if k in config: + v = config.pop(k) + if cfg_to_mamba_cfg[k] is not None: + config[cfg_to_mamba_cfg[k]] = v + + config['architecture'] = 'MambaForCuasualLM' + config['dtype'] = dtype + config['num_attention_heads'] = mapping.world_size + + hf_config = MambaConfig(**config) + mamba2_default_cfg = { + 'n_groups': 8, + 'hidden_size': 4096, + 'head_dim': 64, + 'chunk_size': 256, + 'state_size': 128, + 'conv_kernel': 4, + 'use_bias': False + } + + hf_config.update(mamba2_default_cfg) + conv_dim = hf_config.intermediate_size + 2 * hf_config.n_groups * hf_config.state_size + ssm_rmsnorm = getattr(hf_config, "ssm_rmsnorm", hf_config.rms_norm) + mamba2_cfg = { + 'rnn_head_size': hf_config.head_dim, + 'rnn_conv_dim_size': conv_dim, + 'ngroups': hf_config.n_groups, + 'chunk_size': hf_config.chunk_size, + 'ssm_rmsnorm': ssm_rmsnorm, + } + hf_config.update(mamba2_cfg) + + if 'expand' in config: + expand = config['expand'] + hf_config.intermediate_size = expand * hf_config.hidden_size + else: + hf_config.intermediate_size = 2 * hf_config.hidden_size + vocab_size = hf_config.vocab_size + pad_vocab_size_multiple = getattr(hf_config, + "pad_vocab_size_multiple", 1) + if vocab_size % pad_vocab_size_multiple != 0: + vocab_size += pad_vocab_size_multiple - ( + vocab_size % pad_vocab_size_multiple) + + return cls( + architecture="MambaForCausalLM", + dtype=dtype, + num_hidden_layers=hf_config.num_hidden_layers, + num_attention_heads=mapping.world_size, + hidden_size=hf_config.hidden_size, + intermediate_size=hf_config.intermediate_size, + # num_key_value_heads=num_key_value_heads, + vocab_size=vocab_size, + mamba_version=mamba_version, + hidden_act=hf_config.hidden_act, + rms_norm=hf_config.rms_norm, + residual_in_fp32=hf_config.residual_in_fp32, + pad_vocab_size_multiple=pad_vocab_size_multiple, + rnn_hidden_size=hf_config.intermediate_size, + rnn_conv_dim_size=hf_config.rnn_conv_dim_size, + state_size=hf_config.state_size, + conv_kernel=hf_config.conv_kernel, + use_bias=hf_config.use_bias, + mapping=mapping, + quantization=quant_config, + rnn_head_size=hf_config.rnn_head_size, + ngroups=hf_config.n_groups, + chunk_size=hf_config.chunk_size, + ssm_rmsnorm=hf_config.ssm_rmsnorm, + **kwargs) + else: + pass + + if isinstance(hf_config_or_dir, transformers.PretrainedConfig): + hf_config = hf_config_or_dir + else: + hf_config_dir = str(hf_config_or_dir) + + hf_config = transformers.AutoConfig.from_pretrained( + hf_config_dir, trust_remote_code=True) + + vocab_size = hf_config.vocab_size + pad_vocab_size_multiple = getattr(hf_config, "pad_vocab_size_multiple", + 1) + if vocab_size % pad_vocab_size_multiple != 0: + vocab_size += pad_vocab_size_multiple - (vocab_size % + pad_vocab_size_multiple) diff --git a/tensorrt_llm/models/mamba/convert.py b/tensorrt_llm/models/mamba/convert.py new file mode 100644 index 000000000..f55bda43c --- /dev/null +++ b/tensorrt_llm/models/mamba/convert.py @@ -0,0 +1,245 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import re +import time +from pathlib import Path +from typing import Union + +import torch + +import tensorrt_llm +from tensorrt_llm.models.convert_utils import (iterate_shard_files, + load_state_dict) + + +def get_weight(config, prefix, dtype): + return config[prefix + '.weight'].to(dtype).detach() + + +def get_bias(config, prefix, dtype): + if (prefix + '.bias') in config: + return config[prefix + '.bias'].to(dtype).detach() + return None + + +def get_weight_and_bias(config, prefix, dtype_w, dtype_b): + return get_weight(config, prefix, + dtype_w), get_bias(config, prefix, dtype_b) + + +def split(v, tp_size, idx, dim=0): + assert v.shape[dim] % tp_size == 0 + split_size = v.shape[dim] // tp_size + if tp_size == 1: + return v + return torch.split(v, split_size, dim=dim)[idx] + + +def rename_hf_to_tllm(name: str): + """ Rename a HF parameter name by the corresponding TRT-LLM style name. """ + # remove model + if 'model.' in name: + name = name.replace('model.', '') + + # change layer name + if 'embeddings.' in name: + name = name.replace('embeddings', 'vocab_embedding') + elif 'embedding.' in name: + name = name.replace('embedding', 'vocab_embedding') + norm_pattern = r'\d\.norm\.' + if 'mixer.' in name: + name = name.replace('mixer.', 'ssm.') + elif re.search(norm_pattern, name): + name = name.replace('norm.', 'input_layernorm.') + elif 'norm_f.' in name: + name = name.replace('norm_f.', 'ln_f.') + + # Parameter names in ssm layers + if 'A_log' in name: + name = name.replace('A_log', 'A') + elif 'dt_proj.bias' in name: + name = name.replace('dt_proj.bias', 'dt_bias') + return name + + +def convert_hf_mamba(hf_mamba, dtype='float32'): + weights = {} + tik = time.time() + + model_params = dict(hf_mamba.named_parameters()) + dtype = getattr(torch, dtype) + + # Parameter names in mamba block + for l in range(hf_mamba.config.num_hidden_layers): + # ssm layer + prefix = f'backbone.layers.{l}.mixer.' + tllm_prex = f'backbone.layers.{l}.ssm.' + for layer in ['conv1d', 'x_proj', 'dt_proj', 'out_proj']: + dtype_b = torch.float32 if layer == 'dt_proj' else dtype + weight, bias = get_weight_and_bias(model_params, prefix + layer, + dtype, dtype_b) + if layer == 'conv1d': + weight = weight.unsqueeze(3) + tllm_weight_name = tllm_prex + layer + '.weight' + tllm_bias_name = tllm_prex + ('dt_bias' if layer == 'dt_proj' else + layer + '.bias') + weights[tllm_weight_name] = weight + if bias is not None: + weights[tllm_bias_name] = bias + # in_proj + weight, bias = get_weight_and_bias(model_params, prefix + 'in_proj', + dtype, dtype) + in_proj_weights = torch.split(weight, weight.size(0) // 2, dim=0) + tllm_weight_name = tllm_prex + 'in_proj.weight' + weights[tllm_weight_name.replace('proj', 'proj_x')] = in_proj_weights[0] + weights[tllm_weight_name.replace('proj', 'proj_z')] = in_proj_weights[1] + if bias is not None: + in_proj_biases = torch.split(bias, bias.size(0) // 2, dim=0) + tllm_bias_name = tllm_prex + 'in_proj.bias' + weights[tllm_bias_name.replace('proj', + 'proj_x')] = in_proj_biases[0] + weights[tllm_bias_name.replace('proj', + 'proj_x')] = in_proj_biases[1] + + # A and D + Aparam = model_params[prefix + 'A_log'].float().detach() + Aparam = Aparam.permute(1, 0).contiguous() + weights[tllm_prex + 'A'] = -torch.exp(Aparam) + weights[tllm_prex + 'D'] = model_params[prefix + 'D'].float().detach() + # norm + prefix = f'backbone.layers.{l}.norm' + tllm_prex = f'backbone.layers.{l}.input_layernorm.' + weight, bias = get_weight_and_bias(model_params, prefix, dtype, dtype) + weights[tllm_prex + 'weight'] = weight + if bias is not None: + weights[tllm_prex + 'bias'] = bias + + # others + for layer in ['backbone.embeddings', 'backbone.norm_f']: + weight, bias = get_weight_and_bias(model_params, layer, dtype, dtype) + layer = layer.replace('embeddings', 'vocab_embedding') + layer = layer.replace('norm_f', 'ln_f') + weights[layer + '.weight'] = weight + if bias is not None: + weights[layer + '.bias'] = bias + weights['lm_head.weight'], _ = get_weight_and_bias(model_params, + 'backbone.embeddings', + dtype, dtype) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + print(f'Weights loaded. Total time: {t}') + return weights + + +def convert_from_hf_checkpoint(mamba_config: dict, model_dir: Union[str, Path]): + + print('Loading weights from HF Mamba...') + tik = time.time() + + tp_rank = mamba_config.mapping.tp_rank + tp_size = mamba_config.mapping.tp_size + d_inner = mamba_config.rnn_hidden_size + d_state = mamba_config.state_size + dtype = mamba_config.dtype + mamba_version = mamba_config.mamba_version + weights = {} + if isinstance(dtype, str): + dtype = tensorrt_llm.str_dtype_to_torch(dtype) + + for model_file in iterate_shard_files(model_dir, 0): + # logger.debug(f'Loading file {str(model_file)}...') + model_params = load_state_dict(model_file, dtype=dtype) + for name, param in model_params.items(): + # logger.debug(f'Converting weight {name}...') + tllm_name = rename_hf_to_tllm(name) + param = param.detach().cpu() + if 'A_log' in name: + param = -torch.exp(param.float()) + if mamba_version == 'Mamba1': + param = param.permute(1, 0).contiguous() + elif 'D' in name: + param = param.float() + elif 'dt_proj.bias' in name: + param = param.float() + elif 'dt_bias' in name: + param = param.float() + elif 'conv1d.weight' in name: + param = param.unsqueeze(3) + + # split in_proj in Mamba1 + if 'in_proj' in name and mamba_version == 'Mamba1': + in_proj_params = torch.split(param, param.size(0) // 2, dim=0) + weights[tllm_name.replace('proj', 'proj_x')] = in_proj_params[0] + weights[tllm_name.replace('proj', 'proj_z')] = in_proj_params[1] + elif 'in_proj' in name and mamba_version == 'Mamba2': + nheads = d_inner // mamba_config.rnn_head_size + ngroups = mamba_config.ngroups + + in_proj_z, in_proj_x, in_proj_b, in_proj_c, in_proj_dt = torch.split( + param, [ + d_inner, d_inner, ngroups * d_state, ngroups * d_state, + nheads + ], + dim=0) + in_proj_z = split(in_proj_z, tp_size, tp_rank, dim=0) + in_proj_x = split(in_proj_x, tp_size, tp_rank, dim=0) + in_proj_b = split(in_proj_b, tp_size, tp_rank, dim=0) + in_proj_c = split(in_proj_c, tp_size, tp_rank, dim=0) + in_proj_dt = split(in_proj_dt, tp_size, tp_rank, dim=0) + in_proj = torch.concat( + [in_proj_z, in_proj_x, in_proj_b, in_proj_c, in_proj_dt]) + weights[tllm_name] = in_proj.contiguous() + elif 'conv1d' in name and mamba_version == 'Mamba2': + ngroups = mamba_config.ngroups + conv_x, conv_b, conv_c = torch.split( + param, [d_inner, ngroups * d_state, ngroups * d_state], + dim=0) + conv_x = split(conv_x, tp_size, tp_rank, dim=0) + conv_b = split(conv_b, tp_size, tp_rank, dim=0) + conv_c = split(conv_c, tp_size, tp_rank, dim=0) + conv = torch.concat([conv_x, conv_b, conv_c]) + weights[tllm_name] = conv.contiguous() + elif any(keyword in name for keyword in ( + 'mixer.norm.weight', + 'A_log', + 'D', + 'dt_proj.bias', + 'dt_bias', + )) and mamba_version == 'Mamba2': + weights[tllm_name] = split(param, tp_size, tp_rank, dim=0) + elif 'out_proj' in name and mamba_version == 'Mamba2': + weights[tllm_name] = split(param, tp_size, tp_rank, + dim=1).contiguous() + else: + weights[tllm_name] = param + del model_params + + # lm_head + emb = weights['backbone.vocab_embedding.weight'] + if 'lm_head.weight' not in weights or weights['lm_head.weight'].data_ptr( + ) == emb.data_ptr(): + weights['lm_head.weight'] = copy.deepcopy(emb) + if mamba_version == 'Mamba2': + weights['lm_head.weight'] = split(weights['lm_head.weight'], + tp_size, + tp_rank, + dim=0) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + print(f'Weights loaded. Total time: {t}') + return weights diff --git a/tensorrt_llm/models/mamba/model.py b/tensorrt_llm/models/mamba/model.py index 7d2aac4d6..79a20798d 100644 --- a/tensorrt_llm/models/mamba/model.py +++ b/tensorrt_llm/models/mamba/model.py @@ -12,20 +12,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os from collections import OrderedDict -from typing import List, Optional +from typing import List, Optional, Union import tensorrt as trt +from transformers import AutoModelForCausalLM from ..._common import default_net from ..._utils import str_dtype_to_trt from ...functional import (Tensor, arange, cast, concat, expand, gather_last_token_logits, shape, unsqueeze) from ...layers import ColumnLinear, Embedding, LayerNorm, Mamba, Mamba2, RmsNorm +from ...mapping import Mapping from ...module import Module, ModuleList from ...plugin import current_all_reduce_helper from ..generation_mixin import GenerationMixin -from ..modeling_utils import PretrainedConfig, PretrainedModel +from ..modeling_utils import PretrainedConfig, PretrainedModel, QuantConfig +from .config import MambaConfig +from .convert import convert_from_hf_checkpoint, convert_hf_mamba class MambaLayer(Module): @@ -168,6 +173,7 @@ def forward(self, class MambaForCausalLM(PretrainedModel): + config_class = MambaConfig def __init__(self, config: PretrainedConfig): super().__init__(config) @@ -425,3 +431,42 @@ def prepare_inputs( return_dict['slot_mapping'] = slot_mapping return return_dict + + @classmethod + def from_hugging_face( + cls, + hf_model_or_dir: Union[str, 'transformers.PreTrainedModel'], + dtype: str = 'auto', + mapping: Optional[Mapping] = None, + quant_config: Optional[QuantConfig] = None, + **kwargs): + import transformers + + assert hf_model_or_dir is not None + use_preloading = isinstance(hf_model_or_dir, + transformers.PreTrainedModel) + if use_preloading: + hf_model = hf_model_or_dir + hf_config_or_dir = hf_model.config + else: + hf_model_dir = hf_model_or_dir + hf_config_or_dir = hf_model_or_dir + config = MambaConfig.from_hugging_face(hf_config_or_dir, + dtype=dtype, + mapping=mapping, + quant_config=quant_config, + **kwargs) + + if not os.path.exists(hf_model_dir): + hf_model = AutoModelForCausalLM.from_pretrained( + hf_model_dir, torch_dtype="auto", trust_remote_code=True) + + assert isinstance(hf_model, transformers.PreTrainedModel) + weights = convert_hf_mamba(hf_model, dtype) + else: + weights = convert_from_hf_checkpoint(config, hf_model_dir) + + model = cls(config) + model.load(weights) + + return model diff --git a/tensorrt_llm/models/medusa/config.py b/tensorrt_llm/models/medusa/config.py index 1e6df3c5a..7e7102e0a 100644 --- a/tensorrt_llm/models/medusa/config.py +++ b/tensorrt_llm/models/medusa/config.py @@ -14,9 +14,12 @@ # limitations under the License. from ..llama.config import LLaMAConfig +from ..qwen.config import QWenConfig -class MedusaConfig(LLaMAConfig): +# MedusaConfig is a thin wrapper that picks parent class for GenericMedusaConfig +# Medusa-specific config is stored and retrieved from GenericMedusaConfig. +class MedusaConfig(): def __init__(self, *, @@ -24,15 +27,33 @@ def __init__(self, num_medusa_layers: int = 1, max_draft_len: int = 63, **kwargs): - self.num_medusa_heads = num_medusa_heads - self.num_medusa_layers = num_medusa_layers - self.max_draft_len = max_draft_len - super().__init__(**kwargs) - - def to_dict(self): - output = super().to_dict() - # Serialize the fields added in MedusaConfig - output['num_medusa_heads'] = self.num_medusa_heads - output['num_medusa_layers'] = self.num_medusa_layers - output['max_draft_len'] = self.max_draft_len - return output + BaseConfig = QWenConfig if "qwen" in kwargs[ + 'model_type'] else LLaMAConfig + + class GenericMedusaConfig(BaseConfig): + + def __init__(self, num_medusa_heads, num_medusa_layers, + max_draft_len, **kwargs): + self.num_medusa_heads = num_medusa_heads + self.num_medusa_layers = num_medusa_layers + self.max_draft_len = max_draft_len + super().__init__(**kwargs) + + def to_dict(self): + output = super().to_dict() + # Serialize the fields added in MedusaConfig + output['num_medusa_heads'] = self.num_medusa_heads + output['num_medusa_layers'] = self.num_medusa_layers + output['max_draft_len'] = self.max_draft_len + return output + + self.config = GenericMedusaConfig(num_medusa_heads, num_medusa_layers, + max_draft_len, **kwargs) + + # Specialization to redirect accesses to self.config + def __getattribute__(self, name): + if name == 'config' or '__' in name: + return object.__getattribute__(self, name) + else: + config = object.__getattribute__(self, 'config') + return config.__getattribute__(name) diff --git a/tensorrt_llm/models/medusa/model.py b/tensorrt_llm/models/medusa/model.py index 689e8d822..676707641 100644 --- a/tensorrt_llm/models/medusa/model.py +++ b/tensorrt_llm/models/medusa/model.py @@ -14,6 +14,7 @@ # limitations under the License. from tensorrt_llm.models.llama.model import LLaMAForCausalLM +from tensorrt_llm.models.qwen.model import QWenForCausalLM from ..._common import default_net from ..._utils import pad_vocab_size @@ -21,6 +22,7 @@ from ...layers import ColumnLinear from ...mapping import Mapping from ...module import Module, ModuleList +from ..modeling_utils import PretrainedModel from .config import MedusaConfig @@ -80,65 +82,89 @@ def forward(self, x): return self.lm_head(hidden_states) -class MedusaForCausalLm(LLaMAForCausalLM): +# MedusaForCausalLm is a thin wrapper that picks parent class for GenericMedusaForCausalLM. +# All medusa functionality is defined in GenericMedusaForCausalLM. +class MedusaForCausalLm(PretrainedModel): config_class = MedusaConfig def __init__(self, config: MedusaConfig): - super().__init__(config) - self.num_medusa_heads = config.num_medusa_heads - self.num_medusa_layers = config.num_medusa_layers - self.hidden_size = config.hidden_size - self.vocab_size = config.vocab_size - vocab_size_padded = pad_vocab_size(self.vocab_size, - config.mapping.tp_size) - self.medusa_heads = ModuleList([ - MedusaHead(num_layers=self.num_medusa_layers, - hidden_size=config.hidden_size, - vocab_size=vocab_size_padded, - hidden_act=config.hidden_act, - dtype=config.dtype, - mapping=config.mapping) - for _ in range(self.num_medusa_heads) - ]) - self.max_medusa_token_len = config.max_draft_len - - def forward(self, *args, **kwargs): - output_original = True - hidden_states = super().forward(*args, **kwargs) - - if kwargs['use_cache']: - if default_net().plugin_config.paged_kv_cache: - lm_logits, hidden_states = hidden_states - else: - lm_logits, presents, hidden_states = hidden_states - - if self.mapping.is_last_pp_rank(): - medusa_logits = [] - for i in range(self.num_medusa_heads): - medusa_logits.append(self.medusa_heads[i](hidden_states)) - # [num_medusa_heads, batch_size, num_medusa_tokens + 1, padded_vocab_size]. - # Remove padding [num_medusa_heads, batch_size * num_medusa_tokens + 1, padded_vocab_size]. - medusa_logits = stack(medusa_logits, dim=0) - medusa_logits.mark_output('medusa_logits', self.config.logits_dtype) + BaseLM = QWenForCausalLM if "qwen" in config.model_type else LLaMAForCausalLM + + class GenericMedusaForCausalLM(BaseLM): + + def __init__(self, config: MedusaConfig): + super().__init__(config) + self.num_medusa_heads = config.num_medusa_heads + self.num_medusa_layers = config.num_medusa_layers + self.hidden_size = config.hidden_size + self.vocab_size = config.vocab_size + vocab_size_padded = pad_vocab_size(self.vocab_size, + config.mapping.tp_size) + self.medusa_heads = ModuleList([ + MedusaHead(num_layers=self.num_medusa_layers, + hidden_size=config.hidden_size, + vocab_size=vocab_size_padded, + hidden_act=config.hidden_act, + dtype=config.dtype, + mapping=config.mapping) + for _ in range(self.num_medusa_heads) + ]) + self.max_medusa_token_len = config.max_draft_len + + def forward(self, *args, **kwargs): + output_original = True + hidden_states = super().forward(*args, **kwargs) + + if kwargs['use_cache']: + if default_net().plugin_config.paged_kv_cache: + lm_logits, hidden_states = hidden_states + else: + lm_logits, presents, hidden_states = hidden_states + + if self.mapping.is_last_pp_rank(): + medusa_logits = [] + for i in range(self.num_medusa_heads): + medusa_logits.append( + self.medusa_heads[i](hidden_states)) + # [num_medusa_heads, batch_size, num_medusa_tokens + 1, padded_vocab_size]. + # Remove padding [num_medusa_heads, batch_size * num_medusa_tokens + 1, padded_vocab_size]. + medusa_logits = stack(medusa_logits, dim=0) + medusa_logits.mark_output('medusa_logits', + self.config.logits_dtype) + else: + hidden_states.mark_output('hidden_states_output', + self.config.dtype) + + if kwargs['use_cache'] and default_net( + ).plugin_config.paged_kv_cache == False: + if self.mapping.is_last_pp_rank(): + if output_original: + return (medusa_logits, lm_logits, presents) + return (medusa_logits, presents) + return (hidden_states, presents) + else: + if self.mapping.is_last_pp_rank(): + if output_original: + return medusa_logits, lm_logits + return medusa_logits + return hidden_states + + def prepare_inputs(self, *args, **kwargs): + kwargs['speculative_decoding_draft_tokens_external'] = False + kwargs['max_draft_len'] = self.max_medusa_token_len + return super().prepare_inputs(*args, **kwargs) + + self.model = GenericMedusaForCausalLM(config) + + # Specialization to redirect accesses to self.model + def __getattribute__(self, name): + if name == 'model' or '__' in name: + return object.__getattribute__(self, name) else: - hidden_states.mark_output('hidden_states_output', self.config.dtype) - - if kwargs['use_cache'] and default_net( - ).plugin_config.paged_kv_cache == False: - if self.mapping.is_last_pp_rank(): - if output_original: - return (medusa_logits, lm_logits, presents) - return (medusa_logits, presents) - return (hidden_states, presents) - else: - if self.mapping.is_last_pp_rank(): - if output_original: - return medusa_logits, lm_logits - return medusa_logits - return hidden_states - - def prepare_inputs(self, *args, **kwargs): - kwargs['speculative_decoding_draft_tokens_external'] = False - kwargs['max_draft_len'] = self.max_medusa_token_len - return super().prepare_inputs(*args, **kwargs) + model = object.__getattribute__(self, 'model') + return model.__getattribute__(name) + + # Override specialized __setattr__ defined in Module + def __setattr__(self, name, value) -> None: + object.__setattr__(self, name, value) diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py index 6b8fca73a..2a5a2448a 100644 --- a/tensorrt_llm/models/modeling_utils.py +++ b/tensorrt_llm/models/modeling_utils.py @@ -73,6 +73,7 @@ class SpeculativeDecodingMode(IntFlag): MEDUSA = auto() LOOKAHEAD_DECODING = auto() EXPLICIT_DRAFT_TOKENS = auto() + EAGLE = auto() @staticmethod def from_arguments(args: argparse.Namespace): @@ -86,6 +87,9 @@ def from_arguments(args: argparse.Namespace): return SpeculativeDecodingMode.LOOKAHEAD_DECODING elif args.speculative_decoding_mode == "explicit_draft_tokens": return SpeculativeDecodingMode.EXPLICIT_DRAFT_TOKENS + elif args.speculative_decoding_mode == "eagle": + logger.warning(f"EAGLE is not supported yet. Do not use it.") + return SpeculativeDecodingMode.EAGLE else: assert False, "Unknown speculative_decoding_mode " + args.speculative_decoding_mode @@ -703,6 +707,7 @@ def prepare_inputs( use_lora_plugin = default_net().plugin_config.lora_plugin multiple_profiles = default_net().plugin_config.multiple_profiles streamingllm = default_net().plugin_config.streamingllm + pp_reduce_scatter = default_net().plugin_config.pp_reduce_scatter kv_cache_type = None if not use_cache: @@ -746,7 +751,8 @@ def prepare_inputs( lora_target_modules=lora_target_modules, multiple_profiles=multiple_profiles, streamingllm=streamingllm, - opt_batch_size=opt_batch_size) + opt_batch_size=opt_batch_size, + pp_reduce_scatter=pp_reduce_scatter) result = { 'input_ids': diff --git a/tensorrt_llm/models/qwen/model.py b/tensorrt_llm/models/qwen/model.py index c3dd5b305..85e27b2a5 100644 --- a/tensorrt_llm/models/qwen/model.py +++ b/tensorrt_llm/models/qwen/model.py @@ -127,6 +127,7 @@ def forward( hidden_states: Tensor, attention_mask=None, use_cache=False, + spec_decoding_params=None, kv_cache_params=None, attention_params=None, lora_layer_params=None, @@ -137,6 +138,7 @@ def forward( hidden_states, attention_mask=attention_mask, use_cache=use_cache, + spec_decoding_params=spec_decoding_params, kv_cache_params=kv_cache_params, attention_params=attention_params, lora_layer_params=lora_layer_params, @@ -198,6 +200,7 @@ def forward(self, input_ids: Tensor, position_ids=None, use_cache=False, + spec_decoding_params=None, attention_mask=None, kv_cache_params=None, attention_params=None, @@ -216,12 +219,14 @@ def forward(self, else: hidden_states = recv(hidden_states, self.mapping.prev_pp_rank()) - hidden_states = self.layers.forward(hidden_states, - use_cache=use_cache, - attention_mask=attention_mask, - kv_cache_params=kv_cache_params, - attention_params=attention_params, - lora_params=lora_params) + hidden_states = self.layers.forward( + hidden_states, + use_cache=use_cache, + spec_decoding_params=spec_decoding_params, + attention_mask=attention_mask, + kv_cache_params=kv_cache_params, + attention_params=attention_params, + lora_params=lora_params) if use_cache: hidden_states, presents = hidden_states diff --git a/tensorrt_llm/plugin/plugin.py b/tensorrt_llm/plugin/plugin.py index 84441597c..c92a84611 100644 --- a/tensorrt_llm/plugin/plugin.py +++ b/tensorrt_llm/plugin/plugin.py @@ -167,12 +167,14 @@ class PluginConfig(metaclass=PluginConfigMeta): default=None, init=False) _weight_only_quant_matmul_plugin: Optional[str] = field(default=None, init=False) + _smooth_quant_plugins: bool = field( + default=True, + init=False) # Always enable smooth quant plugins for external users _quantize_per_token_plugin: bool = field(default=False, init=False) _quantize_tensor_plugin: bool = field(default=False, init=False) _moe_plugin: Optional[str] = field(default="auto", init=False) _mamba_conv1d_plugin: Optional[str] = field(default="auto", init=False) _low_latency_gemm_plugin: Optional[str] = field(default=None, init=False) - # Features _context_fmha: bool = field(default=True, init=False) _bert_context_fmha_fp32_acc: bool = field( @@ -189,6 +191,7 @@ class PluginConfig(metaclass=PluginConfigMeta): _streamingllm: bool = field(default=False, init=False) _manage_weights: bool = field(default=False, init=False) _use_fused_mlp: bool = field(default=True, init=False) + _pp_reduce_scatter: bool = field(default=False, init=False) def update_from_dict(self, config: dict): for name in config.keys(): @@ -317,6 +320,7 @@ def set_nccl_plugin(self, dtype: str = "auto"): "streamingllm", "reduce_fusion", "use_fused_mlp", + "pp_reduce_scatter", ] diff --git a/tensorrt_llm/prompt_adapter_manager.py b/tensorrt_llm/prompt_adapter_manager.py new file mode 100644 index 000000000..67b278e86 --- /dev/null +++ b/tensorrt_llm/prompt_adapter_manager.py @@ -0,0 +1,51 @@ +from typing import TYPE_CHECKING, Dict, List, Optional + +import torch + +from ._utils import str_dtype_to_torch +from .models.convert_utils import get_model_path, load_state_dict + +if TYPE_CHECKING: + from .runtime import ModelConfig + + +class PromptAdapterManager: + + def __init__(self): + self._uid_counter = 0 + self._uid_to_weights: Dict[str, torch.Tensor] = {} + + def load_from_ckpt(self, + model_dirs: List[str], + model_config: 'ModelConfig', + uids: Optional[List[str]] = None): + if uids is None: + uids = [self._generate_uid() for _ in range(len(model_dirs))] + assert len(uids) == len(model_dirs) + + new_uids, new_model_dirs = [], [] + for uid, model_dir in zip(uids, model_dirs): + if uid in self._uid_to_weights: + continue + new_uids.append(uid) + new_model_dirs.append(model_dir) + + if len(new_uids) == 0: + return + + for uid, model_dir in zip(new_uids, new_model_dirs): + state_dict = load_state_dict( + get_model_path(model_dir, 'adapter_model')) + self._uid_to_weights[uid] = state_dict['prompt_embeddings'].to( + str_dtype_to_torch(model_config.dtype)) + + @property + def uid_to_weights(self): + return self._uid_to_weights + + def _generate_uid(self): + while str(self._uid_counter) in self._uid_to_weights: + self._uid_counter += 1 + uid = str(self._uid_counter) + self._uid_counter += 1 + return uid diff --git a/tensorrt_llm/quantization/functional.py b/tensorrt_llm/quantization/functional.py index 6967ccef3..61d3e80d5 100644 --- a/tensorrt_llm/quantization/functional.py +++ b/tensorrt_llm/quantization/functional.py @@ -20,9 +20,10 @@ import torch.nn.functional as F from .._common import default_net, default_trtnet -from .._utils import str_dtype_to_np, str_dtype_to_trt +from .._utils import str_dtype_to_np, str_dtype_to_trt, trt_dtype_to_np from ..functional import (Tensor, _add_plugin_info, _create_tensor, cast, clip, - constant, matmul, repeat_interleave, round) + constant, flatten, layer_norm, matmul, + repeat_interleave, rms_norm, round, view) from ..layers.linear import ColumnLinear from ..plugin import TRT_LLM_PLUGIN_NAMESPACE from .mode import QuantMode @@ -30,9 +31,32 @@ def smooth_quant_gemm(input: Tensor, weights: Tensor, scales_a: Tensor, scales_b: Tensor, per_token_scaling: bool, - per_channel_scaling: bool) -> Tensor: + per_channel_scaling: bool, dtype: str) -> Tensor: if not default_net().plugin_config.smooth_quant_gemm_plugin: - raise TypeError("Smooth Quant GEMM is only supported with plugin") + if per_token_scaling and input.size(0) == -1: + # WAR for DQ per-token scaling doesn't support dynamic shapes + + scale_one = constant(np.array(1.0, dtype=np.float32)) + input = dequantize(input, scale_one, 0, 'float32') + weights = dequantize(weights, scale_one, 0, 'float32') + result = matmul(input, weights, False, True, False) + scales = matmul(scales_a, scales_b, False, False, False) + result = result * scales + result = cast(result, dtype) + return result + else: + if not per_token_scaling: + scales_a = view(scales_a, []) + else: + scales_a = flatten(scales_a) + if not per_channel_scaling: + scales_b = view(scales_b, []) + else: + scales_b = flatten(scales_b) + input = dequantize(input, scales_a, 0, dtype) + weights = dequantize(weights, scales_b, 0, dtype) + result = matmul(input, weights, False, True, False) + return result else: plg_creator = trt.get_plugin_registry().get_plugin_creator( 'SmoothQuantGemm', '1', TRT_LLM_PLUGIN_NAMESPACE) @@ -119,7 +143,6 @@ def weight_only_quant_matmul(input: Tensor, dtype: str = 'float16', transa: bool = False, transb: bool = False) -> Tensor: - if not default_net( ).plugin_config.weight_only_quant_matmul_plugin or transa or transb: scale_axis = 0 if transb else 1 @@ -166,7 +189,6 @@ def weight_only_groupwise_quant_matmul(input: Tensor, quant_algo: int, group_size: int, dtype: str = 'float16') -> Tensor: - if not default_net( ).plugin_config.weight_only_groupwise_quant_matmul_plugin: scales = repeat_interleave(scales, group_size, 0) @@ -211,12 +233,12 @@ def weight_only_groupwise_quant_matmul(input: Tensor, matmul_plug = plg_creator.create_plugin("woq_groupwise_matmul", pfc) - # quant_algo = fp8_alpha * 8 + pre_quant_scale * 4 + zero * 2 + bias + # quant_algo = use_int8_weight * 16 + fp8_alpha * 8 + pre_quant_scale * 4 + zero * 2 + bias plug_inputs = [input.trt_tensor] # Flags for indicating whether the corresponding inputs are applied in quant_algo - # quant_algo = fp8_alpha * FP8_ALPHA + pre_quant_scale * PRE_QUANT_SCALE + zero * ZERO + bias * BIAS - # Here pre_quant_scale, zero and bias are boolean type + # quant_algo = use_int8_weight * INT8_WEIGHT + fp8_alpha * FP8_ALPHA + pre_quant_scale * PRE_QUANT_SCALE + zero * ZERO + bias * BIAS + # Here use_int8_weight, pre_quant_scale, zero and bias are boolean type BIAS = 1 ZERO = 2 PRE_QUANT_SCALE = 4 @@ -249,7 +271,17 @@ def smooth_quant_layer_norm(input: Tensor, use_diff_of_squares: bool = True, dynamic_act_scaling: bool = False) -> Tensor: if not default_net().plugin_config.layernorm_quantization_plugin: - raise TypeError("Smooth Quant Layer Norm is only supported with plugin") + dtype = trt_dtype_to_np(input.dtype) + if weight is None: + weight = constant(np.ones(normalized_shape, dtype=dtype)) + if bias is None: + bias = constant(np.zeros(normalized_shape, dtype=dtype)) + result = layer_norm(input, normalized_shape, weight, bias, eps, + use_diff_of_squares) + if not dynamic_act_scaling: + return quantize_tensor(result, scale) + else: + return quantize_per_token(result) else: plg_creator = trt.get_plugin_registry().get_plugin_creator( 'LayernormQuantization', '1', TRT_LLM_PLUGIN_NAMESPACE) @@ -308,7 +340,13 @@ def smooth_quant_rms_norm(input: Tensor, eps: float = 1e-05, dynamic_act_scaling: bool = False) -> Tensor: if not default_net().plugin_config.rmsnorm_quantization_plugin: - raise TypeError("Smooth Quant Rms Norm is only supported with plugin") + result = rms_norm(input, normalized_shape, 1, weight, eps) + if bias is not None: + result += bias + if not dynamic_act_scaling: + return quantize_tensor(result, scale) + else: + return quantize_per_token(result) else: plg_creator = trt.get_plugin_registry().get_plugin_creator( 'RmsnormQuantization', '1', TRT_LLM_PLUGIN_NAMESPACE) @@ -568,11 +606,15 @@ def quantize_fp8_per_token(x: Tensor, def quantize_tensor(x, scale): if not default_net().plugin_config.quantize_tensor_plugin: + if scale.dtype == str_dtype_to_trt('float32'): + x = cast(x, 'float32') scaled = x * scale rounded = round(scaled) clipped = clip(rounded, -128, 127) quantized = cast(clipped, 'int8') else: + scale = cast(scale, 'float32') + plg_creator = trt.get_plugin_registry().get_plugin_creator( 'QuantizeTensor', '1', TRT_LLM_PLUGIN_NAMESPACE) assert plg_creator is not None @@ -710,6 +752,7 @@ def postprocess_weight_only_groupwise(tllm_key, weights, torch_dtype, layer, USE_GPTQ = layer.prequant_scaling_factor is None and use_autoawq is None USE_HF_AWQ = layer.prequant_scaling_factor is None and use_autoawq is not None USE_MODELOPT_AWQ = layer.prequant_scaling_factor is not None + USE_INT8_WEIGHT = layer.quant_algo & 16 tp_dim = 1 if isinstance(layer, ColumnLinear) else 0 is_qkv = layer.is_qkv if hasattr(layer, "is_qkv") else False @@ -756,30 +799,45 @@ def postprocess_weight_only_groupwise(tllm_key, weights, torch_dtype, layer, weights = change_qkv_leading_dim(weights, num_heads) results = {tllm_key: weights} elif tllm_key.endswith("weight"): - if USE_GPTQ: - qweight = unpack_int32_into_int8(weights[0].T).T - 8 - elif USE_HF_AWQ: - qweight = unpack_int32_into_int8(weights[0]) - 8 + if not USE_INT8_WEIGHT: + # 4 bit quantization + if USE_GPTQ: + qweight = unpack_int32_into_int8(weights[0].T).T - 8 + elif USE_HF_AWQ: + qweight = unpack_int32_into_int8(weights[0]) - 8 + else: + qweight = unpack_int32_into_int8(weights.T) + qweight[qweight < 0] += 16 + qweight = qweight.view(torch.uint8) + elif USE_INT8_WEIGHT and USE_GPTQ: + # 8 bit quantization (only consider INT8 GPTQ here) + qweight = ( + weights[0].T.contiguous().view(torch.uint8).T.contiguous() - + 128).to(torch.int8) else: - qweight = unpack_int32_into_int8(weights.T) - qweight[qweight < 0] += 16 - qweight = qweight.view(torch.uint8) + warnings.warn("Unsupported quantization mode for weight.") + if using_head_as_leading_dim: qweight = change_qkv_leading_dim(qweight, num_heads) if layer.is_padded: qweight = torch.split(qweight, layer.out_features, tp_dim)[layer.tp_rank] qweight = pad_like(qweight, (layer.in_features, layer.out_features)) - qweight = (qweight[:, 1::2] * 16 + qweight[:, ::2]).view(torch.int8) + # pack int8 tensor to packed int4 + if not USE_INT8_WEIGHT: + qweight = (qweight[:, 1::2] * 16 + qweight[:, ::2]).view(torch.int8) + weight_type = torch.int8 if USE_INT8_WEIGHT else torch.quint4x2 qweight = torch.ops.trtllm.preprocess_weights_for_mixed_gemm( - qweight.contiguous(), torch.quint4x2, - torch.float16).view(torch_dtype) + qweight.contiguous(), weight_type, torch.float16).view(torch_dtype) results = {tllm_key: qweight} # scales and zeros for GPTQ and HF-AWQ if USE_GPTQ or USE_HF_AWQ: scales = weights[1].to(torch_dtype) - qzeros = unpack_int32_into_int8(weights[2]) + if USE_INT8_WEIGHT: + qzeros = weights[2].view(torch.uint8) + else: + qzeros = unpack_int32_into_int8(weights[2]) if using_head_as_leading_dim: scales = change_qkv_leading_dim(scales, num_heads) qzeros = change_qkv_leading_dim(qzeros, num_heads) @@ -792,7 +850,10 @@ def postprocess_weight_only_groupwise(tllm_key, weights, torch_dtype, layer, layer.weights_scaling_factor.shape[tp_dim], tp_dim)[layer.tp_rank] qzeros = pad_like(qzeros, layer.zero.shape, 7) - zeros_x_scales = (-qzeros + 8 - 1 * USE_GPTQ) * scales + if USE_INT8_WEIGHT: + zeros_x_scales = (-qzeros + 128 - 1 * USE_GPTQ) * scales + else: + zeros_x_scales = (-qzeros + 8 - 1 * USE_GPTQ) * scales zeros_x_scales = zeros_x_scales.to(torch_dtype) results.update({ tllm_key.replace("weight", "weights_scaling_factor"): diff --git a/tensorrt_llm/quantization/layers.py b/tensorrt_llm/quantization/layers.py index ac0b14916..b9b1a3324 100644 --- a/tensorrt_llm/quantization/layers.py +++ b/tensorrt_llm/quantization/layers.py @@ -147,7 +147,8 @@ def forward(self, x, lora_runtime_params=None): x = smooth_quant_gemm(x, self.weight.value, per_token_scale, self.per_channel_scale.value, self.quant_mode.has_per_token_dynamic_scaling(), - self.quant_mode.has_per_channel_scaling()) + self.quant_mode.has_per_channel_scaling(), + self.dtype) if self.bias is not None: x = x + self.bias.value @@ -211,7 +212,8 @@ def forward(self, x, lora_runtime_params=None, reduce_fusion_params=None): x = smooth_quant_gemm(x, self.weight.value, per_token_scale, self.per_channel_scale.value, self.quant_mode.has_per_token_dynamic_scaling(), - self.quant_mode.has_per_channel_scaling()) + self.quant_mode.has_per_channel_scaling(), + self.dtype) if self.tp_size > 1 and self.tp_group is not None: need_bias = self.bias is not None @@ -594,6 +596,7 @@ def __init__( self.tp_rank = tp_rank if self.is_padded: self.tp_dim = -1 + self.quant_mode = quant_mode def forward(self, x, lora_runtime_params=None): # ootb has not supported int4 yet. @@ -677,6 +680,7 @@ def __init__( self.tp_rank = tp_rank if self.is_padded: self.tp_dim = -1 + self.quant_mode = quant_mode def forward(self, x, lora_runtime_params=None, reduce_fusion_params=None): hidden_state = x @@ -794,6 +798,7 @@ def __init__( tp_rank=0, gather_output=True, use_w4a8_awq=False, + use_int8_weight=False, is_qkv=False, ): multiple = max((128 if use_w4a8_awq else 64), group_size) * tp_size @@ -819,11 +824,17 @@ def __init__( ZERO = 2 PRE_QUANT_SCALE = 4 W4A8_AWQ = 8 + INT8_WEIGHT = 16 - self.quant_algo = use_w4a8_awq * W4A8_AWQ + pre_quant_scale * PRE_QUANT_SCALE + zero * ZERO + bias * BIAS + self.quant_algo = (use_int8_weight * INT8_WEIGHT + + use_w4a8_awq * W4A8_AWQ + + pre_quant_scale * PRE_QUANT_SCALE + zero * ZERO + + bias * BIAS) self.group_size = group_size + # packed in FP16 format (INT4*4 -> FP16, INT8*2 -> FP16) + pack_ratio = 2 if use_int8_weight else 4 self.weight = Parameter(shape=(self.in_features, - self.out_features // 4), + self.out_features // pack_ratio), dtype=dtype) scale_shape = (self.in_features // group_size, self.out_features) @@ -831,6 +842,8 @@ def __init__( self.tp_rank = tp_rank if self.is_padded: self.tp_dim = -1 + self.pre_quant_scale = pre_quant_scale + self.use_w4a8_awq = use_w4a8_awq if pre_quant_scale: self.prequant_scaling_factor = Parameter(shape=(1, @@ -898,20 +911,19 @@ def postprocess(self, tllm_key, weights, **kwargs): class WeightOnlyGroupwiseQuantRowLinear(RowLinear): - def __init__( - self, - in_features, - out_features, - group_size=128, - pre_quant_scale=False, - zero=False, - bias=False, - dtype=None, - tp_group=None, - tp_size=1, - tp_rank=0, - use_w4a8_awq=False, - ): + def __init__(self, + in_features, + out_features, + group_size=128, + pre_quant_scale=False, + zero=False, + bias=False, + dtype=None, + tp_group=None, + tp_size=1, + tp_rank=0, + use_w4a8_awq=False, + use_int8_weight=False): multiple = max((128 if use_w4a8_awq else 64), group_size) * tp_size self.is_padded = False if in_features % multiple > 0: @@ -932,18 +944,25 @@ def __init__( ZERO = 2 PRE_QUANT_SCALE = 4 W4A8_AWQ = 8 + INT8_WEIGHT = 16 - self.quant_algo = use_w4a8_awq * W4A8_AWQ + pre_quant_scale * PRE_QUANT_SCALE + zero * ZERO + bias * BIAS + self.quant_algo = (use_int8_weight * INT8_WEIGHT + + use_w4a8_awq * W4A8_AWQ + + pre_quant_scale * PRE_QUANT_SCALE + zero * ZERO + + bias * BIAS) self.group_size = group_size + # packed in FP16 format (INT4*4 -> FP16, INT8*2 -> FP16) + pack_ratio = 2 if use_int8_weight else 4 self.weight = Parameter(shape=(self.in_features, - self.out_features // 4), + self.out_features // pack_ratio), dtype=dtype) - scale_shape = (self.in_features // group_size, self.out_features) self.weights_scaling_factor = Parameter(shape=scale_shape, dtype=dtype) self.tp_rank = tp_rank if self.is_padded: self.tp_dim = -1 + self.pre_quant_scale = pre_quant_scale + self.use_w4a8_awq = use_w4a8_awq if pre_quant_scale: self.prequant_scaling_factor = Parameter(shape=(1, @@ -1762,10 +1781,7 @@ def forward( reduce_fusion_params: Optional[AllReduceFusionParams] = None, ): assert lora_layer_params is None, "lora is not supported on SmoothQuantAttention now" - if default_net().plugin_config.smooth_quant_gemm_plugin: - qkv = self.qkv(hidden_states) - else: - raise ValueError("smooth_quant_gemm_plugin is not set") + qkv = self.qkv(hidden_states) alibi_slopes = None if self.position_embedding_type == PositionEmbeddingType.alibi: diff --git a/tensorrt_llm/quantization/mode.py b/tensorrt_llm/quantization/mode.py index 0ececc424..78a990ed5 100644 --- a/tensorrt_llm/quantization/mode.py +++ b/tensorrt_llm/quantization/mode.py @@ -25,6 +25,7 @@ class QuantAlgo(StrEnum, metaclass=BaseEnumMeta): W4A16 = auto() W4A16_AWQ = auto() W4A8_AWQ = auto() + W8A16_GPTQ = auto() W4A16_GPTQ = auto() W8A8_SQ_PER_CHANNEL = auto() W8A8_SQ_PER_TENSOR_PLUGIN = auto() @@ -104,6 +105,9 @@ def is_int4_weight_only(self): def is_weight_only(self): return self.is_int4_weight_only() or self.is_int8_weight_only() + def is_int8_weight_only_per_group(self): + return self.is_int8_weight_only() and self._any(self.PER_GROUP) + def is_int4_weight_only_per_group(self): return self.is_int4_weight_only() and self._any(self.PER_GROUP) @@ -267,6 +271,9 @@ def from_quant_algo( elif quant_algo == QuantAlgo.W4A16_GPTQ: quant_mode = QuantMode.use_weight_only(use_int4_weights=True, per_group=True) + elif quant_algo == QuantAlgo.W8A16_GPTQ: + quant_mode = QuantMode.use_weight_only(use_int4_weights=False, + per_group=True) elif quant_algo == QuantAlgo.W8A8_SQ_PER_CHANNEL: quant_mode = QuantMode.use_smooth_quant(per_token=False, per_channel=True) diff --git a/tensorrt_llm/quantization/quantize.py b/tensorrt_llm/quantization/quantize.py index 92cc7bac8..4791cd8b0 100644 --- a/tensorrt_llm/quantization/quantize.py +++ b/tensorrt_llm/quantization/quantize.py @@ -131,6 +131,8 @@ def preprocess_init_params(init_params, name, module): init_params["zero"] = quant_config.has_zero_point init_params[ "use_w4a8_awq"] = quant_config.quant_algo == QuantAlgo.W4A8_AWQ + init_params[ + "use_int8_weight"] = quant_config.quant_algo == QuantAlgo.W8A16_GPTQ init_params["tp_rank"] = model_cfg.mapping.tp_rank model = quantize_layers( diff --git a/tensorrt_llm/quantization/quantize_by_modelopt.py b/tensorrt_llm/quantization/quantize_by_modelopt.py old mode 100644 new mode 100755 index 8c6869eae..04fdfc52f --- a/tensorrt_llm/quantization/quantize_by_modelopt.py +++ b/tensorrt_llm/quantization/quantize_by_modelopt.py @@ -126,7 +126,9 @@ def quant_cfg_choices(): "Starcoder2ForCausalLM": "gptnext", "GPTBigCodeForCausalLM": "gptnext", "GLM": "glm", + "Exaone": "exaone", "DeciLMForCausalLM": "deci", + "DeepseekForCausalLM": "deepseek", } @@ -571,6 +573,8 @@ def quantize_and_export(*, with open(f"{export_path}/config.json", "r") as f: tensorrt_llm_config = json.load(f) + tensorrt_llm_config["model_type"] = model_type + # Workaround for wo quantization if qformat in ["int8_wo", "int4_wo", "full_prec"]: if qformat == "int8_wo": diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py index 789e5f612..531c2f796 100644 --- a/tensorrt_llm/version.py +++ b/tensorrt_llm/version.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.14.0.dev2024100800" +__version__ = "0.15.0.dev2024101500" diff --git a/tests/functional/test_pp_reduce_scatter.py b/tests/functional/test_pp_reduce_scatter.py new file mode 100644 index 000000000..dd2a278c3 --- /dev/null +++ b/tests/functional/test_pp_reduce_scatter.py @@ -0,0 +1,194 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +from itertools import product + +import pytest + +# isort: off +import torch +# isort: on +import os +import sys + +from cuda import cudart +from parameterized import parameterized + +import tensorrt_llm as tllm +from tensorrt_llm import Mapping, Tensor +from tensorrt_llm.functional import (allgather, allreduce, concat, recv, + reduce_scatter, send) +from tensorrt_llm.plugin.plugin import (current_all_reduce_helper, + init_all_reduce_helper) + +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from utils.util import create_session, run_session, unittest_name_func + +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from utils.util import unittest_name_func + + +def forward_allreduce(x: Tensor, y: Tensor, mapping: Mapping) -> Tensor: + current = x + if mapping.tp_size > 1 and mapping.tp_group is not None: + current = allreduce(current, mapping.tp_group) + current = current + y + return current + + +def forward_reduce_scatter(x: Tensor, y: Tensor, mapping: Mapping, + hidden_size: int) -> Tensor: + if mapping.tp_rank == 0: + current = x + y + else: + current = x + 0 + # reshape to (-1) + current = current.view(concat([-1])) + if mapping.tp_size > 1 and mapping.tp_group is not None: + current = reduce_scatter(current, mapping.tp_group) + # reshape to (-1, hidden_size // tp_size) + current = current.view(concat([-1, hidden_size // mapping.tp_size])) + return current + + +class TestPPReduceScatter(unittest.TestCase): + + def setUp(self): + torch.manual_seed(20240603) + torch.cuda.manual_seed(20240603) + tllm.logger.set_level('error') + self.world_size = tllm.mpi_world_size() + self.rank = tllm.mpi_rank() + torch.cuda.set_device(self.rank) + cudart.cudaSetDevice(self.rank) + self.reference_tensors = [ + torch.full([10000000], i + 1, dtype=torch.float32, device="cuda") + for i in range(self.world_size) + ] + + @parameterized.expand(list( + product(['bfloat16', 'float16', 'float32'], [1, 4, 16, 64], + [4096, 8192, 12288], [2, 4, 8])), + name_func=unittest_name_func) + def test_pp_reduce_scatter(self, dtype: str, token_num: int, + hidden_size: int, pp_size: int): + if self.world_size == 1 or pp_size > self.world_size: + pytest.skip("Skip single GPU and pp_size > world_size case") + tp_size = self.world_size // pp_size + mapping = Mapping(self.world_size, self.rank, self.world_size, 1, + tp_size, pp_size) + + size = token_num * hidden_size # tensor size + torch_dtype = tllm._utils.str_dtype_to_torch(dtype) + dtype_size = torch.finfo(torch_dtype).bits // 8 + input = self.reference_tensors[self.rank][:size].to( + torch_dtype).reshape(token_num, hidden_size) + residual = torch.rand(input.shape, dtype=torch_dtype, device="cuda") + input_recv = torch.zeros(torch.Size([token_num, + hidden_size // tp_size]), + dtype=torch_dtype, + device="cuda") + + builder = tllm.Builder() + net_ref = builder.create_network() + net = builder.create_network() + init_all_reduce_helper() + _, workspace = current_all_reduce_helper().allocate_workspace( + mapping, size * dtype_size) + + with tllm.net_guard(net_ref): + x = Tensor(name='x', + shape=input.shape, + dtype=tllm.str_dtype_to_trt(dtype)) + y = Tensor(name='y', + shape=residual.shape, + dtype=tllm.str_dtype_to_trt(dtype)) + current_all_reduce_helper().set_workspace_tensor(mapping) + + if not mapping.is_first_pp_rank(): + net_ref_input = x + net_ref_input = recv(net_ref_input, mapping.prev_pp_rank()) + else: + net_ref_input = x + + if not mapping.is_last_pp_rank(): + output_ref = forward_allreduce(net_ref_input, y, mapping) + output_ref = send(output_ref, mapping.next_pp_rank()) + else: + output_ref = forward_allreduce(net_ref_input, y, mapping) + + output_ref.mark_output('output', dtype) + + with tllm.net_guard(net): + x = Tensor(name='x', + shape=input.shape, + dtype=tllm.str_dtype_to_trt(dtype)) + y = Tensor(name='y', + shape=residual.shape, + dtype=tllm.str_dtype_to_trt(dtype)) + x_recv = Tensor(name='x_recv', + shape=torch.Size( + [token_num, hidden_size // mapping.tp_size]), + dtype=tllm.str_dtype_to_trt(dtype)) + current_all_reduce_helper().set_workspace_tensor(mapping) + + if not mapping.is_first_pp_rank(): + net_input = x_recv + net_input = recv(net_input, mapping.prev_pp_rank()) + net_input = allgather(net_input, mapping.tp_group, gather_dim=0) + # reshape to (-1, hidden_size) + net_input = net_input.view(concat([-1, hidden_size])) + else: + net_input = x + + if not mapping.is_last_pp_rank(): + output = forward_reduce_scatter(net_input, y, mapping, + hidden_size) + output = send(output, mapping.next_pp_rank()) + else: + output = forward_allreduce(net_input, y, mapping) + + output.mark_output('output', dtype) + + feed_dict_ref = { + 'x': input, + 'y': residual, + 'all_reduce_workspace': workspace + } + + feed_dict = { + 'x': input, + 'y': residual, + 'x_recv': input_recv, + 'all_reduce_workspace': workspace + } + + # trt run + session_ref = create_session(builder, net_ref, precision=dtype) + outputs_ref = run_session(session_ref, feed_dict_ref) + + session = create_session(builder, net, precision=dtype) + outputs = run_session(session, feed_dict) + + # compare diff + if mapping.is_last_pp_rank(): + torch.testing.assert_allclose(outputs['output'], + outputs_ref['output'], + atol=1e-5, + rtol=1e-2) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/hlapi/apps/README.md b/tests/llmapi/apps/README.md similarity index 100% rename from tests/hlapi/apps/README.md rename to tests/llmapi/apps/README.md diff --git a/tests/hlapi/apps/_test_llm_chat.py b/tests/llmapi/apps/_test_llm_chat.py similarity index 100% rename from tests/hlapi/apps/_test_llm_chat.py rename to tests/llmapi/apps/_test_llm_chat.py diff --git a/tests/hlapi/apps/_test_llm_server.py b/tests/llmapi/apps/_test_llm_server.py similarity index 100% rename from tests/hlapi/apps/_test_llm_server.py rename to tests/llmapi/apps/_test_llm_server.py diff --git a/tests/hlapi/apps/_test_openai_chat.py b/tests/llmapi/apps/_test_openai_chat.py similarity index 100% rename from tests/hlapi/apps/_test_openai_chat.py rename to tests/llmapi/apps/_test_openai_chat.py diff --git a/tests/hlapi/apps/_test_openai_completions.py b/tests/llmapi/apps/_test_openai_completions.py similarity index 100% rename from tests/hlapi/apps/_test_openai_completions.py rename to tests/llmapi/apps/_test_openai_completions.py diff --git a/tests/hlapi/apps/_test_openai_misc.py b/tests/llmapi/apps/_test_openai_misc.py similarity index 100% rename from tests/hlapi/apps/_test_openai_misc.py rename to tests/llmapi/apps/_test_openai_misc.py diff --git a/tests/hlapi/apps/openai_server.py b/tests/llmapi/apps/openai_server.py similarity index 100% rename from tests/hlapi/apps/openai_server.py rename to tests/llmapi/apps/openai_server.py diff --git a/tests/hlapi/fake.sh b/tests/llmapi/fake.sh similarity index 100% rename from tests/hlapi/fake.sh rename to tests/llmapi/fake.sh diff --git a/tests/hlapi/grid_searcher.py b/tests/llmapi/grid_searcher.py similarity index 95% rename from tests/hlapi/grid_searcher.py rename to tests/llmapi/grid_searcher.py index 7faf66c7c..319868f8a 100644 --- a/tests/hlapi/grid_searcher.py +++ b/tests/llmapi/grid_searcher.py @@ -8,10 +8,10 @@ from typing import Any, Dict, Iterable, List, Optional from tensorrt_llm import logger -from tensorrt_llm.hlapi import (BuildConfig, CapacitySchedulerPolicy, - KvCacheConfig, SchedulerConfig) -from tensorrt_llm.hlapi._perf_evaluator import LLMPerfEvaluator -from tensorrt_llm.hlapi.utils import print_colored +from tensorrt_llm.llmapi import (BuildConfig, CapacitySchedulerPolicy, + KvCacheConfig, SchedulerConfig) +from tensorrt_llm.llmapi._perf_evaluator import LLMPerfEvaluator +from tensorrt_llm.llmapi.utils import print_colored class GridSearcher: diff --git a/tests/hlapi/hlapi_evaluator.py b/tests/llmapi/llmapi_evaluator.py similarity index 94% rename from tests/hlapi/hlapi_evaluator.py rename to tests/llmapi/llmapi_evaluator.py index 1f35f3bbe..bad96236e 100644 --- a/tests/hlapi/hlapi_evaluator.py +++ b/tests/llmapi/llmapi_evaluator.py @@ -8,11 +8,11 @@ import click -from tensorrt_llm.hlapi import BuildConfig -from tensorrt_llm.hlapi._perf_evaluator import LLMPerfEvaluator -from tensorrt_llm.hlapi.llm import ModelLoader -from tensorrt_llm.hlapi.llm_utils import _ModelFormatKind -from tensorrt_llm.hlapi.utils import print_colored +from tensorrt_llm.llmapi import BuildConfig +from tensorrt_llm.llmapi._perf_evaluator import LLMPerfEvaluator +from tensorrt_llm.llmapi.llm import ModelLoader +from tensorrt_llm.llmapi.llm_utils import _ModelFormatKind +from tensorrt_llm.llmapi.utils import print_colored try: from .grid_searcher import GridSearcher @@ -62,7 +62,7 @@ def benchmark_main(model_path: str, max_batch_size: int = 128, engine_output_dir: str = "", cpp_executable: str = None): - ''' Run the benchmark on HLAPI. + ''' Run the benchmark on LLMAPI. If `cpp_executable_path` is provided, it will run the cpp benchmark as well. ''' model_path = Path(model_path) @@ -84,8 +84,8 @@ def benchmark_main(model_path: str, temp_dir = tempfile.TemporaryDirectory() engine_output_dir = Path(temp_dir.name) - def run_hlapi(): - print_colored(f"Running HLAPI benchmark ...\n", + def run_llmapi(): + print_colored(f"Running LLMAPI benchmark ...\n", "bold_green", writer=sys.stdout) @@ -151,7 +151,7 @@ def run_gpt_manager_benchmark(): "red", writer=sys.stdout) - run_hlapi() + run_llmapi() if cpp_executable: run_gpt_manager_benchmark() diff --git a/tests/hlapi/mpi_test_task.py b/tests/llmapi/mpi_test_task.py similarity index 92% rename from tests/hlapi/mpi_test_task.py rename to tests/llmapi/mpi_test_task.py index 31e1f07af..1c7de1f3f 100644 --- a/tests/hlapi/mpi_test_task.py +++ b/tests/llmapi/mpi_test_task.py @@ -2,7 +2,7 @@ from mpi4py.futures import MPICommExecutor from tensorrt_llm._utils import mpi_comm, mpi_rank, mpi_world_size -from tensorrt_llm.hlapi.mpi_session import MpiCommSession, MPINodeState +from tensorrt_llm.llmapi.mpi_session import MpiCommSession, MPINodeState class MpiTask: diff --git a/tests/hlapi/run_llm.py b/tests/llmapi/run_llm.py similarity index 92% rename from tests/hlapi/run_llm.py rename to tests/llmapi/run_llm.py index fd69ae7bf..44709248c 100644 --- a/tests/hlapi/run_llm.py +++ b/tests/llmapi/run_llm.py @@ -3,7 +3,7 @@ import click -from tensorrt_llm.hlapi import LLM, KvCacheConfig, SamplingParams +from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams @click.command() diff --git a/tests/hlapi/run_llm_exit.py b/tests/llmapi/run_llm_exit.py similarity index 100% rename from tests/hlapi/run_llm_exit.py rename to tests/llmapi/run_llm_exit.py diff --git a/tests/hlapi/test_build_cache.py b/tests/llmapi/test_build_cache.py similarity index 98% rename from tests/hlapi/test_build_cache.py rename to tests/llmapi/test_build_cache.py index 7b1cef7ef..a5f17b88f 100644 --- a/tests/hlapi/test_build_cache.py +++ b/tests/llmapi/test_build_cache.py @@ -1,7 +1,7 @@ import json from tempfile import TemporaryDirectory -from tensorrt_llm.hlapi.build_cache import * +from tensorrt_llm.llmapi.build_cache import * try: pass diff --git a/tests/hlapi/test_executor.py b/tests/llmapi/test_executor.py similarity index 98% rename from tests/hlapi/test_executor.py rename to tests/llmapi/test_executor.py index 972034891..47a0aba85 100644 --- a/tests/hlapi/test_executor.py +++ b/tests/llmapi/test_executor.py @@ -11,8 +11,8 @@ from tensorrt_llm.bindings import executor as tllm from tensorrt_llm.executor import (GenerationExecutor, GenerationRequest, SamplingParams) -from tensorrt_llm.hlapi import LLM, BuildConfig -from tensorrt_llm.hlapi.tokenizer import TransformersTokenizer +from tensorrt_llm.llmapi import LLM, BuildConfig +from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer sys.path.append(_os.path.join(_os.path.dirname(__file__), '..')) import tempfile diff --git a/tests/hlapi/test_llm.py b/tests/llmapi/test_llm.py similarity index 93% rename from tests/hlapi/test_llm.py rename to tests/llmapi/test_llm.py index aca7b5437..24cc4a067 100644 --- a/tests/hlapi/test_llm.py +++ b/tests/llmapi/test_llm.py @@ -1,6 +1,7 @@ import asyncio import json import os +import shutil import sys import tempfile import time @@ -12,12 +13,13 @@ from tensorrt_llm._utils import release_gc from tensorrt_llm.executor import (ExecutorBindingsWorker, GenerationRequest, - GenerationResult, LoRARequest) -from tensorrt_llm.hlapi import (LLM, BuildCacheConfig, KvCacheConfig, - SamplingParams) -from tensorrt_llm.hlapi.llm_utils import BuildConfig, _ParallelConfig -from tensorrt_llm.hlapi.tokenizer import TokenizerBase, TransformersTokenizer -from tensorrt_llm.hlapi.utils import get_total_gpu_memory + GenerationResult, LoRARequest, + PromptAdapterRequest) +from tensorrt_llm.llmapi import (LLM, BuildCacheConfig, KvCacheConfig, + SamplingParams) +from tensorrt_llm.llmapi.llm_utils import BuildConfig, _ParallelConfig +from tensorrt_llm.llmapi.tokenizer import TokenizerBase, TransformersTokenizer +from tensorrt_llm.llmapi.utils import get_total_gpu_memory from tensorrt_llm.lora_manager import LoraConfig sys.path.append(os.path.join(os.path.dirname(__file__), '..')) @@ -376,6 +378,20 @@ def llm_for_sampling_params() -> LLM: return llm +def test_user_specify_workspace(): + user_specified_ws_path = '/tmp/specified_workspace' + shutil.rmtree(user_specified_ws_path, ignore_errors=True) + os.mkdir(user_specified_ws_path) + llm = LLM(model=llama_model_path, + kv_cache_config=global_kvcache_config, + workspace=user_specified_ws_path) + pre_built_engine_cfg = llm.args.model / 'config.json' + assert pre_built_engine_cfg.exists() + del llm + release_gc() + assert not pre_built_engine_cfg.exists() + + @force_ampere def test_generate_with_sampling_params_per_prompt(llm_for_sampling_params: LLM): llm = llm_for_sampling_params @@ -714,6 +730,45 @@ def test_llama_7b_multi_lora(): llama_7b_multi_lora_test_harness(max_loras=1, max_cpu_loras=8) +def llama_v2_7b_prompt_adapter_test_harness(**llm_kwargs): + hf_model_dir = get_model_path("llama-models-v2/llama-v2-7b-hf") + hf_prompt_adapter_dir = get_model_path("llama-models-v2/llama_tweet_ptune") + llm = LLM(hf_model_dir, + enable_prompt_adapter=True, + max_prompt_adapter_token=8, + **llm_kwargs) + + prompts = [ + "Born in north-east France, Soyer trained as a", + "Born in north-east France, Soyer trained as a", + "Tweet text: I have complaints! Label: ", + "Tweet text: I have complaints! Label: ", + "Tweet text: I have no problems Label: ", + "Tweet text: I have no problems Label: ", + ] + references = [ + "painter at the École des Beaux-Arts in Paris. He was a member of the", + "chef and has worked in the restaurant industry for 15 years.Ћ\nBorn in north", + "1999.\nTweet text: I have complaints! Label: 19", + "no complaint", + "100%\nI have no problems Label: 100%\nI have no", + "no complaint", + ] + pa_req = PromptAdapterRequest('tweet', 1, hf_prompt_adapter_dir) + sampling_params = SamplingParams(max_tokens=20) + outputs = llm.generate( + prompts, + sampling_params, + prompt_adapter_request=[None, pa_req, None, pa_req, None, pa_req]) + for output, ref in zip(outputs, references): + assert similar(output.outputs[0].text, ref) + + +@skip_less_than_40gb_memory +def test_llama_v2_7b_prompt_adapter(): + llama_v2_7b_prompt_adapter_test_harness() + + @force_ampere def test_generate_block_reuse(): build_config = BuildConfig() diff --git a/tests/hlapi/test_llm_download.py b/tests/llmapi/test_llm_download.py similarity index 83% rename from tests/hlapi/test_llm_download.py rename to tests/llmapi/test_llm_download.py index 079e72cb9..1a7315bcf 100644 --- a/tests/hlapi/test_llm_download.py +++ b/tests/llmapi/test_llm_download.py @@ -1,6 +1,6 @@ -from tensorrt_llm.hlapi import LLM -from tensorrt_llm.hlapi.utils import (download_hf_model, - download_hf_pretrained_config) +from tensorrt_llm.llmapi import LLM +from tensorrt_llm.llmapi.utils import (download_hf_model, + download_hf_pretrained_config) try: from test_llm import llama_model_path diff --git a/tests/hlapi/test_llm_models.py b/tests/llmapi/test_llm_models.py similarity index 88% rename from tests/hlapi/test_llm_models.py rename to tests/llmapi/test_llm_models.py index a659f8002..316183dd9 100644 --- a/tests/hlapi/test_llm_models.py +++ b/tests/llmapi/test_llm_models.py @@ -3,7 +3,7 @@ import pytest from tensorrt_llm import BuildConfig, SamplingParams -from tensorrt_llm.hlapi import CalibConfig, QuantAlgo, QuantConfig +from tensorrt_llm.llmapi import CalibConfig, QuantAlgo, QuantConfig try: from .test_llm import cnn_dailymail_path, get_model_path, llm_test_harness @@ -15,7 +15,7 @@ sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from utils.util import (force_ampere, skip_less_than_40gb_memory, - skip_pre_ampere, skip_pre_hopper) + skip_less_than_memory, skip_pre_ampere, skip_pre_hopper) gptj_model_path = get_model_path('gpt-j-6b') gpt2_model_path = get_model_path('gpt2-medium') @@ -36,6 +36,10 @@ qwen_model_path = get_model_path('Qwen-1_8B-Chat') qwen1_5_model_path = get_model_path('Qwen1.5-0.5B-Chat') qwen2_model_path = get_model_path('Qwen2-7B-Instruct') +mamba2_370m_model_path = get_model_path('mamba2/mamba2-370m') +gpt_neox_20b_model_path = get_model_path('gpt-neox-20b') +commandr_v01_model_path = get_model_path('c4ai-command-r-v01') +commandr_plus_model_path = get_model_path('c4ai-command-r-plus') sampling_params = SamplingParams(max_tokens=10) @@ -339,6 +343,37 @@ def test_llm_qwen2_fp8(): trust_remote_code=True) +@skip_pre_ampere +def test_llm_mamba2_370m(): + build_config = BuildConfig() + build_config.plugin_config._paged_kv_cache = False + llm_test_harness(mamba2_370m_model_path, + inputs=['A B C'], + references=['D E F G H I J K L M'], + sampling_params=sampling_params, + tokenizer=gpt_neox_20b_model_path, + build_config=build_config, + trust_remote_code=True) + + +@skip_less_than_memory(70 * 1024 * 1024 * 1024) +def test_llm_commandr_v01(): + llm_test_harness(commandr_v01_model_path, + inputs=['A B C'], + references=[' D E F G H I J K L M'], + sampling_params=sampling_params) + + +@skip_less_than_40gb_memory +def test_llm_commandr_v01_int8_weight_only(): + quant_config = QuantConfig(quant_algo=QuantAlgo.W8A16) + llm_test_harness(commandr_v01_model_path, + inputs=['A B C'], + references=[' D E F G H I J K L M'], + sampling_params=sampling_params, + quant_config=quant_config) + + if __name__ == '__main__': test_llm_gptj() test_llm_phi_1_5() @@ -346,3 +381,5 @@ def test_llm_qwen2_fp8(): test_llm_phi_3_mini_4k() test_llm_phi_3_small_8k() test_llm_glm() + test_llm_commandr_v01() + test_llm_commandr_v01_int8_weight_only() diff --git a/tests/hlapi/test_llm_models_multi_gpu.py b/tests/llmapi/test_llm_models_multi_gpu.py similarity index 100% rename from tests/hlapi/test_llm_models_multi_gpu.py rename to tests/llmapi/test_llm_models_multi_gpu.py diff --git a/tests/hlapi/test_llm_multi_gpu.py b/tests/llmapi/test_llm_multi_gpu.py similarity index 90% rename from tests/hlapi/test_llm_multi_gpu.py rename to tests/llmapi/test_llm_multi_gpu.py index 782a8037b..ba1b34280 100644 --- a/tests/hlapi/test_llm_multi_gpu.py +++ b/tests/llmapi/test_llm_multi_gpu.py @@ -11,33 +11,33 @@ from tensorrt_llm.executor import (ExecutorBindingsProxy, GenerationRequest, GenerationResult) -from tensorrt_llm.hlapi import LLM, KvCacheConfig, SamplingParams -from tensorrt_llm.hlapi.tokenizer import TransformersTokenizer -from tensorrt_llm.hlapi.utils import get_total_gpu_memory +from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams +from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer +from tensorrt_llm.llmapi.utils import get_total_gpu_memory from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.llama.model import LLaMAForCausalLM sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from utils.util import skip_single_gpu, unittest_name_func +# isort: off try: - from .test_llm import DummyExecutorWorker2 # isort:skip - from .test_llm import (DummyError, _test_llm_generate_async, - check_llm_return_context_logits, - check_llm_return_generation_logits, - default_model_name, get_model_path, - llama_7b_multi_lora_test_harness, llama_model_path, - llama_v2_13b_lora_test_harness, llm_check_output, - llm_test_harness, mixtral_model_name, prompts) + from .test_llm import ( + DummyError, DummyExecutorWorker2, _test_llm_generate_async, + check_llm_return_context_logits, check_llm_return_generation_logits, + default_model_name, get_model_path, llama_7b_multi_lora_test_harness, + llama_model_path, llama_v2_7b_prompt_adapter_test_harness, + llama_v2_13b_lora_test_harness, llm_check_output, llm_test_harness, + mixtral_model_name, prompts) except ImportError: - from test_llm import DummyExecutorWorker2 # isort:skip - from test_llm import (DummyError, _test_llm_generate_async, - check_llm_return_context_logits, - check_llm_return_generation_logits, - default_model_name, get_model_path, - llama_7b_multi_lora_test_harness, llama_model_path, - llama_v2_13b_lora_test_harness, llm_check_output, - llm_test_harness, mixtral_model_name, prompts) + from test_llm import ( + DummyError, DummyExecutorWorker2, _test_llm_generate_async, + check_llm_return_context_logits, check_llm_return_generation_logits, + default_model_name, get_model_path, llama_7b_multi_lora_test_harness, + llama_model_path, llama_v2_7b_prompt_adapter_test_harness, + llama_v2_13b_lora_test_harness, llm_check_output, llm_test_harness, + mixtral_model_name, prompts) +# isort: on @pytest.fixture(scope="module") @@ -223,13 +223,19 @@ def test_llama_7b_multi_lora_tp2(): kv_cache_config=global_kv_cache_config) +@skip_single_gpu +def test_llama_v2_7b_prompt_adapter_tp2(): + llama_v2_7b_prompt_adapter_test_harness( + tensor_parallel_size=2, kv_cache_config=global_kv_cache_config) + + @skip_single_gpu def _test_llm_multi_node(engine_from_checkpoint: tempfile.TemporaryDirectory): # TODO[chunweiy]: reactivate this later nworkers = 2 test_case_file = os.path.join(os.path.dirname(__file__), "run_llm.py") os.path.join(os.path.dirname(__file__), "launch.py") - command = f"mpirun --allow-run-as-root -n {nworkers} trtllm-hlapi-launch python3 {test_case_file} --model_dir {engine_from_checkpoint.name} --tp_size {nworkers}" + command = f"mpirun --allow-run-as-root -n {nworkers} trtllm-llmapi-launch python3 {test_case_file} --model_dir {engine_from_checkpoint.name} --tp_size {nworkers}" subprocess.run(command, shell=True, check=True, env=os.environ) # nosec B603 @@ -403,7 +409,8 @@ async def task(): asyncio.run(task()) -def test_executor_handle_background_error_in_worker(): +# TODO[chunweiy]: This test is not stable, need to investigate +def _test_executor_handle_background_error_in_worker(): llm = LLM(model=llama_model_path, executor_cls=DummyExecutor2, kv_cache_config=global_kv_cache_config) diff --git a/tests/hlapi/test_llm_perf_evaluator.py b/tests/llmapi/test_llm_perf_evaluator.py similarity index 89% rename from tests/hlapi/test_llm_perf_evaluator.py rename to tests/llmapi/test_llm_perf_evaluator.py index a2422add9..ee0956c86 100644 --- a/tests/hlapi/test_llm_perf_evaluator.py +++ b/tests/llmapi/test_llm_perf_evaluator.py @@ -5,9 +5,9 @@ import time from pathlib import Path -from tensorrt_llm.hlapi import BuildConfig, KvCacheConfig -from tensorrt_llm.hlapi._perf_evaluator import (LLMPerfEvaluator, - MemoryContinuousMonitorThread) +from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig +from tensorrt_llm.llmapi._perf_evaluator import (LLMPerfEvaluator, + MemoryContinuousMonitorThread) sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from utils.util import force_ampere diff --git a/tests/hlapi/test_llm_quant.py b/tests/llmapi/test_llm_quant.py similarity index 92% rename from tests/hlapi/test_llm_quant.py rename to tests/llmapi/test_llm_quant.py index d7edb3eef..5208a9e19 100644 --- a/tests/hlapi/test_llm_quant.py +++ b/tests/llmapi/test_llm_quant.py @@ -1,8 +1,8 @@ import os import sys -from tensorrt_llm.hlapi.llm import LLM, SamplingParams -from tensorrt_llm.hlapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig +from tensorrt_llm.llmapi.llm import LLM, SamplingParams +from tensorrt_llm.llmapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from utils.util import skip_pre_ampere, skip_pre_hopper diff --git a/tests/hlapi/test_llm_utils.py b/tests/llmapi/test_llm_utils.py similarity index 99% rename from tests/hlapi/test_llm_utils.py rename to tests/llmapi/test_llm_utils.py index 733b424f4..d01a4d645 100644 --- a/tests/hlapi/test_llm_utils.py +++ b/tests/llmapi/test_llm_utils.py @@ -4,7 +4,7 @@ import pytest from tensorrt_llm.builder import PluginConfig -from tensorrt_llm.hlapi.llm_utils import * +from tensorrt_llm.llmapi.llm_utils import * try: from test_llm import llama_model_path diff --git a/tests/hlapi/test_mpi_session.py b/tests/llmapi/test_mpi_session.py similarity index 90% rename from tests/hlapi/test_mpi_session.py rename to tests/llmapi/test_mpi_session.py index 58d5eae10..c763ed0d4 100644 --- a/tests/hlapi/test_mpi_session.py +++ b/tests/llmapi/test_mpi_session.py @@ -4,7 +4,7 @@ import pytest from tensorrt_llm.bindings.BuildInfo import ENABLE_MULTI_DEVICE -from tensorrt_llm.hlapi.mpi_session import MPINodeState +from tensorrt_llm.llmapi.mpi_session import MPINodeState def task0(): @@ -16,7 +16,7 @@ def task0(): @pytest.mark.skipif(not ENABLE_MULTI_DEVICE, reason="multi-device required") def test_mpi_session_basic(): - from tensorrt_llm.hlapi.mpi_session import MpiPoolSession + from tensorrt_llm.llmapi.mpi_session import MpiPoolSession n_workers = 4 executor = MpiPoolSession(n_workers) diff --git a/tests/model/eagle/test_sample_accept_draft_tokens_plugin.py b/tests/model/eagle/test_sample_accept_draft_tokens_plugin.py new file mode 100644 index 000000000..3377f40c4 --- /dev/null +++ b/tests/model/eagle/test_sample_accept_draft_tokens_plugin.py @@ -0,0 +1,364 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +import unittest + +import tensorrt as trt +import torch +from parameterized import parameterized + +import tensorrt_llm +import tensorrt_llm.models.eagle +from tensorrt_llm import Tensor +from tensorrt_llm.models.eagle.model import TreeParams + +sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)) +from utils.util import create_session, run_session, unittest_name_func + + +class TestEagleSampleAcceptDraftTokensPlugin(unittest.TestCase): + + def setUp(self): + tensorrt_llm.logger.set_level('warning') + + +######################################################################################################################## + + def load_test_cases(): + test_cases = [] + ################# CASE 0 ########################## + # BS=1, greedy sampling, gen request + # 7 draft tokens + logits = torch.tensor( + [ + [0, -100, -100, -100, -100, -100, -100, -100 + ], # t0: Top1 id = 0 + [-100, 0, -100, -100, -100, -100, -100, -100 + ], # t1: Top1 id = 1 + [-100, -100, 0, -100, -100, -100, -100, -100 + ], # t2: Top1 id = 2 + [-100, -100, -100, 0, -100, -100, -100, -100 + ], # t3: Top1 id = 3 + [-100, -100, -100, 0, -100, -100, -100, -100 + ], # t4: Top1 id = 3 + [-100, -100, 0, -100, -100, -100, -100, -100 + ], # t5: Top1 id = 2 + [-100, 0, -100, -100, -100, -100, -100, -100 + ], # t6: Top1 id = 1 + [0, -100, -100, -100, -100, -100, -100, -100] # t7: Top1 id = 0 + ], + dtype=torch.float32, + device="cuda") + draft_tokens = torch.tensor([[0, 1, 2, 3, 4, 5, 6]], + dtype=torch.int32, + device="cuda") + draft_lens = torch.tensor([7], dtype=torch.int32, device="cuda") + eagle_temperature = torch.tensor([0.0], + dtype=torch.float, + device="cuda") + rand_data_validation = torch.tensor([[0.0]], + dtype=torch.float, + device="cuda") + paths = torch.tensor( + [[ + [0, 1, 2, 3], # Draft seq [0, 1, 2], Target seq [0, 1, 2, 3] + [0, 1, 2, 4], # Draft seq [0, 1, 3], Target seq [0, 1, 2, 3] + [0, 5, -1, -1], # Draft seq [4], Target seq [0, 2] + [0, 6, 7, -1], # Draft seq [5, 6], Target seq [0, 1, 0] + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1] + ]], + dtype=torch.int32, + device="cuda") + greedy_sampling = True + ref_accepted_tokens = torch.tensor([[0, 1, 2, 3]], + dtype=torch.int32, + device="cuda") + ref_num_accepted_tokens = torch.tensor([4], + dtype=torch.int32, + device="cuda") + ref_accepted_paths = torch.tensor([0], dtype=torch.int32, device="cuda") + ref_last_accepted_tokens = torch.tensor([3], + dtype=torch.int32, + device="cuda") + ref_cum_last_accepted_idxs = torch.tensor([3], + dtype=torch.int32, + device="cuda") + + test_cases += [[ + logits, draft_tokens, draft_lens, eagle_temperature, + rand_data_validation, paths, greedy_sampling, ref_accepted_tokens, + ref_num_accepted_tokens, ref_accepted_paths, + ref_last_accepted_tokens, ref_cum_last_accepted_idxs + ]] + + ################# CASE 1 ########################## + # BS=1, greedy sampling, gen request + # 7 draft tokens + paths = torch.tensor( + [[ + [0, 4, 5, -1], # Draft seq [3, 4], Target seq [0, 3, 2] + [0, 1, -1, -1], # Draft seq [0], Target seq [0, 1, 2] + [0, 1, 2, -1], # Draft seq [0, 1], Target seq [0, 1, 2] + [0, 6, 7, -1], # Draft seq [5, 6], Target seq [0, 1, 0] + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1] + ]], + dtype=torch.int32, + device="cuda") + greedy_sampling = True + ref_accepted_tokens = torch.tensor([[0, 1, 2, -1]], + dtype=torch.int32, + device="cuda") + ref_num_accepted_tokens = torch.tensor([3], + dtype=torch.int32, + device="cuda") + ref_accepted_paths = torch.tensor([2], dtype=torch.int32, device="cuda") + ref_last_accepted_tokens = torch.tensor([2], + dtype=torch.int32, + device="cuda") + ref_cum_last_accepted_idxs = torch.tensor([2], + dtype=torch.int32, + device="cuda") + + test_cases += [[ + logits, draft_tokens, draft_lens, eagle_temperature, + rand_data_validation, paths, greedy_sampling, ref_accepted_tokens, + ref_num_accepted_tokens, ref_accepted_paths, + ref_last_accepted_tokens, ref_cum_last_accepted_idxs + ]] + + ################# CASE 2 ########################## + # BS=2, greedy sampling, gen request + # 3 draft tokens + draft_tokens = torch.tensor([[0, 1, -1, -1], [2, 3, 4, 5]], + dtype=torch.int32, + device="cuda") + draft_lens = torch.tensor([2, 4], dtype=torch.int32, device="cuda") + eagle_temperature = torch.tensor([0.0, 0.0], + dtype=torch.float, + device="cuda") + rand_data_validation = torch.tensor([[0.0], [0.0]], + dtype=torch.float, + device="cuda") + paths = torch.tensor( + [ + [ + [0, 2, -1, -1], # Draft seq [1], Target seq [0, 2] + [0, 1, -1, -1], # Draft seq [0], Target seq [0, 1] + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1] + ], + [ + [0, 1, -1, -1], # Draft seq [2], Target seq [3, 3] + [0, 2, -1, -1], # Draft seq [3], Target seq [3, 2] + [0, 3, 4, -1], # Draft seq [4, 5], Target seq [3, 1, 0] + [-1, -1, -1, -1], + [-1, -1, -1, -1] + ] + ], + dtype=torch.int32, + device="cuda") + greedy_sampling = True + ref_accepted_tokens = torch.tensor([[0, 1, -1, -1], [3, 2, -1, -1]], + dtype=torch.int32, + device="cuda") + ref_num_accepted_tokens = torch.tensor([2, 2], + dtype=torch.int32, + device="cuda") + ref_accepted_paths = torch.tensor([1, 1], + dtype=torch.int32, + device="cuda") + ref_last_accepted_tokens = torch.tensor([1, 2], + dtype=torch.int32, + device="cuda") + ref_cum_last_accepted_idxs = torch.tensor([1, 5], + dtype=torch.int32, + device="cuda") + + test_cases += [[ + logits, draft_tokens, draft_lens, eagle_temperature, + rand_data_validation, paths, greedy_sampling, ref_accepted_tokens, + ref_num_accepted_tokens, ref_accepted_paths, + ref_last_accepted_tokens, ref_cum_last_accepted_idxs + ]] + + ################# CASE 3 ########################## + # BS=2, greedy sampling, 2 ctx request, 1 gen request + draft_tokens = torch.tensor( + [[-1, -1, -1, -1, -1], [-1, -1, -1, -1, -1], [0, 1, 2, 3, 4]], + dtype=torch.int32, + device="cuda") + draft_lens = torch.tensor([0, 0, 5], dtype=torch.int32, device="cuda") + eagle_temperature = torch.tensor([0.0, 0.0, 0.0], + dtype=torch.float, + device="cuda") + rand_data_validation = torch.tensor([[0.0], [0.0], [0.0]], + dtype=torch.float, + device="cuda") + paths = torch.tensor( + [ + [ + [0, 1, 2, -1], # Draft seq [], Target seq [0] + [0, 1, 3, -1], # Draft seq [], Target seq [0] + [0, 1, 4, -1], # Draft seq [], Target seq [0] + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1] + ], + [ + [0, 1, 2, -1], # Draft seq [], Target seq [1] + [0, 1, 3, -1], # Draft seq [], Target seq [1] + [0, 1, 4, -1], # Draft seq [], Target seq [1] + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1] + ], + [ + [0, 1, -1, -1], # Draft seq [0], Target seq [2, 3] + [0, 2, -1, -1], # Draft seq [1], Target seq [2, 3] + [0, 3, 4, 5 + ], # Draft seq [2, 3, 4], Target seq [2, 2, 1, 0] + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1] + ] + ], + dtype=torch.int32, + device="cuda") + greedy_sampling = True + ref_accepted_tokens = torch.tensor( + [[0, -1, -1, -1], [1, -1, -1, -1], [2, 2, -1, -1]], + dtype=torch.int32, + device="cuda") + ref_num_accepted_tokens = torch.tensor([1, 1, 2], + dtype=torch.int32, + device="cuda") + ref_accepted_paths = torch.tensor([0, 0, 2], + dtype=torch.int32, + device="cuda") + ref_last_accepted_tokens = torch.tensor([0, 1, 2], + dtype=torch.int32, + device="cuda") + ref_cum_last_accepted_idxs = torch.tensor([0, 1, 5], + dtype=torch.int32, + device="cuda") + + test_cases += [[ + logits, draft_tokens, draft_lens, eagle_temperature, + rand_data_validation, paths, greedy_sampling, ref_accepted_tokens, + ref_num_accepted_tokens, ref_accepted_paths, + ref_last_accepted_tokens, ref_cum_last_accepted_idxs + ]] + return test_cases + + @parameterized.expand(load_test_cases, name_func=unittest_name_func) + def test_sample_accept_draft_tokens_plugin( + self, logits, draft_tokens, draft_lens, eagle_temperature, + rand_data_validation, paths, greedy_sampling, ref_accepted_tokens, + ref_num_accepted_tokens, ref_accepted_paths, + ref_last_accepted_tokens, ref_cum_last_accepted_idxs): + # test data + torch.get_default_device() + torch.set_default_device("cuda") + + # construct trt network + builder = tensorrt_llm.Builder() + network = builder.create_network() + with tensorrt_llm.net_guard(network): + logits_t = Tensor(name='logits', + dtype=tensorrt_llm.torch_dtype_to_trt( + logits.dtype), + shape=logits.shape) + draft_tokens_t = Tensor(name='draft_tokens', + dtype=trt.int32, + shape=draft_tokens.shape) + draft_lens_t = Tensor(name='draft_lens', + dtype=trt.int32, + shape=draft_lens.shape) + eagle_temperature_t = Tensor(name='eagle_temperature', + dtype=trt.float32, + shape=eagle_temperature.shape) + rand_data_validation_t = Tensor(name='rand_data_validation', + dtype=trt.float32, + shape=rand_data_validation.shape) + paths_t = Tensor(name='paths', dtype=trt.int32, shape=paths.shape) + + output = tensorrt_llm.models.eagle.model.eagle_sample_and_accept_draft_plugin( + logits_t, + draft_tokens_t, + draft_lens_t, + eagle_temperature_t, + rand_data_validation_t, + TreeParams(paths=paths_t), + greedy_sampling=greedy_sampling) + accepted_tokens, num_accepted_tokens, accepted_paths, \ + last_accepted_tokens, cum_last_accepted_idxs, next_draft_tokens, next_draft_lens = output + + accepted_tokens.mark_output('accepted_tokens') + num_accepted_tokens.mark_output('num_accepted_tokens') + accepted_paths.mark_output('accepted_paths') + last_accepted_tokens.mark_output('last_accepted_tokens') + cum_last_accepted_idxs.mark_output('cum_last_accepted_idxs') + next_draft_tokens.mark_output('next_draft_tokens') + next_draft_lens.mark_output('next_draft_lens') + + # trt run + session = create_session(builder, network, precision='float32') + inputs = { + "logits": logits, + "draft_tokens": draft_tokens, + "draft_lens": draft_lens, + "eagle_temperature": eagle_temperature, + "rand_data_validation": rand_data_validation, + "paths": paths, + } + outputs = run_session(session, inputs) + + batch_size = ref_accepted_tokens.shape[0] + torch.testing.assert_close(ref_num_accepted_tokens, + outputs["num_accepted_tokens"], + rtol=0, + atol=0) + for bi in range(batch_size): + torch.testing.assert_close( + ref_accepted_tokens[bi][:ref_num_accepted_tokens[bi]], + outputs["accepted_tokens"][bi][:ref_num_accepted_tokens[bi]], + rtol=0, + atol=0) + torch.testing.assert_close(ref_accepted_paths, + outputs["accepted_paths"], + rtol=0, + atol=0) + torch.testing.assert_close(ref_last_accepted_tokens, + outputs["last_accepted_tokens"], + rtol=0, + atol=0) + torch.testing.assert_close(ref_cum_last_accepted_idxs, + outputs["cum_last_accepted_idxs"], + rtol=0, + atol=0) + + self.assertEqual(outputs["next_draft_tokens"].shape, draft_tokens.shape) + self.assertEqual(outputs["next_draft_lens"].shape, draft_lens.shape) + +if __name__ == "__main__": + unittest.main() diff --git a/tests/model/test_falcon.py b/tests/model/test_falcon.py index 324427f3b..9566bac7f 100644 --- a/tests/model/test_falcon.py +++ b/tests/model/test_falcon.py @@ -54,6 +54,7 @@ def generate_hf_model(self, num_kv_heads: Optional[int] = None, use_alibi: bool = True, parallel_attention: bool = False, + num_ln_in_parallel_attn: int = 2, new_decoder_architecture: bool = False): if isinstance(dtype, str): dtype = tensorrt_llm._utils.str_dtype_to_torch(dtype) @@ -80,6 +81,7 @@ def generate_hf_model(self, new_decoder_architecture=new_decoder_architecture, multi_query=multi_query, parallel_attn=parallel_attention, + num_ln_in_parallel_attn=num_ln_in_parallel_attn, num_kv_heads=num_kv_heads, pad_token_id=1, eos_token_id=0, @@ -185,20 +187,25 @@ def generate_trtllm_runtime(self, def load_test_cases(): test_cases = [ # TC for Falcon-1B arch: MHA + ALiBi - ('MHA', True, False, False, False, False, True, False, + ('MHA', True, False, 1, False, False, False, True, False, ContextFMHAType.disabled, 'float16'), - ('MHA', True, False, False, False, False, True, False, + ('MHA', True, False, 1, False, False, False, True, False, ContextFMHAType.disabled, 'float32'), # TC for Falcon-7B arch: MQA + RoPE + parallel_attention - ('MQA', False, True, False, False, True, True, False, + ('MQA', False, True, 1, False, False, True, True, False, ContextFMHAType.disabled, 'float16'), - ('MQA', False, True, False, False, True, True, False, + ('MQA', False, True, 1, False, False, True, True, False, ContextFMHAType.disabled, 'float32'), # TC for Falcon-40B arch: GQA + RoPE + parallel_attention + new_decoder_architecture - ('GQA', False, True, True, False, True, True, False, + ('GQA', False, True, 2, True, False, True, True, False, ContextFMHAType.disabled, 'float16'), - ('GQA', False, True, True, False, True, True, False, + ('GQA', False, True, 2, True, False, True, True, False, ContextFMHAType.disabled, 'float32'), + # TC for Falcon2-11B arch: GQA + RoPE + parallel_attention (1 or 2 layernorm) + new_decoder_architecture + ('GQA', False, True, 1, True, False, True, True, False, + ContextFMHAType.disabled, 'float32'), + ('GQA', False, True, 2, True, False, True, True, False, + ContextFMHAType.disabled, 'float32') ] return test_cases @@ -245,8 +252,8 @@ def skip_test_case(self, query_type, use_alibi, parallel_attention, @parameterized.expand(load_test_cases(), name_func=unittest_name_func) def test_falcon(self, query_type, use_alibi, parallel_attention, - new_decoder_architecture, use_refit, - use_gpt_attengion_plugin, use_gemm_plugin, + num_ln_in_parallel_attn, new_decoder_architecture, + use_refit, use_gpt_attengion_plugin, use_gemm_plugin, remove_input_padding, context_fmha_type, dtype): self.skip_test_case(query_type, use_alibi, parallel_attention, new_decoder_architecture, use_refit, @@ -266,6 +273,7 @@ def test_falcon(self, query_type, use_alibi, parallel_attention, dtype, use_alibi=use_alibi, parallel_attention=parallel_attention, + num_ln_in_parallel_attn=num_ln_in_parallel_attn, new_decoder_architecture=new_decoder_architecture, query_type=query_type) runtime, _ = self.generate_trtllm_runtime( @@ -480,8 +488,8 @@ def test_falcon(self, query_type, use_alibi, parallel_attention, @parameterized.expand(load_test_cases(), name_func=unittest_name_func) def test_greedy_search(self, query_type, use_alibi, parallel_attention, - new_decoder_architecture, use_refit, - use_gpt_attengion_plugin, use_gemm_plugin, + num_ln_in_parallel_attn, new_decoder_architecture, + use_refit, use_gpt_attengion_plugin, use_gemm_plugin, remove_input_padding, context_fmha_type, dtype): self.skip_test_case(query_type, use_alibi, parallel_attention, @@ -504,6 +512,7 @@ def test_greedy_search(self, query_type, use_alibi, parallel_attention, query_type=query_type, use_alibi=use_alibi, parallel_attention=parallel_attention, + num_ln_in_parallel_attn=num_ln_in_parallel_attn, new_decoder_architecture=new_decoder_architecture) _, engine_buffer = self.generate_trtllm_runtime( model_name=model_name, diff --git a/tests/model/test_mamba.py b/tests/model/test_mamba.py index 60beb572f..d7ae52456 100644 --- a/tests/model/test_mamba.py +++ b/tests/model/test_mamba.py @@ -32,7 +32,7 @@ sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) -from examples.mamba.convert_checkpoint import (convert_from_hf_checkpoint, +from tensorrt_llm.models.mamba.convert import (convert_from_hf_checkpoint, convert_hf_mamba) sys.path.append(os.path.join(os.path.dirname(__file__), '..')) @@ -74,13 +74,12 @@ def _gen_tensorrt_llm_mamba(self, hf_config, hf_path, hf_mamba, load_mode, 'pp_size': 1 }, } + config = tensorrt_llm.models.PretrainedConfig.from_dict(config) if load_mode == 'from_checkpoint': weights = convert_from_hf_checkpoint(mamba_config=config, - model_dir=hf_path, - dtype=dtype) + model_dir=hf_path) else: - weights = convert_hf_mamba(hf_mamba, rank=0, dtype=dtype) - config = tensorrt_llm.models.PretrainedConfig.from_dict(config) + weights = convert_hf_mamba(hf_mamba, dtype=dtype) tensorrt_llm_mamba = tensorrt_llm.models.MambaForCausalLM(config) tensorrt_llm_mamba.load(weights) return tensorrt_llm_mamba diff --git a/tests/quantization/test_smooth_quant_gemm.py b/tests/quantization/test_smooth_quant_gemm.py index c84e21b64..153dc137e 100644 --- a/tests/quantization/test_smooth_quant_gemm.py +++ b/tests/quantization/test_smooth_quant_gemm.py @@ -15,7 +15,7 @@ import os import sys import unittest -from itertools import product +from itertools import chain, product import _utils import numpy as np @@ -37,7 +37,8 @@ class TestSmoothQuantGemm(unittest.TestCase): def setUp(self): tensorrt_llm.logger.set_level('error') - def _sq_gemm(self, m, n, k, dtype, per_token_scaling, per_channel_scaling): + def _sq_gemm(self, m, n, k, dtype, per_token_scaling, per_channel_scaling, + use_plugin): # Init operands for multiplication in int32 shape1 = (m, k) mat1 = torch.randint(-128, 128, shape1, dtype=torch.int8) @@ -60,11 +61,11 @@ def _sq_gemm(self, m, n, k, dtype, per_token_scaling, per_channel_scaling): # Create builder builder = tensorrt_llm.Builder() - builder.strongly_typed = False # Test need to run in weekly typed mode # Create empty network network = builder.create_network() # Allow SQ plugin of dtype type - network.plugin_config.smooth_quant_gemm_plugin = dtype + if use_plugin: + network.plugin_config.smooth_quant_gemm_plugin = dtype with tensorrt_llm.net_guard(network): # Init TensorRT-LLM tensor for mat1 x = Tensor(name='x', @@ -75,18 +76,13 @@ def _sq_gemm(self, m, n, k, dtype, per_token_scaling, per_channel_scaling): shape=mat2.shape, dtype=tensorrt_llm._utils.str_dtype_to_trt("int8")) # Init TensorRT-LLM tensor for per token scaling - scale_a = Tensor( - name='scale_a', - shape=scale_a_torch.shape, - dtype=tensorrt_llm._utils.str_dtype_to_trt("float32")) + scale_a = tensorrt_llm.functional.constant(scale_a_torch.numpy()) # Init TensorRT-LLM tensor for per channel scaling - scale_b = Tensor( - name='scale_b', - shape=scale_b_torch.shape, - dtype=tensorrt_llm._utils.str_dtype_to_trt("float32")) + scale_b = tensorrt_llm.functional.constant(scale_b_torch.numpy()) # Get output tensor for SQ gemm output = smooth_quant_gemm(x, y, scale_a, scale_b, - per_token_scaling, per_channel_scaling) + per_token_scaling, per_channel_scaling, + dtype) output.mark_output('output', dtype) # TODO: When dtype=int32, per_token_scaling=False, per_channel_scaling=False, @@ -95,19 +91,14 @@ def _sq_gemm(self, m, n, k, dtype, per_token_scaling, per_channel_scaling): engine = EngineFromNetwork( (builder.trt_builder, network.trt_network), config=CreateConfig( - int8=True, - fp16=(dtype == "float16"), memory_pool_limits={trt.MemoryPoolType.WORKSPACE: 33554432})) # Infer engine with TrtRunner(engine) as runner: - outputs = runner.infer( - feed_dict={ - 'x': mat1.numpy(), - 'y': mat2.numpy(), - 'scale_a': scale_a_torch.numpy(), - 'scale_b': scale_b_torch.numpy() - }) + outputs = runner.infer(feed_dict={ + 'x': mat1.numpy(), + 'y': mat2.numpy(), + }) ref = _utils.gt_matmul_smooth_quant(mat1, mat2, @@ -118,34 +109,25 @@ def _sq_gemm(self, m, n, k, dtype, per_token_scaling, per_channel_scaling): np.testing.assert_allclose(ref.cpu().numpy(), outputs['output']) - @parameterized.expand(product(["float16", "float32", "int32"], - [True, False], [True, False]), + @parameterized.expand(chain( + product(["float16", "float32", "int32"], [True, False], [True, False], + [True]), + product(["float16", "float32"], [True, False], [True, False], [False])), name_func=unittest_name_func) @skip_pre_ampere # SmoothQuant is not supported in pre-Ampere - def test_matmul(self, dtype, per_token_scaling, per_channel_scaling): + def test_matmul(self, dtype, per_token_scaling, per_channel_scaling, + use_plugin): bs = 2 inseq = 16 hidden_size = 768 # qkv_gemm self._sq_gemm(bs * inseq, 3 * hidden_size, hidden_size, dtype, - per_token_scaling, per_channel_scaling) + per_token_scaling, per_channel_scaling, use_plugin) # mlp_gemm_1 self._sq_gemm(bs * inseq, 4 * hidden_size, hidden_size, dtype, - per_channel_scaling, per_token_scaling) - - def test_sq_matmul_no_plugin(self): - # Create builder - builder = tensorrt_llm.Builder() - # Create empty network - network = builder.create_network() - with tensorrt_llm.net_guard(network): - # SQ Gemm ootb should fail - with self.assertRaisesRegex( - TypeError, - "Smooth Quant GEMM is only supported with plugin"): - smooth_quant_gemm(None, None, None, None, False, False) + per_channel_scaling, per_token_scaling, use_plugin) if __name__ == '__main__': diff --git a/tests/quantization/test_smooth_quant_layer_norm.py b/tests/quantization/test_smooth_quant_layer_norm.py index 25ad2527a..0a189439b 100644 --- a/tests/quantization/test_smooth_quant_layer_norm.py +++ b/tests/quantization/test_smooth_quant_layer_norm.py @@ -40,13 +40,15 @@ def load_test_cases(): ('bfloat16', False, True), ('bfloat16', True, True), ('float32', False, True), ('float32', True, True), ('float16', True, False)] + test_cases = [i + (True, ) for i in test_cases + ] + [i + (False, ) for i in test_cases] return [i + (True, ) for i in test_cases] + [i + (False, ) for i in test_cases] @parameterized.expand(load_test_cases, name_func=unittest_name_func) - def test_smooth_quant_layer_norm_plugin(self, dtype, dynamic_act_scaling, - elementwise_affine, - remove_batch_dim): + def test_smooth_quant_layer_norm(self, dtype, dynamic_act_scaling, + elementwise_affine, remove_batch_dim, + use_plugin): # Skip tests that are not supported in pre-ampere architecture skip_bf16_pre_ampere(dtype) @@ -75,7 +77,8 @@ def test_smooth_quant_layer_norm_plugin(self, dtype, dynamic_act_scaling, # construct trt network builder = tensorrt_llm.Builder() network = builder.create_network() - network.plugin_config.layernorm_quantization_plugin = dtype + if use_plugin: + network.plugin_config.layernorm_quantization_plugin = dtype with tensorrt_llm.net_guard(network): x = Tensor(name='x', shape=x_data.shape, @@ -139,18 +142,6 @@ def cast_to_int8_with_sat(tensor): atol=1e-2, rtol=1e-2) - def test_sq_layer_norm_no_plugin(self): - # Create builder - builder = tensorrt_llm.Builder() - # Create empty network - network = builder.create_network() - with tensorrt_llm.net_guard(network): - # SQ LayerNorm ootb should fail - with self.assertRaisesRegex( - TypeError, - "Smooth Quant Layer Norm is only supported with plugin"): - smooth_quant_layer_norm(None, 0, None, None, None, 0) - if __name__ == '__main__': unittest.main() diff --git a/tests/quantization/test_smooth_quant_rms_norm.py b/tests/quantization/test_smooth_quant_rms_norm.py index 3fa45d98b..21f66d272 100644 --- a/tests/quantization/test_smooth_quant_rms_norm.py +++ b/tests/quantization/test_smooth_quant_rms_norm.py @@ -34,11 +34,16 @@ class TestSmoothQuantRmsNorm(unittest.TestCase): def setUp(self): tensorrt_llm.logger.set_level('error') - @parameterized.expand([('float16', False), ('float16', True), - ('bfloat16', False), ('bfloat16', True), - ('float32', False), ('float32', True)], + @parameterized.expand([('float16', False, True), ('float16', True, True), + ('bfloat16', False, True), ('bfloat16', True, True), + ('float32', False, True), ('float32', True, True), + ('float16', False, False), ('float16', True, False), + ('bfloat16', False, False), + ('bfloat16', True, False), ('float32', False, False), + ('float32', True, False)], name_func=unittest_name_func) - def test_smooth_quant_rms_norm_plugin(self, dtype, dynamic_act_scaling): + def test_smooth_quant_rms_norm(self, dtype, dynamic_act_scaling, + use_plugin): # Skip tests that are not supported in pre-ampere architecture skip_bf16_pre_ampere(dtype) @@ -77,7 +82,8 @@ def cast_to_int8_with_sat(tensor): builder = tensorrt_llm.Builder() builder.strongly_typed = False # Test need to run in weekly typed mode network = builder.create_network() - network.plugin_config.rmsnorm_quantization_plugin = dtype + if use_plugin: + network.plugin_config.rmsnorm_quantization_plugin = dtype with tensorrt_llm.net_guard(network): x = Tensor(name='x', shape=x_data.shape, @@ -118,19 +124,6 @@ def cast_to_int8_with_sat(tensor): atol=1e-2, rtol=1e-2) - def test_sq_rms_norm_no_plugin(self): - # Create builder - builder = tensorrt_llm.Builder() - builder.strongly_typed = False # Test need to run in weekly typed mode - # Create empty network - network = builder.create_network() - with tensorrt_llm.net_guard(network): - # SQ Rmsnorm ootb should fail. - with self.assertRaisesRegex( - TypeError, - "Smooth Quant Rms Norm is only supported with plugin"): - smooth_quant_rms_norm(None, 0, None, None, None, 0) - if __name__ == '__main__': unittest.main() diff --git a/tests/quantization/test_weight_only_groupwise_quant_matmul.py b/tests/quantization/test_weight_only_groupwise_quant_matmul.py index 8118e67cd..5cbde9d06 100644 --- a/tests/quantization/test_weight_only_groupwise_quant_matmul.py +++ b/tests/quantization/test_weight_only_groupwise_quant_matmul.py @@ -172,8 +172,10 @@ def _woq_groupwise_matmul(self, num_weights_in_32_bits = 0 if quantized_weight_dtype == torch.int8: num_weights_in_32_bits = 4 + use_int8_weight = 1 elif quantized_weight_dtype == torch.quint4x2: num_weights_in_32_bits = 8 + use_int8_weight = 0 else: assert False, "Unsupported weight dtype." @@ -184,18 +186,23 @@ def _woq_groupwise_matmul(self, dtype=torch.int32, device="cuda") - preprocessor = torch.ops.trtllm.preprocess_weights_for_mixed_gemm - # Weights must be a CPU Tensor - unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8 - unprocessed_weight = unprocessed_int_weight.view(torch.int8) - # Weights must be a CPU Tensor - ref_q_weight = unpacker(unprocessed_weight.cpu()) + if use_w4a8_awq: activation_type = torch.float8_e4m3fn else: activation_type = torch.float16 - # Weights must be a CPU Tensor + + if quantized_weight_dtype == torch.int8: + ref_q_weight = unprocessed_weight + elif quantized_weight_dtype == torch.quint4x2: + # Weights must be a CPU Tensor + unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8 + ref_q_weight = unpacker(unprocessed_weight.cpu()) + else: + assert False, "Unsupported weight dtype." + + preprocessor = torch.ops.trtllm.preprocess_weights_for_mixed_gemm cuda_q_weight = preprocessor(unprocessed_weight.cpu(), quantized_weight_dtype, activation_type).view(activation_dtype) @@ -205,8 +212,11 @@ def _woq_groupwise_matmul(self, ZERO = 2 PRE_QUANT_SCALE = 4 W4A8_AWQ = 8 + INT8_WEIGHT = 16 - quant_algo = use_w4a8_awq * W4A8_AWQ + has_pre_quant * PRE_QUANT_SCALE + has_zero * ZERO + has_bias * BIAS + quant_algo = (use_int8_weight * INT8_WEIGHT + use_w4a8_awq * W4A8_AWQ + + has_pre_quant * PRE_QUANT_SCALE + has_zero * ZERO + + has_bias * BIAS) scale_ref = scale.repeat_interleave(group_size, dim=0)[:k, :] ref_th_weight = ref_q_weight.cuda().to(activation_dtype) * scale_ref @@ -230,6 +240,54 @@ def _woq_groupwise_matmul(self, ref = _utils.woq_groupwise_gt_matmul(activation, ref_th_weight, bias) _utils.woq_assert_near_eq(ref, output, 2) + # test for INT8 weight + @parameterized.expand( + [(1, 1024, 64, 'float16', False, True, True, 64), + (16, 1024, 256, 'float16', False, True, False, 64), + (32, 2048, 384, 'float16', False, False, True, 64), + (64, 2048, 1024, 'float16', False, False, False, 64), + (2, 1024, 128, 'float16', False, True, True, 128), + (8, 1024, 256, 'float16', False, True, False, 128), + (48, 2048, 384, 'float16', False, False, True, 128), + (96, 2048, 1024, 'float16', False, False, False, 128)], + name_func=unittest_name_func) + @skip_pre_ampere_unittest + def test_matmul_int8_input(self, + m, + n, + k, + dtype, + has_pre_quant, + has_zero, + has_bias, + group_size=128): + self._woq_groupwise_matmul(m, n, k, dtype, torch.int8, has_pre_quant, + has_zero, has_bias, group_size) + + @parameterized.expand( + [(1, 1024, 64, 'bfloat16', False, True, True, 64), + (16, 1024, 256, 'bfloat16', False, True, False, 64), + (32, 2048, 384, 'bfloat16', False, False, True, 64), + (64, 2048, 1024, 'bfloat16', False, False, False, 64), + (2, 1024, 128, 'bfloat16', False, True, True, 128), + (8, 1024, 256, 'bfloat16', False, True, False, 128), + (48, 2048, 384, 'bfloat16', False, False, True, 128), + (96, 2048, 1024, 'bfloat16', False, False, False, 128)], + name_func=unittest_name_func) + @skip_pre_ampere_unittest + def test_matmul_bf16_int8_input(self, + m, + n, + k, + dtype, + has_pre_quant, + has_zero, + has_bias, + group_size=128): + self._woq_groupwise_matmul(m, n, k, dtype, torch.int8, has_pre_quant, + has_zero, has_bias, group_size) + + # test for INT4 weight @parameterized.expand( [(1, 1024, 64, 'float16', False, True, True, 64), (16, 1024, 256, 'float16', False, True, False, 64), diff --git a/tests/utils/util.py b/tests/utils/util.py index 8184c9f0f..a54f49d71 100644 --- a/tests/utils/util.py +++ b/tests/utils/util.py @@ -11,7 +11,7 @@ import tensorrt_llm from tensorrt_llm._utils import torch_dtype_to_trt, trt_dtype_to_torch -from tensorrt_llm.hlapi.utils import get_total_gpu_memory +from tensorrt_llm.llmapi.utils import get_total_gpu_memory from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode from tensorrt_llm.runtime import Session, TensorInfo