Update TensorRT-LLM (NVIDIA#2110)

ampdot-io · Aug 13, 2024 · 74b324f · 74b324f
1 parent be9cd71
commit 74b324f
Show file tree

Hide file tree

Showing 327 changed files with 255,842 additions and 9,089 deletions.
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
diff --git a/README.md b/README.md
@@ -6,8 +6,8 @@ TensorRT-LLM
 
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
-[![cuda](https://img.shields.io/badge/cuda-12.4.1-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.2.0-green)](https://developer.nvidia.com/tensorrt)
+[![cuda](https://img.shields.io/badge/cuda-12.5.1-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt](https://img.shields.io/badge/TRT-10.3.0-green)](https://developer.nvidia.com/tensorrt)
 [![version](https://img.shields.io/badge/release-0.12.0.dev-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
@@ -17,19 +17,21 @@ TensorRT-LLM
 <div align="left">
 
 ## Latest News
-* [2024/07/30] Introducing🍊 @SliceXAI ELM Turbo 🤖 train ELM once ⚡ #TensorRT #LLM optimize ☁️ deploy anywhere
-[➡️ link](https://developer.nvidia.com/blog/supercharging-llama-3-1-across-nvidia-platforms)
+* [2024/08/06] 🗫 Multilingual Challenge Accepted 🗫
+🤖 #TensorRT #LLM boosts low-resource languages like Hebrew, Indonesian and Vietnamese ⚡[➡️ link](https://developer.nvidia.com/blog/accelerating-hebrew-llm-performance-with-nvidia-tensorrt-llm/?linkId=100000278659647)
 <div align="center">
-<img src="docs/source/media/picture-07-30-2024.png" width="70%">
+<img src="docs/source/media/picture-08-06-2024.png" width="50%">
 <div align="left">
 
+* [2024/07/30] Introducing🍊 @SliceXAI ELM Turbo 🤖 train ELM once ⚡ #TensorRT #LLM optimize ☁️ deploy anywhere
+[➡️ link](https://developer.nvidia.com/blog/supercharging-llama-3-1-across-nvidia-platforms)
+
 * [2024/07/23] 👀 @AIatMeta Llama 3.1 405B trained on 16K NVIDIA H100s - inference is #TensorRT #LLM optimized ⚡
 🦙 400 tok/s - per node
 🦙 37 tok/s - per user
 🦙 1 node inference
 [➡️ link](https://developer.nvidia.com/blog/supercharging-llama-3-1-across-nvidia-platforms)
 
-
 * [2024/07/09] Checklist to maximize multi-language performance of @meta #Llama3 with #TensorRT #LLM inference:
 ✅ MultiLingual
 ✅ NIM
@@ -50,6 +52,10 @@ Technical Deep Dive for serious coders ✅+99% compression ✅1 set of weights
 * [2024/06/04] ✨ #TensorRT and GeForce #RTX unlock ComfyUI SD superhero powers 🦸⚡ 🎥 Demo: [➡️ link](https://youtu.be/64QEVfbPHyg)
 📗 DIY notebook: [➡️ link](https://console.brev.dev/launchable/deploy?userID=2x2sil999&orgID=ktj33l4xj&name=ComfyUI_TensorRT&instance=L4%40g2-standard-4%3Anvidia-l4%3A1&diskStorage=500&cloudID=GCP&baseImage=docker.io%2Fpytorch%2Fpytorch%3A2.2.0-cuda12.1-cudnn8-runtime&ports=ComfUI%3A8188&file=https%3A%2F%2Fgithub.com%2Fbrevdev%2Fnotebooks%2Fblob%2Fmain%2Ftensorrt-comfyui.ipynb&launchableID=env-2hQX3n7ae5mq3NjNZ32DfAG0tJf)
 
+<details close>
+<summary>Previous News</summary>
+
+
 * [2024/05/28] ✨#TensorRT weight stripping for ResNet-50 ✨ ✅+99% compression
 ✅1 set of weights → ** GPUs\ ✅0 performance loss ✅** models…LLM, CNN, etc
 👀 📚 DIY [➡️ link](https://console.brev.dev/launchable/deploy?userID=2x2sil999&orgID=ktj33l4xj&launchableID=env-2h6bym7h5GFNho3vpWQQeUYMwTM&instance=L4%40g6.xlarge&diskStorage=500&cloudID=devplane-brev-1&baseImage=nvcr.io%2Fnvidia%2Ftensorrt%3A24.05-py3&file=https%3A%2F%2Fgithub.com%2FNVIDIA%2FTensorRT%2Fblob%2Frelease%2F10.0%2Fsamples%2Fpython%2Fsample_weight_stripping%2Fnotebooks%2Fweight_stripping.ipynb&name=tensorrt_weight_stripping_resnet50)
@@ -62,8 +68,6 @@ Serverless TensorRT-LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.co
 
 * [2024/05/07] 🦙🦙🦙 24,000 tokens per second 🛫Meta Llama 3 takes off with #TensorRT #LLM 📚[➡️ link](https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/)
 
-<details close>
-<summary>Previous News</summary>
 
 * [2024/02/06] [🚀 Speed up inference with SOTA quantization techniques in TRT-LLM](./docs/source/blogs/quantization-in-TRT-LLM.md)
 * [2024/01/30] [ New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget](./docs/source/blogs/XQA-kernel.md)

diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
@@ -154,7 +154,7 @@ struct BenchmarkParams
     std::optional<SizeType32> maxBatchSize{std::nullopt};
     std::optional<SizeType32> maxNumTokens{std::nullopt};
     int randomSeed = 430;
-    std::optional<int> maxAttentionWindow{std::nullopt};
+    std::optional<std::vector<int>> maxAttentionWindowVec{std::nullopt};
     std::optional<int> sinkTokenLength{std::nullopt};
     bool multiBlockMode{false};
     bool enableContextFMHAFP32Acc{false};
@@ -803,8 +803,8 @@ class ExecutorServer
 
         texec::SchedulerConfig schedulerConfig(capacitySchedulerPolicy);
         texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse, benchmarkParams.maxTokensInPagedKvCache,
-            benchmarkParams.maxAttentionWindow, benchmarkParams.sinkTokenLength, benchmarkParams.freeGpuMemoryFraction,
-            benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks);
+            benchmarkParams.maxAttentionWindowVec, benchmarkParams.sinkTokenLength,
+            benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks);
         texec::PeftCacheConfig peftCacheConfig(0, benchmarkParams.loraDeviceNumModLayers, 8, 64, 4, 4, 4, 24, 8,
             std::nullopt, benchmarkParams.loraHostCacheSize);
         texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig(
@@ -1351,7 +1351,7 @@ std::shared_ptr<InferenceRequest> makeRequest(std::uint64_t reqId, Sample const&
     if (sample.taskId >= 0)
     {
         uint64_t taskId = static_cast<uint64_t>(sample.taskId);
-        request->setLoraTaskId(bufferManager.copyFrom(&taskId, ITensor::makeShape({1}), MemoryType::kPINNED));
+        request->setLoraTaskId(bufferManager.copyFrom(&taskId, ITensor::makeShape({1}), MemoryType::kPINNEDPOOL));
     }
     if (loraWeights)
     {
@@ -1406,9 +1406,9 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
     {
         optionalParams.kvCacheConfig.freeGpuMemoryFraction = benchmarkParams.freeGpuMemoryFraction;
     }
-    if (benchmarkParams.maxAttentionWindow)
+    if (benchmarkParams.maxAttentionWindowVec)
     {
-        optionalParams.kvCacheConfig.maxAttentionWindow = benchmarkParams.maxAttentionWindow;
+        optionalParams.kvCacheConfig.maxAttentionWindowVec = benchmarkParams.maxAttentionWindowVec;
     }
     if (benchmarkParams.sinkTokenLength)
     {
@@ -1442,7 +1442,7 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
     BufferManager bufferManager{std::make_shared<CudaStream>()}; // the stream is not used
 
     ITensor::SharedPtr beamWidthTensor{
-        bufferManager.copyFrom(&beamWidth, ITensor::makeShape({1}), MemoryType::kPINNED)};
+        bufferManager.copyFrom(&beamWidth, ITensor::makeShape({1}), MemoryType::kPINNEDPOOL)};
 
     // Load dataset
     auto const samples = parseWorkloadJson(datasetPath, maxNumSamples, maxPromptLen);
@@ -1455,16 +1455,16 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
         waitSleep, staticEmulatedBatchSize, batchTimeout, logIterationData, excludeInputInOutput);
 
     ITensor::SharedPtr eosIdTensor{
-        eosId ? bufferManager.copyFrom(&eosId.value(), ITensor::makeShape({1}), MemoryType::kPINNED) : nullptr};
+        eosId ? bufferManager.copyFrom(&eosId.value(), ITensor::makeShape({1}), MemoryType::kPINNEDPOOL) : nullptr};
     ITensor::SharedPtr padIdTensor{
-        padId ? bufferManager.copyFrom(&padId.value(), ITensor::makeShape({1}), MemoryType::kPINNED) : nullptr};
+        padId ? bufferManager.copyFrom(&padId.value(), ITensor::makeShape({1}), MemoryType::kPINNEDPOOL) : nullptr};
 
     ITensor::SharedPtr returnContextLogitsFlagTensor{returnContextLogits
-            ? bufferManager.copyFrom(&returnContextLogits, ITensor::makeShape({1}), MemoryType::kPINNED)
+            ? bufferManager.copyFrom(&returnContextLogits, ITensor::makeShape({1}), MemoryType::kPINNEDPOOL)
             : nullptr};
 
     ITensor::SharedPtr returnGenerationLogitsFlagTensor{returnGenerationLogits
-            ? bufferManager.copyFrom(&returnGenerationLogits, ITensor::makeShape({1}), MemoryType::kPINNED)
+            ? bufferManager.copyFrom(&returnGenerationLogits, ITensor::makeShape({1}), MemoryType::kPINNEDPOOL)
             : nullptr};
 
     if (worldConfig.getRank() == 0)
@@ -1816,7 +1816,8 @@ int main(int argc, char* argv[])
         "eos_id", "Specify the end-of-sequence token id.", cxxopts::value<TokenIdType>()->default_value("-1"));
     options.add_options()("pad_id", "Specify the padding token id.", cxxopts::value<TokenIdType>());
     options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value<int>());
-    options.add_options()("max_attention_window", "Max KV cache length per sequence", cxxopts::value<int>());
+    options.add_options()(
+        "max_attention_window", "Max KV cache length per sequence", cxxopts::value<std::vector<int>>());
     options.add_options()("sink_token_len", "Sink token length in kv cache per sequence.", cxxopts::value<int>());
     options.add_options()(
         "random_seed", "integer random seed for exponential time delays.", cxxopts::value<int>()->default_value("420"));
@@ -1961,7 +1962,7 @@ int main(int argc, char* argv[])
     // Argument: Max KV cache length
     if (result.count("max_attention_window"))
     {
-        benchmarkParams.maxAttentionWindow = result["max_attention_window"].as<int>();
+        benchmarkParams.maxAttentionWindowVec = result["max_attention_window"].as<std::vector<int>>();
     }
 
     // Argument: Sink token length

diff --git a/benchmarks/cpp/gptSessionBenchmark.cpp b/benchmarks/cpp/gptSessionBenchmark.cpp
@@ -427,7 +427,8 @@ int main(int argc, char* argv[])
 
     options.add_options()("ctx_micro_batch_size", "Batch size for context phase.", cxxopts::value<int>());
     options.add_options()("gen_micro_batch_size", "Batch size for generation phase.", cxxopts::value<int>());
-    options.add_options()("max_attention_window", "Max kv cache length per sequence.", cxxopts::value<int>());
+    options.add_options()(
+        "max_attention_window", "Max kv cache length per sequence.", cxxopts::value<std::vector<int>>());
     options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value<int>());
     options.add_options()("sink_token_len", "Sink token length in kv cache per sequence.", cxxopts::value<int>());
     options.add_options()(
@@ -535,7 +536,7 @@ int main(int argc, char* argv[])
     // Argument: Max KV Cache Length
     if (result.count("max_attention_window"))
     {
-        sessionConfig.kvCacheConfig.maxAttentionWindow = result["max_attention_window"].as<int>();
+        sessionConfig.kvCacheConfig.maxAttentionWindowVec = result["max_attention_window"].as<std::vector<int>>();
     }
     // Argument: Sink token length
     if (result.count("sink_token_len"))

diff --git a/benchmarks/python/base_benchmark.py b/benchmarks/python/base_benchmark.py
@@ -57,6 +57,12 @@ def serialize_engine(engine, path):
     logger.info(f'Engine serialized. Total time: {t}')
 
 
+def get_last_path_component(path):
+    normalized_path = os.path.normpath(path)
+    last_component = os.path.basename(normalized_path)
+    return last_component
+
+
 class BaseBenchmark(object):
 
     def __init__(self, engine_dir, model_name, dtype, rank, world_size):
@@ -144,7 +150,7 @@ def __init__(self, engine_dir, model_name, dtype, rank, world_size):
 
     def get_report_dict(self, benchmark_profiler=None):
         report_fields = [
-            "model_name",
+            "engine_dir",
             "world_size",
             "num_heads",
             "num_kv_heads",
@@ -165,7 +171,7 @@ def get_report_dict(self, benchmark_profiler=None):
             "compute_cap",
         ]
         report_dict = OrderedDict.fromkeys(report_fields)
-        report_dict["model_name"] = self.model_name
+        report_dict["engine_dir"] = get_last_path_component(self.engine_dir)
         report_dict["world_size"] = self.world_size
         report_dict["precision"] = self.dtype
         report_dict["quantization"] = str(self.quant_mode)
@@ -174,7 +180,8 @@ def get_report_dict(self, benchmark_profiler=None):
 
     def get_csv_filename(self):
         if len(self.csv_filename) == 0:
-            self.csv_filename = get_csv_filename(self.model_name,
+            self.csv_filename = get_csv_filename(get_last_path_component(
+                self.engine_dir),
                                                  self.dtype,
                                                  self.world_size,
                                                  fp8linear=int(self.enable_fp8))

diff --git a/benchmarks/python/benchmark.py b/benchmarks/python/benchmark.py
@@ -192,7 +192,6 @@ def main(args):
             raise Exception(
                 f"--gpu_weights_percent only accepts values between 0.0 and 1.0."
             )
-    args.weight_streaming = any([p != 1 for p in gpu_weights_percents])
 
     rank = tensorrt_llm.mpi_rank()
     world_size = tensorrt_llm.mpi_world_size()
@@ -225,10 +224,9 @@ def main(args):
                                     benchmark_profiler=benchmark_profiler)
     for config in benchmarker.get_config():
         try:
-            if args.weight_streaming:
-                # We pass in config instead of the gpu_weights_percent here to keep this benchmark script
-                # agnostic to the length and contents of the config.
-                benchmarker.set_weight_streaming(config)
+            # We pass in config instead of the gpu_weights_percent here to keep this benchmark script
+            # agnostic to the length and contents of the config.
+            benchmarker.set_weight_streaming(config)
             inputs = benchmarker.prepare_inputs(config)
         except torch.cuda.OutOfMemoryError as e:
             logger.error(