From 74b324f6673d1d8a836e05e506dea2234b22ccc8 Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Tue, 13 Aug 2024 22:34:33 +0800
Subject: [PATCH] Update TensorRT-LLM (#2110)
---
3rdparty/cutlass | 2 +-
README.md | 20 +-
benchmarks/cpp/gptManagerBenchmark.cpp | 27 +-
benchmarks/cpp/gptSessionBenchmark.cpp | 5 +-
benchmarks/python/base_benchmark.py | 13 +-
benchmarks/python/benchmark.py | 8 +-
benchmarks/python/enc_dec_benchmark.py | 77 +-
benchmarks/python/gpt_benchmark.py | 11 +-
.../tensorrt_llm/batch_manager/GptManager.h | 2 +-
.../batch_manager/kvCacheConfig.h | 10 +-
.../batch_manager/kvCacheManager.h | 23 +-
.../tensorrt_llm/batch_manager/llmRequest.h | 4 +-
cpp/include/tensorrt_llm/executor/executor.h | 20 +-
cpp/include/tensorrt_llm/executor/types.h | 1 +
.../tensorrt_llm/runtime/decodingInput.h | 8 +-
.../tensorrt_llm/runtime/decodingOutput.h | 8 +-
cpp/include/tensorrt_llm/runtime/gptDecoder.h | 24 +-
.../tensorrt_llm/runtime/gptDecoderBatched.h | 49 +-
cpp/include/tensorrt_llm/runtime/gptSession.h | 9 +-
cpp/include/tensorrt_llm/runtime/iBuffer.h | 9 +-
.../tensorrt_llm/runtime/iGptDecoderBatched.h | 10 +-
.../runtime/iStatefulGptDecoder.h | 5 +-
.../tensorrt_llm/runtime/memoryCounters.h | 24 +-
.../tensorrt_llm/runtime/modelConfig.h | 92 +-
cpp/include/tensorrt_llm/runtime/rawEngine.h | 15 +-
.../tensorrt_llm/runtime/worldConfig.h | 5 +
.../mixtureOfExpertsBackendBenchmarkFixture.h | 10 +-
.../libtensorrt_llm_batch_manager_static.a | 4 +-
...sorrt_llm_batch_manager_static.pre_cxx11.a | 4 +-
.../aarch64-linux-gnu/version.txt | 6 +-
.../libtensorrt_llm_batch_manager_static.a | 4 +-
...sorrt_llm_batch_manager_static.pre_cxx11.a | 4 +-
.../tensorrt_llm_batch_manager_static.lib | 4 +-
cpp/tensorrt_llm/common/safetensors.cpp | 168 +
cpp/tensorrt_llm/common/safetensors.h | 61 +
.../cutlass_extensions/compute_occupancy.h | 11 +
.../collective/epilogue_moe_finalize.hpp | 44 +-
..._mma_gated_tma_gmma_ss_warpspecialized.hpp | 3 +-
..._gated_tma_gmma_ss_warpspecialized_fp8.hpp | 3 +-
.../gemm/kernel/fpA_intB_gemm.h | 1 -
.../gemm/kernel/fused_moe_kernel.cuh | 9 +-
.../gemm/kernel/fused_moe_kernel_routine.cuh | 35 +-
.../gemm/kernel/fused_moe_kernel_traits.cuh | 2 +
.../gemm/kernel/moe_cutlass_kernel.h | 17 +-
..._gated_tma_warpspecialized_cooperative.hpp | 2 +-
...emm_gated_tma_warpspecialized_pingpong.hpp | 2 +-
.../libtensorrt_llm_executor_static.a | 4 +-
...ibtensorrt_llm_executor_static.pre_cxx11.a | 4 +-
.../executor/aarch64-linux-gnu/version.txt | 6 +-
.../libtensorrt_llm_executor_static.a | 4 +-
...ibtensorrt_llm_executor_static.pre_cxx11.a | 4 +-
.../tensorrt_llm_executor_static.lib | 4 +-
.../beamSearchKernelsTemplate.h | 6 +-
.../cubin/fmha_cubin.h | 114 +
...e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp | 7765 +++++
...e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp | 11040 +++++++
...p32_128_128_S_q_paged_kv_32_sm89.cubin.cpp | 13803 +++++++++
...p32_128_128_S_q_paged_kv_40_sm89.cubin.cpp | 17579 +++++++++++
...p32_128_128_S_q_paged_kv_64_sm89.cubin.cpp | 18560 ++++++++++++
..._e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp | 17899 ++++++++++++
..._e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp | 17515 +++++++++++
..._e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp | 23989 ++++++++++++++++
..._e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp | 4000 +++
...fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp | 6336 ++++
...fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp | 6485 +++++
...fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp | 8939 ++++++
...fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp | 9483 ++++++
...fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp | 11541 ++++++++
..._fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp | 6112 ++++
..._fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp | 6251 ++++
...n_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp | 6421 +++++
...n_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp | 6603 +++++
...n_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp | 8693 ++++++
...n_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp | 9397 ++++++
...n_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp | 11520 ++++++++
...on_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp | 6101 ++++
...on_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp | 6251 ++++
.../fmhaRunner.cpp | 8 +-
.../fp8_rowwise_gemm_kernel_template_sm90.h | 12 +-
.../launchers/fused_moe_gemm_launcher_sm80.h | 4 +-
.../fused_moe_gemm_launcher_sm80.inl | 6 +-
.../moe_gemm/moe_gemm_hopper_input.cu | 2 +-
.../moe_gemm/moe_gemm_kernels.h | 11 +-
.../moe_gemm/moe_gemm_kernels_template.h | 255 +-
.../python/generate_kernels.py | 2 +-
.../libtensorrt_llm_nvrtc_wrapper.so | 4 +-
.../aarch64-linux-gnu/version.txt | 4 +-
.../libtensorrt_llm_nvrtc_wrapper.so | 4 +-
.../tensorrt_llm_nvrtc_wrapper.dll | 2 +-
cpp/tensorrt_llm/kernels/decodingCommon.cu | 12 +-
cpp/tensorrt_llm/kernels/decodingKernels.cu | 139 +-
cpp/tensorrt_llm/kernels/decodingKernels.h | 21 +
cpp/tensorrt_llm/kernels/kvCacheUtils.h | 35 +-
cpp/tensorrt_llm/kernels/lora/lora.cpp | 335 +
cpp/tensorrt_llm/kernels/lora/lora.h | 68 +
.../kernels/mixtureOfExperts/moe_kernels.cu | 501 +-
.../kernels/mixtureOfExperts/moe_kernels.h | 126 +-
cpp/tensorrt_llm/kernels/penaltyKernels.cu | 4 +-
.../kernels/samplingAirTopPKernels.cu | 4 +-
.../kernels/samplingTopKKernels.cu | 4 +-
.../kernels/samplingTopPKernels.cu | 8 +-
cpp/tensorrt_llm/kernels/selectiveScan.cu | 3 +-
.../kernels/selectiveScan/chunkcumsum.h | 8 +
.../kernels/selectiveScan/chunkscan.h | 324 +-
.../kernels/selectiveScan/chunkstate.h | 13 +-
.../explicitDraftTokensKernels.cu | 59 +-
.../explicitDraftTokensKernels.h | 34 +-
.../externalDraftTokensKernels.cu | 40 +-
.../externalDraftTokensKernels.h | 12 +
.../medusaDecodingKernels.cu | 4 +-
.../kernels/stopCriteriaKernels.cu | 6 +-
.../unfusedAttentionKernels_2_template.h | 14 +-
cpp/tensorrt_llm/layers/banWordsLayer.cpp | 14 +-
cpp/tensorrt_llm/layers/beamSearchLayer.cu | 14 +-
cpp/tensorrt_llm/layers/decodingLayer.cpp | 4 +-
cpp/tensorrt_llm/layers/decodingParams.h | 27 +-
.../layers/dynamicDecodeLayer.cpp | 11 +-
.../layers/explicitDraftTokensLayer.cpp | 15 +-
cpp/tensorrt_llm/layers/layerUtils.h | 11 +-
.../layers/lookaheadDecodingLayer.cpp | 6 +-
.../layers/medusaDecodingLayer.cpp | 12 +-
cpp/tensorrt_llm/layers/medusaDecodingLayer.h | 2 +-
cpp/tensorrt_llm/layers/penaltyLayer.cpp | 21 +-
cpp/tensorrt_llm/layers/samplingLayer.cpp | 4 +-
cpp/tensorrt_llm/layers/stopCriteriaLayer.cpp | 2 +-
cpp/tensorrt_llm/layers/topKSamplingLayer.cu | 8 +-
cpp/tensorrt_llm/layers/topKSamplingLayer.h | 2 +-
cpp/tensorrt_llm/layers/topPSamplingLayer.cu | 22 +-
cpp/tensorrt_llm/layers/topPSamplingLayer.h | 3 +-
.../gptAttentionCommon/gptAttentionCommon.cpp | 33 +-
.../gptAttentionPlugin/gptAttentionPlugin.cpp | 10 +-
.../plugins/loraPlugin/loraPlugin.cpp | 401 +-
.../plugins/loraPlugin/loraPlugin.h | 22 +-
.../mixtureOfExpertsPlugin.cpp | 309 +-
.../mixtureOfExperts/mixtureOfExpertsPlugin.h | 108 +-
.../pybind/batch_manager/gptManager.cpp | 20 +-
cpp/tensorrt_llm/pybind/bindings.cpp | 22 +-
cpp/tensorrt_llm/pybind/executor/bindings.cpp | 32 +-
cpp/tensorrt_llm/runtime/bufferManager.cpp | 2 +
cpp/tensorrt_llm/runtime/generationConfig.cpp | 8 +-
cpp/tensorrt_llm/runtime/generationConfig.h | 9 +-
cpp/tensorrt_llm/runtime/gptDecoder.cpp | 34 +-
.../runtime/gptDecoderBatched.cpp | 85 +-
cpp/tensorrt_llm/runtime/gptJsonConfig.cpp | 21 +-
cpp/tensorrt_llm/runtime/gptSession.cpp | 95 +-
cpp/tensorrt_llm/runtime/iBuffer.cpp | 7 +-
cpp/tensorrt_llm/runtime/iTensor.cpp | 4 +
cpp/tensorrt_llm/runtime/memoryCounters.cpp | 2 +
cpp/tensorrt_llm/runtime/runtimeBuffers.cpp | 15 +-
cpp/tensorrt_llm/runtime/runtimeBuffers.h | 4 +-
.../runtime/statefulGptDecoder.cpp | 36 +-
cpp/tensorrt_llm/runtime/statefulGptDecoder.h | 9 +-
cpp/tensorrt_llm/runtime/tllmBuffers.h | 1 +
cpp/tensorrt_llm/runtime/tllmRuntime.cpp | 34 +-
cpp/tensorrt_llm/runtime/tllmRuntime.h | 5 +
cpp/tensorrt_llm/runtime/torch.h | 2 +-
cpp/tensorrt_llm/runtime/torchUtils.h | 1 +
.../runtime/transformerBuffers.cpp | 38 +-
cpp/tensorrt_llm/runtime/utils/numpyUtils.cpp | 4 +-
.../runtime/utils/sessionUtils.cpp | 3 +
cpp/tensorrt_llm/thop/CMakeLists.txt | 9 +-
cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp | 11 +-
cpp/tensorrt_llm/thop/dynamicDecodeOp.h | 1 +
cpp/tensorrt_llm/thop/redrafterCurandOp.cpp | 146 +
cpp/tests/kernels/decodingKernelTest.cpp | 195 +-
cpp/tests/kernels/mixtureOfExpertsTest.cu | 11 +-
cpp/tests/kernels/ropeTest.cu | 2 +-
.../kernels/sampling/samplingUtilsTest.cu | 29 +-
cpp/tests/layers/baseSamplingLayerTest.cpp | 4 +-
cpp/tests/layers/dynamicDecodeLayerTest.cpp | 6 +-
.../layers/explicitDraftTokensLayerTest.cpp | 98 +-
.../layers/explicitDraftTokensLayerTest.h | 11 +-
.../layers/lookaheadDecodingLayerTest.cpp | 3 +-
cpp/tests/layers/medusaDecodeLayerTest.cpp | 4 +-
.../data/test_model_lora_config.json | 1 +
.../resources/scripts/build_gpt_engines.py | 83 +-
.../scripts/generate_expected_gpt_output.py | 21 +-
cpp/tests/resources/scripts/test_cpp.py | 13 +
cpp/tests/runtime/bufferManagerTest.cpp | 2 +-
cpp/tests/runtime/gptDecoderBatchedTest.cpp | 2 +-
cpp/tests/runtime/gptDecoderTest.cpp | 10 +-
cpp/tests/runtime/gptSessionTest.cpp | 2 +-
cpp/tests/runtime/samplingTest.cpp | 6 +-
docker/Dockerfile.multi | 2 +-
docker/Makefile | 2 +-
docker/common/install_base.sh | 2 +-
docker/common/install_cuda_toolkit.sh | 4 +-
docker/common/install_pytorch.sh | 4 +-
docker/common/install_tensorrt.sh | 31 +-
docs/source/advanced/batch-manager.md | 3 +
docs/source/advanced/gpt-attention.md | 17 +-
docs/source/executor.md | 17 +-
.../installation/build-from-source-windows.md | 6 +-
docs/source/media/picture-07-30-2024.png | Bin 157902 -> 0 bytes
docs/source/media/picture-08-06-2024.png | Bin 0 -> 354043 bytes
docs/source/reference/support-matrix.md | 7 +-
docs/source/release-notes.md | 11 +
examples/baichuan/convert_checkpoint.py | 1211 +-
examples/baichuan/requirements.txt | 2 +-
examples/bloom/requirements.txt | 2 +-
examples/chatglm/README.md | 2 +-
examples/chatglm/convert_checkpoint.py | 1133 +-
examples/chatglm/requirements.txt | 2 +-
examples/dbrx/requirements.txt | 2 +-
examples/enc_dec/run.py | 538 +-
examples/exaone/README.md | 81 +
examples/falcon/requirements.txt | 2 +-
examples/gemma/convert_checkpoint.py | 1115 +-
examples/gemma/requirements.txt | 3 +-
examples/gpt/convert_checkpoint.py | 8 +-
examples/gpt/requirements.txt | 2 +-
examples/gptj/requirements.txt | 2 +-
examples/gptneox/requirements.txt | 2 +-
examples/grok/requirements.txt | 2 +-
examples/high-level-api/requirements.txt | 2 +-
examples/internlm/requirements.txt | 2 +-
examples/jais/requirements.txt | 2 +-
examples/llama/convert_checkpoint.py | 59 +-
examples/llama/requirements.txt | 2 +-
examples/llama/summarize_long.py | 5 +-
examples/mamba/requirements.txt | 2 +-
examples/medusa/requirements.txt | 2 +-
examples/mixtral/requirements.txt | 2 +-
examples/mpt/requirements.txt | 2 +-
examples/multimodal/build_visual_engine.py | 815 +-
examples/multimodal/run.py | 1201 +-
examples/nemotron/requirements.txt | 2 +-
examples/opt/requirements.txt | 2 +-
examples/phi/requirements.txt | 2 +-
examples/quantization/requirements.txt | 2 +-
examples/qwen/convert_checkpoint.py | 1 -
examples/qwen/requirements.txt | 2 +-
examples/qwenvl/requirements.txt | 2 +-
examples/recurrentgemma/requirements.txt | 2 +-
examples/run.py | 4 +-
examples/skywork/requirements.txt | 2 +-
examples/smaug/requirements.txt | 2 +-
examples/summarize.py | 34 +-
examples/utils.py | 10 +-
examples/whisper/requirements.txt | 2 +-
examples/whisper/run.py | 6 +-
requirements-windows.txt | 2 +-
requirements.txt | 9 +-
scripts/build_wheel.py | 19 +-
tensorrt_llm/_common.py | 11 +-
tensorrt_llm/builder.py | 184 +-
tensorrt_llm/commands/build.py | 33 +
tensorrt_llm/executor.py | 29 +-
tensorrt_llm/functional.py | 5 +-
tensorrt_llm/hlapi/llm_utils.py | 46 +-
tensorrt_llm/layers/__init__.py | 5 +-
tensorrt_llm/layers/attention.py | 34 +-
tensorrt_llm/layers/embedding.py | 22 +-
tensorrt_llm/layers/linear.py | 414 +-
tensorrt_llm/layers/moe.py | 176 +-
tensorrt_llm/models/__init__.py | 8 +-
tensorrt_llm/models/baichuan/config.py | 106 +
tensorrt_llm/models/baichuan/convert.py | 1029 +
tensorrt_llm/models/baichuan/model.py | 102 +-
tensorrt_llm/models/bloom/model.py | 9 +-
tensorrt_llm/models/chatglm/config.py | 192 +
tensorrt_llm/models/chatglm/convert.py | 925 +
tensorrt_llm/models/chatglm/model.py | 123 +-
tensorrt_llm/models/falcon/model.py | 37 -
tensorrt_llm/models/gemma/config.py | 172 +
tensorrt_llm/models/gemma/convert.py | 918 +
tensorrt_llm/models/gemma/model.py | 213 +-
tensorrt_llm/models/gemma/smoothquant.py | 59 +-
.../models}/gemma/utils/__init__.py | 0
.../models}/gemma/utils/layers.py | 0
.../models}/gemma/utils/modules.py | 0
.../models}/gemma/utils/params.py | 3 +-
.../gemma/utils/positional_embeddings.py | 0
.../models}/gemma/utils/sampler.py | 0
.../models}/gemma/utils/transformer.py | 0
tensorrt_llm/models/gemma/weight.py | 652 +-
tensorrt_llm/models/generation_mixin.py | 114 +-
tensorrt_llm/models/gpt/convert.py | 3 +-
tensorrt_llm/models/gpt/model.py | 44 +-
tensorrt_llm/models/llama/config.py | 21 +-
tensorrt_llm/models/llama/convert.py | 217 +-
tensorrt_llm/models/llama/model.py | 128 +-
tensorrt_llm/models/mamba/model.py | 13 +-
tensorrt_llm/models/model_weights_loader.py | 318 +
tensorrt_llm/models/modeling_utils.py | 270 +-
tensorrt_llm/models/opt/model.py | 8 +-
tensorrt_llm/models/phi/convert.py | 98 +-
tensorrt_llm/models/phi/model.py | 81 +-
tensorrt_llm/models/phi3/model.py | 79 +-
tensorrt_llm/models/qwen/convert.py | 10 +-
tensorrt_llm/models/qwen/model.py | 45 +-
tensorrt_llm/models/recurrentgemma/model.py | 23 +-
tensorrt_llm/models/redrafter/model.py | 8 +-
tensorrt_llm/network.py | 12 +
tensorrt_llm/parameter.py | 106 +-
tensorrt_llm/plugin/plugin.py | 15 +-
tensorrt_llm/quantization/__init__.py | 5 +-
tensorrt_llm/quantization/functional.py | 70 +
tensorrt_llm/quantization/layers.py | 398 +-
tensorrt_llm/quantization/mode.py | 4 +
tensorrt_llm/quantization/quantize.py | 2 +
.../quantization/quantize_by_modelopt.py | 40 +-
tensorrt_llm/runtime/__init__.py | 4 +
tensorrt_llm/runtime/enc_dec_model_runner.py | 521 +
tensorrt_llm/runtime/generation.py | 356 +-
tensorrt_llm/runtime/model_runner.py | 46 +-
tensorrt_llm/runtime/model_runner_cpp.py | 32 +-
.../runtime/multimodal_model_runner.py | 1095 +
tensorrt_llm/runtime/redrafter_utils.py | 57 +-
tensorrt_llm/runtime/session.py | 23 +-
tensorrt_llm/tools/multimodal_builder.py | 808 +
tensorrt_llm/version.py | 2 +-
.../attention/test_gpt_attention_no_cache.py | 6 +-
tests/bindings/test_bindings_ut.py | 14 +-
tests/bindings/test_executor_bindings.py | 9 +-
tests/hlapi/test_build_cache.py | 2 +-
tests/hlapi/test_executor.py | 20 +-
tests/hlapi/test_llm.py | 3 +-
tests/hlapi/test_llm_models.py | 169 +-
tests/hlapi/test_llm_models_multi_gpu.py | 57 +
tests/hlapi/test_llm_multi_gpu.py | 4 -
tests/hlapi/test_llm_utils.py | 9 +-
tests/model/test_falcon.py | 2 +
tests/test_leak.py | 1 +
tests/test_plugins.py | 2 -
windows/setup_build_env.ps1 | 14 +-
windows/setup_env.ps1 | 4 +-
327 files changed, 255842 insertions(+), 9089 deletions(-)
create mode 100644 cpp/tensorrt_llm/common/safetensors.cpp
create mode 100644 cpp/tensorrt_llm/common/safetensors.h
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp
create mode 100644 cpp/tensorrt_llm/kernels/lora/lora.cpp
create mode 100644 cpp/tensorrt_llm/kernels/lora/lora.h
create mode 100644 cpp/tensorrt_llm/thop/redrafterCurandOp.cpp
delete mode 100644 docs/source/media/picture-07-30-2024.png
create mode 100644 docs/source/media/picture-08-06-2024.png
create mode 100644 examples/exaone/README.md
create mode 100644 tensorrt_llm/models/baichuan/config.py
create mode 100644 tensorrt_llm/models/baichuan/convert.py
create mode 100644 tensorrt_llm/models/chatglm/config.py
create mode 100644 tensorrt_llm/models/chatglm/convert.py
create mode 100644 tensorrt_llm/models/gemma/config.py
create mode 100644 tensorrt_llm/models/gemma/convert.py
rename {examples => tensorrt_llm/models}/gemma/utils/__init__.py (100%)
rename {examples => tensorrt_llm/models}/gemma/utils/layers.py (100%)
rename {examples => tensorrt_llm/models}/gemma/utils/modules.py (100%)
rename {examples => tensorrt_llm/models}/gemma/utils/params.py (98%)
rename {examples => tensorrt_llm/models}/gemma/utils/positional_embeddings.py (100%)
rename {examples => tensorrt_llm/models}/gemma/utils/sampler.py (100%)
rename {examples => tensorrt_llm/models}/gemma/utils/transformer.py (100%)
create mode 100644 tensorrt_llm/models/model_weights_loader.py
create mode 100644 tensorrt_llm/runtime/enc_dec_model_runner.py
create mode 100644 tensorrt_llm/runtime/multimodal_model_runner.py
create mode 100644 tensorrt_llm/tools/multimodal_builder.py
create mode 100644 tests/hlapi/test_llm_models_multi_gpu.py
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index 637b15906..19b4c5e06 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit 637b15906358191cb4238af419d408a65819d7ec
+Subproject commit 19b4c5e065e7e5bbc8082dfc7dbd792bdac850fc
diff --git a/README.md b/README.md
index 26619fc6c..e981b1d7e 100644
--- a/README.md
+++ b/README.md
@@ -6,8 +6,8 @@ TensorRT-LLM
[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
-[![cuda](https://img.shields.io/badge/cuda-12.4.1-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.2.0-green)](https://developer.nvidia.com/tensorrt)
+[![cuda](https://img.shields.io/badge/cuda-12.5.1-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt](https://img.shields.io/badge/TRT-10.3.0-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.12.0.dev-green)](./tensorrt_llm/version.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
@@ -17,19 +17,21 @@ TensorRT-LLM
## Latest News
-* [2024/07/30] Introducing🍊 @SliceXAI ELM Turbo 🤖 train ELM once ⚡ #TensorRT #LLM optimize ☁️ deploy anywhere
-[➡️ link](https://developer.nvidia.com/blog/supercharging-llama-3-1-across-nvidia-platforms)
+* [2024/08/06] 🗫 Multilingual Challenge Accepted 🗫
+🤖 #TensorRT #LLM boosts low-resource languages like Hebrew, Indonesian and Vietnamese ⚡[➡️ link](https://developer.nvidia.com/blog/accelerating-hebrew-llm-performance-with-nvidia-tensorrt-llm/?linkId=100000278659647)
-
+
+* [2024/07/30] Introducing🍊 @SliceXAI ELM Turbo 🤖 train ELM once ⚡ #TensorRT #LLM optimize ☁️ deploy anywhere
+[➡️ link](https://developer.nvidia.com/blog/supercharging-llama-3-1-across-nvidia-platforms)
+
* [2024/07/23] 👀 @AIatMeta Llama 3.1 405B trained on 16K NVIDIA H100s - inference is #TensorRT #LLM optimized ⚡
🦙 400 tok/s - per node
🦙 37 tok/s - per user
🦙 1 node inference
[➡️ link](https://developer.nvidia.com/blog/supercharging-llama-3-1-across-nvidia-platforms)
-
* [2024/07/09] Checklist to maximize multi-language performance of @meta #Llama3 with #TensorRT #LLM inference:
✅ MultiLingual
✅ NIM
@@ -50,6 +52,10 @@ Technical Deep Dive for serious coders ✅+99% compression ✅1 set of weights
* [2024/06/04] ✨ #TensorRT and GeForce #RTX unlock ComfyUI SD superhero powers 🦸⚡ 🎥 Demo: [➡️ link](https://youtu.be/64QEVfbPHyg)
📗 DIY notebook: [➡️ link](https://console.brev.dev/launchable/deploy?userID=2x2sil999&orgID=ktj33l4xj&name=ComfyUI_TensorRT&instance=L4%40g2-standard-4%3Anvidia-l4%3A1&diskStorage=500&cloudID=GCP&baseImage=docker.io%2Fpytorch%2Fpytorch%3A2.2.0-cuda12.1-cudnn8-runtime&ports=ComfUI%3A8188&file=https%3A%2F%2Fgithub.com%2Fbrevdev%2Fnotebooks%2Fblob%2Fmain%2Ftensorrt-comfyui.ipynb&launchableID=env-2hQX3n7ae5mq3NjNZ32DfAG0tJf)
+
+Previous News
+
+
* [2024/05/28] ✨#TensorRT weight stripping for ResNet-50 ✨ ✅+99% compression
✅1 set of weights → ** GPUs\ ✅0 performance loss ✅** models…LLM, CNN, etc
👀 📚 DIY [➡️ link](https://console.brev.dev/launchable/deploy?userID=2x2sil999&orgID=ktj33l4xj&launchableID=env-2h6bym7h5GFNho3vpWQQeUYMwTM&instance=L4%40g6.xlarge&diskStorage=500&cloudID=devplane-brev-1&baseImage=nvcr.io%2Fnvidia%2Ftensorrt%3A24.05-py3&file=https%3A%2F%2Fgithub.com%2FNVIDIA%2FTensorRT%2Fblob%2Frelease%2F10.0%2Fsamples%2Fpython%2Fsample_weight_stripping%2Fnotebooks%2Fweight_stripping.ipynb&name=tensorrt_weight_stripping_resnet50)
@@ -62,8 +68,6 @@ Serverless TensorRT-LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.co
* [2024/05/07] 🦙🦙🦙 24,000 tokens per second 🛫Meta Llama 3 takes off with #TensorRT #LLM 📚[➡️ link](https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/)
-
-Previous News
* [2024/02/06] [🚀 Speed up inference with SOTA quantization techniques in TRT-LLM](./docs/source/blogs/quantization-in-TRT-LLM.md)
* [2024/01/30] [ New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget](./docs/source/blogs/XQA-kernel.md)
diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
index 3e442943c..d3861a2f3 100644
--- a/benchmarks/cpp/gptManagerBenchmark.cpp
+++ b/benchmarks/cpp/gptManagerBenchmark.cpp
@@ -154,7 +154,7 @@ struct BenchmarkParams
std::optional maxBatchSize{std::nullopt};
std::optional maxNumTokens{std::nullopt};
int randomSeed = 430;
- std::optional maxAttentionWindow{std::nullopt};
+ std::optional> maxAttentionWindowVec{std::nullopt};
std::optional sinkTokenLength{std::nullopt};
bool multiBlockMode{false};
bool enableContextFMHAFP32Acc{false};
@@ -803,8 +803,8 @@ class ExecutorServer
texec::SchedulerConfig schedulerConfig(capacitySchedulerPolicy);
texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse, benchmarkParams.maxTokensInPagedKvCache,
- benchmarkParams.maxAttentionWindow, benchmarkParams.sinkTokenLength, benchmarkParams.freeGpuMemoryFraction,
- benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks);
+ benchmarkParams.maxAttentionWindowVec, benchmarkParams.sinkTokenLength,
+ benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks);
texec::PeftCacheConfig peftCacheConfig(0, benchmarkParams.loraDeviceNumModLayers, 8, 64, 4, 4, 4, 24, 8,
std::nullopt, benchmarkParams.loraHostCacheSize);
texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig(
@@ -1351,7 +1351,7 @@ std::shared_ptr makeRequest(std::uint64_t reqId, Sample const&
if (sample.taskId >= 0)
{
uint64_t taskId = static_cast(sample.taskId);
- request->setLoraTaskId(bufferManager.copyFrom(&taskId, ITensor::makeShape({1}), MemoryType::kPINNED));
+ request->setLoraTaskId(bufferManager.copyFrom(&taskId, ITensor::makeShape({1}), MemoryType::kPINNEDPOOL));
}
if (loraWeights)
{
@@ -1406,9 +1406,9 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
{
optionalParams.kvCacheConfig.freeGpuMemoryFraction = benchmarkParams.freeGpuMemoryFraction;
}
- if (benchmarkParams.maxAttentionWindow)
+ if (benchmarkParams.maxAttentionWindowVec)
{
- optionalParams.kvCacheConfig.maxAttentionWindow = benchmarkParams.maxAttentionWindow;
+ optionalParams.kvCacheConfig.maxAttentionWindowVec = benchmarkParams.maxAttentionWindowVec;
}
if (benchmarkParams.sinkTokenLength)
{
@@ -1442,7 +1442,7 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
BufferManager bufferManager{std::make_shared()}; // the stream is not used
ITensor::SharedPtr beamWidthTensor{
- bufferManager.copyFrom(&beamWidth, ITensor::makeShape({1}), MemoryType::kPINNED)};
+ bufferManager.copyFrom(&beamWidth, ITensor::makeShape({1}), MemoryType::kPINNEDPOOL)};
// Load dataset
auto const samples = parseWorkloadJson(datasetPath, maxNumSamples, maxPromptLen);
@@ -1455,16 +1455,16 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
waitSleep, staticEmulatedBatchSize, batchTimeout, logIterationData, excludeInputInOutput);
ITensor::SharedPtr eosIdTensor{
- eosId ? bufferManager.copyFrom(&eosId.value(), ITensor::makeShape({1}), MemoryType::kPINNED) : nullptr};
+ eosId ? bufferManager.copyFrom(&eosId.value(), ITensor::makeShape({1}), MemoryType::kPINNEDPOOL) : nullptr};
ITensor::SharedPtr padIdTensor{
- padId ? bufferManager.copyFrom(&padId.value(), ITensor::makeShape({1}), MemoryType::kPINNED) : nullptr};
+ padId ? bufferManager.copyFrom(&padId.value(), ITensor::makeShape({1}), MemoryType::kPINNEDPOOL) : nullptr};
ITensor::SharedPtr returnContextLogitsFlagTensor{returnContextLogits
- ? bufferManager.copyFrom(&returnContextLogits, ITensor::makeShape({1}), MemoryType::kPINNED)
+ ? bufferManager.copyFrom(&returnContextLogits, ITensor::makeShape({1}), MemoryType::kPINNEDPOOL)
: nullptr};
ITensor::SharedPtr returnGenerationLogitsFlagTensor{returnGenerationLogits
- ? bufferManager.copyFrom(&returnGenerationLogits, ITensor::makeShape({1}), MemoryType::kPINNED)
+ ? bufferManager.copyFrom(&returnGenerationLogits, ITensor::makeShape({1}), MemoryType::kPINNEDPOOL)
: nullptr};
if (worldConfig.getRank() == 0)
@@ -1816,7 +1816,8 @@ int main(int argc, char* argv[])
"eos_id", "Specify the end-of-sequence token id.", cxxopts::value()->default_value("-1"));
options.add_options()("pad_id", "Specify the padding token id.", cxxopts::value());
options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value());
- options.add_options()("max_attention_window", "Max KV cache length per sequence", cxxopts::value());
+ options.add_options()(
+ "max_attention_window", "Max KV cache length per sequence", cxxopts::value>());
options.add_options()("sink_token_len", "Sink token length in kv cache per sequence.", cxxopts::value());
options.add_options()(
"random_seed", "integer random seed for exponential time delays.", cxxopts::value()->default_value("420"));
@@ -1961,7 +1962,7 @@ int main(int argc, char* argv[])
// Argument: Max KV cache length
if (result.count("max_attention_window"))
{
- benchmarkParams.maxAttentionWindow = result["max_attention_window"].as();
+ benchmarkParams.maxAttentionWindowVec = result["max_attention_window"].as>();
}
// Argument: Sink token length
diff --git a/benchmarks/cpp/gptSessionBenchmark.cpp b/benchmarks/cpp/gptSessionBenchmark.cpp
index f29e0a239..0550a7303 100644
--- a/benchmarks/cpp/gptSessionBenchmark.cpp
+++ b/benchmarks/cpp/gptSessionBenchmark.cpp
@@ -427,7 +427,8 @@ int main(int argc, char* argv[])
options.add_options()("ctx_micro_batch_size", "Batch size for context phase.", cxxopts::value());
options.add_options()("gen_micro_batch_size", "Batch size for generation phase.", cxxopts::value());
- options.add_options()("max_attention_window", "Max kv cache length per sequence.", cxxopts::value());
+ options.add_options()(
+ "max_attention_window", "Max kv cache length per sequence.", cxxopts::value>());
options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value());
options.add_options()("sink_token_len", "Sink token length in kv cache per sequence.", cxxopts::value());
options.add_options()(
@@ -535,7 +536,7 @@ int main(int argc, char* argv[])
// Argument: Max KV Cache Length
if (result.count("max_attention_window"))
{
- sessionConfig.kvCacheConfig.maxAttentionWindow = result["max_attention_window"].as();
+ sessionConfig.kvCacheConfig.maxAttentionWindowVec = result["max_attention_window"].as>();
}
// Argument: Sink token length
if (result.count("sink_token_len"))
diff --git a/benchmarks/python/base_benchmark.py b/benchmarks/python/base_benchmark.py
index d9cb33687..af62155a0 100644
--- a/benchmarks/python/base_benchmark.py
+++ b/benchmarks/python/base_benchmark.py
@@ -57,6 +57,12 @@ def serialize_engine(engine, path):
logger.info(f'Engine serialized. Total time: {t}')
+def get_last_path_component(path):
+ normalized_path = os.path.normpath(path)
+ last_component = os.path.basename(normalized_path)
+ return last_component
+
+
class BaseBenchmark(object):
def __init__(self, engine_dir, model_name, dtype, rank, world_size):
@@ -144,7 +150,7 @@ def __init__(self, engine_dir, model_name, dtype, rank, world_size):
def get_report_dict(self, benchmark_profiler=None):
report_fields = [
- "model_name",
+ "engine_dir",
"world_size",
"num_heads",
"num_kv_heads",
@@ -165,7 +171,7 @@ def get_report_dict(self, benchmark_profiler=None):
"compute_cap",
]
report_dict = OrderedDict.fromkeys(report_fields)
- report_dict["model_name"] = self.model_name
+ report_dict["engine_dir"] = get_last_path_component(self.engine_dir)
report_dict["world_size"] = self.world_size
report_dict["precision"] = self.dtype
report_dict["quantization"] = str(self.quant_mode)
@@ -174,7 +180,8 @@ def get_report_dict(self, benchmark_profiler=None):
def get_csv_filename(self):
if len(self.csv_filename) == 0:
- self.csv_filename = get_csv_filename(self.model_name,
+ self.csv_filename = get_csv_filename(get_last_path_component(
+ self.engine_dir),
self.dtype,
self.world_size,
fp8linear=int(self.enable_fp8))
diff --git a/benchmarks/python/benchmark.py b/benchmarks/python/benchmark.py
index 4d8d2b2ae..8a4e266d5 100644
--- a/benchmarks/python/benchmark.py
+++ b/benchmarks/python/benchmark.py
@@ -192,7 +192,6 @@ def main(args):
raise Exception(
f"--gpu_weights_percent only accepts values between 0.0 and 1.0."
)
- args.weight_streaming = any([p != 1 for p in gpu_weights_percents])
rank = tensorrt_llm.mpi_rank()
world_size = tensorrt_llm.mpi_world_size()
@@ -225,10 +224,9 @@ def main(args):
benchmark_profiler=benchmark_profiler)
for config in benchmarker.get_config():
try:
- if args.weight_streaming:
- # We pass in config instead of the gpu_weights_percent here to keep this benchmark script
- # agnostic to the length and contents of the config.
- benchmarker.set_weight_streaming(config)
+ # We pass in config instead of the gpu_weights_percent here to keep this benchmark script
+ # agnostic to the length and contents of the config.
+ benchmarker.set_weight_streaming(config)
inputs = benchmarker.prepare_inputs(config)
except torch.cuda.OutOfMemoryError as e:
logger.error(
diff --git a/benchmarks/python/enc_dec_benchmark.py b/benchmarks/python/enc_dec_benchmark.py
index 42348732f..aab73ace6 100644
--- a/benchmarks/python/enc_dec_benchmark.py
+++ b/benchmarks/python/enc_dec_benchmark.py
@@ -25,6 +25,7 @@
from tensorrt_llm.quantization import QuantMode
from tensorrt_llm.runtime.session import TensorInfo
from tensorrt_llm.runtime import ModelConfig
+from tensorrt_llm.models.modeling_utils import get_kv_cache_type_from_legacy
class EncDecBenchmark(BaseBenchmark):
@@ -56,6 +57,7 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
if self.engine_dir is not None:
def read_config(component):
+ # almost same as enc_dec_model_runner.py::read_config()
config_path = os.path.join(self.engine_dir, component,
"config.json")
with open(config_path, "r") as f:
@@ -65,12 +67,13 @@ def read_config(component):
plugin_config = builder_config['plugin_config']
pretrained_config = config['pretrained_config']
lora_config = builder_config['lora_config']
- builder_config['auto_parallel_config']
+ auto_parallel_config = builder_config['auto_parallel_config']
use_gpt_attention_plugin = plugin_config["gpt_attention_plugin"]
remove_input_padding = plugin_config["remove_input_padding"]
use_lora_plugin = plugin_config["lora_plugin"]
tp_size = pretrained_config['mapping']['tp_size']
pp_size = pretrained_config['mapping']['pp_size']
+ auto_parallel_config['gpus_per_node']
world_size = tp_size * pp_size
assert world_size == tensorrt_llm.mpi_world_size(), \
f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
@@ -98,6 +101,9 @@ def read_config(component):
dtype = pretrained_config["dtype"]
paged_kv_cache = plugin_config['paged_kv_cache']
+ kv_cache_type = get_kv_cache_type_from_legacy(
+ True, paged_kv_cache)
+
tokens_per_block = plugin_config['tokens_per_block']
gather_context_logits = builder_config.get(
@@ -107,11 +113,6 @@ def read_config(component):
max_prompt_embedding_table_size = builder_config.get(
'max_prompt_embedding_table_size', 0)
- self.max_batch_size = config["build_config"]["max_batch_size"]
- self.max_input_len = config["build_config"][
- "max_encoder_input_len"]
- self.max_seq_len = config["build_config"]["max_seq_len"]
-
model_config = ModelConfig(
num_heads=num_heads,
num_kv_heads=num_kv_heads,
@@ -123,7 +124,7 @@ def read_config(component):
num_layers=num_layers,
gpt_attention_plugin=use_gpt_attention_plugin,
remove_input_padding=remove_input_padding,
- paged_kv_cache=paged_kv_cache,
+ kv_cache_type=kv_cache_type,
tokens_per_block=tokens_per_block,
cross_attention=cross_attention,
has_position_embedding=has_position_embedding,
@@ -140,6 +141,15 @@ def read_config(component):
skip_cross_qkv=skip_cross_qkv,
)
+ # additional info for benchmark
+ self.max_batch_size = config["build_config"]["max_batch_size"]
+ self.max_input_len = config["build_config"][
+ "max_encoder_input_len"]
+ self.max_seq_len = config["build_config"]["max_seq_len"]
+ if component == "decoder":
+ self.decoder_start_token_id = pretrained_config[
+ 'decoder_start_token_id']
+
return model_config
self.encoder_model_config = read_config("encoder")
@@ -212,7 +222,7 @@ def get_config(self):
def set_weight_streaming(self, config):
gpu_weights_percent = config[3]
self.encoder_session._set_weight_streaming(gpu_weights_percent)
- self.decoder_session._set_weight_streaming(gpu_weights_percent)
+ self.decoder_session.runtime._set_weight_streaming(gpu_weights_percent)
def prepare_inputs(self, config):
batch_size, encoder_input_len = config[0], config[1]
@@ -234,7 +244,7 @@ def prepare_inputs(self, config):
decoder_input_ids = decoder_input_ids.repeat(
(encoder_input_ids.shape[0], 1))
output_list = [
- TensorInfo('x', str_dtype_to_trt(self.dtype),
+ TensorInfo('input_features', str_dtype_to_trt(self.dtype),
encoder_input_ids.shape),
TensorInfo('input_lengths', str_dtype_to_trt('int32'),
encoder_input_lengths.shape)
@@ -248,8 +258,8 @@ def prepare_inputs(self, config):
}
whisper_decoder_encoder_input_lengths = torch.tensor(
[
- outputs['output'].shape[1]
- for x in range(outputs['output'].shape[0])
+ outputs['encoder_output'].shape[1]
+ for x in range(outputs['encoder_output'].shape[0])
],
dtype=torch.int32,
device='cuda')
@@ -260,28 +270,28 @@ def prepare_inputs(self, config):
],
dtype=torch.int32,
device='cuda')
- cross_attention_mask = torch.ones(
- [outputs['output'].shape[0], 1,
- outputs['output'].shape[1]]).int().cuda()
+ cross_attention_mask = torch.ones([
+ outputs['encoder_output'].shape[0], 1,
+ outputs['encoder_output'].shape[1]
+ ]).int().cuda()
else:
encoder_input_ids = (torch.randint(
100, (batch_size, encoder_input_len)).int().cuda())
- # For now, just hardcode the decoder_start_token_id to 0 for t5 models.
- decoder_start_token_id = 0
- decoder_input_ids = torch.IntTensor([[decoder_start_token_id]
+ decoder_input_ids = torch.IntTensor([[self.decoder_start_token_id]
]).to(self.device)
- decoder_input_ids = decoder_input_ids.repeat(
- (encoder_input_ids.shape[0], 1))
- # in padding mode --> keep input, just calculate actual length and max length
- # Note: 1st token should always count, even if it is pad_token_id (0). e.g., decoder start id in enc-dec models could be a single pad_token_id, we should count
- encoder_input_lengths = ((
- 1 + (encoder_input_ids[:, 1:] != 0).sum(dim=1).type(
- torch.IntTensor).to(self.device)).clone().detach().to(
- dtype=torch.int32, device=self.device))
- decoder_input_lengths = ((
- 1 + (decoder_input_ids[:, 1:] != 0).sum(dim=1).type(
- torch.IntTensor).to(self.device)).clone().detach().to(
- dtype=torch.int32, device=self.device))
+ decoder_input_ids = decoder_input_ids.repeat((batch_size, 1))
+ encoder_input_lengths = torch.tensor([encoder_input_len] *
+ batch_size,
+ dtype=torch.int32,
+ device=self.device)
+ decoder_input_lengths = torch.tensor([1] * batch_size,
+ dtype=torch.int32,
+ device=self.device)
+
+ if self.encoder_model_config.remove_input_padding:
+ encoder_input_ids = torch.flatten(encoder_input_ids)
+ decoder_input_ids = torch.flatten(decoder_input_ids)
+
# attention mask, always set 1 as if all are valid tokens
attention_mask = torch.ones(
(batch_size, encoder_input_len)).int().cuda()
@@ -293,6 +303,9 @@ def prepare_inputs(self, config):
hidden_size = (self.encoder_model_config.hidden_size *
self.world_size) # tp_size
hidden_states_shape = (
+ encoder_input_ids.shape[0],
+ hidden_size,
+ ) if self.encoder_model_config.remove_input_padding else (
encoder_input_ids.shape[0],
encoder_input_ids.shape[1],
hidden_size,
@@ -339,7 +352,7 @@ def run(self, inputs, config, benchmark_profiler=None):
# input tensors
inputs = {}
if 'whisper' in self.model_name:
- inputs['x'] = encoder_input_ids.contiguous()
+ inputs['input_features'] = encoder_input_ids.contiguous()
inputs["input_lengths"] = encoder_input_lengths
else:
inputs["input_ids"] = encoder_input_ids.contiguous()
@@ -369,9 +382,7 @@ def run(self, inputs, config, benchmark_profiler=None):
# run decoder
sampling_config = tensorrt_llm.runtime.SamplingConfig(
end_id=1, pad_id=0, num_beams=self.num_beams, min_length=output_len)
- encoder_output = outputs[
- 'output'] if 'whisper' in self.model_name else outputs[
- "encoder_output"]
+ encoder_output = outputs["encoder_output"]
encoder_max_input_length = encoder_output.shape[
1] if 'whisper' in self.model_name else torch.max(
encoder_input_lengths).item()
diff --git a/benchmarks/python/gpt_benchmark.py b/benchmarks/python/gpt_benchmark.py
index 6319a1949..04ba2ab0f 100644
--- a/benchmarks/python/gpt_benchmark.py
+++ b/benchmarks/python/gpt_benchmark.py
@@ -20,6 +20,7 @@
import torch
import tensorrt_llm
+from tensorrt_llm.bindings import KVCacheType
from tensorrt_llm.builder import Engine
from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession,
SamplingConfig)
@@ -77,6 +78,13 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
if hasattr(self, item):
rnn_configs_kwargs[item] = getattr(self, item)
+ kv_cache_type = KVCacheType.CONTINUOUS
+ if hasattr(self, 'kv_cache_type'):
+ kv_cache_type = self.kv_cache_type
+ else:
+ if hasattr(self, 'paged_kv_cache'):
+ kv_cache_type = KVCacheType.PAGED if self.paged_kv_cache == True else KVCacheType.CONTINUOUS
+
model_config = tensorrt_llm.runtime.ModelConfig(
max_batch_size=self.max_batch_size,
max_beam_width=self.num_beams,
@@ -86,8 +94,7 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
num_kv_heads=ceil(self.num_kv_heads / self.world_size),
hidden_size=self.hidden_size // self.world_size,
gpt_attention_plugin=self.use_gpt_attention_plugin,
- paged_kv_cache=self.paged_kv_cache if hasattr(
- self, 'paged_kv_cache') else False,
+ kv_cache_type=kv_cache_type,
paged_state=self.paged_state
if hasattr(self, 'paged_state') else False,
dtype=self.dtype,
diff --git a/cpp/include/tensorrt_llm/batch_manager/GptManager.h b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
index 21b75d1bc..40034b995 100644
--- a/cpp/include/tensorrt_llm/batch_manager/GptManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
@@ -40,7 +40,7 @@ class TrtGptModel;
/* Responsible for shepherding requests through to completion
using TRT Backend. */
-class GptManager
+class [[deprecated("Use the executor API instead.")]] GptManager
{
public:
using SizeType32 = tensorrt_llm::runtime::SizeType32;
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
index a7f1ba430..0aa80adfe 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
@@ -38,12 +38,12 @@ class KvCacheConfig
using SizeType32 = tensorrt_llm::runtime::SizeType32;
explicit KvCacheConfig(std::optional maxTokens = std::nullopt,
- std::optional maxAttentionWindow = std::nullopt,
+ std::optional> maxAttentionWindowVec = std::nullopt,
std::optional sinkTokenLength = std::nullopt,
std::optional freeGpuMemoryFraction = std::nullopt, bool enableBlockReuse = false, bool useUvm = false,
std::optional hostCacheSize = std::nullopt, bool onboardBlocks = true)
: maxTokens{maxTokens}
- , maxAttentionWindow{maxAttentionWindow}
+ , maxAttentionWindowVec{maxAttentionWindowVec}
, sinkTokenLength{sinkTokenLength}
, freeGpuMemoryFraction{freeGpuMemoryFraction}
, enableBlockReuse(enableBlockReuse)
@@ -54,7 +54,7 @@ class KvCacheConfig
}
explicit KvCacheConfig(executor::KvCacheConfig const& kvCacheConfig)
- : KvCacheConfig(kvCacheConfig.getMaxTokens(), kvCacheConfig.getMaxAttentionWindow(),
+ : KvCacheConfig(kvCacheConfig.getMaxTokens(), kvCacheConfig.getMaxAttentionWindowVec(),
kvCacheConfig.getSinkTokenLength(), kvCacheConfig.getFreeGpuMemoryFraction(),
kvCacheConfig.getEnableBlockReuse(), false, kvCacheConfig.getHostCacheSize(),
kvCacheConfig.getOnboardBlocks())
@@ -63,7 +63,7 @@ class KvCacheConfig
bool operator==(KvCacheConfig const& other) const
{
- return maxTokens == other.maxTokens && maxAttentionWindow == other.maxAttentionWindow
+ return maxTokens == other.maxTokens && maxAttentionWindowVec == other.maxAttentionWindowVec
&& sinkTokenLength == other.sinkTokenLength && freeGpuMemoryFraction == other.freeGpuMemoryFraction
&& enableBlockReuse == other.enableBlockReuse && useUvm == other.useUvm
&& hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks;
@@ -72,7 +72,7 @@ class KvCacheConfig
friend std::ostream& operator<<(std::ostream& os, KvCacheConfig const& self);
std::optional maxTokens;
- std::optional maxAttentionWindow;
+ std::optional> maxAttentionWindowVec;
std::optional sinkTokenLength;
std::optional freeGpuMemoryFraction;
bool enableBlockReuse;
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
index e1dd38ed0..2699270ac 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -67,6 +67,7 @@ namespace tensorrt_llm::batch_manager::kv_cache_manager
{
class KVCacheBlock;
+class KVCacheManager;
using SizeType32 = tensorrt_llm::runtime::SizeType32;
using TokenIdType = tensorrt_llm::runtime::TokenIdType;
@@ -288,8 +289,8 @@ class BlockManager
void startScheduling();
//! \brief Assign blocks for new sequence. Try to reuse blocks.
- void addSequence(
- GenerationRequest& sequence, SizeType32 inputLength, std::shared_ptr const& llmRequest);
+ void addSequence(GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks,
+ std::shared_ptr const& llmRequest);
//! \brief Assign blocks for new sequence. Does not try to reuse blocks.
void addSequence(GenerationRequest& sequence, SizeType32 numBlocks, SizeType32 unsharedBlockIdx);
@@ -389,6 +390,10 @@ class BlockManager
//! \details Does nothing of block is already in primary memory.
void onboardBlock(BlockPtr offloadBlock);
+ //! \brief Find first new block that must be allocated for context phase and return it's concatenated token vectors.
+ //! \details Only full blocks are considered.
+ VecTokens findNewContextBlock(VecTokens const& inputTokens) const;
+
private:
//! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -405,7 +410,8 @@ class BlockManager
//! \param blockedTokens Tokens of each block.
//! \param sequence Sequence to which blocks are assigned.
//! \return Number of matched tokens from loaded blocks.
- SizeType32 loadOrAllocateBlocks(std::list const& blockedTokens, GenerationRequest& sequence);
+ SizeType32 loadOrAllocateBlocks(
+ std::list const& blockedTokens, SizeType32 numContextBlocks, GenerationRequest& sequence);
//! \brief Find best primary block to free.
//! \details The best primary block to free is the primary block that appears first in the queue and have no primary
@@ -460,6 +466,9 @@ class BlockManager
std::size_t mAllocTotalBlocks, mAllocNewBlocks, mReusedBlocks;
// KV cache type (self or cross)
CacheType mCacheType;
+
+private:
+ friend class KVCacheManager;
};
class KVCacheManager
@@ -611,6 +620,14 @@ class KVCacheManager
return mCacheType == CacheType::kCROSS;
}
+ //! \brief Find first new block that must be allocated for context phase and return it's concatenated token vector.
+ //! \details Only full blocks are considered.
+ VecTokens findNewContextBlock(VecTokens const& inputTokens) const;
+
+ //! \brief Store full context blocks contributed by llmRequest.
+ //! \details These blocks become reusable from next step.
+ void storeContextBlocks(SizeType32 seqSlotIdx, std::shared_ptr const& llmRequest);
+
[[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
[[nodiscard]] static SizeType32 getMaxAttentionWindowUpperBound(SizeType32 blocksInPrimaryPool,
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index 6c86b06ad..42536fc33 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -162,7 +162,7 @@ class GenericLlmRequest
, mDecodingIter(0)
, mPriority(req.getPriority())
{
- if (mIsStreaming && mSamplingConfig.beamWidth > 1 && mReturnAllGeneratedTokens == false)
+ if (mIsStreaming && mSamplingConfig.beamWidth > 1 && !mReturnAllGeneratedTokens)
{
TLLM_LOG_WARNING(
"Setting mReturnAllGeneratedTokens to True since streaming AND beam search are done simultaneously. "
@@ -405,7 +405,7 @@ class GenericLlmRequest
}
}
- /// @brief Sets the generated tokens for all beams. Erases all previous generated tokens.
+ /// @brief Sets the generated tokens for all beams after gatherTree. Erases all previous generated tokens.
/// @param generatedBeamTokens The generated tokens for all beams (vector of vector of tokens)
void setGeneratedTokens(BeamTokens const& generatedBeamTokens)
{
diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
index 571970471..4ed912100 100644
--- a/cpp/include/tensorrt_llm/executor/executor.h
+++ b/cpp/include/tensorrt_llm/executor/executor.h
@@ -409,14 +409,14 @@ class KvCacheConfig
{
public:
explicit KvCacheConfig(bool enableBlockReuse = false, std::optional const& maxTokens = std::nullopt,
- std::optional const& maxAttentionWindow = std::nullopt,
+ std::optional> const& maxAttentionWindowVec = std::nullopt,
std::optional const& sinkTokenLength = std::nullopt,
std::optional const& freeGpuMemoryFraction = std::nullopt,
std::optional const& hostCacheSize = std::nullopt, bool onboardBlocks = true);
[[nodiscard]] bool getEnableBlockReuse() const;
[[nodiscard]] std::optional getMaxTokens() const;
- [[nodiscard]] std::optional getMaxAttentionWindow() const;
+ [[nodiscard]] std::optional> getMaxAttentionWindowVec() const;
[[nodiscard]] std::optional getSinkTokenLength() const;
[[nodiscard]] std::optional getFreeGpuMemoryFraction() const;
[[nodiscard]] std::optional getHostCacheSize() const;
@@ -424,7 +424,7 @@ class KvCacheConfig
void setEnableBlockReuse(bool enableBlockReuse);
void setMaxTokens(SizeType32 maxTokens);
- void setMaxAttentionWindow(SizeType32 maxAttentionWindow);
+ void setMaxAttentionWindowVec(std::vector maxAttentionWindowVec);
void setSinkTokenLength(SizeType32 sinkTokenLength);
void setFreeGpuMemoryFraction(FloatType freeGpuMemoryFraction);
void setHostCacheSize(size_t hostCacheSize);
@@ -442,8 +442,10 @@ class KvCacheConfig
std::optional mMaxTokens;
/// @brief Size of the attention window for each sequence. Only the last mMaxAttentionWindow tokens of each sequence
- /// will be stored in the KV cache.
- std::optional mMaxAttentionWindow;
+ /// will be stored in the KV cache. Different layers may have different max attention window sizes.
+ /// If the number of elements in mMaxAttentionWindowVec is less than the number of layers, mMaxAttentionWindowVec
+ /// will be repeated multiple times to the number of layers.
+ std::optional> mMaxAttentionWindowVec;
/// @brief Number of sink tokens (tokens to always keep in attention window)
std::optional mSinkTokenLength;
@@ -699,8 +701,8 @@ class ExecutorConfig
std::optional const& peftCacheConfig = std::nullopt,
std::optional logitsPostProcessorMap = std::nullopt,
std::optional logitsPostProcessorBatched = std::nullopt,
- std::optional decodingConfig = std::nullopt, float gpuWeightsPercent = 1,
- std::optional maxQueueSize = std::nullopt,
+ bool replicateLogitsPostProcessor = true, std::optional decodingConfig = std::nullopt,
+ float gpuWeightsPercent = 1, std::optional maxQueueSize = std::nullopt,
ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig = ExtendedRuntimePerfKnobConfig());
[[nodiscard]] SizeType32 getMaxBeamWidth() const;
@@ -717,6 +719,7 @@ class ExecutorConfig
[[nodiscard]] std::optional getPeftCacheConfig() const;
[[nodiscard]] std::optional getLogitsPostProcessorMap() const;
[[nodiscard]] std::optional getLogitsPostProcessorBatched() const;
+ [[nodiscard]] bool getReplicateLogitsPostProcessor() const;
[[nodiscard]] std::optional getDecodingConfig() const;
[[nodiscard]] float getGpuWeightsPercent() const;
[[nodiscard]] std::optional getMaxQueueSize() const;
@@ -736,6 +739,7 @@ class ExecutorConfig
void setPeftCacheConfig(PeftCacheConfig const& peftCacheConfig);
void setLogitsPostProcessorMap(LogitsPostProcessorMap const& logitsPostProcessorMap);
void setLogitsPostProcessorBatched(LogitsPostProcessorBatched const& logitsPostProcessorBatched);
+ void setReplicateLogitsPostProcessor(bool const replicateLogitsPostProcessor);
void setDecodingConfig(DecodingConfig const& decodingConfig);
void setGpuWeightsPercent(float const& gpuWeightsPercent);
void setMaxQueueSize(std::optional const& maxQueueSize);
@@ -779,6 +783,8 @@ class ExecutorConfig
std::optional mPeftCacheConfig;
std::optional mLogitsPostProcessorMap;
std::optional mLogitsPostProcessorBatched;
+ /// @brief If set to true, logits post processor will run on all TP ranks in last PP rank
+ bool mReplicateLogitsPostProcessor;
/// @brief Decoding configuration.
std::optional mDecodingConfig;
diff --git a/cpp/include/tensorrt_llm/executor/types.h b/cpp/include/tensorrt_llm/executor/types.h
index 88d56b098..4861a39aa 100644
--- a/cpp/include/tensorrt_llm/executor/types.h
+++ b/cpp/include/tensorrt_llm/executor/types.h
@@ -151,6 +151,7 @@ enum class MemoryType
{
kCPU,
kCPU_PINNED,
+ kCPU_PINNEDPOOL,
kGPU,
kUVM,
kUNKNOWN
diff --git a/cpp/include/tensorrt_llm/runtime/decodingInput.h b/cpp/include/tensorrt_llm/runtime/decodingInput.h
index 52da69e27..6c4d7c805 100644
--- a/cpp/include/tensorrt_llm/runtime/decodingInput.h
+++ b/cpp/include/tensorrt_llm/runtime/decodingInput.h
@@ -35,7 +35,7 @@ class DecodingInput
using TensorPtr = ITensor::SharedPtr;
DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 batchSize,
- TensorPtr logits, TensorPtr endIds)
+ TensorPtr logits, TensorPtr endIds, TensorConstPtr batchSlots)
: step{maxLength}
, maxLength{maxLength}
, maxAttentionWindow{maxAttentionWindow}
@@ -45,6 +45,7 @@ class DecodingInput
, maxBadWordsLen{0}
, logits{std::move(logits)}
, endIds{std::move(endIds)}
+ , batchSlots{std::move(batchSlots)}
{
TLLM_CHECK_WITH_INFO(static_cast(this->logits), "Invalid logits tensor");
TLLM_CHECK_WITH_INFO(static_cast(this->endIds), "Invalid endIds tensor");
@@ -75,6 +76,9 @@ class DecodingInput
TensorConstPtr endIds; //!< [batchSize * beamWidth], on gpu
+ TensorConstPtr
+ batchSlots; //!< [batchSize], address map of the linear batch id to to the seq slots, int32_t, pinned
+
// optional parameters
TensorConstPtr finished; //!< [batchSize, beamWidth], finished states at current iteration.
//!< If true for some request, the decoding step of it is skipped, on gpu
@@ -89,8 +93,6 @@ class DecodingInput
TensorConstPtr stopWordsPtrs; //!< [batchSize][2, stopWordsLength], on gpu
TensorConstPtr stopWordsLens; //!< [batchSize], on gpu
TensorConstPtr noRepeatNgramSize; //!< [batchSize], on gpu
- TensorConstPtr
- batchSlots; //!< [batchSize], optional, address map of the linear batch id to to the seq slots, int32_t, pinned
// parameters for beam search
TensorPtr cacheIndirection; //!< [batchSize, beamWidth, maxSeqLen] - the k/v cache index for beam search, on gpu
diff --git a/cpp/include/tensorrt_llm/runtime/decodingOutput.h b/cpp/include/tensorrt_llm/runtime/decodingOutput.h
index 8298b07a2..c07ae057b 100644
--- a/cpp/include/tensorrt_llm/runtime/decodingOutput.h
+++ b/cpp/include/tensorrt_llm/runtime/decodingOutput.h
@@ -59,8 +59,9 @@ class DecodingOutput
static float constexpr kNegativeInfinity = -1e20f;
- explicit DecodingOutput(TensorPtr ids)
+ explicit DecodingOutput(TensorPtr ids, TensorPtr gatheredIds)
: ids{std::move(ids)}
+ , gatheredIds{std::move(gatheredIds)}
{
TLLM_CHECK_WITH_INFO(static_cast(this->ids), "Invalid ids tensor");
}
@@ -68,6 +69,11 @@ class DecodingOutput
// mandatory parameters
TensorPtr ids; // [BS, BM, MSL], contains previously generated token ids for all
// steps before DecodingInput.step
+
+ TensorPtr gatheredIds; // [BS, BM, MSL], these are the tokens computed during the gatherTree step
+ // When doing beam search and streaming, this second set of tokens is needed
+ // due to the beam search kernels assuming ungathered tokens (stored in `ids`).
+
TensorPtr newTokensSteps; // [maxTokensPerStep, BS, BM] new tokens at each generated token of
// maxTokensPerStep
TensorPtr newTokens; // [BS, BM] usually a view of newTokensSteps for the current token
diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoder.h b/cpp/include/tensorrt_llm/runtime/gptDecoder.h
index 922236523..1753f24c6 100644
--- a/cpp/include/tensorrt_llm/runtime/gptDecoder.h
+++ b/cpp/include/tensorrt_llm/runtime/gptDecoder.h
@@ -51,8 +51,7 @@ class IGptDecoder
virtual ~IGptDecoder() = default;
- virtual void setup(SamplingConfig const& samplingConfig, size_t batchSize,
- std::optional const& batchSlots = std::nullopt,
+ virtual void setup(SamplingConfig const& samplingConfig, size_t batchSize, TensorConstPtr const& batchSlots,
std::optional const& output = std::nullopt)
= 0;
@@ -60,8 +59,8 @@ class IGptDecoder
virtual void forwardSync(DecodingOutput& output, DecodingInput const& input) = 0;
- virtual void gatherTree(ITensor& finalOutputIds, DecodingOutput const& decodingOutput,
- DecodingInput const& decodingInput, BufferManager const& manager,
+ virtual void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput,
+ BufferManager const& manager,
std::optional> samplingConfig = std::nullopt)
= 0;
@@ -95,15 +94,14 @@ class GptDecoder : public virtual IGptDecoder
size_t vocabSizePadded, size_t maxSequenceLength, CudaStreamPtr const& stream,
std::shared_ptr speculativeDecodingModule = nullptr);
- void setup(SamplingConfig const& samplingConfig, size_t batchSize,
- std::optional const& batchSlots = std::nullopt,
+ void setup(SamplingConfig const& samplingConfig, size_t batchSize, TensorConstPtr const& batchSlots,
std::optional const& output = std::nullopt) override;
void forwardAsync(DecodingOutput& output, DecodingInput const& input) override;
void forwardSync(DecodingOutput& output, DecodingInput const& input) override;
- void gatherTree(ITensor& finalOutputIds, DecodingOutput const& decodingOutput, DecodingInput const& decodingInput,
+ void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput,
BufferManager const& manager,
std::optional> samplingConfig = std::nullopt) override;
@@ -143,5 +141,17 @@ inline std::unique_ptr IGptDecoder::create(executor::DecodingMode c
return nullptr;
}
}
+
+/// @brief Helper function to produce batch slots [0, 1, ..., batchSize - 1] for paths that do not explicitly provide
+/// batch slots to the decoder.
+inline runtime::ITensor::SharedConstPtr getDefaultBatchSlots(
+ runtime::SizeType32 batchSize, runtime::BufferManager const& bufferManager)
+{
+ auto defaultBatchSlots = bufferManager.pinnedPool(
+ runtime::ITensor::makeShape({batchSize}), runtime::TRTDataType::value);
+ auto range = runtime::BufferRange(*defaultBatchSlots);
+ std::iota(range.begin(), range.end(), 0);
+ return defaultBatchSlots;
+}
} // namespace runtime
} // namespace tensorrt_llm
diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
index f3b70349a..bbef3ae7a 100644
--- a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
+++ b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
@@ -79,24 +79,52 @@ class GptDecoderBatched : public IGptDecoderBatched
//! @param batchIdx index of the batch
//! @returns [maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without
- //! padding for request `batchIdx`, on gpu
- [[nodiscard]] TensorPtr getOutputIds(SizeType32 batchIdx) const override
+ //! padding for request `batchIdx`, on gpu. In case of beam search, contains the ungathered data.
+ [[nodiscard]] TensorPtr getIds(SizeType32 batchIdx) const override
{
+ TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
auto tensor = ITensor::slice(mJointDecodingOutput->ids, batchIdx, 1);
tensor->squeeze(0);
+ TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
return tensor;
}
//! @returns [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token
- //! ids without padding, on gpu
- [[nodiscard]] TensorPtr getOutputIds() const override
+ //! ids without padding, on gpu. In case of beam search, contains the ungathered data.
+ [[nodiscard]] TensorPtr getIds() const override
{
- return ITensor::slice(mJointDecodingOutput->ids, 0, mActualBatchSize);
+ TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+ auto tensor = ITensor::slice(mJointDecodingOutput->ids, 0, mActualBatchSize);
+ TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+ return tensor;
+ }
+
+ //! @param batchIdx index of the batch
+ //! @returns [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains
+ //! gathered token ids without padding for request `batchIdx`, on gpu.
+ [[nodiscard]] TensorPtr getGatheredIds(SizeType32 batchIdx) const override
+ {
+ TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+ auto tensor = ITensor::slice(mJointDecodingOutput->gatheredIds, batchIdx, 1);
+ tensor->squeeze(0);
+ TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+ return tensor;
+ }
+
+ //! @returns [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains
+ //! gathered token ids without padding, on gpu
+ [[nodiscard]] TensorPtr getGatheredIds() const override
+ {
+ TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+ auto tensor = ITensor::slice(mJointDecodingOutput->gatheredIds, 0, mActualBatchSize);
+ TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+ return tensor;
}
//! @brief Gather final beam search results for request `batchSlot`.
//! Result will only be available after event returned.
- [[nodiscard]] CudaEvent finalize(SizeType32 batchSlot, SamplingConfig const& samplingConfig) const override;
+ [[nodiscard]] CudaEvent finalize(
+ SizeType32 batchSlot, SamplingConfig const& samplingConfig, bool streaming) const override;
//! @brief Gather final beam search results for all requests.
void finalize(SamplingConfig const& samplingConfig) const override;
@@ -202,7 +230,8 @@ class GptDecoderBatched : public IGptDecoderBatched
private:
//! @brief Gather final beam search results for request `batchIdx`.
- [[nodiscard]] CudaEvent postProcessRequest(SizeType32 batchIdx, SamplingConfig const& samplingConfig) const;
+ [[nodiscard]] CudaEvent postProcessRequest(
+ SizeType32 batchIdx, SamplingConfig const& samplingConfig, bool streaming) const;
//! @brief Initialize the decoder at `batchSlot` with a new `request`.
void newRequest(SizeType32 batchSlot, decoder_batch::Request const& request, SamplingConfig const& samplingConfig);
@@ -299,5 +328,11 @@ class GptDecoderBatched : public IGptDecoderBatched
SpeculativeDecodingMode mSpeculativeDecodingMode;
executor::DecodingMode mDecodingMode{executor::DecodingMode::Auto()};
+
+ // temporary buffers for the beam search + streaming case
+ std::shared_ptr mOutputBeamHypotheses{nullptr};
+ // will store a slice of DecodingOutput::cumLogProbs
+ DecodingOutput::TensorPtr mCumLogProbsTmp;
+ SizeType32 mNumSMs;
};
} // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/gptSession.h b/cpp/include/tensorrt_llm/runtime/gptSession.h
index 11289d5a2..46cd19902 100644
--- a/cpp/include/tensorrt_llm/runtime/gptSession.h
+++ b/cpp/include/tensorrt_llm/runtime/gptSession.h
@@ -115,6 +115,7 @@ class [[deprecated("Use the executor API instead.")]] GptSession
std::optional genMicroBatchSize = std::nullopt;
std::optional decodingMode = std::nullopt;
bool normalizeLogProbs = true;
+ std::optional enginePath;
};
//! @brief Optional profiler class to profile the generation phase of an inference request
@@ -178,10 +179,7 @@ class [[deprecated("Use the executor API instead.")]] GptSession
}
GptSession(Config const& sessionConfig, ModelConfig const& modelConfig, WorldConfig const& worldConfig,
- std::string const& engineFile, LoggerPtr logger = nullptr)
- : GptSession(sessionConfig, modelConfig, worldConfig, RawEngine(engineFile), std::move(logger))
- {
- }
+ std::string const& engineFile, LoggerPtr logger = nullptr);
[[nodiscard]] nvinfer1::ILogger& getLogger() const;
@@ -291,6 +289,8 @@ class [[deprecated("Use the executor API instead.")]] GptSession
TokenGeneratedCallback createOnTokenGeneratedCallback(GenerationOutput& outputs);
+ bool shouldUseKVCacheManager() const;
+
class CudaGraphExecutor
{
public:
@@ -369,6 +369,7 @@ class [[deprecated("Use the executor API instead.")]] GptSession
std::shared_ptr mAllReduceBuffers;
SizeType32 mDecoderMaxSequenceLength{};
+ std::vector mDecoderMaxAttentionWindowVec{};
SizeType32 mDecoderMaxAttentionWindow{};
SizeType32 mDecoderSinkTokenLength{};
diff --git a/cpp/include/tensorrt_llm/runtime/iBuffer.h b/cpp/include/tensorrt_llm/runtime/iBuffer.h
index 2b6d2ca98..46fb3972e 100644
--- a/cpp/include/tensorrt_llm/runtime/iBuffer.h
+++ b/cpp/include/tensorrt_llm/runtime/iBuffer.h
@@ -46,7 +46,8 @@ enum class MemoryType : std::int32_t
kGPU = 0,
kCPU = 1,
kPINNED = 2,
- kUVM = 3
+ kUVM = 3,
+ kPINNEDPOOL = 4
};
template
@@ -78,6 +79,12 @@ struct MemoryTypeString
static auto constexpr value = "UVM";
};
+template <>
+struct MemoryTypeString
+{
+ static auto constexpr value = "PINNEDPOOL";
+};
+
//! \brief For converting a TensorRT data type to a C++ data type.
template
struct DataTypeTraits
diff --git a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
index ec772cd7b..4495c102a 100644
--- a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
+++ b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
@@ -120,11 +120,17 @@ class IGptDecoderBatched : public virtual IStatefulGptDecoder
//! @param batchIdx index of the batch
//! @returns [maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token
//! ids without padding for request `batchIdx`, on gpu
- [[nodiscard]] virtual TensorPtr getOutputIds(SizeType32 batchIdx) const = 0;
+ [[nodiscard]] virtual TensorPtr getIds(SizeType32 batchIdx) const = 0;
+
+ //! @returns [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search in
+ //! GptDecoderBatched It contains gathered token ids without padding, on gpu
+ [[nodiscard]] virtual TensorPtr getGatheredIds(SizeType32 batchIdx) const = 0;
//! @brief Gather final beam search results for request `batchIdx`.
//! Result will only be available after event returned
- [[nodiscard]] virtual CudaEvent finalize(SizeType32 batchIdx, SamplingConfig const& samplingConfig) const = 0;
+ [[nodiscard]] virtual CudaEvent finalize(
+ SizeType32 batchIdx, SamplingConfig const& samplingConfig, bool streaming) const
+ = 0;
//! @returns [batchSize (actual)], marks finished requests (per batch)
[[nodiscard]] virtual std::vector getFinished() const = 0;
diff --git a/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h b/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h
index 0339feb5b..f5e0f142d 100644
--- a/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h
+++ b/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h
@@ -101,7 +101,10 @@ class IStatefulGptDecoder
virtual void finalize(SamplingConfig const& samplingConfig) const = 0;
//! @returns [batchSize, beamWidth, maxSequenceLength], all token ids, on gpu
- [[nodiscard]] virtual TensorPtr getOutputIds() const = 0;
+ [[nodiscard]] virtual TensorPtr getIds() const = 0;
+
+ //! @returns [batchSize, beamWidth, maxSequenceLength] token ids after gatherTree
+ [[nodiscard]] virtual TensorPtr getGatheredIds() const = 0;
//! @returns [batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
[[nodiscard]] virtual TensorPtr getCumLogProbs() const = 0;
diff --git a/cpp/include/tensorrt_llm/runtime/memoryCounters.h b/cpp/include/tensorrt_llm/runtime/memoryCounters.h
index e4725f851..42ccdc13d 100644
--- a/cpp/include/tensorrt_llm/runtime/memoryCounters.h
+++ b/cpp/include/tensorrt_llm/runtime/memoryCounters.h
@@ -54,6 +54,11 @@ class MemoryCounters
return mUVM;
}
+ [[nodiscard]] SizeType32 getPinnedPool() const
+ {
+ return mPinnedPool;
+ }
+
[[nodiscard]] DiffType getGpuDiff() const
{
return mGpuDiff;
@@ -74,6 +79,11 @@ class MemoryCounters
return mUVMDiff;
}
+ [[nodiscard]] DiffType getPinnedPoolDiff() const
+ {
+ return mPinnedPoolDiff;
+ }
+
template
void allocate(SizeType32 size)
{
@@ -98,6 +108,11 @@ class MemoryCounters
mUVM += size;
mUVMDiff = sizeDiff;
}
+ else if constexpr (T == MemoryType::kPINNEDPOOL)
+ {
+ mPinnedPool += size;
+ mPinnedPoolDiff = sizeDiff;
+ }
else
{
TLLM_THROW("Unknown memory type: %s", MemoryTypeString::value);
@@ -130,6 +145,11 @@ class MemoryCounters
mUVM -= size;
mUVMDiff = sizeDiff;
}
+ else if constexpr (T == MemoryType::kPINNEDPOOL)
+ {
+ mPinnedPool -= size;
+ mPinnedPoolDiff = sizeDiff;
+ }
else
{
TLLM_THROW("Unknown memory type: %s", MemoryTypeString::value);
@@ -147,8 +167,8 @@ class MemoryCounters
[[nodiscard]] std::string toString() const;
private:
- std::atomic mGpu{}, mCpu{}, mPinned{}, mUVM{};
- std::atomic mGpuDiff{}, mCpuDiff{}, mPinnedDiff{}, mUVMDiff{};
+ std::atomic mGpu{}, mCpu{}, mPinned{}, mUVM{}, mPinnedPool{};
+ std::atomic mGpuDiff{}, mCpuDiff{}, mPinnedDiff{}, mUVMDiff{}, mPinnedPoolDiff{};
};
} // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/modelConfig.h b/cpp/include/tensorrt_llm/runtime/modelConfig.h
index aa2585078..3736dc4b0 100644
--- a/cpp/include/tensorrt_llm/runtime/modelConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/modelConfig.h
@@ -62,6 +62,41 @@ class ModelConfig
kRECURRENT,
};
+ enum class KVCacheType : std::int32_t
+ {
+ kCONTINUOUS,
+ kPAGED,
+ kDISABLED,
+ };
+
+ static KVCacheType KVCacheTypeFromString(std::string value)
+ {
+ std::transform(value.begin(), value.end(), value.begin(), ::toupper);
+
+ if (value == "CONTINUOUS")
+ {
+ return KVCacheType::kCONTINUOUS;
+ }
+ else if (value == "PAGED")
+ {
+ return KVCacheType::kPAGED;
+ }
+ else if (value == "DISABLED")
+ {
+ return KVCacheType::kDISABLED;
+ }
+ else
+ {
+ throw std::invalid_argument("Invalid KV cache type: " + value);
+ }
+ }
+
+ enum class ManageWeightsType : std::int32_t
+ {
+ kDisabled,
+ kEnabled,
+ };
+
explicit ModelConfig(SizeType32 vocabSize, SizeType32 nbAttentionLayers, SizeType32 nbRnnLayers, SizeType32 nbHeads,
SizeType32 hiddenSize, nvinfer1::DataType dtype)
: mVocabSize(vocabSize)
@@ -75,8 +110,6 @@ class ModelConfig
, mUseGptAttentionPlugin(false)
, mUseMambaConv1dPlugin(false)
, mInputPacked{false}
- , mPagedKvCache{false}
- , mPagedState{false}
, mTokensPerBlock{64}
, mQuantMode{common::QuantMode::none()}
, mMaxBatchSize(0)
@@ -99,6 +132,7 @@ class ModelConfig
, mSpeculativeDecodingMode(SpeculativeDecodingMode::None())
, mLogitsDtype(nvinfer1::DataType::kFLOAT)
, mUseShapeInference(true)
+ , mManageWeightsType(ManageWeightsType::kDisabled)
{
}
@@ -202,16 +236,6 @@ class ModelConfig
mInputPacked = inputPacked;
}
- [[nodiscard]] bool constexpr usePagedKvCache() const noexcept
- {
- return mPagedKvCache;
- }
-
- void constexpr usePagedKvCache(bool pagedKvCache) noexcept
- {
- mPagedKvCache = pagedKvCache;
- }
-
[[nodiscard]] bool constexpr usePagedState() const noexcept
{
return mPagedState;
@@ -244,7 +268,8 @@ class ModelConfig
[[nodiscard]] bool constexpr supportsInflightBatching() const noexcept
{
- return (isTransformerBased() && mUseGptAttentionPlugin && mInputPacked && mPagedKvCache)
+ return (isTransformerBased() && mUseGptAttentionPlugin && mInputPacked
+ && (mKVCacheType == KVCacheType::kDISABLED || mKVCacheType == KVCacheType::kPAGED))
|| (isRnnBased() && mUseMambaConv1dPlugin && mInputPacked && mPagedState);
}
@@ -423,6 +448,32 @@ class ModelConfig
mMlpHiddenSize = mlpHiddenSize;
}
+ // Utility functions for fast KVCacheType checking.
+ [[nodiscard]] bool constexpr isKVCacheEnabled() const noexcept
+ {
+ return mKVCacheType != KVCacheType::kDISABLED;
+ }
+
+ [[nodiscard]] bool constexpr isPagedKVCache() const noexcept
+ {
+ return mKVCacheType == KVCacheType::kPAGED;
+ }
+
+ [[nodiscard]] bool constexpr isContinuousKVCache() const noexcept
+ {
+ return mKVCacheType == KVCacheType::kCONTINUOUS;
+ }
+
+ [[nodiscard]] KVCacheType constexpr getKVCacheType() const noexcept
+ {
+ return mKVCacheType;
+ }
+
+ void constexpr setKVCacheType(KVCacheType kvCacheType) noexcept
+ {
+ mKVCacheType = kvCacheType;
+ }
+
[[nodiscard]] bool constexpr useCrossAttention() const noexcept
{
return mUseCrossAttention;
@@ -574,6 +625,16 @@ class ModelConfig
return mUseShapeInference;
}
+ [[nodiscard]] ManageWeightsType getManageWeightsType() const noexcept
+ {
+ return mManageWeightsType;
+ }
+
+ void setManageWeightsType(const ManageWeightsType manageWeightType) noexcept
+ {
+ mManageWeightsType = manageWeightType;
+ }
+
private:
SizeType32 mVocabSize;
SizeType32 mNbAttentionLayers;
@@ -586,7 +647,6 @@ class ModelConfig
bool mUseGptAttentionPlugin;
bool mUseMambaConv1dPlugin;
bool mInputPacked;
- bool mPagedKvCache;
bool mPagedState;
SizeType32 mTokensPerBlock;
common::QuantMode mQuantMode;
@@ -613,6 +673,9 @@ class ModelConfig
std::optional mRnnConfig;
+ // Whether kv_cache is enabled. In kv_cache is disabled, it is only intended for context phase only now.
+ KVCacheType mKVCacheType = KVCacheType::kCONTINUOUS;
+
// Configs related to encoder / enc-dec models
SizeType32 mMaxEncoderLen{};
SizeType32 mEncoderHiddenSize{};
@@ -628,6 +691,7 @@ class ModelConfig
// Logits datatype
nvinfer1::DataType mLogitsDtype;
bool mUseShapeInference;
+ ManageWeightsType mManageWeightsType;
};
} // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/rawEngine.h b/cpp/include/tensorrt_llm/runtime/rawEngine.h
index 6a6e7eb5a..2c1c5ba22 100644
--- a/cpp/include/tensorrt_llm/runtime/rawEngine.h
+++ b/cpp/include/tensorrt_llm/runtime/rawEngine.h
@@ -20,6 +20,7 @@
#include
#include
+#include
namespace tensorrt_llm::runtime
{
@@ -60,10 +61,20 @@ class RawEngine
[[nodiscard]] std::filesystem::path getPath() const
{
- TLLM_CHECK(mType == FilePath);
+ TLLM_CHECK(mEnginePath.has_value());
+ return mEnginePath.value();
+ }
+
+ [[nodiscard]] std::optional getPathOpt() const
+ {
return mEnginePath;
}
+ void setPath(std::filesystem::path enginePath)
+ {
+ mEnginePath = std::move(enginePath);
+ }
+
[[nodiscard]] void const* getAddress() const
{
TLLM_CHECK(mType == AddressWithSize);
@@ -84,7 +95,7 @@ class RawEngine
private:
Type mType;
- std::filesystem::path mEnginePath;
+ std::optional mEnginePath;
struct
{
diff --git a/cpp/include/tensorrt_llm/runtime/worldConfig.h b/cpp/include/tensorrt_llm/runtime/worldConfig.h
index a39264d8a..58574ddd5 100644
--- a/cpp/include/tensorrt_llm/runtime/worldConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/worldConfig.h
@@ -123,6 +123,11 @@ class WorldConfig
return getPipelineParallelRank() == getPipelineParallelism() - 1;
}
+ [[nodiscard]] bool constexpr isFirstTensorParallelRank() const noexcept
+ {
+ return getTensorParallelRank() == 0;
+ }
+
[[nodiscard]] SizeType32 constexpr getLastRank() const noexcept
{
return getSize() - 1;
diff --git a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
index cef848bc1..02a5133b1 100644
--- a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
+++ b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
@@ -421,6 +421,8 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
MOEExpertScaleNormalizationMode mNormMode = MOEExpertScaleNormalizationMode::NONE;
QuantParams mQuantParams{};
+ bool mUseLora = false;
+ LoraParams mLoraParams{};
std::optional mSelectedConfig = std::nullopt;
@@ -451,8 +453,8 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
mGatedMultiplier = mIsGated ? 2 : 1;
auto const gated_inter = mInterSize * mGatedMultiplier;
- size_t workspace_size
- = mMoERunner.getWorkspaceSize(mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mActType, {});
+ size_t workspace_size = mMoERunner.getWorkspaceSize(
+ mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mActType, {}, mUseLora);
mWorkspace = allocBuffer(workspace_size);
size_t const expert_matrix_size = mNumExperts * mHiddenSize * mInterSize;
@@ -531,7 +533,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
GemmProfilerBackend profiler;
profiler.init(mMoERunner, gemm_to_profile, typeToDtypeID(), typeToDtypeID(),
- typeToDtypeID(), mNumExperts, mK, mHiddenSize, mInterSize, mActType, mUseBias,
+ typeToDtypeID(), mNumExperts, mK, mHiddenSize, mInterSize, mActType, mUseBias, mUseLora,
parallelism_config);
auto workspace_size = profiler.getWorkspaceSize(mTotalTokens);
auto workspace = bufferManager->gpu(workspace_size);
@@ -639,7 +641,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
mMoERunner.runMoe(mInputTensor, mInputProbabilities, mExpertWeight1, mExpertBias1, mActType, mExpertWeight2,
mExpertBias2, mQuantParams, mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace,
mFinalOutput, nullptr, mTotalTokens, mScaleProbs, mSourceToExpandedMap, mSelectedExpert, parallelism_config,
- mNormMode, stream);
+ mNormMode, mUseLora, mLoraParams, stream);
}
void runBenchmark(benchmark::State& state);
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index 0431d7681..1781be133 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:47a55030296ff903151b42bba88ec1a90b415eed96ba64059ebaae6728b530af
-size 4302336
+oid sha256:84a6439038eb0a7d2913c3fe051684ab7779e42635c074bc6df30bfc46807929
+size 4358834
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index d92d24475..f90fc6d03 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:d49d3c8a723894c256b76986377252f5fb7d0831db5e4245182b9c1c1310ea3c
-size 4402848
+oid sha256:1b321340ea622b28ed7d38fe18ff7707091d2efa414af40f6db516959a4fa2f4
+size 4466694
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
index d3534f7ef..2eff4c1d7 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-0aa1241f7acc7c819e8b709e9d5bd274 libtensorrt_llm_batch_manager_static.a
-e9beacff06034f37ae89cbd3e0366f9b libtensorrt_llm_batch_manager_static.pre_cxx11.a
-fe9929c38087d3afb34eb756995c5ac40936fdef commit
\ No newline at end of file
+99062b35da1cb99df9c79368da1ff9de libtensorrt_llm_batch_manager_static.a
+72e6a44f7636bb6d48b016db1c62cdc7 libtensorrt_llm_batch_manager_static.pre_cxx11.a
+90dd0ad72954a5cc7cc2e298495e784906fe49b1 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index 3407a7111..a2560a5a6 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:dd85087aec03056fd3f2a956997390374752d4a9e079d75b6c3c0db6b3cdf6da
-size 4161088
+oid sha256:1df6689f399313cac54ec1f4422975d6060957edbc698468a96f3a2c2a6542bc
+size 4221016
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index 3ae5d8227..f9251acc4 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:3a1676bf04d165b095f0c0287bd02ece848cda3b43f2cf234f883f82871bf744
-size 4140446
+oid sha256:b118f1bbccd6fe8e5e001916f4de19ec34886e7dcc288b91dcdadf5500f0eb50
+size 4205756
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
index 29f3868ea..f69634e79 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:72d330d16ce6748bf661655a4a809383c60b2634df8386b51b611e4fc62a04fb
-size 23939786
+oid sha256:47162c3eaab9b6f60bca8927eef0423c5521d7750f112b87a2f9175156ccb6cd
+size 24807904
diff --git a/cpp/tensorrt_llm/common/safetensors.cpp b/cpp/tensorrt_llm/common/safetensors.cpp
new file mode 100644
index 000000000..b8e73f31e
--- /dev/null
+++ b/cpp/tensorrt_llm/common/safetensors.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "safetensors.h"
+#include "nlohmann/json.hpp"
+#include "tensorrt_llm/common/assert.h"
+#include
+#include
+#include
+#include
+#include