Skip to content

Commit

Permalink
Update TensorRT-LLM (NVIDIA#2156)
Browse files Browse the repository at this point in the history
Co-authored-by: Bruno Magalhaes <[email protected]>
  • Loading branch information
Shixiaowei02 and bm-synth authored Aug 27, 2024
1 parent 32ed92e commit b8fc663
Show file tree
Hide file tree
Showing 143 changed files with 2,110 additions and 1,438 deletions.
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@ TensorRT-LLM
<div align="left">

## Latest News
* [2024/08/13] 🐍 DIY Code Completion with #Mamba ⚡ #TensorRT #LLM for speed 🤖 NIM for ease ☁️ deploy anywhere
[➡️ link](https://developer.nvidia.com/blog/revolutionizing-code-completion-with-codestral-mamba-the-next-gen-coding-llm/)
* [2024/08/13] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
[➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)
<div align="center">
<img src="docs/source/media/picture-08-13-2024.png" width="50%">
<img src="docs/source/media/picture-08-20-2024.png" width="40%">
<div align="left">

* [2024/08/13] 🐍 DIY Code Completion with #Mamba ⚡ #TensorRT #LLM for speed 🤖 NIM for ease ☁️ deploy anywhere
[➡️ link](https://developer.nvidia.com/blog/revolutionizing-code-completion-with-codestral-mamba-the-next-gen-coding-llm/)

* [2024/08/06] 🗫 Multilingual Challenge Accepted 🗫
🤖 #TensorRT #LLM boosts low-resource languages like Hebrew, Indonesian and Vietnamese ⚡[➡️ link](https://developer.nvidia.com/blog/accelerating-hebrew-llm-performance-with-nvidia-tensorrt-llm/?linkId=100000278659647)

Expand Down
237 changes: 124 additions & 113 deletions benchmarks/Suite.md

Large diffs are not rendered by default.

11 changes: 10 additions & 1 deletion benchmarks/cpp/gptManagerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ struct BenchInfo
float firstTokenLatency{};
std::optional<float> avgGenT2TLatency{};
bool firstTokenSeen{false};
SizeType32 decodingIter{0};
};

class Recorder
Expand Down Expand Up @@ -527,6 +528,7 @@ class Recorder
outSeqLen -= inputSeqLen;
}
mRequestBenchInfos[requestId].outputLength = outSeqLen;
mRequestBenchInfos[requestId].decodingIter = response.getResult().decodingIter;
}
else
{
Expand Down Expand Up @@ -572,6 +574,7 @@ class Recorder
std::vector<float> genT2TLatencies;

int totalOutputTokens{0};
int totalDecodingIter{0};
mNumErrorSamples = 0;
mNumSamples = 0;
for (auto reqInfo : mRequestBenchInfos)
Expand All @@ -580,6 +583,7 @@ class Recorder
{
reqLatencies.push_back(reqInfo.second.latency);
totalOutputTokens += reqInfo.second.outputLength;
totalDecodingIter += reqInfo.second.decodingIter;

if (mStreaming)
{
Expand All @@ -601,6 +605,9 @@ class Recorder
mTotalLatency = std::chrono::duration<float, std::milli>(mEnd - mStart).count();
mSeqThroughput = mNumSamples / (mTotalLatency / 1000);
mTokenThroughput = totalOutputTokens / (mTotalLatency / 1000);
mAcceptanceRate = totalDecodingIter
? (static_cast<float>(totalOutputTokens) / static_cast<float>(totalDecodingIter))
: 0.0f;

mAvgSeqLatency = std::accumulate(reqLatencies.begin(), reqLatencies.end(), 0.F) / reqLatencies.size();

Expand Down Expand Up @@ -648,7 +655,8 @@ class Recorder
printf("\n[BENCHMARK] num_samples %d\n", mNumSamples);
printf("[BENCHMARK] total_latency(ms) %.2f\n", mTotalLatency);
printf("[BENCHMARK] seq_throughput(seq/sec) %.2f\n", mSeqThroughput);
printf("[BENCHMARK] token_throughput(token/sec) %.2f\n\n", mTokenThroughput);
printf("[BENCHMARK] token_throughput(token/sec) %.2f\n", mTokenThroughput);
printf("[BENCHMARK] avg_acceptance_rate(tokens/decoding steps) %.2f\n\n", mAcceptanceRate);

printf("[BENCHMARK] avg_sequence_latency(ms) %.2f\n", mAvgSeqLatency);
printf("[BENCHMARK] max_sequence_latency(ms) %.2f\n", mMaxSeqLatency);
Expand Down Expand Up @@ -763,6 +771,7 @@ class Recorder
float mAvgGenT2TLatency{};
float mAvgFtLatency{};
float mTokenThroughput{};
float mAcceptanceRate{};
float mP99SeqLatency{};
float mP90SeqLatency{};
float mP50SeqLatency{};
Expand Down
12 changes: 5 additions & 7 deletions benchmarks/python/check_accuracy_mlperf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from transformers import AutoTokenizer, LlamaTokenizerFast

nltk.download("punkt", quiet=False)
nltk.download('punkt_tab')
import argparse


Expand All @@ -25,10 +26,9 @@ class Model(Enum):
"tokens_per_sample": 294.45 * 0.9
},
Model.GPT_J: {
"rouge1": 42.9435135,
"rouge2": 20.1033765,
"rougeL": 29.9581119,
# "tokens_per_sample": ??
"rouge1": 42.9865 * 0.99,
"rouge2": 20.1235 * 0.99,
"rougeL": 29.9881 * 0.99,
}
}

Expand Down Expand Up @@ -138,7 +138,6 @@ def main():
target_texts = get_reference_df(args.dataset)
model = Model.Llama_v2_70B
tokenizer = LlamaTokenizerFast.from_pretrained(args.base_model)
relaxing_factor = 1.0
elif args.dataset.lower().endswith(".json"):
target_texts = get_reference_json(args.dataset)
model = Model.GPT_J
Expand All @@ -147,7 +146,6 @@ def main():
padding_side="left",
use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
relaxing_factor = 0.93
else:
raise RuntimeError(
"Dataset expected to be pkl (open-orca) or json (cnn-dailymail)")
Expand All @@ -169,7 +167,7 @@ def main():
print("Targets: ", targets)

for k, _ in targets.items():
assert targets[k] * relaxing_factor <= achieved_scores[k]
assert targets[k] <= achieved_scores[k]


if __name__ == "__main__":
Expand Down
81 changes: 68 additions & 13 deletions cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ class GenericLlmRequest
bool applyLogitsPostProcessorBatched = false,
std::optional<std::shared_ptr<VecTokens>> encoderInputTokens = std::nullopt, bool returnEncoderOutput = false,
std::optional<RequestIdType> clientId = std::nullopt,
executor::PriorityType priority = executor::Request::kDefaultPriority)
executor::PriorityType priority = executor::Request::kDefaultPriority,
std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
std::optional<SizeType32> encoderOutputLength = std::nullopt)
: mRequestId(requestId)
, mPromptLen(inputTokens->size())
, mMaxNewTokens(maxNewTokens)
Expand Down Expand Up @@ -123,8 +125,10 @@ class GenericLlmRequest
, mDecodingIter(0)
, mPriority(priority)
, mFinishReasons(samplingConfig.beamWidth)
, mEncoderInputFeatures(std::move(encoderInputFeatures))
, mEncoderOutputLength(encoderOutputLength)
{
if (mEncoderTokens.has_value())
if (mEncoderTokens.has_value() || encoderInputFeatures.has_value())
{
mState = REQUEST_STATE_ENCODER_INIT;
}
Expand Down Expand Up @@ -170,6 +174,7 @@ class GenericLlmRequest
, mPriority(req.getPriority())
, mFinishReasons(mSamplingConfig.beamWidth)
, mContextPhaseParams(req.getContextPhaseParams())
, mEncoderOutputLength(req.getEncoderOutputLength())
{
if (mIsStreaming && mSamplingConfig.beamWidth > 1 && !mReturnAllGeneratedTokens)
{
Expand All @@ -189,10 +194,14 @@ class GenericLlmRequest
"since logits are not. Disabling returnGenerationLogits.");
mReturnGenerationLogits = false;
}
if (req.getEncoderInputTokenIds())

if (req.getEncoderInputTokenIds().has_value() || req.getEncoderInputFeatures().has_value())
{
mState = REQUEST_STATE_ENCODER_INIT;
mEncoderTokens = std::make_shared<VecTokens>(req.getEncoderInputTokenIds().value());
if (req.getEncoderInputTokenIds().has_value())
{
mEncoderTokens = std::make_shared<VecTokens>(req.getEncoderInputTokenIds().value());
}
}
if (req.getEmbeddingBias())
{
Expand Down Expand Up @@ -254,14 +263,24 @@ class GenericLlmRequest
// NOTE: Draft acceptance threshold is stored in mSamplingConfig
}

auto const& encoderInputFeatures = req.getEncoderInputFeatures();
if (encoderInputFeatures.has_value())
{
mEncoderInputFeatures = executor::detail::toITensor(encoderInputFeatures.value());
}
else
{
mEncoderInputFeatures = std::nullopt;
}

initialize(req.getInputTokenIds(), req.getOutputConfig().returnLogProbs);
}

void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
std::optional<SizeType32> maxEncoderInputLen = std::nullopt)
{
TLLM_CHECK_WITH_INFO(!(maxEncoderInputLen.has_value() && getEncoderLen() > maxEncoderInputLen.value()),
"Encoder length (%d) exceeds maximum encoder input length (%d).", getEncoderLen(),
TLLM_CHECK_WITH_INFO(!(maxEncoderInputLen.has_value() && getEncoderInputLen() > maxEncoderInputLen.value()),
"Encoder length (%d) exceeds maximum encoder input length (%d).", getEncoderInputLen(),
maxEncoderInputLen.value());

if (mPromptLen > maxInputLen)
Expand Down Expand Up @@ -383,12 +402,36 @@ class GenericLlmRequest
return mEncoderTokens;
}

/// @brief Get the number of input tokens to encoder
/// @return The number of encoder input tokens.
[[nodiscard]] SizeType32 getEncoderLen() const
/// @brief Get length of encoder input (could be tokens or features length)
/// @return An integer.
[[nodiscard]] SizeType32 getEncoderInputLen() const
{
TLLM_CHECK_WITH_INFO(getEncoderTokens().has_value(), "Encoder tokens are not given");
return getEncoderTokens().value()->size();
if (mEncoderInputFeatures.has_value())
{
return getEncoderInputFeatures()->getShape().d[0];
}
else if (getEncoderTokens().has_value())
{
return getEncoderTokens().value()->size();
}
else
{
TLLM_THROW("GenericLlmRequest::getEncoderInputLen - Do not have encoder length!");
}
}

/// @brief Get length of encoder output. Fall back to encoder input length if not present
/// @return An integer.
[[nodiscard]] SizeType32 getEncoderOutputLen() const
{
if (mEncoderOutputLength.has_value())
{
return mEncoderOutputLength.value();
}
else
{
return getEncoderInputLen();
}
}

/// @brief Get the draft tokens
Expand Down Expand Up @@ -513,7 +556,8 @@ class GenericLlmRequest
}

// for enc-dec models, pause means saving generated tokens to prompt but need to re-do encoder phase
mState = mEncoderTokens.has_value() ? REQUEST_STATE_ENCODER_INIT : REQUEST_STATE_CONTEXT_INIT;
mState = mEncoderTokens.has_value() || mEncoderInputFeatures ? REQUEST_STATE_ENCODER_INIT
: REQUEST_STATE_CONTEXT_INIT;
mContextCurrentPosition = 0;
mContextChunkSize = std::nullopt;
mSeqSlot.reset();
Expand Down Expand Up @@ -716,6 +760,11 @@ class GenericLlmRequest
return mEncoderOutputHost;
}

[[nodiscard]] TensorPtr const getEncoderInputFeatures() const
{
return mEncoderInputFeatures.value_or(nullptr);
}

void setEncoderOutputHost(TensorPtr encoderOutputHost)
{
mEncoderOutputHost = std::move(encoderOutputHost);
Expand All @@ -724,7 +773,7 @@ class GenericLlmRequest
void allocEncoderOutputHost(SizeType32 encoderHiddenSize, nvinfer1::DataType dataType)
{
mEncoderOutputHost = runtime::BufferManager::pinned(
runtime::ITensor::makeShape({getEncoderLen(), encoderHiddenSize}), dataType);
runtime::ITensor::makeShape({getEncoderOutputLen(), encoderHiddenSize}), dataType);
}

[[nodiscard]] TensorPtr const& getEncoderOutput() const noexcept
Expand Down Expand Up @@ -1091,6 +1140,7 @@ class GenericLlmRequest
}

result.finishReasons = mFinishReasons;
result.decodingIter = mDecodingIter;

// Update position of last sent response
setMaxSentTokenLen(maxNbTokens);
Expand Down Expand Up @@ -1196,6 +1246,11 @@ class GenericLlmRequest
std::vector<executor::FinishReason> mFinishReasons;
std::optional<executor::ContextPhaseParams> mContextPhaseParams;

std::optional<TensorPtr> mEncoderInputFeatures; // Input features of encoder for multimodal models
std::optional<SizeType32>
mEncoderOutputLength; // For some models like Whisper, encoder output shape cannot be inferred from encoder
// input shape due to downsampling. Thus this is needed for setting buffer sizes correctly

private:
void initialize(VecTokens const& inputTokens, bool outputLogProbs)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,15 @@ class TrtGptModelOptionalParams
{
}

// Copy constructor
TrtGptModelOptionalParams(TrtGptModelOptionalParams const& other)
: TrtGptModelOptionalParams(other.kvCacheConfig, other.enableTrtOverlap, other.deviceIds,
other.normalizeLogProbs, other.enableChunkedContext, other.peftCacheManagerConfig, other.decodingConfig,
other.gpuWeightsPercent, other.maxBeamWidth, other.maxBatchSize, other.maxNumTokens, other.schedulerConfig,
other.extendedRuntimePerfKnobConfig)
{
}

bool operator==(TrtGptModelOptionalParams const& other) const
{
return kvCacheConfig == other.kvCacheConfig //
Expand Down
22 changes: 18 additions & 4 deletions cpp/include/tensorrt_llm/common/logger.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class Logger

void log(std::exception const& ex, Level level = Level::ERROR);

Level getLevel()
Level getLevel() const
{
return level_;
}
Expand All @@ -91,6 +91,11 @@ class Logger
log(INFO, "Set logger level to %s", getLevelName(level));
}

bool isEnabled(Level const level) const
{
return level_ <= level;
}

private:
static auto constexpr kPREFIX = "[TensorRT-LLM]";

Expand Down Expand Up @@ -131,7 +136,7 @@ class Logger
template <typename... Args>
void Logger::log(Logger::Level level, char const* format, Args const&... args)
{
if (level_ <= level)
if (isEnabled(level))
{
auto const fmt = getPrefix(level) + format;
auto& out = level_ < WARNING ? std::cout : std::cerr;
Expand All @@ -150,7 +155,7 @@ void Logger::log(Logger::Level level, char const* format, Args const&... args)
template <typename... Args>
void Logger::log(Logger::Level const level, int const rank, char const* format, Args const&... args)
{
if (level_ <= level)
if (isEnabled(level))
{
auto const fmt = getPrefix(level, rank) + format;
auto& out = level_ < WARNING ? std::cout : std::cerr;
Expand All @@ -166,7 +171,16 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
}
}

#define TLLM_LOG(level, ...) tensorrt_llm::common::Logger::getLogger()->log(level, __VA_ARGS__)
#define TLLM_LOG(level, ...) \
do \
{ \
auto* const logger = tensorrt_llm::common::Logger::getLogger(); \
if (logger->isEnabled(level)) \
{ \
logger->log(level, __VA_ARGS__); \
} \
} while (0)

#define TLLM_LOG_TRACE(...) TLLM_LOG(tensorrt_llm::common::Logger::TRACE, __VA_ARGS__)
#define TLLM_LOG_DEBUG(...) TLLM_LOG(tensorrt_llm::common::Logger::DEBUG, __VA_ARGS__)
#define TLLM_LOG_INFO(...) TLLM_LOG(tensorrt_llm::common::Logger::INFO, __VA_ARGS__)
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/tensorrt_llm/common/stringUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ inline std::string arr2str(T* arr, size_t size, char const* delim = kDefaultDeli
}

template <typename T>
inline std::string vec2str(std::vector<T> vec, char const* delim = kDefaultDelimiter)
inline std::string vec2str(std::vector<T> const& vec, char const* delim = kDefaultDelimiter)
{
return arr2str(vec.data(), vec.size(), delim);
}
Expand Down
Loading

0 comments on commit b8fc663

Please sign in to comment.