Skip to content

Commit

Permalink
Update TensorRT-LLM (#1835)
Browse files Browse the repository at this point in the history
* Update TensorRT-LLM

---------

Co-authored-by: Morgan Funtowicz <[email protected]>
  • Loading branch information
kaiyux and mfuntowicz authored Jun 25, 2024
1 parent 2a115da commit 9691e12
Show file tree
Hide file tree
Showing 94 changed files with 1,129 additions and 988 deletions.
44 changes: 44 additions & 0 deletions benchmarks/cpp/gptManagerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,9 @@ struct BenchmarkParams

// Weights offloading
float gpuWeightsPercent{1.0};

// Decoding params
std::optional<std::vector<std::vector<SizeType32>>> medusaChoices;
};

class InferenceRequestsSyncSend
Expand Down Expand Up @@ -791,6 +794,10 @@ class ExecutorServer
executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
}

executorConfig.setDecodingConfig(texec::DecodingConfig(
benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
std::nullopt, benchmarkParams.medusaChoices));

mExecutor = std::make_unique<texec::Executor>(trtEnginePath, texec::ModelType::kDECODER_ONLY, executorConfig);

if (logIterationData)
Expand Down Expand Up @@ -1346,6 +1353,9 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
optionalParams.maxBeamWidth = beamWidth;
optionalParams.maxBatchSize = benchmarkParams.maxBatchSize;
optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy};
optionalParams.decodingConfig = texec::DecodingConfig(
benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
std::nullopt, benchmarkParams.medusaChoices);

auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
SizeType32 deviceCount{0};
Expand Down Expand Up @@ -1600,6 +1610,32 @@ void benchmarkExecutor(std::filesystem::path const& engineDir, TrtGptModelType m
}
}

std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
{
std::vector<std::vector<SizeType32>> result;
std::regex outer_regex(R"(\[(.*?)\])");
std::regex inner_regex(R"(\d+)");
auto outer_begin = std::sregex_iterator(input.begin(), input.end(), outer_regex);
auto outer_end = std::sregex_iterator();

for (std::sregex_iterator i = outer_begin; i != outer_end; ++i)
{
std::smatch match = *i;
std::string inner_str = match.str(1);
std::vector<int> inner_vec;
auto inner_begin = std::sregex_iterator(inner_str.begin(), inner_str.end(), inner_regex);
auto inner_end = std::sregex_iterator();

for (std::sregex_iterator j = inner_begin; j != inner_end; ++j)
{
std::smatch inner_match = *j;
inner_vec.push_back(std::stoi(inner_match.str()));
}
result.push_back(inner_vec);
}
return result;
}

} // namespace

int main(int argc, char* argv[])
Expand Down Expand Up @@ -1692,6 +1728,8 @@ int main(int argc, char* argv[])
options.add_options()("gpu_weights_percent",
"Specify the percentage of weights that reside on GPU (from 0.0 to 1.0).",
cxxopts::value<float>()->default_value("1.0"));
options.add_options()(
"medusa_choices", "Medusa choices in the format of [[0], [0, 1], [0, 0, 1]]", cxxopts::value<std::string>());

auto result = options.parse(argc, argv);

Expand Down Expand Up @@ -1823,6 +1861,12 @@ int main(int argc, char* argv[])
// Argument: If offloaded blocks should be onboarded to primary memory before they are reused.
benchmarkParams.kvOnboardBlocks = !result["kv_dont_onboard_blocks"].as<bool>();

// Argument: Medusa choices for the Medusa speculative decoding.
if (result.count("medusa_choices"))
{
benchmarkParams.medusaChoices = parseVectorOfVectors(result["medusa_choices"].as<std::string>());
}

std::optional<TokenIdType> padId;
// Argument: Padding token id
if (result.count("pad_id"))
Expand Down
1 change: 1 addition & 0 deletions benchmarks/python/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,7 @@ def build_gpt(args):
network = builder.create_network()
network.trt_network.name = engine_name
network.plugin_config.to_legacy_setting()
network.plugin_config.dtype = args.dtype

# Plugins
if args.mode in ['plugin', 'plugin-ifb']:
Expand Down
Git LFS file not shown
Git LFS file not shown
6 changes: 3 additions & 3 deletions cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.a
93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.pre_cxx11.a
c0bd2b69c932257678a2aad9bd8baba4b291795e commit
f8538ac35803837e5d457ea8c1a58053 libtensorrt_llm_batch_manager_static.a
dc6fc82dc4ba319899e1d6777bd8c3a4 libtensorrt_llm_batch_manager_static.pre_cxx11.a
265b039443334094026fbd8f396d52fe29c2d9d1 commit
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
6 changes: 3 additions & 3 deletions cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.a
7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.pre_cxx11.a
c0bd2b69c932257678a2aad9bd8baba4b291795e commit
e18e84fb356995b11c04b79e55c4c3f5 libtensorrt_llm_executor_static.a
f0555b76f21d43e676e5808bf197cc58 libtensorrt_llm_executor_static.pre_cxx11.a
265b039443334094026fbd8f396d52fe29c2d9d1 commit
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,8 @@ std::vector<cutlass_extensions::CutlassGemmConfig> MoeGemmRunner<T, WeightType>:
template <typename T, typename WeightType>
bool MoeGemmRunner<T, WeightType>::isHopperSpecialised() const
{
bool config_is_sm90 = best_config_ && best_config_->is_sm90;
TLLM_CHECK_WITH_INFO(best_config_, "Cannot determine if hopper is specialised without a selected config");
bool config_is_sm90 = best_config_->is_sm90;
return supportsHopperSpecialisation() && config_is_sm90;
}

Expand All @@ -440,7 +441,7 @@ int MoeGemmRunner<T, WeightType>::getSM() const
return this->sm_;
}

// currently support sm80 bf16/fp16 gate ativation, only set predication tensor for m direction
// currently support sm80 bf16/fp16 gate activation, only set predication tensor for m direction
template <typename T, typename WeightType>
bool MoeGemmRunner<T, WeightType>::isFusedGatedActivation(bool is_gated_activation, int gemm_n, int gemm_k) const
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
5b6c74ce66f62d2a58aa9cac16f11ad6 libtensorrt_llm_nvrtc_wrapper.so
c0bd2b69c932257678a2aad9bd8baba4b291795e commit
265b039443334094026fbd8f396d52fe29c2d9d1 commit
Git LFS file not shown
66 changes: 53 additions & 13 deletions cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1072,10 +1072,38 @@ std::vector<size_t> CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWo
size_t const hopper_size = using_hopper ? HopperGroupedGemmInput::workspaceSize(num_experts_per_node) : 0;
size_t const gemm_workspace_size = moe_gemm_runner_.getMaxWorkspaceSize(num_experts_per_node);

std::vector<size_t> workspace{source_rows_size, permuted_rows_size, permuted_experts_size, permuted_data_size,
total_rows_before_expert_size, softmax_out_size, glu_inter_size,
// We do some overlapping of the large workspace buffers. Although we could overlap some of the other buffers, they
// are small enough (i.e no factor of hidden size) they will only be a couple MiB at most, so we don't bother
// in the case of fused activation we overlap permuted_data and fc2_result
// in the case of unfused activation we overlap permuted_data and fc1_result
// we need to calculate the max possible size, so use the max of all three
size_t overlapped_gemm1_gemm2_inputs = std::max(permuted_data_size, fc2_result_size);
// When glu_inter_elems is 0 we are always fused, otherwise we may need the un-fused case
if (glu_inter_elems > 0)
{
overlapped_gemm1_gemm2_inputs = std::max(overlapped_gemm1_gemm2_inputs, fc1_result_size);
}

// if we have glu_inter we overlap it with fc2_result, otherwise we use fc1_result by itself
size_t overlapped_gemm1_gemm2_outputs = fc1_result_size;
if (glu_inter_elems > 0)
{
overlapped_gemm1_gemm2_outputs
= std::max(std::max(glu_inter_size, fc2_result_size), overlapped_gemm1_gemm2_outputs);
}

std::vector<size_t> workspace{ //
source_rows_size, //
permuted_rows_size, //
permuted_experts_size, //
total_rows_before_expert_size, //
softmax_out_size, //
sorter_size, //
// These pointers reuse the same memory
std::max(fc1_result_size, sorter_size), fc2_result_size, hopper_size, gemm_workspace_size};
overlapped_gemm1_gemm2_inputs, //
overlapped_gemm1_gemm2_outputs, //
hopper_size, //
gemm_workspace_size};
return workspace;
}

Expand All @@ -1088,7 +1116,9 @@ size_t CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWorkspaceSize(i
TLLM_CHECK_WITH_INFO(num_experts % ep_size == 0, "Number of experts must be a multiple of ep size");
auto workspace = getWorkspaceBufferSizes(
num_rows, hidden_size, inter_size, num_experts, num_experts / ep_size, k, activation_type);
return tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size());
auto ws_size = tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size());
TLLM_LOG_DEBUG("Mixture Of Experts Plugin requires workspace of %2f MiB", ws_size / 1024.f / 1024.f);
return ws_size;
}

template <class T, class WeightType, class OutputType, class Enable>
Expand All @@ -1109,29 +1139,38 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::configureWsPtrs(char
source_rows_ = (int*) ws_sliced[0];
permuted_rows_ = (int*) ws_sliced[1];
permuted_experts_ = (int*) ws_sliced[2];
permuted_data_ = (T*) ws_sliced[3];

total_rows_before_expert_ = (int64_t*) ws_sliced[4];
total_rows_before_expert_ = (int64_t*) ws_sliced[3];

softmax_out_ = nullptr;
bool const is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
if (!is_pow_2 || num_experts > 256)
{
softmax_out_ = (float*) ws_sliced[5];
softmax_out_ = (float*) ws_sliced[4];
}

glu_inter_result_ = (T*) ws_sliced[6];
sorter_ws_ = (char*) ws_sliced[5];

// These pointers are aliased. Since the sort ws can be overwritten after it is finished
sorter_ws_ = (char*) ws_sliced[7];
fc1_result_ = (T*) ws_sliced[7];
// Always 6, but overlapped with either fc1_result_ or fc2_result_
permuted_data_ = (T*) ws_sliced[6];

fc2_result_ = (T*) ws_sliced[8];
bool const is_gated_activation = isGatedActivation(activation_type);
bool const use_fused_moe = moe_gemm_runner_.isFusedGatedActivation(is_gated_activation, inter_size, hidden_size);
bool const using_hopper = moe_gemm_runner_.isHopperSpecialised();
bool const hopper_has_glu = using_hopper && (mayHaveDifferentGEMMOutputType() || is_gated_activation);
bool const non_hopper_has_glu = !using_hopper && !use_fused_moe && is_gated_activation;
bool const has_glu_inter_result = hopper_has_glu || non_hopper_has_glu;
// Always 7, ignored if not needed
glu_inter_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : nullptr;

// fc1 and fc2 alias one of the above pointers, but it depends on if actfn is fused/unfused which is overlapped
fc1_result_ = has_glu_inter_result ? (T*) ws_sliced[6] : (T*) ws_sliced[7];
fc2_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : (T*) ws_sliced[6];

hopper_grouped_gemm_input_ = {};
if (moe_gemm_runner_.isHopperSpecialised())
{
hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[9], num_experts_per_node, ws_sliced[10], ws_sizes[10]);
hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[8], num_experts_per_node, ws_sliced[9], ws_sizes[9]);
}
}

Expand Down Expand Up @@ -1293,6 +1332,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::runMoe(void const* i
}
else
{

// Run the GEMM with activation function overridden with `Identity`, we do the activation separately
ActivationType activation_type = (use_fused_moe) ? fc1_activation_type : ActivationType::Identity;
T* gemm_result = (use_fused_moe) ? fc1_result_ : static_cast<T*>(glu_inter_result_);
Expand Down
6 changes: 4 additions & 2 deletions cpp/tensorrt_llm/pybind/executor/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,8 @@ void InitBindings(pybind11::module_& m)
&tle::DecodingConfig::setLookaheadDecoding)
.def_property("medusa_choices", &tle::DecodingConfig::getMedusaChoices, &tle::DecodingConfig::setMedusaChoices);

auto executorConfigGetState = [&](tle::ExecutorConfig const& self)
auto executorConfigGetState = [&peftCacheConfigGetstate, &kvCacheConfigGetstate, &schedulerConfigGetstate,
&parallelConfigGetstate](tle::ExecutorConfig const& self)
{
py::object peftCacheConfigState = py::none();

Expand All @@ -453,7 +454,8 @@ void InitBindings(pybind11::module_& m)
peftCacheConfigState, self.getLogitsPostProcessorMap(), self.getLogitsPostProcessorBatched(),
self.getDecodingConfig(), self.getGpuWeightsPercent());
};
auto executorConfigSetState = [&](py::tuple state)
auto executorConfigSetState = [&kvCacheConfigSetstate, &peftCacheConfigSetstate, &schedulerConfigSetstate,
&parallelConfigSetstate](py::tuple state)
{
if (state.size() != 15)
{
Expand Down
3 changes: 3 additions & 0 deletions cpp/tensorrt_llm/runtime/medusaModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,11 @@ void MedusaModule::initMedusaTensorsFromChoices(MedusaChoices const& choices, st
if (curDepth != depth)
{
TLLM_CHECK(depth + 1 == curDepth);
TLLM_CHECK_WITH_INFO(depth <= getMaxDraftPathLen(),
"Medusa choices require more Medusa heads than the engine was built with.");
// Save TopK
topKs[depth - 1] = maxTopK;

// Accumulate TopK for global indexing in tree
globalNodeInTreeIdx += maxTopK;

Expand Down
Loading

0 comments on commit 9691e12

Please sign in to comment.