diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp index 02abc189b..eeb7da68d 100644 --- a/benchmarks/cpp/gptManagerBenchmark.cpp +++ b/benchmarks/cpp/gptManagerBenchmark.cpp @@ -165,6 +165,9 @@ struct BenchmarkParams // Weights offloading float gpuWeightsPercent{1.0}; + + // Decoding params + std::optional>> medusaChoices; }; class InferenceRequestsSyncSend @@ -791,6 +794,10 @@ class ExecutorServer executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value()); } + executorConfig.setDecodingConfig(texec::DecodingConfig( + benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(), + std::nullopt, benchmarkParams.medusaChoices)); + mExecutor = std::make_unique(trtEnginePath, texec::ModelType::kDECODER_ONLY, executorConfig); if (logIterationData) @@ -1346,6 +1353,9 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType optionalParams.maxBeamWidth = beamWidth; optionalParams.maxBatchSize = benchmarkParams.maxBatchSize; optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy}; + optionalParams.decodingConfig = texec::DecodingConfig( + benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(), + std::nullopt, benchmarkParams.medusaChoices); auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json"); SizeType32 deviceCount{0}; @@ -1600,6 +1610,32 @@ void benchmarkExecutor(std::filesystem::path const& engineDir, TrtGptModelType m } } +std::vector> parseVectorOfVectors(std::string const& input) +{ + std::vector> result; + std::regex outer_regex(R"(\[(.*?)\])"); + std::regex inner_regex(R"(\d+)"); + auto outer_begin = std::sregex_iterator(input.begin(), input.end(), outer_regex); + auto outer_end = std::sregex_iterator(); + + for (std::sregex_iterator i = outer_begin; i != outer_end; ++i) + { + std::smatch match = *i; + std::string inner_str = match.str(1); + std::vector inner_vec; + auto inner_begin = std::sregex_iterator(inner_str.begin(), inner_str.end(), inner_regex); + auto inner_end = std::sregex_iterator(); + + for (std::sregex_iterator j = inner_begin; j != inner_end; ++j) + { + std::smatch inner_match = *j; + inner_vec.push_back(std::stoi(inner_match.str())); + } + result.push_back(inner_vec); + } + return result; +} + } // namespace int main(int argc, char* argv[]) @@ -1692,6 +1728,8 @@ int main(int argc, char* argv[]) options.add_options()("gpu_weights_percent", "Specify the percentage of weights that reside on GPU (from 0.0 to 1.0).", cxxopts::value()->default_value("1.0")); + options.add_options()( + "medusa_choices", "Medusa choices in the format of [[0], [0, 1], [0, 0, 1]]", cxxopts::value()); auto result = options.parse(argc, argv); @@ -1823,6 +1861,12 @@ int main(int argc, char* argv[]) // Argument: If offloaded blocks should be onboarded to primary memory before they are reused. benchmarkParams.kvOnboardBlocks = !result["kv_dont_onboard_blocks"].as(); + // Argument: Medusa choices for the Medusa speculative decoding. + if (result.count("medusa_choices")) + { + benchmarkParams.medusaChoices = parseVectorOfVectors(result["medusa_choices"].as()); + } + std::optional padId; // Argument: Padding token id if (result.count("pad_id")) diff --git a/benchmarks/python/build.py b/benchmarks/python/build.py index 07447aaa8..c5fb39888 100644 --- a/benchmarks/python/build.py +++ b/benchmarks/python/build.py @@ -944,6 +944,7 @@ def build_gpt(args): network = builder.create_network() network.trt_network.name = engine_name network.plugin_config.to_legacy_setting() + network.plugin_config.dtype = args.dtype # Plugins if args.mode in ['plugin', 'plugin-ifb']: diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a index 4c834e01a..fe7949a5b 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1 -size 3931604 +oid sha256:8b6ad33047e2684c7d22471f87febbb96ae26f4eac6529e2f3b7c1469ec2ec6d +size 3931504 diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a index 4c834e01a..8d15a4cc4 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1 -size 3931604 +oid sha256:560f736af15a4dfba849ab29efc3520d6ec8c87bf2aa16589299b232dc171cca +size 3989220 diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt index a23bfa496..d1e552ba5 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.a -93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.pre_cxx11.a -c0bd2b69c932257678a2aad9bd8baba4b291795e commit \ No newline at end of file +f8538ac35803837e5d457ea8c1a58053 libtensorrt_llm_batch_manager_static.a +dc6fc82dc4ba319899e1d6777bd8c3a4 libtensorrt_llm_batch_manager_static.pre_cxx11.a +265b039443334094026fbd8f396d52fe29c2d9d1 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a index 2438054cd..09e9e4105 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd757c26886a3ffd6947615d9f2829434e94839b693007a64b47c6b5c26416e4 -size 3812158 +oid sha256:74948e00ff7341914b1831ccfdce9ae242dd149603b1ba7e24ee993f08b63542 +size 3812960 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a index a88fe4a7f..effd33642 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87321383075adf2d87cfbdc8a12a3d3815ef058d5da9b6aaa8d7d3f3263af439 -size 3773896 +oid sha256:0421ceacd5d07bc172bb4d0979edaf466aa8950290b4d6d1a7d355dbcefc2c84 +size 3772832 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib index 2710c6005..04b00e8b8 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib +++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58cdc0a330f8bfb7b50e3202aeac47bde0835b1dc600b4bfdcd2b30801e66e03 -size 22381766 +oid sha256:46eb1d351e3e8da3945a3f451166f12536aae3e440d57337d8891492424aff78 +size 22387798 diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a index 69936aaee..932e0f372 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe -size 1396404 +oid sha256:19585b7709736197d9c1762d1bb8e3099e298d6dcc1c521d51c83637cc624c20 +size 1397814 diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a index 69936aaee..926f354f0 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe -size 1396404 +oid sha256:f5d5475663640c068af2e9b5772b9b602656641dd17ca473ce7125ef7f2ec855 +size 1423172 diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt index 698fe4058..fce860082 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.a -7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.pre_cxx11.a -c0bd2b69c932257678a2aad9bd8baba4b291795e commit \ No newline at end of file +e18e84fb356995b11c04b79e55c4c3f5 libtensorrt_llm_executor_static.a +f0555b76f21d43e676e5808bf197cc58 libtensorrt_llm_executor_static.pre_cxx11.a +265b039443334094026fbd8f396d52fe29c2d9d1 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a index eee7ed862..702aee286 100644 --- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a +++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e503b4cfb1c842850287a359ffed23a1773a67a96475d365b66d757a283ac218 -size 1448772 +oid sha256:8496c9e4a20efd3d2072520cf843dac70cbb0fe23621cfba2a1e0ef3e5fa22ed +size 1450288 diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a index b6e842a0f..24277fcfa 100644 --- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8c80cf7aca2b135a656a060456fb30a820e459b4b36560162b02fa65121ef50 -size 1375430 +oid sha256:1b76267834252836e26ddecc2e1b9449e33a67fb1981e5d42f721bc439be1c02 +size 1377018 diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib index f20e66567..05a1eccbc 100644 --- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib +++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc65971d6d74260cb49b354aa4b0b82f92863cc722fbf206bf8a4919a4897532 -size 14031364 +oid sha256:9bd0faf48175745d7aeff58f539ae021db365b73933dab9c51329de9e92f2d86 +size 14039826 diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h index 32fefcd22..3a0b72e5a 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h @@ -424,7 +424,8 @@ std::vector MoeGemmRunner: template bool MoeGemmRunner::isHopperSpecialised() const { - bool config_is_sm90 = best_config_ && best_config_->is_sm90; + TLLM_CHECK_WITH_INFO(best_config_, "Cannot determine if hopper is specialised without a selected config"); + bool config_is_sm90 = best_config_->is_sm90; return supportsHopperSpecialisation() && config_is_sm90; } @@ -440,7 +441,7 @@ int MoeGemmRunner::getSM() const return this->sm_; } -// currently support sm80 bf16/fp16 gate ativation, only set predication tensor for m direction +// currently support sm80 bf16/fp16 gate activation, only set predication tensor for m direction template bool MoeGemmRunner::isFusedGatedActivation(bool is_gated_activation, int gemm_n, int gemm_k) const { diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt index 236936439..a44f5448b 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt @@ -1,2 +1,2 @@ 5b6c74ce66f62d2a58aa9cac16f11ad6 libtensorrt_llm_nvrtc_wrapper.so -c0bd2b69c932257678a2aad9bd8baba4b291795e commit \ No newline at end of file +265b039443334094026fbd8f396d52fe29c2d9d1 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll index 964f3f9b0..8d3409955 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:84319476e8ecf9666f40f69355f19ec3b585fc0987f940be14af9e11e3f524c3 +oid sha256:9f2f97eb5b4181917a47b6028a857d7a597ca93faa5846af42c4cb24797d7fa7 size 1080832 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16 .cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu similarity index 100% rename from cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16 .cu rename to cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu diff --git a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu index 7b37b25ea..f016d2876 100644 --- a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu +++ b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu @@ -1072,10 +1072,38 @@ std::vector CutlassMoeFCRunner::getWo size_t const hopper_size = using_hopper ? HopperGroupedGemmInput::workspaceSize(num_experts_per_node) : 0; size_t const gemm_workspace_size = moe_gemm_runner_.getMaxWorkspaceSize(num_experts_per_node); - std::vector workspace{source_rows_size, permuted_rows_size, permuted_experts_size, permuted_data_size, - total_rows_before_expert_size, softmax_out_size, glu_inter_size, + // We do some overlapping of the large workspace buffers. Although we could overlap some of the other buffers, they + // are small enough (i.e no factor of hidden size) they will only be a couple MiB at most, so we don't bother + // in the case of fused activation we overlap permuted_data and fc2_result + // in the case of unfused activation we overlap permuted_data and fc1_result + // we need to calculate the max possible size, so use the max of all three + size_t overlapped_gemm1_gemm2_inputs = std::max(permuted_data_size, fc2_result_size); + // When glu_inter_elems is 0 we are always fused, otherwise we may need the un-fused case + if (glu_inter_elems > 0) + { + overlapped_gemm1_gemm2_inputs = std::max(overlapped_gemm1_gemm2_inputs, fc1_result_size); + } + + // if we have glu_inter we overlap it with fc2_result, otherwise we use fc1_result by itself + size_t overlapped_gemm1_gemm2_outputs = fc1_result_size; + if (glu_inter_elems > 0) + { + overlapped_gemm1_gemm2_outputs + = std::max(std::max(glu_inter_size, fc2_result_size), overlapped_gemm1_gemm2_outputs); + } + + std::vector workspace{ // + source_rows_size, // + permuted_rows_size, // + permuted_experts_size, // + total_rows_before_expert_size, // + softmax_out_size, // + sorter_size, // // These pointers reuse the same memory - std::max(fc1_result_size, sorter_size), fc2_result_size, hopper_size, gemm_workspace_size}; + overlapped_gemm1_gemm2_inputs, // + overlapped_gemm1_gemm2_outputs, // + hopper_size, // + gemm_workspace_size}; return workspace; } @@ -1088,7 +1116,9 @@ size_t CutlassMoeFCRunner::getWorkspaceSize(i TLLM_CHECK_WITH_INFO(num_experts % ep_size == 0, "Number of experts must be a multiple of ep size"); auto workspace = getWorkspaceBufferSizes( num_rows, hidden_size, inter_size, num_experts, num_experts / ep_size, k, activation_type); - return tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size()); + auto ws_size = tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size()); + TLLM_LOG_DEBUG("Mixture Of Experts Plugin requires workspace of %2f MiB", ws_size / 1024.f / 1024.f); + return ws_size; } template @@ -1109,29 +1139,38 @@ void CutlassMoeFCRunner::configureWsPtrs(char source_rows_ = (int*) ws_sliced[0]; permuted_rows_ = (int*) ws_sliced[1]; permuted_experts_ = (int*) ws_sliced[2]; - permuted_data_ = (T*) ws_sliced[3]; - total_rows_before_expert_ = (int64_t*) ws_sliced[4]; + total_rows_before_expert_ = (int64_t*) ws_sliced[3]; softmax_out_ = nullptr; bool const is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0); if (!is_pow_2 || num_experts > 256) { - softmax_out_ = (float*) ws_sliced[5]; + softmax_out_ = (float*) ws_sliced[4]; } - glu_inter_result_ = (T*) ws_sliced[6]; + sorter_ws_ = (char*) ws_sliced[5]; - // These pointers are aliased. Since the sort ws can be overwritten after it is finished - sorter_ws_ = (char*) ws_sliced[7]; - fc1_result_ = (T*) ws_sliced[7]; + // Always 6, but overlapped with either fc1_result_ or fc2_result_ + permuted_data_ = (T*) ws_sliced[6]; - fc2_result_ = (T*) ws_sliced[8]; + bool const is_gated_activation = isGatedActivation(activation_type); + bool const use_fused_moe = moe_gemm_runner_.isFusedGatedActivation(is_gated_activation, inter_size, hidden_size); + bool const using_hopper = moe_gemm_runner_.isHopperSpecialised(); + bool const hopper_has_glu = using_hopper && (mayHaveDifferentGEMMOutputType() || is_gated_activation); + bool const non_hopper_has_glu = !using_hopper && !use_fused_moe && is_gated_activation; + bool const has_glu_inter_result = hopper_has_glu || non_hopper_has_glu; + // Always 7, ignored if not needed + glu_inter_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : nullptr; + + // fc1 and fc2 alias one of the above pointers, but it depends on if actfn is fused/unfused which is overlapped + fc1_result_ = has_glu_inter_result ? (T*) ws_sliced[6] : (T*) ws_sliced[7]; + fc2_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : (T*) ws_sliced[6]; hopper_grouped_gemm_input_ = {}; if (moe_gemm_runner_.isHopperSpecialised()) { - hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[9], num_experts_per_node, ws_sliced[10], ws_sizes[10]); + hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[8], num_experts_per_node, ws_sliced[9], ws_sizes[9]); } } @@ -1293,6 +1332,7 @@ void CutlassMoeFCRunner::runMoe(void const* i } else { + // Run the GEMM with activation function overridden with `Identity`, we do the activation separately ActivationType activation_type = (use_fused_moe) ? fc1_activation_type : ActivationType::Identity; T* gemm_result = (use_fused_moe) ? fc1_result_ : static_cast(glu_inter_result_); diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp index 9f7799dee..379eb7642 100644 --- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp @@ -431,7 +431,8 @@ void InitBindings(pybind11::module_& m) &tle::DecodingConfig::setLookaheadDecoding) .def_property("medusa_choices", &tle::DecodingConfig::getMedusaChoices, &tle::DecodingConfig::setMedusaChoices); - auto executorConfigGetState = [&](tle::ExecutorConfig const& self) + auto executorConfigGetState = [&peftCacheConfigGetstate, &kvCacheConfigGetstate, &schedulerConfigGetstate, + ¶llelConfigGetstate](tle::ExecutorConfig const& self) { py::object peftCacheConfigState = py::none(); @@ -453,7 +454,8 @@ void InitBindings(pybind11::module_& m) peftCacheConfigState, self.getLogitsPostProcessorMap(), self.getLogitsPostProcessorBatched(), self.getDecodingConfig(), self.getGpuWeightsPercent()); }; - auto executorConfigSetState = [&](py::tuple state) + auto executorConfigSetState = [&kvCacheConfigSetstate, &peftCacheConfigSetstate, &schedulerConfigSetstate, + ¶llelConfigSetstate](py::tuple state) { if (state.size() != 15) { diff --git a/cpp/tensorrt_llm/runtime/medusaModule.cpp b/cpp/tensorrt_llm/runtime/medusaModule.cpp index 2730977c3..d889dcdfa 100644 --- a/cpp/tensorrt_llm/runtime/medusaModule.cpp +++ b/cpp/tensorrt_llm/runtime/medusaModule.cpp @@ -96,8 +96,11 @@ void MedusaModule::initMedusaTensorsFromChoices(MedusaChoices const& choices, st if (curDepth != depth) { TLLM_CHECK(depth + 1 == curDepth); + TLLM_CHECK_WITH_INFO(depth <= getMaxDraftPathLen(), + "Medusa choices require more Medusa heads than the engine was built with."); // Save TopK topKs[depth - 1] = maxTopK; + // Accumulate TopK for global indexing in tree globalNodeInTreeIdx += maxTopK; diff --git a/cpp/tests/kernels/mixtureOfExpertsTest.cu b/cpp/tests/kernels/mixtureOfExpertsTest.cu index a23ce3d3e..27e6a46ce 100644 --- a/cpp/tests/kernels/mixtureOfExpertsTest.cu +++ b/cpp/tests/kernels/mixtureOfExpertsTest.cu @@ -257,7 +257,7 @@ protected: template T* allocBuffer(size_t size) { - managed_buffers.emplace_back(mBufferManager->managed(size * sizeof(T))); + managed_buffers.emplace_back(mBufferManager->gpu(size * sizeof(T))); EXPECT_EQ(cudaGetLastError(), cudaSuccess) << "Error allocating buffer of size: " << size; T* ptr = static_cast(managed_buffers.back()->data()); return ptr; @@ -268,15 +268,27 @@ protected: this->managed_buffers.clear(); // Make sure all the previous buffers are freed check_cuda_error(cudaDeviceSynchronize()); // Sync to make sure all previous operations are resolved - size_t weight_size = hidden_size * hidden_size * 4 * num_experts * sizeof(WeightType); - // Skip the test if the GPU does not have enough memory - size_t workspace_size = this->mMoERunner.getWorkspaceSize( + // Calculate the size contributions for all the large buffers to check if the GPU has enough space + bool const is_gated = tensorrt_llm::isGatedActivation(mActType); + size_t const num_gemms = 2 + is_gated; + // Expert weights + size_t const weight_size = hidden_size * (hidden_size * 4) * num_experts * sizeof(WeightStorage) * num_gemms; + // Workspace size + size_t const workspace_size = this->mMoERunner.getWorkspaceSize( num_tokens, hidden_size, hidden_size * 4, num_experts, k, this->mActType, {}); + // The input/output buffers + size_t const in_out_size = 2 * num_tokens * hidden_size * sizeof(DataType); - size_t total_size = workspace_size + weight_size * 2; + // This should be correct to within 100MiB (on tests with 30GiB total) + size_t const total_size = workspace_size + weight_size + in_out_size; + size_t const memory_pool_free_mem_size = mBufferManager->memoryPoolFree(); auto const [freeMem, totalMem] = tensorrt_llm::common::getDeviceMemoryInfo(false); - return freeMem >= total_size; + float const freeMemBuffer = 0.9f; // Add some buffer so we aren't completely pushing the limits + std::cout << "Free memory is: " << freeMem << ", memory pool size is: " << memory_pool_free_mem_size + << ", required memory is: " << total_size << ", device total memory capacity: " << totalMem + << std::endl; + return (freeMem + memory_pool_free_mem_size) * freeMemBuffer >= total_size; } void initBuffersPermute(std::vector> h_hidden_states, @@ -362,7 +374,10 @@ protected: initFP8Scales(mMaxInput); } - mTpExpertScratch = allocBuffer(mTpExpertScratchSize); + if (parallelism_config.tp_size > 1 || parallelism_config.ep_size > 1) + { + mTpExpertScratch = allocBuffer(mTpExpertScratchSize); + } mActiveRows = mTotalTokens; mFinished = nullptr; @@ -475,10 +490,18 @@ protected: ASSERT_NE(mExpertFP8Scale1, nullptr); ASSERT_NE(mExpertFP8Scale2, nullptr); ASSERT_NE(mExpertFP8Scale3, nullptr); + // Dequant values for each expert are 1/(w_i*a_i) calculated above - std::fill_n(mExpertFP8Scale1, mNumExperts, 1.f / (scaleW1 * scaleAct1)); - std::fill_n(mExpertFP8Scale3, mNumExperts, 1.f / (scaleW2 * scaleAct2)); - *mExpertFP8Scale2 = scaleAct2; + std::vector scales_1(mNumExperts, 1.f / (scaleW1 * scaleAct1)); + std::vector scales_2(1, scaleAct2); + std::vector scales_3(mNumExperts, 1.f / (scaleW2 * scaleAct2)); + + check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale1, scales_1.data(), scales_1.size() * sizeof(float), + cudaMemcpyHostToDevice, mStream->get())); + check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale2, scales_2.data(), scales_2.size() * sizeof(float), + cudaMemcpyHostToDevice, mStream->get())); + check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale3, scales_3.data(), scales_3.size() * sizeof(float), + cudaMemcpyHostToDevice, mStream->get())); check_cuda_error(cudaStreamSynchronize(mStream->get())); } @@ -561,6 +584,13 @@ protected: void* ep_scale_2 = FP8 ? (void*) mExpertFP8Scale2 : (void*) mExpertIntScale2; void* ep_scale_3 = FP8 ? mExpertFP8Scale3 : nullptr; + // Handle the case with no parallelism to not require the extra alloc + if (parallelism_config.tp_size == 1 && parallelism_config.ep_size == 1) + { + return std::tuple{ + mExpertWeight1, mExpertWeight2, mExpertBias1, mExpertBias2, ep_scale_1, ep_scale_2, ep_scale_3}; + } + // Slice weights for EP size_t const gated_inter = mInterSize * mGatedMultiplier; size_t const experts_per_node = mNumExperts / parallelism_config.ep_size; diff --git a/cpp/tests/resources/scripts/build_medusa_engines.py b/cpp/tests/resources/scripts/build_medusa_engines.py index 18eccf932..a8c000d2f 100755 --- a/cpp/tests/resources/scripts/build_medusa_engines.py +++ b/cpp/tests/resources/scripts/build_medusa_engines.py @@ -28,7 +28,7 @@ def build_engine(weight_dir: _pl.Path, medusa_dir: _pl.Path, covert_cmd = [_sys.executable, "examples/medusa/convert_checkpoint.py"] + ( ['--model_dir', str(weight_dir)] if weight_dir else []) + [ '--medusa_model_dir', str(medusa_dir), \ - '--output_dir', str(engine_dir), '--dtype=float16', '--fixed_num_medusa_heads=4' + '--output_dir', str(engine_dir), '--dtype=float16', '--num_medusa_heads=4' ] + list(args) run_command(covert_cmd) diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index 4458be56d..db3509bd2 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -1,6 +1,6 @@ # Multi-stage Dockerfile ARG BASE_IMAGE=nvcr.io/nvidia/pytorch -ARG BASE_TAG=24.04-py3 +ARG BASE_TAG=24.05-py3 ARG DEVEL_IMAGE=devel FROM ${BASE_IMAGE}:${BASE_TAG} as base diff --git a/docker/common/install_pytorch.sh b/docker/common/install_pytorch.sh index 9a683314d..d12a1b9f5 100644 --- a/docker/common/install_pytorch.sh +++ b/docker/common/install_pytorch.sh @@ -4,8 +4,8 @@ set -ex # Use latest stable version from https://pypi.org/project/torch/#history # and closest to the version specified in -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04 -TORCH_VERSION="2.3.0" +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05 +TORCH_VERSION="2.3.1" SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') prepare_environment() { diff --git a/docker/common/install_tensorrt.sh b/docker/common/install_tensorrt.sh index bc2ea72da..3649b612c 100644 --- a/docker/common/install_tensorrt.sh +++ b/docker/common/install_tensorrt.sh @@ -2,12 +2,12 @@ set -ex -TRT_VER="10.0.1.6" +TRT_VER="10.1.0.27" # Align with the pre-installed cuDNN / cuBLAS / NCCL versions from -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05 CUDA_VER="12.4" # 12.4.1 # Keep the installation for cuDNN if users want to install PyTorch with source codes. -# PyTorch 2.3.0 can compile with cuDNN v9. +# PyTorch 2.3.x can compile with cuDNN v9. CUDNN_VER="9.1.0.70-1" NCCL_VER="2.21.5-1+cuda12.4" CUBLAS_VER="12.4.5.8-1" @@ -86,8 +86,7 @@ install_tensorrt() { if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-22.04" && OS="ubuntu-22.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi - RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz - + RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.1.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz fi wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar tar -xf /tmp/TensorRT.tar -C /usr/local/ diff --git a/docs/source/reference/support-matrix.md b/docs/source/reference/support-matrix.md index d3d96a128..3ad64739d 100644 --- a/docs/source/reference/support-matrix.md +++ b/docs/source/reference/support-matrix.md @@ -44,9 +44,9 @@ The following table shows the supported software for TensorRT-LLM. * - - Software Compatibility * - Container - - [24.04](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html) + - [24.05](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html) * - TensorRT - - [10.0](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html) + - [10.1](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html) * - Precision - - Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4 diff --git a/docs/source/release-notes.md b/docs/source/release-notes.md index 79e4be3a4..99ba44037 100644 --- a/docs/source/release-notes.md +++ b/docs/source/release-notes.md @@ -23,10 +23,11 @@ All published functionality in the Release Notes has been fully tested and verif - TBD ### Infrastructure changes - - Base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:24.04-py3`. - - Base Docker image for TensorRT-LLM backend is updated to `nvcr.io/nvidia/tritonserver:24.04-py3`. + - Base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:24.05-py3`. + - Base Docker image for TensorRT-LLM backend is updated to `nvcr.io/nvidia/tritonserver:24.05-py3`. + - The dependent TensorRT version is updated to 10.1.0. - The dependent CUDA version is updated to 12.4.1. - - The dependent PyTorch version is updated to 2.3.0. + - The dependent PyTorch version is updated to 2.3.1. ## TensorRT-LLM Release 0.10.0 diff --git a/docs/source/speculative_decoding.md b/docs/source/speculative_decoding.md index 5359e2193..8864e0511 100644 --- a/docs/source/speculative_decoding.md +++ b/docs/source/speculative_decoding.md @@ -15,10 +15,10 @@ If the first assumption holds true, the latency of speculative decoding will no The combination of both these allows speculative decoding to result in reduced latency. TensorRT-LLM supports several approaches for generating draft tokens, including: + 1. Utilizing a smaller, auxiliary model, known as the draft model approach. For more information, refer to the [Fast Inference from Transformers via Speculative Decoding paper](https://arxiv.org/pdf/2211.17192.pdf). 2. Implementing additional language model heads that predict tokens for future positions, as detailed in the [Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads paper](https://arxiv.org/abs/2401.10774). - ## Performance Improvements It's important to note that the effectiveness of speculative decoding techniques is highly dependent @@ -56,6 +56,166 @@ it is advisable to enable KV cache reuse for both models. This can be achieved by adding the `--use_paged_context_fmha=enable` flag to the `trtllm-build` command and setting `enableBlockReuse=true` in the `KVCacheConfig`. +## Using Draft model approach with Triton Inference Server + ++ Draft model approach is supported since TensorRT-LLM-0.7.0 (using two separate Tritonserver to maintain draft and target model respectively), but has significant optimization in TensorRT-LLM-0.10.0 (using one Tritonserver with [Business Logic Scripting](https://github.com/triton-inference-server/python_backend?tab=readme-ov-file#business-logic-scripting), BLS). ++ The source file of Draft model with BLS can be found [here](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py). ++ This example is based on TensorRT-LLM-0.10.0 and TRTLLM-backend-0.10.0, using docker image `nvcr.io/nvidia/tritonserver:24.05-trtllm-py3`. ++ Llama-7B-hf and Llama-30B-hf are used as draft and target model respectively in this example, assuming the paths to the models' repository are `DRAFT_MODEL_PATH` and `TARGET_MODEL_PATH`. ++ Maximum number of draft tokens is set to 10 in this example. + +1. Prepare TensorRT engine for inference + + Here are the commands to build draft / target engines in FP16 or FP8. All combinations of the data type (Draft-FP16/FP8 + Target-FP16/FP8) are supported. + + `--remove_input_padding=enable --paged_kv_cache=enable` are necessary for inflight-batching. + + `--context_fmha=enable --use_paged_context_fmha=enable` are optional, but recommended for the performance. + + `--gather_generation_logits` is necessary if using generation logits for selecting tokens in target model. + + `--tp_size` can be modified set if using TP mode for draft / target model. + + `--max_batch_size` more than 1 is acceptable in general usage, but we use 1 in this example. + + ```bash + export MAX_DRAFT_LENGTH=10 + export COMMON_COMMAND="--max_batch_size=1 --max_input_len=2048 --max_output_len=1024 --gpt_attention_plugin=float16 --gemm_plugin=float16 --remove_input_padding=enable --paged_kv_cache=enable --context_fmha=enable --use_paged_context_fmha=enable --gather_generation_logits" + export DRAFT_COMMAND_FP16="$COMMON_COMMAND" + export TARGET_COMMAND_FP16="$DRAFT_COMMAND_FP16 --max_draft_len=$MAX_DRAFT_LENGTH --speculative_decoding_mode draft_tokens_external" + export DRAFT_COMMAND_FP8="$COMMON_COMMAND --strongly_typed --use_fp8_context_fmha=enable" + export TARGET_COMMAND_FP8="$DRAFT_COMMAND_FP8 --max_draft_len=$MAX_DRAFT_LENGTH --speculative_decoding_mode draft_tokens_external" + + # Build checkpoints and engines in tensorrt_llm/examples/llama/ + # FP16 mode + export DRAFT_NAME=llama-7b-fp16-tp1 + export TARGET_NAME=llama-30b-fp16-tp1 + python3 convert_checkpoint.py --model_dir=$DRAFT_MODEL_PATH --output_dir=ckpt/$DRAFT_NAME --tp_size=1 + python3 convert_checkpoint.py --model_dir=$TARGET_MODEL_PATH --output_dir=ckpt/$TARGET_NAME --tp_size=1 + trtllm-build --checkpoint_dir=ckpt/$DRAFT_NAME --output_dir=engine/draft/$DRAFT_NAME $DRAFT_COMMAND_FP16 + trtllm-build --checkpoint_dir=ckpt/$TARGET_NAME --output_dir=engine/target/$TARGET_NAME $TARGET_COMMAND_FP16 + export DRAFT_ENGINE_PATH=$(pwd)/engine/draft/$DRAFT_NAME + export TARGET_ENGINE_PATH=$(pwd)/engine/target/$TARGET_NAME + + # FP8 mode + export DRAFT_NAME=llama-7b-fp8-tp1 + export TARGET_NAME=llama-30b-fp8-tp1 + python3 convert_checkpoint.py --model_dir=$DRAFT_MODEL_PATH --output_dir=ckpt/$DRAFT_NAME --tp_size=1 + python3 convert_checkpoint.py --model_dir=$TARGET_MODEL_PATH --output_dir=ckpt/$TARGET_NAME --tp_size=1 + trtllm-build --checkpoint_dir=ckpt/$DRAFT_NAME --output_dir=engine/draft/$DRAFT_NAME $DRAFT_COMMAND_FP8 + trtllm-build --checkpoint_dir=ckpt/$TARGET_NAME --output_dir=engine/target/$TARGET_NAME $TARGET_COMMAND_FP8 + export DRAFT_ENGINE_PATH=$(pwd)/engine/draft/$DRAFT_NAME + export TARGET_ENGINE_PATH=$(pwd)/engine/target/$TARGET_NAME + ``` + +2. Edit Triton configuration + + If both draft and target model can be placed in one GPU (for example, llama-7B-FP8 + llama-30B-FP8, totally 40GiB in one H100-80GiB GPU), `DRAFT_GPU_DEVICE_IDS` and `TARGET_GPU_DEVICE_IDS` can be the same, `0` as example. It appears better performance than placing on two separate GPUs. + + Elsewise, the draft and target models can be placed in different GPUs, `DRAFT_GPU_DEVICE_IDS="0"` and `TARGET_GPU_DEVICE_IDS="1"` as example. + + Furthermore, if TP mode is used, the value of `GPU_DEVICE_IDS` can be a list, `DRAFT_GPU_DEVICE_IDS="0"` and `TARGET_GPU_DEVICE_IDS="1,2,3,4"` as example. + + For more configuration of launching models with Tritonserver, please visit [TensorRT-LLM Backed repo](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/README.md). + + ```bash + ACCUMULATE_TOKEN="false" + BACKEND="tensorrtllm" + BATCH_SCHEDULER_POLICY="guaranteed_no_evict" + BATCHING_STRATEGY="inflight_fused_batching" + BLS_INSTANCE_COUNT="1" + DECODING_MODE="top_k_top_p" + DECOUPLED_MODE="False" + DRAFT_GPU_DEVICE_IDS="0" + E2E_MODEL_NAME="ensemble" + ENABLE_KV_CACHE_REUSE="true" + ENGINE_PATH=$TARGET_ENGINE_PATH + EXCLUDE_INPUT_IN_OUTPUT="false" + KV_CACHE_FREE_GPU_MEM_FRACTION="0.8" + MAX_ATTENTION_WINDOW_SIZE="" + MAX_BEAM_WIDTH="1" + MAX_QUEUE_DELAY_MICROSECONDS="0" + MAX_TOKENS_IN_KV_CACHE="" + NORMALIZE_LOG_PROBS="true" + POSTPROCESSING_INSTANCE_COUNT="1" + PREPROCESSING_INSTANCE_COUNT="1" + TARGET_GPU_DEVICE_IDS="1" + TENSORRT_LLM_DRAFT_MODEL_NAME="tensorrt_llm_draft" + TENSORRT_LLM_MODEL_NAME="tensorrt_llm" + TOKENIZER_PATH=$DRAFT_MODEL_PATH + TOKENIZER_TYPE=llama + TRITON_GRPC_PORT="8001" + TRITON_HTTP_PORT="8000" + TRITON_MAX_BATCH_SIZE="4" + TRITON_METRICS_PORT="8002" + TRITON_REPO="triton_repo" + USE_DRAFT_LOGITS="false" + + # Make a copy of triton repo and replace the fields in the configuration files + cd /tensorrtllm_backend/ + apt-get update && apt-get install -y build-essential cmake git-lfs + pip3 install git-lfs tritonclient grpcio + rm -rf ${TRITON_REPO} + cp -R all_models/inflight_batcher_llm ${TRITON_REPO} + python3 tools/fill_template.py -i ${TRITON_REPO}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE} + python3 tools/fill_template.py -i ${TRITON_REPO}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:${PREPROCESSING_INSTANCE_COUNT} + python3 tools/fill_template.py -i ${TRITON_REPO}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:${POSTPROCESSING_INSTANCE_COUNT} + python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},accumulate_tokens:${ACCUMULATE_TOKEN},bls_instance_count:${BLS_INSTANCE_COUNT},tensorrt_llm_model_name:${TENSORRT_LLM_MODEL_NAME},tensorrt_llm_draft_model_name:${TENSORRT_LLM_DRAFT_MODEL_NAME} + + # Make a copy of tensorrt_llm as configurations of draft / target models. + cp -R ${TRITON_REPO}/tensorrt_llm ${TRITON_REPO}/tensorrt_llm_draft + sed -i 's/name: "tensorrt_llm"/name: "tensorrt_llm_draft"/g' ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt + python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt triton_backend:${BACKEND},engine_dir:${ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${TARGET_GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE} + python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt triton_backend:${BACKEND},engine_dir:${DRAFT_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${DRAFT_GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE} + ``` + +3. Launch Triton server + + `--multi-model` is necessary if TP mode is used for target model. + + ```bash + python3 scripts/launch_triton_server.py \ + --model_repo=${TRITON_REPO} \ + --tensorrt_llm_model_name "tensorrt_llm,tensorrt_llm_draft" \ + --multi-model \ + --log & + ``` + + + Verbose log will be written in to file `triton_log.txt`. Triton server launches successfully if you see the output below in the file: + + ```txt + Started HTTPService at 0.0.0.0:8000 + Started GRPCInferenceService at 0.0.0.0:8001 + Started Metrics Service at 0.0.0.0:8002 + ``` + +4. Send Requests + + Prepare a JSON file `input_data.json` containing input data as below (more requests are acceptable). + + ```json + [ + { + "input": "James Best, best known for his ", + "instruction": "Continue writing the following story:", + "output": " " + } + ] + ``` + + + Use command below to launch requests for inference. + + `--num-draft-tokens` can be modified by runtime draft lengths, 4 is used in this example. + + ```bash + python3 tools/inflight_batcher_llm/speculative_decoding_test.py \ + --max-input-len 2048 \ + --dataset=input_data.json \ + --url-target=localhost:8001 \ + --url-draft=localhost:8001 \ + --draft-tensorrt-llm-model-name="${TENSORRT_LLM_DRAFT_MODEL_NAME}" \ + --target-tensorrt-llm-model-name="${TENSORRT_LLM_MODEL_NAME}" \ + --bls-speculative-tensorrt-llm-model-name="tensorrt_llm_bls" \ + --execute-bls-speculative-decoding \ + --disable-output-comparison \ + --num-draft-tokens=4 \ + --verbose + ``` + +5. Kill Tritonserver after finishing inference + + ```bash + pkill -9 -f trtllmExecutorWorker + pkill -9 -f tritonserver + ``` + # Medusa This approach leverages a single model to both generate and verify draft tokens. diff --git a/examples/baichuan/requirements.txt b/examples/baichuan/requirements.txt index f51033e19..89b722b3e 100644 --- a/examples/baichuan/requirements.txt +++ b/examples/baichuan/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets~=2.15.0 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/bloom/requirements.txt b/examples/bloom/requirements.txt index 047fb0e86..5626d3984 100644 --- a/examples/bloom/requirements.txt +++ b/examples/bloom/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/chatglm/requirements.txt b/examples/chatglm/requirements.txt index 58286c4c4..3236169a0 100644 --- a/examples/chatglm/requirements.txt +++ b/examples/chatglm/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets~=2.14.5 evaluate~=0.4.1 protobuf diff --git a/examples/cogvlm/convert_checkpoint.py b/examples/cogvlm/convert_checkpoint.py index fec672fe3..a52df63d7 100644 --- a/examples/cogvlm/convert_checkpoint.py +++ b/examples/cogvlm/convert_checkpoint.py @@ -119,7 +119,7 @@ def parse_arguments(): 'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV' ) parser.add_argument( - '--modelopt_quant_ckpt_path', + '--quant_ckpt_path', type=str, default=None, help='Path of a quantized model checkpoint in .npz format') @@ -438,7 +438,7 @@ def covert_and_save(rank): if args.use_weight_only and args.weight_only_precision == 'int4_gptq': weights = load_weights_from_gptq( - args.modelopt_quant_ckpt_path, + args.quant_ckpt_path, PretrainedConfig.from_dict(copy.deepcopy(config)), ) diff --git a/examples/dbrx/requirements.txt b/examples/dbrx/requirements.txt index cd04908b5..5de3a3747 100644 --- a/examples/dbrx/requirements.txt +++ b/examples/dbrx/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/falcon/requirements.txt b/examples/falcon/requirements.txt index 567d1d89d..77d72b907 100644 --- a/examples/falcon/requirements.txt +++ b/examples/falcon/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 transformers>=4.31.0 datasets~=2.14.5 evaluate~=0.4.1 diff --git a/examples/gemma/convert_checkpoint.py b/examples/gemma/convert_checkpoint.py index 38895306f..2e3e3ba41 100644 --- a/examples/gemma/convert_checkpoint.py +++ b/examples/gemma/convert_checkpoint.py @@ -71,7 +71,7 @@ def parse_arguments(): "By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV", ) parser.add_argument( - "--modelopt_quant_ckpt_path", + "--quant_ckpt_path", default=None, help= "Path of a directory to quantized model checkpoints in .safetensors format or \ @@ -944,7 +944,7 @@ def convert(worker_rank, args, convert_kwargs): weight_scales = quantize_fp8_weights( weights, trt_llm_config.num_hidden_layers, trt_llm_config.mapping) - scales = load_from_fp8_gemma(args.modelopt_quant_ckpt_path, + scales = load_from_fp8_gemma(args.quant_ckpt_path, trt_llm_config.num_hidden_layers, trt_llm_config.mapping, args.fp8_kv_cache, weight_scales) diff --git a/examples/gemma/requirements.txt b/examples/gemma/requirements.txt index 9a2afb89d..296bc6255 100644 --- a/examples/gemma/requirements.txt +++ b/examples/gemma/requirements.txt @@ -3,7 +3,7 @@ # WAR the new posting of "nvidia-cudnn-cu12~=9.0". # "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9". nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64" -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 flax~=0.8.0 # jax[cuda12_pip]~=0.4.19; platform_system != "Windows" jax~=0.4.19; platform_system == "Windows" diff --git a/examples/gpt/requirements.txt b/examples/gpt/requirements.txt index 954293a71..03735ad42 100644 --- a/examples/gpt/requirements.txt +++ b/examples/gpt/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/gptj/requirements.txt b/examples/gptj/requirements.txt index 58af19bd7..9cd6ac378 100644 --- a/examples/gptj/requirements.txt +++ b/examples/gptj/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/gptneox/README.md b/examples/gptneox/README.md index d7c48483f..9d77d8d58 100644 --- a/examples/gptneox/README.md +++ b/examples/gptneox/README.md @@ -167,7 +167,7 @@ sh gptq_convert.sh ### 3. Convert weights from HF Transformers to TensorRT-LLM format To apply groupwise quantization GPTQ, addition command-line flags need to be passed to `convert_checkpoint.py`: -Here `--modelopt_quant_ckpt_path` flag specifies the output safetensors of `gptq_convert.sh` script. +Here `--quant_ckpt_path` flag specifies the output safetensors of `gptq_convert.sh` script. ```bash # Single GPU @@ -175,7 +175,7 @@ python3 convert_checkpoint.py --model_dir ./gptneox_model \ --dtype float16 \ --use_weight_only \ --weight_only_precision int4_gptq \ - --modelopt_quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \ + --quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \ --output_dir ./gptneox/20B/trt_ckpt/int4_gptq/1-gpu/ # With 2-way Tensor Parallel python3 convert_checkpoint.py --model_dir ./gptneox_model \ @@ -184,7 +184,7 @@ python3 convert_checkpoint.py --model_dir ./gptneox_model \ --weight_only_precision int4_gptq \ --tp_size 2 \ --workers 2 \ - --modelopt_quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \ + --quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \ --output_dir ./gptneox/20B/trt_ckpt/int4_gptq/2-gpu/ ``` diff --git a/examples/gptneox/convert_checkpoint.py b/examples/gptneox/convert_checkpoint.py index d4064f981..67bf72772 100644 --- a/examples/gptneox/convert_checkpoint.py +++ b/examples/gptneox/convert_checkpoint.py @@ -50,7 +50,7 @@ def parse_arguments(): 'Define the precision for the weights when using weight-only quantization.' 'You must also use --use_weight_only for that argument to have an impact.' ) - parser.add_argument('--modelopt_quant_ckpt_path', + parser.add_argument('--quant_ckpt_path', type=str, default=None, help='Path of a quantized model checkpoint') @@ -708,8 +708,7 @@ def convert_hf_gptneox(hf_model, 'has_zero_point': True, 'group_size': - get_gptq_gptneox_group_size(args.modelopt_quant_ckpt_path, - hf_config) + get_gptq_gptneox_group_size(args.quant_ckpt_path, hf_config) }) with open(os.path.join(args.output_dir, 'config.json'), 'w') as f: @@ -723,7 +722,7 @@ def covert_and_save(rank): if args.use_weight_only and args.weight_only_precision == 'int4_gptq': weights = load_from_gptq_gptneox( - args.modelopt_quant_ckpt_path, + args.quant_ckpt_path, hf_config, use_parallel_embedding=args.use_parallel_embedding, sharding_dim=args.embedding_sharding_dim, diff --git a/examples/gptneox/requirements.txt b/examples/gptneox/requirements.txt index 8edbc46e6..b6d9a8b15 100644 --- a/examples/gptneox/requirements.txt +++ b/examples/gptneox/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets~=2.14.5 rouge_score~=0.1.2 evaluate~=0.4.1 diff --git a/examples/grok/requirements.txt b/examples/grok/requirements.txt index b6e249ae7..7480cc654 100644 --- a/examples/grok/requirements.txt +++ b/examples/grok/requirements.txt @@ -1,6 +1,6 @@ -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/high-level-api/requirements.txt b/examples/high-level-api/requirements.txt index 182e2ccc1..7833f8eee 100644 --- a/examples/high-level-api/requirements.txt +++ b/examples/high-level-api/requirements.txt @@ -1,2 +1,2 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 diff --git a/examples/internlm/requirements.txt b/examples/internlm/requirements.txt index 6c49a765f..e86e7fd2d 100644 --- a/examples/internlm/requirements.txt +++ b/examples/internlm/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets==2.14.5 rouge_score~=0.1.2 sentencepiece~=0.1.99 diff --git a/examples/llama/README.md b/examples/llama/README.md index 96c9cf8e9..84ebc94c1 100644 --- a/examples/llama/README.md +++ b/examples/llama/README.md @@ -644,7 +644,7 @@ One can enable AWQ/GPTQ INT4 weight only quantization with these options when bu - `--use_weight_only` enables weight only GEMMs in the network. - `--per_group` enable groupwise weight only quantization, for GPT-J example, we support AWQ with the group size default as 128. - `--weight_only_precision` should specify the weight only quantization format. Supported formats are `int4_awq` or `int4_gptq`. -- `--modelopt_quant_ckpt_path` passes the quantized checkpoint to build the engine. +- `--quant_ckpt_path` passes the quantized checkpoint to build the engine. AWQ/GPTQ examples below involves 2 steps: 1. Weight quantization @@ -700,7 +700,7 @@ To run the GPTQ LLaMa example, the following steps are required: python convert_checkpoint.py --model_dir /tmp/llama-7b-hf \ --output_dir ./tllm_checkpoint_2gpu_gptq \ --dtype float16 \ - --modelopt_quant_ckpt_path ./llama-7b-4bit-gs128.safetensors \ + --quant_ckpt_path ./llama-7b-4bit-gs128.safetensors \ --use_weight_only \ --weight_only_precision int4_gptq \ --per_group \ diff --git a/examples/llama/requirements.txt b/examples/llama/requirements.txt index f8e55c97d..814a8c824 100644 --- a/examples/llama/requirements.txt +++ b/examples/llama/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/mamba/README.md b/examples/mamba/README.md index 710bb10f0..d3cfd78d4 100644 --- a/examples/mamba/README.md +++ b/examples/mamba/README.md @@ -29,9 +29,6 @@ Please install required packages first and setup `git-lfs`: ```bash pip install -r requirements.txt -pip install "transformers>=4.39.0" - -# Setup git-lfs git lfs install ``` @@ -98,8 +95,7 @@ The `trtllm-build` command builds TensorRT-LLM engines from TensorRT-LLM checkpo # mamba-2.8b trtllm-build --checkpoint_dir ./mamba_model/mamba-2.8b/trt_ckpt/bf16/1-gpu/ \ --paged_kv_cache disable \ - --gemm_plugin bfloat16 \ - --mamba_conv1d_plugin bfloat16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 924 \ --max_seq_len 1024 \ @@ -108,8 +104,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-2.8b/trt_ckpt/bf16/1-gpu/ \ # mamba-1.4b trtllm-build --checkpoint_dir ./mamba_model/mamba-1.4b/trt_ckpt/fp16/1-gpu/ \ --paged_kv_cache disable \ - --gemm_plugin float16 \ - --mamba_conv1d_plugin float16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 924 \ --max_seq_len 1024 \ @@ -118,8 +113,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-1.4b/trt_ckpt/fp16/1-gpu/ \ # mamba-790m trtllm-build --checkpoint_dir ./mamba_model/mamba-790m/trt_ckpt/fp16/1-gpu/ \ --paged_kv_cache disable \ - --gemm_plugin float16 \ - --mamba_conv1d_plugin float16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 924 \ --max_seq_len 1024 \ @@ -128,8 +122,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-790m/trt_ckpt/fp16/1-gpu/ \ # mamba-370m trtllm-build --checkpoint_dir ./mamba_model/mamba-370m/trt_ckpt/fp16/1-gpu/ \ --paged_kv_cache disable \ - --gemm_plugin float16 \ - --mamba_conv1d_plugin float16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 924 \ --max_seq_len 1024 \ @@ -138,8 +131,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-370m/trt_ckpt/fp16/1-gpu/ \ # mamba-130m trtllm-build --checkpoint_dir ./mamba_model/mamba-130m/trt_ckpt/fp16/1-gpu/ \ --paged_kv_cache disable \ - --gemm_plugin float16 \ - --mamba_conv1d_plugin float16 \ + --gemm_plugin auto \ --max_batch_size 8 \ --max_input_len 924 \ --max_seq_len 1024 \ diff --git a/examples/mamba/requirements.txt b/examples/mamba/requirements.txt index 5d3b4823e..c64cfbf6a 100644 --- a/examples/mamba/requirements.txt +++ b/examples/mamba/requirements.txt @@ -1,5 +1,6 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 +transformers>=4.39.0 datasets~=2.14.5 evaluate rouge_score diff --git a/examples/medusa/README.md b/examples/medusa/README.md index 587363966..1ef2d4377 100644 --- a/examples/medusa/README.md +++ b/examples/medusa/README.md @@ -13,6 +13,7 @@ For more info about Medusa visit [speculative decoding documentation](../../docs * GPU Compute Capability >= 8.0 (Ampere or newer) * FP16 * BF16 + * FP8 (base model) * PAGED_KV_CACHE * Tensor Parallel @@ -32,7 +33,7 @@ https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3 ``` We use `convert_checkpoint.py` script to convert the model for Medusa decoding into TensorRT-LLM checkpoint format. -Here we also add `--fixed_num_medusa_heads 4` as `medusa_num_heads` is 2 in `config.json` of `medusa-vicuna-7b-v1.3` but it actually has 4. +We could use `--num_medusa_heads` to set the number of medusa heads that we want to use. If not, `num_medusa_heads` will be set according to the `medusa_num_heads` from medusa weights' `config.json`. Here is the example: ```bash @@ -41,20 +42,21 @@ python convert_checkpoint.py --model_dir ./vicuna-7b-v1.3 \ --medusa_model_dir medusa-vicuna-7b-v1.3 \ --output_dir ./tllm_checkpoint_1gpu_medusa \ --dtype float16 \ - --fixed_num_medusa_heads 4 + --num_medusa_heads 4 +# Note: Increasing the batch size may have a negative impact on performance trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_medusa \ --output_dir ./tmp/medusa/7B/trt_engines/fp16/1-gpu/ \ --gemm_plugin float16 \ --speculative_decoding_mode medusa \ - --max_batch_size 8 + --max_batch_size 4 # Convert and Build Medusa decoding support for vicuna-13b-v1.3 with 4-way tensor parallelism. python convert_checkpoint.py --model_dir ./vicuna-7b-v1.3 \ --medusa_model_dir medusa-vicuna-7b-v1.3 \ --output_dir ./tllm_checkpoint_1gpu_medusa \ --dtype float16 \ - --fixed_num_medusa_heads 4 \ + --num_medusa_heads 4 \ --tp_size 4 \ --workers 4 @@ -62,7 +64,30 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_medusa \ --output_dir ./tmp/medusa/7B/trt_engines/fp16/1-gpu/ \ --gemm_plugin float16 \ --speculative_decoding_mode medusa \ - --max_batch_size 8 + --max_batch_size 4 +``` + +### FP8 Post-Training Quantization for Base Model +The example below quantizes the base model to FP8, while keeping the weight of the medusa head non-quantize. +```bash +# Quantize base model into FP8 and export trtllm checkpoint +python ../quantization/quantize.py --model_dir /path/to/base-model-hf/ \ + --dtype float16 \ + --qformat fp8 \ + --kv_cache_dtype fp8 \ + --output_dir ./tllm_checkpoint_1gpu_base_model_fp8_medusa_fp16 \ + --calib_size 512 \ + --tp_size 1 \ + --medusa_model_dir /path/to/medusa_head/ \ + --num_medusa_heads 4 + +# Build trtllm engines from the trtllm checkpoint +trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_base_model_fp8_medusa_fp16 \ + --output_dir ./trt_engine_1gpu_base_model_fp8_medusa_fp16 \ + --gemm_plugin float16 \ + --gpt_attention_plugin float16 \ + --speculative_decoding_mode medusa \ + --max_batch_size 4 ``` ### Run diff --git a/examples/medusa/convert_checkpoint.py b/examples/medusa/convert_checkpoint.py index b369facfd..86dc8b986 100644 --- a/examples/medusa/convert_checkpoint.py +++ b/examples/medusa/convert_checkpoint.py @@ -19,12 +19,13 @@ from transformers.pytorch_utils import Conv1D import tensorrt_llm -from tensorrt_llm._utils import str_dtype_to_torch from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping from tensorrt_llm.models import PretrainedConfig from tensorrt_llm.models.convert_utils import load_calib_dataset from tensorrt_llm.models.llama.convert import load_weights_from_hf_by_shard +from tensorrt_llm.models.medusa.weight import (get_tllm_linear_weight, + load_medusa_hf) from tensorrt_llm.quantization import QuantAlgo try: @@ -108,11 +109,6 @@ def parse_arguments(): help= 'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV' ) - parser.add_argument( - '--modelopt_quant_ckpt_path', - type=str, - default=None, - help='Path of a quantized model checkpoint in .npz format') parser.add_argument( '--per_group', @@ -182,13 +178,6 @@ def parse_arguments(): help='The number of workers for converting checkpoint in parallel') parser.add_argument('--num_medusa_heads', type=int, default=4) - parser.add_argument( - '--fixed_num_medusa_heads', - type=int, - default=None, - help="If exist, fix medusa_num_heads from config.json." - "num_medusa_heads < medusa_num_heads in config.json < fixed_num_medusa_heads" - ) parser.add_argument('--num_medusa_layers', type=int, default=1) parser.add_argument('--max_medusa_token_len', type=int, default=63) parser.add_argument('--medusa_hidden_act', type=str, default="silu") @@ -570,29 +559,6 @@ def get_weight_and_bias(config, prefix, dtype): return get_weight(config, prefix, dtype), get_bias(config, prefix, dtype) -def get_tllm_linear_weight(weight, - prefix, - bias=None, - use_weight_only=False, - plugin_weight_only_quant_type=torch.int8, - postfix='weight'): - results = {} - if use_weight_only: - v = weight.t().contiguous().cpu() - processed_torch_weights, torch_weight_scales = \ - torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix( - v, plugin_weight_only_quant_type) - results[prefix + postfix] = processed_torch_weights - results[prefix + 'per_channel_scale'] = torch_weight_scales - else: - results[prefix + postfix] = weight.contiguous() - - if bias is not None: - results[prefix + 'bias'] = bias - - return results - - def dup_kv_weight(v, num_head, tp_size): assert tp_size % num_head == 0 reps = tp_size // num_head @@ -1189,77 +1155,28 @@ def covert_and_save(rank, convert_args): qkv_para=convert_args['llama_qkv_para'], smoother=convert_args['llama_smoother']) - def load_medusa_hf(medusa_path: str, - mapping=Mapping(), - dtype='float32'): - logger.info("Loading Medusa heads' weights ...") - is_ckpt_safetensors = False - - ckpt_file = Path(medusa_path) / "medusa_lm_head.pt" - if not ckpt_file.exists(): - ckpt_file = Path( - medusa_path) / "medusa_lm_head.safetensors" - is_ckpt_safetensors = True - - if is_ckpt_safetensors: - logger.info("Safetensors Found ...") - from safetensors.torch import load_file - state_dict = load_file(ckpt_file) - else: - state_dict = torch.load(ckpt_file, map_location="cpu") - - torch_dtype = str_dtype_to_torch(dtype) - weights = {} - - for h in range(args.num_medusa_heads): - for l in range(args.num_medusa_layers): - w = state_dict[f"{h}.{l}.linear.weight"].clone().to( - torch_dtype) - - split_v = split(w, mapping.tp_size, mapping.tp_rank) - weights.update( - get_tllm_linear_weight( - split_v, - f'medusa_heads.{h}.medusa_layers.{l}.linear.', - None, args.use_weight_only, - plugin_weight_only_quant_type)) - - b = state_dict[f"{h}.{l}.linear.bias"].clone().to( - torch_dtype) - - weights[ - 'medusa_heads.{}.medusa_layers.{}.linear.bias'. - format(h, l)] = split(b, mapping.tp_size, - mapping.tp_rank) - - lm = state_dict[ - f"{h}.{args.num_medusa_layers}.weight"].clone().to( - torch_dtype) # LM Head - - weights['medusa_heads.{}.lm_head.weight'.format( - h)] = split(lm, mapping.tp_size, mapping.tp_rank) - - return weights - if args.medusa_model_dir is not None: config_file = Path(args.medusa_model_dir) / "config.json" with open(config_file) as fp: config = json.load(fp) - args.num_medusa_heads = config.get('medusa_num_heads', - args.num_medusa_heads) + num_medusa_heads_from_config = config.get( + 'medusa_num_heads', args.num_medusa_heads) args.num_medusa_layers = config.get('medusa_num_layers', args.num_medusa_layers) - if args.fixed_num_medusa_heads is not None and args.fixed_num_medusa_heads != args.num_medusa_heads: - logger.info( - f"fixing num_medusa_heads from {args.num_medusa_heads} to {args.fixed_num_medusa_heads}" - ) - args.num_medusa_heads = args.fixed_num_medusa_heads + if args.num_medusa_heads is None: + args.num_medusa_heads = num_medusa_heads_from_config assert args.max_medusa_token_len > 0, "should have max_medusa_token_len > 0" - medusa_weights = load_medusa_hf(args.medusa_model_dir, - mapping, - dtype=args.dtype) + medusa_weights = load_medusa_hf( + medusa_path=args.medusa_model_dir, + num_medusa_heads=args.num_medusa_heads, + num_medusa_layers=args.num_medusa_layers, + mapping=mapping, + dtype=args.dtype, + use_weight_only=args.use_weight_only, + plugin_weight_only_quant_type= + plugin_weight_only_quant_type) weights.update(medusa_weights) safetensors.torch.save_file( diff --git a/examples/medusa/requirements.txt b/examples/medusa/requirements.txt index b29584c99..f396bbdcc 100644 --- a/examples/medusa/requirements.txt +++ b/examples/medusa/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets~=2.14.5 rouge_score~=0.1.2 sentencepiece~=0.1.99 diff --git a/examples/mixtral/requirements.txt b/examples/mixtral/requirements.txt index 9eb7892b0..6a15339a6 100644 --- a/examples/mixtral/requirements.txt +++ b/examples/mixtral/requirements.txt @@ -1,4 +1,4 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 transformers==4.38.2 accelerate==0.25.0 diff --git a/examples/mmlu.py b/examples/mmlu.py index 18ea6d4a5..0ce98a7be 100644 --- a/examples/mmlu.py +++ b/examples/mmlu.py @@ -363,11 +363,13 @@ def main(): if args.test_trt_llm: assert not args.test_hf, "Cannot test both TRT-LLM and HF" - runner_cls = ModelRunner if (args.debug_mode - or not PYTHON_BINDINGS) else ModelRunnerCpp + runner_cls = ModelRunner if not PYTHON_BINDINGS else ModelRunnerCpp + runner_kwargs = {} + if PYTHON_BINDINGS: + runner_kwargs.update(max_beam_width=1) model = runner_cls.from_dir(args.engine_dir, rank=runtime_rank, - debug_mode=args.debug_mode) + **runner_kwargs) else: assert args.test_hf, "Must test either TRT-LLM or HF" if model_name == 'ChatGLMForCausalLM' and model_version == 'glm': diff --git a/examples/mpt/requirements.txt b/examples/mpt/requirements.txt index 58af19bd7..9cd6ac378 100644 --- a/examples/mpt/requirements.txt +++ b/examples/mpt/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/nemotron/requirements.txt b/examples/nemotron/requirements.txt index 88383335b..6e9c7e8f7 100644 --- a/examples/nemotron/requirements.txt +++ b/examples/nemotron/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 transformers==4.40.2 datasets~=2.14.5 evaluate~=0.4.1 diff --git a/examples/opt/requirements.txt b/examples/opt/requirements.txt index 58af19bd7..9cd6ac378 100644 --- a/examples/opt/requirements.txt +++ b/examples/opt/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/phi/README.md b/examples/phi/README.md index 5dd4f7491..a9ee4f2c4 100644 --- a/examples/phi/README.md +++ b/examples/phi/README.md @@ -15,10 +15,9 @@ models using TensorRT-LLM and run on a single GPU. ## Overview -The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/phi`](./). There are two files: +The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/phi`](./) with a single file: * [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format -* [`postprocess_quant_checkpoint.py`](./postprocess_quant_checkpoint.py) to post-process FP8 or INT8 SmoothQuant quantized checkpoints for Phi-3-small variants. In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation: @@ -29,18 +28,19 @@ In addition, there are two shared files in the parent folder [`examples`](../) f * FP16 * BF16 * FP8 + * INT8 * Tensor Parallel ## Support Matrix -| Model Name | FP16 | BF16 | FP8 | TP | -| :--------------: | :---: | :---: | :---: | :---: | -| phi-2 | Y | Y | | Y | -| Phi-3-mini-4k-instruct | Y | Y | | | -| Phi-3-mini-128k-instruct | Y | Y | | | -| Phi-3-small-8k-instruct | Y | Y | Y | Y | -| Phi-3-small-128k-instruct | Y | Y | Y | Y | -| Phi-3-medium-8k-instruct | Y | Y | | | Y | -| Phi-3-medium-128k-instruct | Y | Y | | | Y | +| Model Name | FP16 | BF16 | FP8 | INT8 | TP | +| :--------------: | :---: | :---: | :---: | :---: | :---: | +| phi-2 | Y | Y | | | Y | +| Phi-3-mini-4k-instruct | Y | Y | Y | Y | +| Phi-3-mini-128k-instruct | Y | Y | Y | Y | +| Phi-3-small-8k-instruct | Y | Y | Y | Y | Y | +| Phi-3-small-128k-instruct | Y | Y | Y | Y | Y | +| Phi-3-medium-8k-instruct | Y | Y | Y | Y | +| Phi-3-medium-128k-instruct | Y | Y | Y | Y | * Model Name: the name of the model, the same as the name on HuggingFace * TP: Tensor Parallel @@ -128,9 +128,9 @@ python3 ../summarize.py --engine_dir ./phi-engine-tp2 \ ``` -### 5. Quantization options for Phi-3-small +### 5. Quantization -Phi-3-small variants support post-training quantization to FP8 and INT8 SmoothQuant formats. +All Phi-3 variants support post-training quantization to FP8 and INT8 SmoothQuant formats. FP8 checkpoints can be built as follows: @@ -141,8 +141,6 @@ python3 ../quantization/quantize.py \ --output_dir ./phi3-checkpoint \ --dtype $DTYPE \ --qformat fp8 --kv_cache_dtype fp8 - -python3 postprocess_quant_checkpoint.py --checkpoint_dir ./phi3-checkpoint ``` INT8 checkpoints can be built as follows: @@ -154,8 +152,6 @@ python3 ../quantization/quantize.py \ --output_dir ./phi3-checkpoint \ --dtype $DTYPE \ --qformat int8_sq --kv_cache_dtype int8 - -python3 postprocess_quant_checkpoint.py --checkpoint_dir ./phi3-checkpoint ``` The commands to [build TensorRT engines](#2-build-tensorrt-engines) from quantized checkpoints diff --git a/examples/phi/convert_checkpoint.py b/examples/phi/convert_checkpoint.py index 30fde7092..6f562952f 100644 --- a/examples/phi/convert_checkpoint.py +++ b/examples/phi/convert_checkpoint.py @@ -19,8 +19,7 @@ from transformers import AutoConfig import tensorrt_llm -from tensorrt_llm.models import (Phi3ForCausalLM, Phi3SmallForCausalLM, - PhiForCausalLM) +from tensorrt_llm.models import Phi3ForCausalLM, PhiForCausalLM def parse_arguments(): @@ -81,16 +80,14 @@ def parse_arguments(): model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True) model_type = model_config.architectures[0] - supported_model = { - 'PhiForCausalLM': PhiForCausalLM, - 'Phi3ForCausalLM': Phi3ForCausalLM, - 'Phi3VForCausalLM': Phi3ForCausalLM, - 'Phi3SmallForCausalLM': Phi3SmallForCausalLM - } + supported_models = [ + 'PhiForCausalLM', 'Phi3ForCausalLM', 'Phi3VForCausalLM', + 'Phi3SmallForCausalLM' + ] modelForCausalLM = None - if model_type not in supported_model: + if model_type not in supported_models: assert False, "Invalid model type" - modelForCausalLM = supported_model[model_type] + modelForCausalLM = PhiForCausalLM if model_type == 'PhiForCausalLM' else Phi3ForCausalLM modelForCausalLM.convert_hf_checkpoint(args.model_dir, dtype=args.dtype, diff --git a/examples/phi/postprocess_quant_checkpoint.py b/examples/phi/postprocess_quant_checkpoint.py deleted file mode 100644 index 8c9cc28bb..000000000 --- a/examples/phi/postprocess_quant_checkpoint.py +++ /dev/null @@ -1,63 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json -import time - -import safetensors -from safetensors.torch import save_file - -import tensorrt_llm -from tensorrt_llm.models.phi3.phi3small.convert import shuffle_qkv_weights - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument('--checkpoint_dir', type=str, default=None) - args = parser.parse_args() - - return args - - -if __name__ == '__main__': - print(tensorrt_llm.__version__) - args = parse_arguments() - tensorrt_llm.logger.set_level('info') - - tik = time.time() - with open(f"{args.checkpoint_dir}/config.json", "r") as f: - config = json.load(f) - - weights = {} - with safetensors.safe_open(f"{args.checkpoint_dir}/rank0.safetensors", - framework="pt") as f: - for k in f.keys(): - weights[k] = f.get_tensor(k) - - # Transform QKV weights from custom Phi3Small format to TRT-LLM format - num_total_heads = config[ - 'num_attention_heads'] + 2 * config['num_key_value_heads'] - for key, value in weights.items(): - if "qkv." in key: - if 'scaling_factor' in key and value.shape[0] % num_total_heads != 0: - continue - weights[key] = shuffle_qkv_weights(value, config) - - save_file(weights, f'{args.checkpoint_dir}/rank0.safetensors') - - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - print(f'Total time of converting checkpoints: {t}') diff --git a/examples/phi/requirements.txt b/examples/phi/requirements.txt index 0bf746b21..07d57c486 100644 --- a/examples/phi/requirements.txt +++ b/examples/phi/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/quantization/quantize.py b/examples/quantization/quantize.py index f1ce69d41..f2fe6f3c9 100644 --- a/examples/quantization/quantize.py +++ b/examples/quantization/quantize.py @@ -90,6 +90,17 @@ help="KV Cache dtype.", default=None, choices=["int8", "fp8", None]) + # Medusa + parser.add_argument('--num_medusa_heads', type=int, default=4) + parser.add_argument('--num_medusa_layers', type=int, default=1) + parser.add_argument('--max_draft_len', type=int, default=63) + parser.add_argument('--medusa_hidden_act', type=str, default="silu") + parser.add_argument('--medusa_model_dir', type=str, default=None) + parser.add_argument('--quant_medusa_head', + default=False, + action='store_true', + help="whether to quantize the weights of medusa heads") + args = parser.parse_args() if args.model_dir is not None: @@ -108,7 +119,13 @@ tp_size=args.tp_size, pp_size=args.pp_size, seed=args.seed, - tokenizer_max_seq_length=args.tokenizer_max_seq_length) + tokenizer_max_seq_length=args.tokenizer_max_seq_length, + num_medusa_heads=args.num_medusa_heads, + num_medusa_layers=args.num_medusa_layers, + max_draft_len=args.max_draft_len, + medusa_hidden_act=args.medusa_hidden_act, + medusa_model_dir=args.medusa_model_dir, + quant_medusa_head=args.quant_medusa_head) elif args.nemo_ckpt_path is not None: quantize_nemo_and_export(nemo_ckpt_path=args.nemo_ckpt_path, decoder_type=args.decoder_type, diff --git a/examples/quantization/requirements.txt b/examples/quantization/requirements.txt index b904ad81d..98ab003b8 100644 --- a/examples/quantization/requirements.txt +++ b/examples/quantization/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets>=2.14.4 nemo-toolkit[all]<=1.20.0,>=1.18.0 rouge_score~=0.1.2 diff --git a/examples/qwen/requirements.txt b/examples/qwen/requirements.txt index f1d4accf8..80a4f7c70 100644 --- a/examples/qwen/requirements.txt +++ b/examples/qwen/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets~=2.16.0 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/qwenvl/requirements.txt b/examples/qwenvl/requirements.txt index 09ee007be..0dc21bb2d 100644 --- a/examples/qwenvl/requirements.txt +++ b/examples/qwenvl/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets~=2.16.0 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/recurrentgemma/requirements.txt b/examples/recurrentgemma/requirements.txt index 90d3ff974..6d8673322 100644 --- a/examples/recurrentgemma/requirements.txt +++ b/examples/recurrentgemma/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 git+https://github.com/google-deepmind/recurrentgemma.git flax>=0.8.2 jax~=0.4.23 diff --git a/examples/run.py b/examples/run.py index bf802cb9b..eb1313e57 100644 --- a/examples/run.py +++ b/examples/run.py @@ -247,7 +247,7 @@ def main(args): model_name, model_version = read_model_name( args.engine_dir) if not is_enc_dec else ("", "") - if args.tokenizer_dir is None: + if args.tokenizer_dir is None and model_name in DEFAULT_HF_MODEL_DIRS: logger.warning( "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect." ) diff --git a/examples/skywork/requirements.txt b/examples/skywork/requirements.txt index afc5cb1e1..a2bbe457e 100644 --- a/examples/skywork/requirements.txt +++ b/examples/skywork/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets~=2.16.1 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/smaug/requirements.txt b/examples/smaug/requirements.txt index f8e55c97d..814a8c824 100644 --- a/examples/smaug/requirements.txt +++ b/examples/smaug/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/whisper/requirements.txt b/examples/whisper/requirements.txt index a5f91efea..b2a1ae5bc 100644 --- a/examples/whisper/requirements.txt +++ b/examples/whisper/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.11.0.dev2024061800 +tensorrt_llm==0.11.0.dev2024062500 tiktoken datasets kaldialign diff --git a/requirements.txt b/requirements.txt index 259a73da8..d5d157897 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,12 +16,13 @@ pandas h5py==3.10.0 StrEnum sentencepiece>=0.1.99 -tensorrt==10.0.1 -# https://github.com/pytorch/pytorch/blob/v2.3.0/version.txt uses 2.3.0a0. -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04 uses 2.3.0a0. -torch>=2.3.0a,<=2.3.0 +tensorrt==10.1.0 +# https://github.com/pytorch/pytorch/blob/v2.3.1/version.txt uses 2.3.0a0. +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05 uses 2.4.0a0. +torch>=2.3.0a0,<=2.4.0a0 nvidia-modelopt~=0.11,<0.12 transformers>=4.38.2 +pillow==10.2.0 wheel optimum evaluate diff --git a/tensorrt_llm/auto_parallel/parallelization.py b/tensorrt_llm/auto_parallel/parallelization.py index 6883616db..44f2cd59d 100644 --- a/tensorrt_llm/auto_parallel/parallelization.py +++ b/tensorrt_llm/auto_parallel/parallelization.py @@ -1146,8 +1146,8 @@ def shard_gpt_attention(self, context: ShardContext): num_kv_heads = plugin_info.pfc_as_ndarray["num_kv_heads"].copy() tp_size = plugin_info.pfc_as_ndarray["tp_size"].copy() tp_rank = plugin_info.pfc_as_ndarray["tp_rank"].copy() - num_kv_heads = num_kv_heads // kv_partition - num_heads = num_heads // partition + num_kv_heads = np.maximum(num_kv_heads // kv_partition, 1) + num_heads = np.maximum(num_heads // partition, 1) tp_size[0] = partition tp_rank[0] = index diff --git a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_node.py b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_node.py index 42a2eeb8a..419308c7c 100644 --- a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_node.py +++ b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_node.py @@ -15,6 +15,11 @@ def __init__(self, layer): layer.name) layer.to_base_class() + def _collect_strategies(self, device_mesh): + raise NotImplementedError( + f"Auto parallel does not support {self.plugin_type} plugin right now." + ) + def _default_strategy(self, device_mesh): strategies_vector = StrategiesVector(self) dim_partition_dict_mapping = {} diff --git a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py index 86b334da9..a818e4153 100644 --- a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py +++ b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py @@ -1,5 +1,6 @@ from enum import Enum, auto +import numpy as np import torch from tensorrt_llm.functional import PositionEmbeddingType @@ -369,8 +370,8 @@ def _profile_sharding_cost(self, strategy, device_mesh): num_kv_heads = self.plugin_info.pfc_as_ndarray["num_kv_heads"].copy() tp_size = self.plugin_info.pfc_as_ndarray["tp_size"].copy() tp_rank = self.plugin_info.pfc_as_ndarray["tp_rank"].copy() - num_kv_heads = num_kv_heads // kv_partition - num_heads = num_heads // partition + num_kv_heads = np.maximum(num_kv_heads // kv_partition, 1) + num_heads = np.maximum(num_heads // partition, 1) tp_size[0] = partition tp_rank[0] = 0 diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py index 1c614c445..4384894f1 100644 --- a/tensorrt_llm/commands/build.py +++ b/tensorrt_llm/commands/build.py @@ -30,10 +30,9 @@ from ..logger import logger from ..lora_manager import LoraConfig, LoraManager from ..models import MODEL_MAP, PretrainedConfig -from ..models.modeling_utils import (WEIGHT_LOADER_MODELS, QuantConfig, +from ..models.modeling_utils import (WEIGHT_LOADER_MODELS, SpeculativeDecodingMode) from ..plugin import PluginConfig, add_plugin_argument -from ..quantization import QuantAlgo def parse_arguments(): @@ -133,10 +132,6 @@ def parse_arguments(): type=str, default=None, choices=['float16', 'float32']) - parser.add_argument('--weight_only_precision', - type=str, - default=None, - choices=['int8', 'int4']) parser.add_argument('--weight_sparsity', default=False, action='store_true') parser.add_argument( '--max_draft_len', @@ -270,14 +265,6 @@ def build_model(build_config: BuildConfig, if logits_dtype is not None: model_config.logits_dtype = logits_dtype - weight_only_precision = kwargs.get('weight_only_precision', None) - if not model_config.quant_mode.has_any_quant( - ) and weight_only_precision is not None: - if weight_only_precision == 'int4': - model_config.quantization = QuantConfig(QuantAlgo.W4A16) - else: - model_config.quantization = QuantConfig(QuantAlgo.W8A16) - architecture = model_config.architecture assert not build_config.plugin_config.streamingllm or architecture == "LlamaForCausalLM", \ "StreamingLLM is only supported in the llama model." @@ -420,7 +407,6 @@ def main(): kwargs = { 'logits_dtype': args.logits_dtype, 'use_fused_mlp': args.use_fused_mlp, - 'weight_only_precision': args.weight_only_precision, 'tp_size': args.tp_size, 'pp_size': args.pp_size, 'lora_dir': args.lora_dir, diff --git a/tensorrt_llm/models/__init__.py b/tensorrt_llm/models/__init__.py index b09e1d505..aa5cb7a2d 100755 --- a/tensorrt_llm/models/__init__.py +++ b/tensorrt_llm/models/__init__.py @@ -40,7 +40,6 @@ from .mpt.model import MPTForCausalLM, MPTModel from .opt.model import OPTForCausalLM, OPTModel from .phi3.model import Phi3ForCausalLM, Phi3Model -from .phi3.phi3small.model import Phi3SmallForCausalLM, Phi3SmallModel from .phi.model import PhiForCausalLM, PhiModel from .qwen.model import QWenForCausalLM from .recurrentgemma.model import RecurrentGemmaForCausalLM @@ -70,10 +69,8 @@ 'GPTNeoXForCausalLM', 'PhiModel', 'Phi3Model', - 'Phi3SmallModel', 'PhiForCausalLM', 'Phi3ForCausalLM', - 'Phi3SmallForCausalLM', 'ChatGLMForCausalLM', 'ChatGLMModel', 'BaichuanForCausalLM', @@ -103,7 +100,7 @@ 'FalconForCausalLM': FalconForCausalLM, 'PhiForCausalLM': PhiForCausalLM, 'Phi3ForCausalLM': Phi3ForCausalLM, - 'Phi3SmallForCausalLM': Phi3SmallForCausalLM, + 'Phi3SmallForCausalLM': Phi3ForCausalLM, 'MambaForCausalLM': MambaForCausalLM, 'GPTNeoXForCausalLM': GPTNeoXForCausalLM, 'GPTJForCausalLM': GPTJForCausalLM, diff --git a/tensorrt_llm/models/gemma/model.py b/tensorrt_llm/models/gemma/model.py index 4c4753547..2e9d80459 100644 --- a/tensorrt_llm/models/gemma/model.py +++ b/tensorrt_llm/models/gemma/model.py @@ -292,6 +292,7 @@ def from_hugging_face(cls, return tllm_llama def check_config(self, config): + config.set_if_not_exist("share_embedding_table", True) config.set_if_not_exist('use_parallel_embedding', False) config.set_if_not_exist('embedding_sharding_dim', 0) config.set_if_not_exist('mlp_bias', False) diff --git a/tensorrt_llm/models/generation_mixin.py b/tensorrt_llm/models/generation_mixin.py index ab5a81b62..5f18ff321 100644 --- a/tensorrt_llm/models/generation_mixin.py +++ b/tensorrt_llm/models/generation_mixin.py @@ -62,6 +62,80 @@ def split_num_tokens_range(max_num_tokens): [split_point[-1], max_num_tokens, max_num_tokens]) return num_tokens_ranges + @staticmethod + def get_profiles_ranges( + *, + max_batch_size, + max_beam_width, + max_input_len, + max_num_tokens, + max_draft_len, + opt_batch_size, + opt_num_tokens, + enable_ctx_gen_opt_profiles, + multiple_profiles, + ): + default_range = GenerationMixin.default_range + if opt_batch_size: + bb_range_cxt = [1, opt_batch_size, max_batch_size] + bb_range_gen = [ + 1, opt_batch_size * max_beam_width, + max_batch_size * max_beam_width + ] + else: + bb_range_cxt = default_range(max_batch_size) + bb_range_gen = default_range(max_batch_size * max_beam_width) + tokens_per_engine_step = max_draft_len + 1 + tokens_per_engine_step_range = [ + 1, tokens_per_engine_step, tokens_per_engine_step + ] + bbd_range_ctx = [ + bb_range_cxt[i] * (tokens_per_engine_step if i != 0 else 1) + for i in range(len(bb_range_cxt)) + ] + bbd_range_gen = [ + bb_range_gen[i] * (tokens_per_engine_step if i != 0 else 1) + for i in range(len(bb_range_gen)) + ] + inlen_range_cxt = default_range(max_input_len) + inlen_range_gen = [1, 1, tokens_per_engine_step] + if enable_ctx_gen_opt_profiles: + num_profiles = 2 + bb_range = [bb_range_cxt, bb_range_gen] + bbd_range = [bbd_range_ctx, bbd_range_gen] + inlen_range = [inlen_range_cxt, inlen_range_gen] + position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]] + num_tokens_range_ctx = default_range(max_batch_size * max_input_len) + # Draft tokens cannot be combined with beam search + num_tokens_range_gen = default_range( + max_batch_size * max(tokens_per_engine_step, max_beam_width)) + num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen] + else: + if multiple_profiles: + num_tokens_range = GenerationMixin.split_num_tokens_range( + max_num_tokens) + else: + if opt_num_tokens is None: + opt_num_tokens = min(max_num_tokens, + max_batch_size * max_beam_width) + num_tokens_range = [[1, opt_num_tokens, max_num_tokens]] + num_profiles = len(num_tokens_range) + bb_range = [bb_range_gen] * num_profiles + bbd_range = [bbd_range_gen] * num_profiles + inlen_range = [[1, 1, max_input_len]] * num_profiles + position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles + tokens_per_engine_step_range = [tokens_per_engine_step_range + ] * num_profiles + ranges = { + 'bb_range': bb_range, + 'bbd_range': bbd_range, + 'inlen_range': inlen_range, + 'position_ids_inlen_range': position_ids_inlen_range, + 'num_tokens_range': num_tokens_range, + 'tokens_per_engine_step_range': tokens_per_engine_step_range, + } + return num_profiles, ranges + def prepare_attention_inputs(self, *, max_batch_size, @@ -343,61 +417,26 @@ def prepare_basic_inputs( streamingllm: bool = False, opt_batch_size=None): - default_range = GenerationMixin.default_range - tokens_per_engine_step = max_draft_len + 1 - tokens_per_engine_step_range = [ - 1, tokens_per_engine_step, tokens_per_engine_step - ] - if opt_batch_size: - bb_range_cxt = [1, opt_batch_size, max_batch_size] - bb_range_gen = [ - 1, opt_batch_size * max_beam_width, - max_batch_size * max_beam_width - ] - else: - bb_range_cxt = default_range(max_batch_size) - bb_range_gen = default_range(max_batch_size * max_beam_width) - bbd_range_ctx = [ - bb_range_cxt[i] * (tokens_per_engine_step if i != 0 else 1) - for i in range(len(bb_range_cxt)) - ] - bbd_range_gen = [ - bb_range_gen[i] * (tokens_per_engine_step if i != 0 else 1) - for i in range(len(bb_range_gen)) - ] - inlen_range_cxt = default_range(max_input_len) - inlen_range_gen = [1, 1, tokens_per_engine_step] - enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles( use_gpt_attention_plugin, use_gemm_plugin, remove_input_padding, paged_kv_cache) - if enable_ctx_gen_opt_profiles: - num_profiles = 2 - bb_range = [bb_range_cxt, bb_range_gen] - bbd_range = [bbd_range_ctx, bbd_range_gen] - inlen_range = [inlen_range_cxt, inlen_range_gen] - position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]] - num_tokens_range_ctx = default_range(max_batch_size * max_input_len) - # Draft tokens cannot be combined with beam search - num_tokens_range_gen = default_range( - max_batch_size * max(tokens_per_engine_step, max_beam_width)) - num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen] - else: - if multiple_profiles: - num_tokens_range = GenerationMixin.split_num_tokens_range( - max_num_tokens) - else: - if opt_num_tokens is None: - opt_num_tokens = min(max_num_tokens, - max_batch_size * max_beam_width) - num_tokens_range = [[1, opt_num_tokens, max_num_tokens]] - num_profiles = len(num_tokens_range) - bb_range = [bb_range_gen] * num_profiles - bbd_range = [bbd_range_gen] * num_profiles - inlen_range = [[1, 1, max_input_len]] * num_profiles - position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles - tokens_per_engine_step_range = [tokens_per_engine_step_range - ] * num_profiles + + num_profiles, ranges = GenerationMixin.get_profiles_ranges( + max_batch_size=max_batch_size, + max_beam_width=max_beam_width, + max_input_len=max_input_len, + max_num_tokens=max_num_tokens, + max_draft_len=max_draft_len, + opt_batch_size=opt_batch_size, + opt_num_tokens=opt_num_tokens, + enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles, + multiple_profiles=multiple_profiles) + bb_range = ranges['bb_range'] + bbd_range = ranges['bbd_range'] + inlen_range = ranges['inlen_range'] + num_tokens_range = ranges['num_tokens_range'] + position_ids_inlen_range = ranges['position_ids_inlen_range'] + tokens_per_engine_step_range = ranges['tokens_per_engine_step_range'] position_ids_num_tokens_range = num_tokens_range input_ids = None @@ -597,12 +636,13 @@ def prepare_basic_inputs( spec_decoding_params = None # Use positional offsets and packed mask only when not in SpS spec decoding if speculative_decoding_draft_tokens_external == False and max_draft_len > 0: + tokens_per_engine_step = max_draft_len + 1 # 32 bits packed mask aligned. num_packed_masks = (tokens_per_engine_step + 32 - 1) // 32 packed_mask_len_range = [[0, 1, num_packed_masks]] * num_profiles # total number of spec decoding tokens for all sequences (sequence length can be variable). num_gen_tokens_range = [ - default_range( + GenerationMixin.default_range( max_batch_size * max_beam_width * tokens_per_engine_step, min_range=0) ] * num_profiles diff --git a/tensorrt_llm/models/llama/convert.py b/tensorrt_llm/models/llama/convert.py index 7d6209040..5f243b926 100644 --- a/tensorrt_llm/models/llama/convert.py +++ b/tensorrt_llm/models/llama/convert.py @@ -1535,6 +1535,12 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig): moe_config = config.moe + kv_tp_size = None + kv_tp_rank = None + if config.num_key_value_heads < mapping.tp_size: + kv_tp_size = config.num_key_value_heads + kv_tp_rank = mapping.tp_rank * kv_tp_size // mapping.tp_size + model_prefix = "model." key_list = [ "embed_tokens.weight", # vocab_embedding @@ -1552,7 +1558,12 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig): torch_dtype = str_dtype_to_torch(dtype) - def load(key, tp_dim=-1, no_prefix=0, is_expert_weights=False): + def load(key, + tp_dim=-1, + no_prefix=0, + is_expert_weights=False, + tp_size=None, + tp_rank=None): if not no_prefix: key = model_prefix + key ptr_idx = safetensors_map[key] if key in safetensors_map else 0 @@ -1560,38 +1571,28 @@ def load(key, tp_dim=-1, no_prefix=0, is_expert_weights=False): if key not in safetensors_ptrs[ptr_idx].keys(): return None + tensor_slice = safetensors_ptrs[ptr_idx].get_slice(key) + tensor_shape = tensor_slice.get_shape() if tp_dim == -1: - res = safetensors_ptrs[ptr_idx].get_tensor(key) - else: + res = tensor_slice[:] + elif tp_dim >= 0 and tp_dim < len(tensor_shape): if is_expert_weights: tp_size = mapping.moe_tp_size tp_rank = mapping.moe_tp_rank else: - tp_size = mapping.tp_size - tp_rank = mapping.tp_rank - tensor_slice = safetensors_ptrs[ptr_idx].get_slice(key) - tensor_shape = tensor_slice.get_shape() - if len(tensor_shape) == 1: - if tp_dim == 0: - slice_width = tensor_shape[0] // tp_size - res = tensor_slice[slice_width * tp_rank:slice_width * - (tp_rank + 1)] - else: - res = tensor_slice[:] - else: - if tensor_shape[tp_dim] % tp_size != 0: - logger.error( - "Current weight shape is invalid for tp_size=" + - str(tp_size)) - slice_width = tensor_shape[tp_dim] // tp_size - if tp_dim == 0: - res = tensor_slice[slice_width * tp_rank:slice_width * - (tp_rank + 1), :] - elif tp_dim == 1: - res = tensor_slice[:, slice_width * tp_rank:slice_width * - (tp_rank + 1)] - else: - assert False, "Invalid TP dim" + tp_size = tp_size or mapping.tp_size + tp_rank = tp_rank or mapping.tp_rank + dim_size = tensor_shape[tp_dim] + if dim_size % tp_size != 0: + logger.error( + f"Current weight shape {tensor_shape} is invalid at dimension {tp_dim} for TP size {tp_size}" + ) + indices = [slice(None)] * len(tensor_shape) + indices[tp_dim] = slice(dim_size * tp_rank // tp_size, + dim_size * (tp_rank + 1) // tp_size) + res = tensor_slice[indices] + else: + raise ValueError(f"Invalid TP dim: {tp_dim}") return res.to(torch_dtype).contiguous( ) if "block_sparse_moe.gate" not in key else res.to(torch.float32) @@ -1632,11 +1633,19 @@ def load_and_set(target, # Attention qkv_list = [] for comp in ["q", "k", "v"]: - weight_part = load(prefix + key_list[3] + comp + key_list[4], 0) + tp_size = kv_tp_size if comp != "q" else None + tp_rank = kv_tp_rank if comp != "q" else None + weight_part = load(prefix + key_list[3] + comp + key_list[4], + 0, + tp_size=tp_size, + tp_rank=tp_rank) qkv_list.append(weight_part) bias_part = load( (prefix + key_list[3] + comp + key_list[4]).replace( - "weight", "bias"), 0) + "weight", "bias"), + 0, + tp_size=tp_size, + tp_rank=tp_rank) if bias_part is not None: qkv_list.append(bias_part) if len(qkv_list) == 3: diff --git a/tensorrt_llm/models/mamba/model.py b/tensorrt_llm/models/mamba/model.py index 2f835dd41..32b2e1cc6 100644 --- a/tensorrt_llm/models/mamba/model.py +++ b/tensorrt_llm/models/mamba/model.py @@ -23,6 +23,7 @@ gather_last_token_logits, shape, unsqueeze) from ...layers import Embedding, LayerNorm, Linear, Mamba, RmsNorm from ...module import Module, ModuleList +from ...plugin import current_all_reduce_helper from ..generation_mixin import GenerationMixin from ..modeling_utils import PretrainedConfig, PretrainedModel @@ -192,6 +193,7 @@ def forward(self, ssm_states, host_request_types, last_token_ids, + last_token_ids_for_logits, host_context_lengths, slot_mapping: Optional[Tensor] = None): hidden_states, present_convs, present_ssms = self.backbone( @@ -200,7 +202,7 @@ def forward(self, if not self.gather_context_logits: hidden_states = gather_last_token_logits( - hidden_states, last_token_ids, + hidden_states, last_token_ids_for_logits, default_net().plugin_config.remove_input_padding) lm_logits = self.lm_head(hidden_states) @@ -218,9 +220,9 @@ def prepare_inputs( max_batch_size, max_input_len, max_seq_len, + max_num_tokens, use_cache, max_beam_width: int = 1, - max_num_tokens: int = None, opt_num_tokens: int = None, opt_batch_size: int = 0, prompt_embedding_table_size: int = 0, @@ -235,56 +237,79 @@ def prepare_inputs( @return: a list contains values which can be fed into the self.forward() ''' assert speculative_decoding_draft_tokens_external == False, "Speculative decoding is not supported in Mamba" + assert max_beam_width == 1, "We don't support beam search for the Mamba model." + remove_input_padding = default_net().plugin_config.remove_input_padding + use_gemm_plugin = default_net().plugin_config.gemm_plugin + paged_state = default_net().plugin_config.paged_state + multiple_profiles = default_net().plugin_config.multiple_profiles use_mamba_conv1d_plugin = default_net( ).plugin_config.mamba_conv1d_plugin - batch_range = [GenerationMixin.default_range(max_batch_size)] + use_custom_all_reduce = default_net( + ).plugin_config.use_custom_all_reduce + self.gather_context_logits = gather_context_logits + mapping = self.config.mapping + + # basic inputs + enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles( + True, use_gemm_plugin, remove_input_padding, paged_state) + + num_profiles, ranges = GenerationMixin.get_profiles_ranges( + max_batch_size=max_batch_size, + max_beam_width=max_beam_width, + max_input_len=max_input_len, + max_num_tokens=max_num_tokens, + max_draft_len=max_draft_len, + opt_batch_size=opt_batch_size, + opt_num_tokens=opt_num_tokens, + enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles, + multiple_profiles=multiple_profiles) + if remove_input_padding: assert use_mamba_conv1d_plugin, "mamba_conv1d_plugin is needed to support remove_input_padding" - max_num_tokens = max( - max_input_len * max_batch_size, - max_beam_width * (max_draft_len + 1) * max_batch_size) - if opt_num_tokens is None: - opt_num_tokens = max_beam_width * (max_draft_len + - 1) * max_batch_size - num_tokens_range = [[1, opt_num_tokens, max_num_tokens]] input_ids = Tensor(name='input_ids', dtype=trt.int32, shape=[-1], dim_range=OrderedDict([ - ('num_tokens', num_tokens_range), + ('num_tokens', ranges['num_tokens_range']), ])) else: input_ids = Tensor(name='input_ids', dtype=trt.int32, shape=[-1, -1], dim_range=OrderedDict([ - ('batch_size', batch_range), - ('input_len', [[1, 1, max_input_len]]), + ('batch_size_beam_width', + ranges['bb_range']), + ('input_len', ranges['inlen_range']), ])) + if use_custom_all_reduce and mapping.tp_size > 1: + current_all_reduce_helper().set_workspace_tensor( + mapping, num_profiles) + + # recurrent inputs conv_states = [] ssm_states = [] if use_mamba_conv1d_plugin: conv_state_dim_range = OrderedDict([ - ('batch_size', batch_range), - ('kernel_size', [self.d_conv - 1]), - ('dim_size', [self.d_inner]), + ('batch_size', ranges['bb_range']), + ('kernel_size', [self.d_conv - 1] * num_profiles), + ('dim_size', [self.d_inner] * num_profiles), ]) else: conv_state_dim_range = OrderedDict([ - ('batch_size', batch_range), - ('dim_size', [self.d_inner]), - ('kernel_size', [self.d_conv - 1]), + ('batch_size', ranges['bb_range']), + ('dim_size', [self.d_inner] * num_profiles), + ('kernel_size', [self.d_conv - 1] * num_profiles), ]) ssm_state_dim_range = OrderedDict([ - ('batch_size', batch_range), - ('state_size', [self.d_state]), - ('dim_size', [self.d_inner]), + ('batch_size', ranges['bb_range']), + ('state_size', [self.d_state] * num_profiles), + ('dim_size', [self.d_inner] * num_profiles), ]) one_dim_range = OrderedDict([ - ('buffer_count', [1]), + ('buffer_count', [1] * num_profiles), ]) for i in range(self.config.num_hidden_layers): @@ -324,7 +349,7 @@ def prepare_inputs( name='host_request_types', dtype=trt.int32, shape=[-1], - dim_range=OrderedDict([('batch_size', batch_range)]), + dim_range=OrderedDict([('batch_size', ranges['bb_range'])]), ) if use_mamba_conv1d_plugin and remove_input_padding: @@ -332,21 +357,22 @@ def prepare_inputs( name='host_context_lengths', dtype=trt.int32, shape=[-1], - dim_range=OrderedDict([('batch_size', batch_range)]), + dim_range=OrderedDict([('batch_size', ranges['bb_range'])]), ) else: host_context_lengths = None - last_token_ids = None + last_token_ids = Tensor( + name='last_token_ids', + dtype=trt.int32, + shape=[-1], + dim_range=OrderedDict([ + ('batch_size', ranges['bbd_range']), + ]), + ) + last_token_ids_for_logits = None if not gather_context_logits: - last_token_ids = Tensor( - name='last_token_ids', - dtype=trt.int32, - shape=[-1], - dim_range=OrderedDict([ - ('batch_size', batch_range), - ]), - ) + last_token_ids_for_logits = last_token_ids return_dict = { 'input_ids': input_ids, @@ -354,6 +380,7 @@ def prepare_inputs( 'ssm_states': ssm_states, 'host_request_types': host_request_types, 'last_token_ids': last_token_ids, + 'last_token_ids_for_logits': last_token_ids_for_logits, 'host_context_lengths': host_context_lengths, } @@ -362,7 +389,7 @@ def prepare_inputs( name='slot_mapping', dtype=trt.int32, shape=[-1], - dim_range=OrderedDict([('batch_size', batch_range)]), + dim_range=OrderedDict([('batch_size', ranges['bb_range'])]), ) return_dict['slot_mapping'] = slot_mapping diff --git a/tensorrt_llm/models/medusa/weight.py b/tensorrt_llm/models/medusa/weight.py index 251ea3edc..3173f2996 100644 --- a/tensorrt_llm/models/medusa/weight.py +++ b/tensorrt_llm/models/medusa/weight.py @@ -1,43 +1,80 @@ from pathlib import Path -import numpy as np import torch from tensorrt_llm import logger -from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy +from tensorrt_llm._utils import str_dtype_to_torch from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import MedusaLM from tensorrt_llm.models.convert_utils import split +def get_tllm_linear_weight(weight, + prefix, + bias=None, + use_weight_only=False, + plugin_weight_only_quant_type=torch.int8, + postfix='weight'): + results = {} + if use_weight_only: + v = weight.t().contiguous().cpu() + processed_torch_weights, torch_weight_scales = \ + torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix( + v, plugin_weight_only_quant_type) + results[prefix + postfix] = processed_torch_weights + results[prefix + 'per_channel_scale'] = torch_weight_scales + else: + results[prefix + postfix] = weight.contiguous() + + if bias is not None: + results[prefix + 'bias'] = bias + + return results + + def load_medusa_hf(medusa_path: str, - trt_llm_medusa: MedusaLM, + num_medusa_heads: int, + num_medusa_layers: int, mapping=Mapping(), - dtype='float32'): + dtype='float32', + use_weight_only=False, + plugin_weight_only_quant_type=None): logger.info("Loading Medusa heads' weights ...") + is_ckpt_safetensors = False + ckpt_file = Path(medusa_path) / "medusa_lm_head.pt" - state_dict = torch.load(ckpt_file, map_location="cpu") + if not ckpt_file.exists(): + ckpt_file = Path(medusa_path) / "medusa_lm_head.safetensors" + is_ckpt_safetensors = True + + if is_ckpt_safetensors: + logger.info("Safetensors Found ...") + from safetensors.torch import load_file + state_dict = load_file(ckpt_file) + else: + state_dict = torch.load(ckpt_file, map_location="cpu") + torch_dtype = str_dtype_to_torch(dtype) - for h in range(trt_llm_medusa.num_medusa_heads): - for l in range(trt_llm_medusa.num_medusa_layers): - w = state_dict[f"{h}.{l}.linear.weight"].clone() - w = torch_to_numpy(w.to(torch_dtype).detach().cpu()) - trt_llm_medusa.medusa_heads[h].medusa_layers[ - l].linear.weight.value = np.ascontiguousarray( - split(w, mapping.tp_size, mapping.tp_rank)) - if trt_llm_medusa.medusa_heads[h].medusa_layers[ - l].linear.bias is not None: - # print(f"Setting bias for {h} {l}") - b = state_dict[f"{h}.{l}.linear.bias"].clone() - b = torch_to_numpy(b.to(torch_dtype).detach().cpu()) - trt_llm_medusa.medusa_heads[h].medusa_layers[ - l].linear.bias.value = np.ascontiguousarray( - np.split(b, mapping.tp_size, - axis=0)[mapping.tp_rank].copy()) - lm = state_dict[f"{h}.{trt_llm_medusa.num_medusa_layers}.weight"].clone( - ) # LM Head - lm = torch_to_numpy(lm.to(torch_dtype).detach().cpu()) - trt_llm_medusa.medusa_heads[ - h].lm_head.weight.value = np.ascontiguousarray( - split(lm, mapping.tp_size, mapping.tp_rank)) - return + weights = {} + + for h in range(num_medusa_heads): + for l in range(num_medusa_layers): + w = state_dict[f"{h}.{l}.linear.weight"].clone().to(torch_dtype) + + split_v = split(w, mapping.tp_size, mapping.tp_rank) + weights.update( + get_tllm_linear_weight( + split_v, f'medusa_heads.{h}.medusa_layers.{l}.linear.', + None, use_weight_only, plugin_weight_only_quant_type)) + + b = state_dict[f"{h}.{l}.linear.bias"].clone().to(torch_dtype) + + weights['medusa_heads.{}.medusa_layers.{}.linear.bias'.format( + h, l)] = split(b, mapping.tp_size, mapping.tp_rank) + + lm = state_dict[f"{h}.{num_medusa_layers}.weight"].clone().to( + torch_dtype) # LM Head + + weights['medusa_heads.{}.lm_head.weight'.format(h)] = split( + lm, mapping.tp_size, mapping.tp_rank) + + return weights diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py index c042e7440..15f076d75 100644 --- a/tensorrt_llm/models/modeling_utils.py +++ b/tensorrt_llm/models/modeling_utils.py @@ -27,6 +27,10 @@ from ..module import Module, ModuleList from ..parameter import Parameter from ..quantization import QuantMode +from ..quantization.layers import (WeightOnlyGroupwiseQuantLinear, + WeightOnlyGroupwiseQuantRowLinear, + WeightOnlyQuantLinear, + WeightOnlyQuantRowLinear) from ..quantization.mode import W8A8_SQ_PLUGIN_LIST, QuantAlgo from ..top_model_mixin import TopModelMixin from .convert_utils import weight_only_quantize_dict @@ -841,7 +845,8 @@ def unfuse_qkv_gemm(model: PretrainedModel) -> PretrainedModel: continue qkv_params = get_init_params(layer.qkv, ColumnLinear) qkv_params["bias"] = qkv_params["bias"] is not None - qkv_params["strict_dtype"] = qkv_params["strict_dtype"] is not None + qkv_params["strict_dtype"] = qkv_params.get( + "strict_dtype") is not None q = ColumnLinear( **{ **qkv_params, @@ -866,20 +871,34 @@ def unfuse_qkv_gemm(model: PretrainedModel) -> PretrainedModel: q = quantize(q, model.config.quantization) k = quantize(k, model.config.quantization) v = quantize(v, model.config.quantization) + out_features = q.out_features + k.out_features + v.out_features + if isinstance(layer.qkv, ( + WeightOnlyQuantLinear, + WeightOnlyQuantRowLinear, + WeightOnlyGroupwiseQuantLinear, + WeightOnlyGroupwiseQuantRowLinear, + )): + out_dim = 1 + else: + out_dim = 0 if layer.qkv.weight.is_inited(): qkv_weight = layer.qkv.weight.raw_value weights = np.split(qkv_weight, [ - q.out_features, - q.out_features + k.out_features, - ]) + qkv_weight.shape[out_dim] * q.out_features // out_features, + qkv_weight.shape[out_dim] * + (q.out_features + k.out_features) // out_features, + ], + axis=out_dim) for gemm, weight in zip([q, k, v], weights): gemm.weight.value = weight if layer.qkv.bias is not None and layer.qkv.bias.is_inited(): qkv_bias = layer.qkv.bias.raw_value biases = np.split(qkv_bias, [ - q.out_features, - q.out_features + k.out_features, - ]) + qkv_bias.shape[out_dim] * q.out_features // out_features, + qkv_bias.shape[out_dim] * + (q.out_features + k.out_features) // out_features, + ], + axis=out_dim) for gemm, bias in zip([q, k, v], biases): gemm.bias.value = bias for name, parameter in layer.qkv._parameters.items(): diff --git a/tensorrt_llm/models/phi3/convert.py b/tensorrt_llm/models/phi3/convert.py index dcc7e2686..da2940178 100644 --- a/tensorrt_llm/models/phi3/convert.py +++ b/tensorrt_llm/models/phi3/convert.py @@ -1,9 +1,12 @@ import torch +from tensorrt_llm.quantization import QuantAlgo + from ..._utils import str_dtype_to_torch +from .split_weights import shuffle_qkv_weights, split_weights_tp -def convert_hf_weights(hf_model, dtype, **kwargs): +def convert_hf_weights(hf_model, dtype, config, small_variant, args, rank): torch_dtype = str_dtype_to_torch(dtype) hf_state_dict = hf_model.state_dict() weights = {} @@ -15,13 +18,16 @@ def convert_hf_weights(hf_model, dtype, **kwargs): key = key.replace("model.layers.", "transformer.layers.") #Attention key = key.replace("self_attn.", "attention.") + key = key.replace("query_key_value.", "qkv.") # small key = key.replace("Wqkv.weight", "qkv.weight") key = key.replace("qkv_proj.", "qkv.") #128k #MLP key = key.replace("mlp.fc1.", "mlp.fc.") key = key.replace("mlp.fc2.", "mlp.proj.") key = key.replace("mlp.gate_up_proj.", "mlp.fc.") - key = key.replace("mlp.up_proj.", "mlp.gate.") #128k + key = key.replace( + "mlp.up_proj.", + "mlp.fc." if small_variant else "mlp.gate.") #128k key = key.replace("mlp.down_proj.", "mlp.proj.") #128k key = key.replace("mlp.gate_proj.", "mlp.fc.") #128k key = key.replace("o_proj.", "dense.") #128k @@ -53,27 +59,67 @@ def convert_hf_weights(hf_model, dtype, **kwargs): key = key.replace("q_proj.weight", "qkv.weight") elif "k_proj" in key or "v_proj" in key: continue + weights[key] = value.to(torch_dtype).cpu() + if small_variant: + weights['lm_head.weight'] = weights[ + 'transformer.vocab_embedding.weight'].clone() + + # Transform QKV weights from custom Phi3Small format to TRT-LLM format + for key, value in weights.items(): + if "qkv." in key: + weights[key] = shuffle_qkv_weights(weights[key], config) + + weights = split_weights_tp(config, weights, args, rank, torch_dtype) + return weights -def convert_hf_config(hf_config, dtype, **kwargs): +def convert_small_hf_config(hf_config): + return { + 'architecture': "Phi3SmallForCausalLM", + 'rotary_base': hf_config.rope_embedding_base, + 'gegelu_limit': hf_config.gegelu_limit, + 'mup_attn_multiplier': hf_config.mup_attn_multiplier, + 'mup_embedding_multiplier': hf_config.mup_embedding_multiplier, + 'mup_use_scaling': hf_config.mup_use_scaling, + 'mup_width_multiplier': hf_config.mup_width_multiplier, + 'blocksparse_block_size': hf_config.blocksparse_block_size, + 'blocksparse_homo_head_pattern': + hf_config.blocksparse_homo_head_pattern, + 'blocksparse_num_local_blocks': hf_config.blocksparse_num_local_blocks, + 'blocksparse_vertical_stride': hf_config.blocksparse_vert_stride, + 'dense_attention_every_n_layers': + hf_config.dense_attention_every_n_layers, + } + + +def convert_hf_config(hf_config, dtype, args): config = { 'architecture': "Phi3ForCausalLM", 'dtype': dtype, 'num_hidden_layers': hf_config.num_hidden_layers, 'num_attention_heads': hf_config.num_attention_heads, 'num_key_value_heads': hf_config.num_key_value_heads, - 'rope_theta': hf_config.rope_theta, 'hidden_size': hf_config.hidden_size, 'intermediate_size': hf_config.intermediate_size, 'vocab_size': hf_config.vocab_size, 'max_position_embeddings': hf_config.max_position_embeddings, 'hidden_act': hf_config.hidden_act, 'share_embedding_table': False, - 'norm_epsilon': hf_config.rms_norm_eps, } + + small_variant = hf_config.architectures[0] == "Phi3SmallForCausalLM" + if small_variant: + config.update(convert_small_hf_config(hf_config)) + else: + config.update({ + 'rotary_base': hf_config.rope_theta, + 'norm_epsilon': hf_config.rms_norm_eps, + }) + + # Long-context variants if hf_config.max_position_embeddings >= 128000: config.update({ 'original_max_position_embeddings': @@ -83,6 +129,31 @@ def convert_hf_config(hf_config, dtype, **kwargs): 'longrope_scaling_long_factors': hf_config.rope_scaling["long_factor"] }) + + if small_variant: + config.update({ + 'longrope_long_mscale': + hf_config.rope_scaling["long_mscale"], + 'longrope_short_mscale': + hf_config.rope_scaling["short_mscale"] + }) + if config["hidden_act"] == "silu": config["hidden_act"] = "swiglu" + + # Tensor parallelism and weight-only quantization + if args is not None: + config.update({ + 'mapping': { + 'world_size': args.tp_size * args.pp_size, + 'tp_size': args.tp_size, + 'pp_size': args.pp_size, + } + }) + + if args.use_weight_only and args.weight_only_precision == 'int8': + config.update({'quantization': {'quant_algo': QuantAlgo.W8A16}}) + elif args.use_weight_only and args.weight_only_precision == 'int4': + config.update({'quantization': {'quant_algo': QuantAlgo.W4A16}}) + return config diff --git a/tensorrt_llm/models/phi3/model.py b/tensorrt_llm/models/phi3/model.py index 451791169..683ad2a5a 100644 --- a/tensorrt_llm/models/phi3/model.py +++ b/tensorrt_llm/models/phi3/model.py @@ -1,15 +1,20 @@ +import json +import os +import traceback +from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Optional import numpy as np +import safetensors from transformers import AutoModelForCausalLM from ..._utils import pad_vocab_size from ...functional import PositionEmbeddingType, Tensor -from ...layers import (MLP, Attention, AttentionMaskType, Embedding, - ParallelLMHead, RmsNorm) +from ...layers import (MLP, Attention, AttentionMaskType, BlockSparseAttnParams, + Embedding, LayerNorm, ParallelLMHead, RmsNorm) from ...module import Module from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM, - PretrainedConfig, save_checkpoint) + PretrainedConfig) from .convert import convert_hf_config, convert_hf_weights @@ -22,28 +27,65 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): tp_group = config.mapping.tp_group tp_size = config.mapping.tp_size - self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size, - eps=config.norm_epsilon, - dtype=config.dtype) - self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size, - eps=config.norm_epsilon, - dtype=config.dtype) + attention_mask_type = AttentionMaskType.causal + block_sparse_attn_params = BlockSparseAttnParams() + q_scaling = 1.0 + self.gegelu_limit = None + + self.small_variant = config.architecture == "Phi3SmallForCausalLM" + if self.small_variant: + self.gegelu_limit = config.gegelu_limit + + # MuP uses norm_factor=attention_head_size (rather than sqrt(attention_head_size)) + # We achieve this using q_scaling = sqrt(attention_head_size) + hidden_size = config.hidden_size + num_attention_heads = config.num_attention_heads + attention_head_size = hidden_size / num_attention_heads + q_scaling = attention_head_size**.5 + + block_sparse = ( + (layer_idx + 1) % config.dense_attention_every_n_layers) != 0 + attention_mask_type = AttentionMaskType.blocksparse if block_sparse else AttentionMaskType.causal + + block_sparse_attn_params = BlockSparseAttnParams( + config.blocksparse_block_size, + config.blocksparse_homo_head_pattern, + config.blocksparse_num_local_blocks, + config.blocksparse_vertical_stride) + + self.input_layernorm = LayerNorm( + normalized_shape=config.hidden_size, dtype=config.dtype) + self.post_layernorm = LayerNorm(normalized_shape=config.hidden_size, + dtype=config.dtype) + else: + self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size, + eps=config.norm_epsilon, + dtype=config.dtype) + self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size, + eps=config.norm_epsilon, + dtype=config.dtype) layers_range = config.mapping.pp_layers(config.num_hidden_layers) local_layer_idx = layer_idx - layers_range[0] position_embedding_type = PositionEmbeddingType.rope_gpt_neox - rope_scaling_short_factors = 1.0 - rope_scaling_long_factors = 1.0 + rope_scaling_short_factors, rope_scaling_long_factors = 1.0, 1.0 + rope_scaling_short_mscale, rope_scaling_long_mscale = 1.0, 1.0 original_max_position_embeddings = config.max_position_embeddings + if hasattr(config, "longrope_scaling_short_factors"): rope_scaling_short_factors = np.asarray( config.longrope_scaling_short_factors).astype(np.float32) rope_scaling_long_factors = np.asarray( config.longrope_scaling_long_factors).astype(np.float32) + original_max_position_embeddings = config.original_max_position_embeddings position_embedding_type = PositionEmbeddingType.long_rope + if self.small_variant: + rope_scaling_short_mscale = config.longrope_short_mscale + rope_scaling_long_mscale = config.longrope_long_mscale + self.attention = Attention( local_layer_idx=local_layer_idx, hidden_size=config.hidden_size, @@ -53,15 +95,18 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): rotary_embedding_base=config.rotary_base, max_position_embeddings=config.max_position_embeddings, dtype=config.dtype, - attention_mask_type=AttentionMaskType.causal, - bias=False, + attention_mask_type=attention_mask_type, + bias=self.small_variant, + q_scaling=q_scaling, tp_group=tp_group, tp_size=tp_size, quant_mode=config.quant_mode, rope_scaling_short_factors=rope_scaling_short_factors, rope_scaling_long_factors=rope_scaling_long_factors, + rope_scaling_short_mscale=rope_scaling_short_mscale, + rope_scaling_long_mscale=rope_scaling_long_mscale, original_max_position_embeddings=original_max_position_embeddings, - ) + block_sparse_params=block_sparse_attn_params) self.mlp = MLP(hidden_size=config.hidden_size, ffn_hidden_size=config.intermediate_size, @@ -70,7 +115,7 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): tp_group=tp_group, tp_size=tp_size, quant_mode=config.quant_mode, - bias=False) + bias=self.small_variant) def forward( self, @@ -88,7 +133,7 @@ def forward( use_cache=use_cache, kv_cache_params=kv_cache_params, attention_params=attention_params, - norm_before_bmm1=True, + norm_before_bmm1=not self.small_variant, ) if use_cache: @@ -96,7 +141,8 @@ def forward( post_attention_input = hidden_states + attention_output post_attention_output = self.post_layernorm(post_attention_input) - feed_forward_hidden_states = self.mlp(post_attention_output, ) + feed_forward_hidden_states = self.mlp(post_attention_output, + gegelu_limit=self.gegelu_limit) hidden_states = post_attention_input + feed_forward_hidden_states if use_cache: return (hidden_states, presents) @@ -112,9 +158,15 @@ def __init__(self, config: PretrainedConfig): dtype=config.dtype) self.layers = DecoderLayerList(Phi3DecoderLayer, config) - self.ln_f = RmsNorm(normalized_shape=config.hidden_size, - eps=config.norm_epsilon, - dtype=config.dtype) + self.small_variant = config.architecture == "Phi3SmallForCausalLM" + if self.small_variant: + self.ln_f = LayerNorm(normalized_shape=config.hidden_size, + dtype=config.dtype) + self.mup_embedding_multiplier = config.mup_embedding_multiplier + else: + self.ln_f = RmsNorm(normalized_shape=config.hidden_size, + eps=config.norm_epsilon, + dtype=config.dtype) def forward( self, @@ -132,6 +184,9 @@ def forward( ] if prompt_embedding_table is not None else [] hidden_states = self.vocab_embedding(input_ids, *args) + if self.small_variant and self.mup_embedding_multiplier > 0.0: + hidden_states = hidden_states * self.mup_embedding_multiplier + hidden_states = self.layers( hidden_states, use_cache=use_cache, @@ -152,7 +207,6 @@ def forward( class Phi3ForCausalLM(DecoderModelForCausalLM): def __init__(self, config: PretrainedConfig): - self.check_config(config) transformer = Phi3Model(config) vocab_size_padded = pad_vocab_size(config.vocab_size, config.mapping.tp_size) @@ -167,25 +221,48 @@ def __init__(self, config: PretrainedConfig): super().__init__(config, transformer, lm_head) - def check_config(self, config): - config.set_if_not_exist('rotary_base', 10000.0) - @classmethod def convert_hf_checkpoint(cls, hf_model_dir: str, dtype: Optional[str] = "float16", output_dir: Optional[str] = None, - **kwargs): + args=None): ''' Convert Huggingface checkpoint to TRT-LLM checkpoint ''' + hf_model = AutoModelForCausalLM.from_pretrained(hf_model_dir, torch_dtype="auto", trust_remote_code=True) - config = convert_hf_config(hf_model.config, dtype=dtype, **kwargs) - weights = convert_hf_weights(hf_model, dtype=dtype, **kwargs) + config = convert_hf_config(hf_model.config, dtype, args) + with open(os.path.join(output_dir, 'config.json'), 'w') as f: + json.dump(config, f, indent=4) + + small_variant = config['architecture'] == "Phi3SmallForCausalLM" - if output_dir: - save_checkpoint(output_dir, config=config, weights=weights) + def covert_and_save(rank): + weights = convert_hf_weights(hf_model, dtype, config, small_variant, + args, rank) + safetensors.torch.save_file( + weights, os.path.join(output_dir, f'rank{rank}.safetensors')) - return {"weights": weights, "config": config} + world_size = args.tp_size * args.pp_size + if args.workers == 1: + for rank in range(world_size): + covert_and_save(rank) + else: + with ThreadPoolExecutor(max_workers=args.workers) as p: + futures = [ + p.submit(covert_and_save, rank) + for rank in range(world_size) + ] + exceptions = [] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + traceback.print_exc() + exceptions.append(e) + assert len( + exceptions + ) == 0, "Checkpoint conversion failed, please check error log." diff --git a/tensorrt_llm/models/phi3/phi3small/__init__.py b/tensorrt_llm/models/phi3/phi3small/__init__.py deleted file mode 100644 index 71bf6d298..000000000 --- a/tensorrt_llm/models/phi3/phi3small/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tensorrt_llm/models/phi3/phi3small/model.py b/tensorrt_llm/models/phi3/phi3small/model.py deleted file mode 100644 index 2d0ff7d79..000000000 --- a/tensorrt_llm/models/phi3/phi3small/model.py +++ /dev/null @@ -1,257 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import json -import os -import traceback -from concurrent.futures import ThreadPoolExecutor, as_completed - -import numpy as np -import safetensors -from transformers import AutoModelForCausalLM - -from ...._utils import pad_vocab_size -from ....functional import PositionEmbeddingType, Tensor -from ....layers import (MLP, Attention, AttentionMaskType, - BlockSparseAttnParams, Embedding, LayerNorm, - ParallelLMHead) -from ....module import Module -from ...modeling_utils import (DecoderLayerList, DecoderModelForCausalLM, - PretrainedConfig) -from .convert import convert_hf_config, convert_hf_weights - - -class Phi3SmallDecoderLayer(Module): - - def __init__(self, config: PretrainedConfig, layer_idx: int): - super().__init__() - self.config = config - self.layer_idx = layer_idx - tp_group = config.mapping.tp_group - tp_size = config.mapping.tp_size - self.gegelu_limit = config.gegelu_limit - - self.input_layernorm = LayerNorm(normalized_shape=config.hidden_size, - dtype=config.dtype) - - # MuP uses norm_factor=attention_head_size (rather than sqrt(attention_head_size)) - # We achieve this using q_scaling = sqrt(attention_head_size) - hidden_size = config.hidden_size - num_attention_heads = config.num_attention_heads - attention_head_size = hidden_size / num_attention_heads - q_scaling = attention_head_size**.5 - - block_sparse = ( - (layer_idx + 1) % config.dense_attention_every_n_layers) != 0 - attention_mask_type = AttentionMaskType.blocksparse if block_sparse else AttentionMaskType.causal - - block_sparse_attn_params = BlockSparseAttnParams( - config.blocksparse_block_size, config.blocksparse_homo_head_pattern, - config.blocksparse_num_local_blocks, - config.blocksparse_vertical_stride) - - layers_range = config.mapping.pp_layers(config.num_hidden_layers) - local_layer_idx = layer_idx - layers_range[0] - - position_embedding_type = PositionEmbeddingType.rope_gpt_neox - original_max_position_embeddings = config.max_position_embeddings - - rope_scaling_short_factors, rope_scaling_long_factors = 1.0, 1.0 - rope_scaling_short_mscale, rope_scaling_long_mscale = 1.0, 1.0 - - if hasattr(config, "longrope_scaling_short_factors"): - rope_scaling_short_factors = np.asarray( - config.longrope_scaling_short_factors).astype(np.float32) - rope_scaling_long_factors = np.asarray( - config.longrope_scaling_long_factors).astype(np.float32) - rope_scaling_short_mscale = config.longrope_short_mscale - rope_scaling_long_mscale = config.longrope_long_mscale - - position_embedding_type = PositionEmbeddingType.long_rope - original_max_position_embeddings = config.original_max_position_embeddings - - self.attention = Attention( - local_layer_idx=local_layer_idx, - hidden_size=config.hidden_size, - num_attention_heads=config.num_attention_heads, - num_kv_heads=config.num_kv_heads, - position_embedding_type=position_embedding_type, - rotary_embedding_base=config.rotary_embedding_base, - max_position_embeddings=config.max_position_embeddings, - original_max_position_embeddings=original_max_position_embeddings, - dtype=config.dtype, - attention_mask_type=attention_mask_type, - bias=True, - q_scaling=q_scaling, - tp_group=tp_group, - tp_size=tp_size, - quant_mode=config.quant_mode, - rope_scaling_short_factors=rope_scaling_short_factors, - rope_scaling_long_factors=rope_scaling_long_factors, - rope_scaling_short_mscale=rope_scaling_short_mscale, - rope_scaling_long_mscale=rope_scaling_long_mscale, - block_sparse_params=block_sparse_attn_params) - - self.post_layernorm = LayerNorm(normalized_shape=config.hidden_size, - dtype=config.dtype) - - self.mlp = MLP(hidden_size=config.hidden_size, - ffn_hidden_size=config.intermediate_size, - hidden_act=config.hidden_act, - dtype=config.dtype, - tp_group=tp_group, - tp_size=tp_size, - quant_mode=config.quant_mode) - - def forward( - self, - hidden_states: Tensor, - attention_mask=None, - use_cache=False, - kv_cache_params=None, - attention_params=None, - ): - residual = hidden_states - input_layernorm_output = self.input_layernorm(hidden_states) - - # Self attention - attention_output = self.attention( - input_layernorm_output, - attention_mask=attention_mask, - use_cache=use_cache, - kv_cache_params=kv_cache_params, - attention_params=attention_params, - ) - - if use_cache: - attention_output, presents = attention_output - - hidden_states = residual + attention_output - - # Fully connected - residual = hidden_states - hidden_states = self.post_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states, gegelu_limit=self.gegelu_limit) - hidden_states = residual + hidden_states - - if use_cache: - return (hidden_states, presents) - return hidden_states - - -class Phi3SmallModel(Module): - - def __init__(self, config: PretrainedConfig): - super().__init__() - self.vocab_embedding = Embedding(num_embeddings=config.vocab_size, - embedding_dim=config.hidden_size, - dtype=config.dtype) - - self.layers = DecoderLayerList(Phi3SmallDecoderLayer, config) - self.ln_f = LayerNorm(normalized_shape=config.hidden_size, - dtype=config.dtype) - self.mup_embedding_multiplier = config.mup_embedding_multiplier - - def forward( - self, - input_ids: Tensor, - position_ids=None, - use_cache=False, - attention_mask=None, - kv_cache_params=None, - attention_params=None, - prompt_embedding_table=None, - prompt_tasks=None, - prompt_vocab_size=None, - ): - args = [prompt_embedding_table, prompt_tasks, prompt_vocab_size - ] if prompt_embedding_table is not None else [] - hidden_states = self.vocab_embedding(input_ids, *args) - - if self.mup_embedding_multiplier is not None and self.mup_embedding_multiplier > 0.0: - hidden_states = hidden_states * self.mup_embedding_multiplier - - hidden_states = self.layers( - hidden_states, - use_cache=use_cache, - attention_mask=attention_mask, - kv_cache_params=kv_cache_params, - attention_params=attention_params, - ) - if use_cache: - hidden_states, presents = hidden_states - - hidden_states = self.ln_f(hidden_states) - - if use_cache: - return (hidden_states, tuple(presents)) - return hidden_states - - -class Phi3SmallForCausalLM(DecoderModelForCausalLM): - - def __init__(self, config: PretrainedConfig): - transformer = Phi3SmallModel(config) - vocab_size_padded = pad_vocab_size(config.vocab_size, - config.mapping.tp_size) - - lm_head = ParallelLMHead(config.hidden_size, - vocab_size_padded, - bias=False, - dtype=config.dtype, - tp_group=config.mapping.tp_group, - tp_size=config.mapping.tp_size, - gather_output=True) - - super().__init__(config, transformer, lm_head) - - @classmethod - def convert_hf_checkpoint(cls, model_dir, dtype, output_dir, args=None): - ''' - Convert Huggingface checkpoint to TRT-LLM checkpoint - ''' - - hf_model = AutoModelForCausalLM.from_pretrained(model_dir, - torch_dtype="auto", - trust_remote_code=True) - - config = convert_hf_config(hf_model.config, dtype, args) - with open(os.path.join(output_dir, 'config.json'), 'w') as f: - json.dump(config, f, indent=4) - - def covert_and_save(rank): - weights = convert_hf_weights(hf_model, config, args, rank) - safetensors.torch.save_file( - weights, os.path.join(output_dir, f'rank{rank}.safetensors')) - - world_size = args.tp_size * args.pp_size - if args.workers == 1: - for rank in range(world_size): - covert_and_save(rank) - else: - with ThreadPoolExecutor(max_workers=args.workers) as p: - futures = [ - p.submit(covert_and_save, rank) - for rank in range(world_size) - ] - exceptions = [] - for future in as_completed(futures): - try: - future.result() - except Exception as e: - traceback.print_exc() - exceptions.append(e) - assert len( - exceptions - ) == 0, "Checkpoint conversion failed, please check error log." diff --git a/tensorrt_llm/models/phi3/phi3small/convert.py b/tensorrt_llm/models/phi3/split_weights.py similarity index 66% rename from tensorrt_llm/models/phi3/phi3small/convert.py rename to tensorrt_llm/models/phi3/split_weights.py index c11824876..72ccf0117 100644 --- a/tensorrt_llm/models/phi3/phi3small/convert.py +++ b/tensorrt_llm/models/phi3/split_weights.py @@ -15,10 +15,6 @@ import torch -from tensorrt_llm.quantization import QuantAlgo - -from ...._utils import str_dtype_to_torch - def shuffle_qkv_weights(weights, config): # Input weights are organized as @@ -29,8 +25,7 @@ def shuffle_qkv_weights(weights, config): # (q00, q01, ..., qnm), (k0, k1, .., kn), (v0, v1, .., vn) num_heads = config['num_attention_heads'] - num_kv_heads = config['num_kv_heads'] if 'num_kv_heads' in config.keys( - ) else config['num_key_value_heads'] + num_kv_heads = config['num_key_value_heads'] num_q_per_kv = num_heads // num_kv_heads hidden_size = config['hidden_size'] @@ -152,7 +147,7 @@ def get_tllm_linear_weight(weight, def split_weights_tp(config, weights, args, rank, dtype): num_heads = config['num_attention_heads'] - num_kv_heads = config['num_kv_heads'] + num_kv_heads = config['num_key_value_heads'] hidden_size = config['hidden_size'] mha_mode = num_heads == num_kv_heads @@ -228,96 +223,3 @@ def get_weight(weight, prefix, bias): dim=0) return weights - - -def convert_hf_weights(hf_model, config, args, rank): - torch_dtype = str_dtype_to_torch(args.dtype) - hf_state_dict = hf_model.state_dict() - weights = {} - - # replace key name - for key, value in hf_state_dict.items(): - # Decoder Layers - if "model.layers." in key: - key = key.replace("model.layers.", "transformer.layers.") - key = key.replace("self_attn.", "attention.") - key = key.replace("query_key_value.", "qkv.") - key = key.replace("mlp.up_proj.", "mlp.fc.") - key = key.replace("mlp.down_proj.", "mlp.proj.") - key = key.replace("post_attention_layernorm.", "post_layernorm.") - # Embedding - key = key.replace("model.embed_tokens.weight", - "transformer.vocab_embedding.weight") - # Final Layer norm - key = key.replace("model.final_layernorm.", "transformer.ln_f.") - weights[key] = value.to(torch_dtype).cpu() - - weights['lm_head.weight'] = weights[ - 'transformer.vocab_embedding.weight'].clone() - - # Transform QKV weights from custom Phi3Small format to TRT-LLM format - for key, value in weights.items(): - if "qkv." in key: - weights[key] = shuffle_qkv_weights(weights[key], config) - - weights = split_weights_tp(config, weights, args, rank, torch_dtype) - - return weights - - -def convert_hf_config(hf_config, dtype, args): - config = { - 'architecture': 'Phi3SmallForCausalLM', - 'dtype': dtype, - 'num_hidden_layers': hf_config.num_hidden_layers, - 'num_attention_heads': hf_config.num_attention_heads, - 'num_kv_heads': hf_config.num_key_value_heads, - 'rotary_embedding_base': hf_config.rope_embedding_base, - 'hidden_size': hf_config.hidden_size, - 'intermediate_size': hf_config.intermediate_size, - 'vocab_size': hf_config.vocab_size, - 'max_position_embeddings': hf_config.max_position_embeddings, - 'hidden_act': hf_config.hidden_act, - 'share_embedding_table': False, - 'gegelu_limit': hf_config.gegelu_limit, - 'mup_attn_multiplier': hf_config.mup_attn_multiplier, - 'mup_embedding_multiplier': hf_config.mup_embedding_multiplier, - 'mup_use_scaling': hf_config.mup_use_scaling, - 'mup_width_multiplier': hf_config.mup_width_multiplier, - 'blocksparse_block_size': hf_config.blocksparse_block_size, - 'blocksparse_homo_head_pattern': - hf_config.blocksparse_homo_head_pattern, - 'blocksparse_num_local_blocks': hf_config.blocksparse_num_local_blocks, - 'blocksparse_vertical_stride': hf_config.blocksparse_vert_stride, - 'dense_attention_every_n_layers': - hf_config.dense_attention_every_n_layers, - } - - if args is not None: - config.update({ - 'mapping': { - 'world_size': args.tp_size * args.pp_size, - 'tp_size': args.tp_size, - 'pp_size': args.pp_size, - } - }) - - if args.use_weight_only and args.weight_only_precision == 'int8': - config.update({'quantization': {'quant_algo': QuantAlgo.W8A16}}) - elif args.use_weight_only and args.weight_only_precision == 'int4': - config.update({'quantization': {'quant_algo': QuantAlgo.W4A16}}) - - if hf_config.max_position_embeddings >= 128000: - config.update({ - 'original_max_position_embeddings': - hf_config.original_max_position_embeddings, - 'longrope_scaling_short_factors': - hf_config.rope_scaling["short_factor"], - 'longrope_scaling_long_factors': - hf_config.rope_scaling["long_factor"], - 'longrope_long_mscale': - hf_config.rope_scaling["long_mscale"], - 'longrope_short_mscale': - hf_config.rope_scaling["short_mscale"] - }) - return config diff --git a/tensorrt_llm/models/recurrentgemma/model.py b/tensorrt_llm/models/recurrentgemma/model.py index be11ed431..53a6c4502 100644 --- a/tensorrt_llm/models/recurrentgemma/model.py +++ b/tensorrt_llm/models/recurrentgemma/model.py @@ -398,9 +398,9 @@ def prepare_inputs( max_batch_size, max_input_len, max_seq_len, + max_num_tokens, use_cache, max_beam_width: int = 1, - max_num_tokens: int = None, opt_num_tokens: int = None, opt_batch_size: int = 0, prompt_embedding_table_size: int = 0, @@ -434,44 +434,20 @@ def prepare_inputs( self.gather_context_logits = gather_context_logits mapping = self.config.mapping - default_range = GenerationMixin.default_range - batch_range = default_range(max_batch_size) - bbd_range = [ - batch_range[i] * ((max_draft_len + 1) if i != 0 else 1) - for i in range(len(batch_range)) - ] - inlen_range_cxt = default_range(max_input_len) - inlen_range_gen = [1, 1, max_draft_len + 1] - # basic inputs enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles( use_gpt_attention_plugin, use_gemm_plugin, remove_input_padding, paged_kv_cache) - if max_num_tokens is None: - max_num_tokens = max( - max_input_len * max_batch_size, - max_beam_width * (max_draft_len + 1) * max_batch_size) - if enable_ctx_gen_opt_profiles: - num_profiles = 2 - inlen_range = [inlen_range_cxt, inlen_range_gen] - num_tokens_range_ctx = default_range(max_num_tokens) - num_tokens_range_gen = default_range( - max_batch_size * (max_draft_len + 1) * max_beam_width) - num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen] - position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]] - else: - max_bs_x_max_bw = max_batch_size * max_beam_width - if opt_num_tokens is None: - opt_num_tokens = max_bs_x_max_bw - if multiple_profiles: - num_tokens_range = GenerationMixin.split_num_tokens_range( - max_num_tokens) - else: - num_tokens_range = [[1, opt_num_tokens, max_num_tokens]] - num_profiles = len(num_tokens_range) - inlen_range = [[1, 1, max_input_len]] * num_profiles - position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles - bb_range = [batch_range] * num_profiles + num_profiles, ranges = GenerationMixin.get_profiles_ranges( + max_batch_size=max_batch_size, + max_beam_width=max_beam_width, + max_input_len=max_input_len, + max_num_tokens=max_num_tokens, + max_draft_len=max_draft_len, + opt_batch_size=opt_batch_size, + opt_num_tokens=opt_num_tokens, + enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles, + multiple_profiles=multiple_profiles) if remove_input_padding: assert use_mamba_conv1d_plugin, "mamba_conv1d_plugin is needed to support remove_input_padding" @@ -479,14 +455,14 @@ def prepare_inputs( dtype=trt.int32, shape=[-1], dim_range=OrderedDict([ - ('num_tokens', num_tokens_range), + ('num_tokens', ranges['num_tokens_range']), ])) position_ids = Tensor(name='position_ids', dtype=trt.int32, shape=[-1], dim_range=OrderedDict([ ('position_ids_num_tokens_range', - num_tokens_range), + ranges['num_tokens_range']), ])) else: input_ids = Tensor(name='input_ids', @@ -494,16 +470,17 @@ def prepare_inputs( shape=[-1, -1], dim_range=OrderedDict([ ('batch_size_beam_width', - [batch_range] * num_profiles), - ('input_len', inlen_range), + ranges['bb_range']), + ('input_len', ranges['inlen_range']), ])) position_ids = Tensor(name='position_ids', dtype=trt.int32, shape=[-1, -1], dim_range=OrderedDict([ - ('batch_size_beam_width', bb_range), + ('batch_size_beam_width', + ranges['bb_range']), ('position_ids_inlen_range', - position_ids_inlen_range), + ranges['position_ids_inlen_range']), ])) if use_custom_all_reduce and mapping.tp_size > 1: current_all_reduce_helper().set_workspace_tensor( @@ -559,7 +536,8 @@ def prepare_inputs( name='host_request_types', dtype=trt.int32, shape=[-1], - dim_range=OrderedDict([('batch_size_beam_width', bb_range)]), + dim_range=OrderedDict([('batch_size_beam_width', + ranges['bb_range'])]), ) last_token_ids = Tensor( @@ -567,7 +545,7 @@ def prepare_inputs( dtype=trt.int32, shape=[-1], dim_range=OrderedDict([ - ('batch_size_last_token_ids', [bbd_range] * num_profiles), + ('batch_size_last_token_ids', ranges['bbd_range']), ]), ) last_token_ids_for_logits = None @@ -581,7 +559,8 @@ def prepare_inputs( name='host_context_lengths', dtype=trt.int32, shape=[-1], - dim_range=OrderedDict([('batch_size_beam_width', bb_range)]), + dim_range=OrderedDict([('batch_size_beam_width', + ranges['bb_range'])]), ) else: host_context_lengths = None diff --git a/tensorrt_llm/quantization/layers.py b/tensorrt_llm/quantization/layers.py index 591dba9d5..b7273b962 100644 --- a/tensorrt_llm/quantization/layers.py +++ b/tensorrt_llm/quantization/layers.py @@ -883,7 +883,6 @@ def forward(self, x, lora_runtime_params=None): assert lora_runtime_params is None or default_net( ).plugin_config.lora_plugin == self.dtype - lora_hidden_state = x if lora_runtime_params is not None else None if default_net().strongly_typed: assert is_same_dtype( x.dtype, @@ -894,8 +893,13 @@ def forward(self, x, lora_runtime_params=None): activation_scaling_factor = cast(activation_scaling_factor, self.dtype) if x.dtype != trt.fp8: quantized_out = quantize(x, activation_scaling_factor, 'fp8') + lora_hidden_state = x if lora_runtime_params is not None else None else: quantized_out = x + # TODO: add fp8 LoRA support + lora_hidden_state = dequantize( + x, activation_scaling_factor, -1, + self.dtype) if lora_runtime_params is not None else None weights_scaling_factor = constant( self.weights_scaling_factor.raw_value.copy()) @@ -956,14 +960,18 @@ def forward(self, x, lora_runtime_params=None, reduce_fusion_params=None): assert lora_runtime_params is None or default_net( ).plugin_config.lora_plugin == self.dtype - lora_hidden_state = x if lora_runtime_params is not None else None activation_scaling_factor = constant( self.activation_scaling_factor.raw_value.copy()) activation_scaling_factor = cast(activation_scaling_factor, self.dtype) if x.dtype != trt.fp8: quantized_out = quantize(x, activation_scaling_factor, 'fp8') + lora_hidden_state = x if lora_runtime_params is not None else None else: quantized_out = x + # TODO: add fp8 LoRA support + lora_hidden_state = dequantize( + x, activation_scaling_factor, -1, + self.dtype) if lora_runtime_params is not None else None weights_scaling_factor = constant( self.weights_scaling_factor.raw_value.copy()) diff --git a/tensorrt_llm/quantization/quantize.py b/tensorrt_llm/quantization/quantize.py index 5a33799dd..d4e271749 100644 --- a/tensorrt_llm/quantization/quantize.py +++ b/tensorrt_llm/quantization/quantize.py @@ -57,7 +57,10 @@ def quantize_layers( if preprocess_init_params is not None: preprocess_init_params(init_params, name, module) quant_layer = quant_cls(**init_params) - setattr(parent, module_name, quant_layer) + if parent is not None: + setattr(parent, module_name, quant_layer) + else: + model = quant_layer setattr(model, 'quant_mode', quant_config.quant_mode) return model @@ -78,7 +81,7 @@ def preprocess_init_params(init_params, name, module): module_name = name.rsplit('.', 1)[-1] init_params["transb"] = module_name == "lm_head" - quantize_layers( + model = quantize_layers( model, quant_config, quant_map, @@ -102,7 +105,7 @@ def preprocess_init_params(init_params, name, module): init_params[ "use_w4a8_awq"] = quant_config.quant_algo == QuantAlgo.W4A8_AWQ - quantize_layers( + model = quantize_layers( model, quant_config, quant_map, @@ -120,7 +123,7 @@ def smooth_quantize_ootb( RowLinear: Int8SmoothQuantRowLinear, } - quantize_layers( + model = quantize_layers( model, quant_config, quant_map, @@ -138,7 +141,7 @@ def smooth_quantize_plugin(model, quant_mode): } for name, layer, parent in model.named_modules_with_parent(): layer_name = name.rsplit('.', 1)[-1] - if layer_name in ['ln_f']: + if layer_name in ['ln_f', 'ln_embed']: continue quant_cls = None @@ -156,7 +159,10 @@ def smooth_quantize_plugin(model, quant_mode): init_params[ "num_attention_heads"] = layer.num_attention_heads * layer.tp_size quant_layer = quant_cls(**init_params) - setattr(parent, layer_name, quant_layer) + if parent is not None: + setattr(parent, layer_name, quant_layer) + else: + model = quant_layer setattr(model, 'quant_mode', quant_mode) return model @@ -178,7 +184,7 @@ def fp8_quantize(model, quant_config: QuantConfig): RowLinear: FP8RowLinear, } - quantize_layers( + model = quantize_layers( model, quant_config, quant_map, diff --git a/tensorrt_llm/quantization/quantize_by_modelopt.py b/tensorrt_llm/quantization/quantize_by_modelopt.py index 86c2527f4..b71d7590c 100644 --- a/tensorrt_llm/quantization/quantize_by_modelopt.py +++ b/tensorrt_llm/quantization/quantize_by_modelopt.py @@ -31,10 +31,12 @@ import safetensors import torch from datasets import load_dataset +from safetensors.torch import load_file, save_file from torch.utils.data import DataLoader from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from ..logger import logger +from ..mapping import Mapping from .mode import QuantAlgo EMPTY_CFG = { @@ -122,7 +124,8 @@ def quant_cfg_choices(): "Gemma": "gemma", "MixtralForCausalLM": "llama", "ArcticForCausalLM": "llama", - "Phi3SmallForCausalLM": "phi", + "Phi3SmallForCausalLM": "phi3small", + "Phi3ForCausalLM": "phi3", } @@ -263,10 +266,95 @@ def calibrate_loop(): return model -def quantize_and_export(*, model_dir, device, calib_dataset, dtype, qformat, - kv_cache_dtype, calib_size, batch_size, - calib_max_seq_length, awq_block_size, output_dir, - tp_size, pp_size, seed, tokenizer_max_seq_length): +def combine_medusa_weight(tp_size, pp_size, base_model_output_dir, + num_medusa_heads, num_medusa_layers, max_draft_len, + medusa_hidden_act, medusa_model_dir, + quant_medusa_head): + + with open(f"{medusa_model_dir}/config.json", "r") as fp: + medusa_config = json.load(fp) + + num_medusa_heads_from_config = medusa_config.get('medusa_num_heads', + num_medusa_heads) + num_medusa_layers = medusa_config.get('medusa_num_layers', + num_medusa_layers) + if num_medusa_heads is None: + num_medusa_heads = num_medusa_heads_from_config + + assert max_draft_len > 0, "should have max_draft_len > 0" + + world_size = tp_size * pp_size + # Process for each rank + for rank in range(world_size): + mapping = Mapping(world_size=world_size, + rank=rank, + tp_size=tp_size, + pp_size=pp_size) + # 1. Load medusa weight for each rank + from tensorrt_llm.models.medusa.weight import load_medusa_hf + medusa_weights = load_medusa_hf(medusa_path=medusa_model_dir, + num_medusa_heads=num_medusa_heads, + num_medusa_layers=num_medusa_layers, + mapping=mapping, + dtype="float16") + # 2. Load base model safetensors (after quant) + base_model_weights = load_file( + f"{base_model_output_dir}/rank{rank}.safetensors") + + # 3. Combine and save weight + base_model_weights.update(medusa_weights) + save_file(base_model_weights, + f"{base_model_output_dir}/rank{rank}.safetensors") + + # 4. Add medusa config into config.json + with open(f"{base_model_output_dir}/config.json", 'r') as f: + base_model_config = json.load(f) + f.close() + + with open(f"{base_model_output_dir}/config.json", 'w') as f: + base_model_config['architecture'] = "MedusaForCausalLM" + base_model_config['quantization']['exclude_modules'] = [ + 'lm_head', + '*router', + '*vocab_embedding', + '*position_embedding', + '*block_embedding', + ] + if not quant_medusa_head: + base_model_config['quantization']['exclude_modules'].append( + '*medusa_heads*') + + base_model_config['max_draft_len'] = max_draft_len + base_model_config['num_medusa_heads'] = num_medusa_heads + base_model_config['num_medusa_layers'] = num_medusa_layers + json.dump(base_model_config, f, indent=4) + + torch.cuda.empty_cache() + print("Combine medusa heads' weight, done.") + + +def quantize_and_export(*, + model_dir, + device, + calib_dataset, + dtype, + qformat, + kv_cache_dtype, + calib_size, + batch_size, + calib_max_seq_length, + awq_block_size, + output_dir, + tp_size, + pp_size, + seed, + tokenizer_max_seq_length, + num_medusa_heads=None, + num_medusa_layers=None, + max_draft_len=None, + medusa_hidden_act=None, + medusa_model_dir=None, + quant_medusa_head=None): ''' Load model from the model_dir, call Modelopt to quantize the model, and then export the quantized model as TRT-LLM checkpoint @@ -419,24 +507,16 @@ def quantize_and_export(*, model_dir, device, calib_dataset, dtype, qformat, with open(f"{export_path}/config.json", "w") as f: json.dump(tensorrt_llm_config, f, indent=4) - if model_type == 'phi': - with open(f"{export_path}/config.json", "r") as f: - tensorrt_llm_config = json.load(f) - phi_config = AutoConfig.from_pretrained(model_dir, - trust_remote_code=True) - - from ..models.phi3.phi3small.convert import \ - convert_hf_config as phi_config_converter - phi_config = phi_config_converter(phi_config, dtype, None) - - for key, value in phi_config.items(): - tensorrt_llm_config[key] = value - - with open(f"{export_path}/config.json", "w") as f: - json.dump(tensorrt_llm_config, f, indent=4) - torch.cuda.empty_cache( ) # otherwise torch is keeping using GPU, other routine like build engine has less free GPU to use + + # Workaround for combining medusa head + # TODO: move these integration into modelopt to avoid redundant reading and writing + if medusa_model_dir is not None: + combine_medusa_weight(tp_size, pp_size, export_path, + num_medusa_heads, num_medusa_layers, + max_draft_len, medusa_hidden_act, + medusa_model_dir, quant_medusa_head) end_time = time.time() print( "Quantized model exported to {} \nTotal time used {:.2f} s.".format( diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py index 12206c450..cb051e10d 100644 --- a/tensorrt_llm/version.py +++ b/tensorrt_llm/version.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.11.0.dev2024061800" +__version__ = "0.11.0.dev2024062500" diff --git a/tests/model/test_mamba.py b/tests/model/test_mamba.py index ce5498361..91c1cdfe7 100644 --- a/tests/model/test_mamba.py +++ b/tests/model/test_mamba.py @@ -84,10 +84,12 @@ def _gen_tensorrt_llm_network(self, network, hf_config, hf_path, hf_mamba, hf_config, hf_path, hf_mamba, load_mode, dtype) with net_guard(network): network.set_named_parameters(tensorrt_llm_mamba.named_parameters()) - inputs = tensorrt_llm_mamba.prepare_inputs(batch_size, - input_len, - input_len + output_len, - use_cache=False) + inputs = tensorrt_llm_mamba.prepare_inputs( + batch_size, + input_len, + input_len + output_len, + max_num_tokens=batch_size * input_len, + use_cache=False) # Prepare tensorrt_llm_mamba(**inputs) return network diff --git a/tests/test_llama_conversion.sh b/tests/test_llama_conversion.sh index 8b55cd7a8..063fc8d14 100755 --- a/tests/test_llama_conversion.sh +++ b/tests/test_llama_conversion.sh @@ -80,7 +80,7 @@ test_gptq() { python convert_checkpoint.py --model_dir ${MODEL} \ --output_dir ./tllm_checkpoint/2gpu_gptq \ --dtype float16 \ - --modelopt_quant_ckpt_path /home/scratch.trt_llm_data/llm-models/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors \ + --quant_ckpt_path /home/scratch.trt_llm_data/llm-models/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors \ --use_weight_only \ --weight_only_precision int4_gptq \ --per_group \