diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
index 02abc189b..eeb7da68d 100644
--- a/benchmarks/cpp/gptManagerBenchmark.cpp
+++ b/benchmarks/cpp/gptManagerBenchmark.cpp
@@ -165,6 +165,9 @@ struct BenchmarkParams
 
     // Weights offloading
     float gpuWeightsPercent{1.0};
+
+    // Decoding params
+    std::optional<std::vector<std::vector<SizeType32>>> medusaChoices;
 };
 
 class InferenceRequestsSyncSend
@@ -791,6 +794,10 @@ class ExecutorServer
             executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
         }
 
+        executorConfig.setDecodingConfig(texec::DecodingConfig(
+            benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
+            std::nullopt, benchmarkParams.medusaChoices));
+
         mExecutor = std::make_unique<texec::Executor>(trtEnginePath, texec::ModelType::kDECODER_ONLY, executorConfig);
 
         if (logIterationData)
@@ -1346,6 +1353,9 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
     optionalParams.maxBeamWidth = beamWidth;
     optionalParams.maxBatchSize = benchmarkParams.maxBatchSize;
     optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy};
+    optionalParams.decodingConfig = texec::DecodingConfig(
+        benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
+        std::nullopt, benchmarkParams.medusaChoices);
 
     auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
     SizeType32 deviceCount{0};
@@ -1600,6 +1610,32 @@ void benchmarkExecutor(std::filesystem::path const& engineDir, TrtGptModelType m
     }
 }
 
+std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
+{
+    std::vector<std::vector<SizeType32>> result;
+    std::regex outer_regex(R"(\[(.*?)\])");
+    std::regex inner_regex(R"(\d+)");
+    auto outer_begin = std::sregex_iterator(input.begin(), input.end(), outer_regex);
+    auto outer_end = std::sregex_iterator();
+
+    for (std::sregex_iterator i = outer_begin; i != outer_end; ++i)
+    {
+        std::smatch match = *i;
+        std::string inner_str = match.str(1);
+        std::vector<int> inner_vec;
+        auto inner_begin = std::sregex_iterator(inner_str.begin(), inner_str.end(), inner_regex);
+        auto inner_end = std::sregex_iterator();
+
+        for (std::sregex_iterator j = inner_begin; j != inner_end; ++j)
+        {
+            std::smatch inner_match = *j;
+            inner_vec.push_back(std::stoi(inner_match.str()));
+        }
+        result.push_back(inner_vec);
+    }
+    return result;
+}
+
 } // namespace
 
 int main(int argc, char* argv[])
@@ -1692,6 +1728,8 @@ int main(int argc, char* argv[])
     options.add_options()("gpu_weights_percent",
         "Specify the percentage of weights that reside on GPU (from 0.0 to 1.0).",
         cxxopts::value<float>()->default_value("1.0"));
+    options.add_options()(
+        "medusa_choices", "Medusa choices in the format of [[0], [0, 1], [0, 0, 1]]", cxxopts::value<std::string>());
 
     auto result = options.parse(argc, argv);
 
@@ -1823,6 +1861,12 @@ int main(int argc, char* argv[])
     // Argument: If offloaded blocks should be onboarded to primary memory before they are reused.
     benchmarkParams.kvOnboardBlocks = !result["kv_dont_onboard_blocks"].as<bool>();
 
+    // Argument: Medusa choices for the Medusa speculative decoding.
+    if (result.count("medusa_choices"))
+    {
+        benchmarkParams.medusaChoices = parseVectorOfVectors(result["medusa_choices"].as<std::string>());
+    }
+
     std::optional<TokenIdType> padId;
     // Argument: Padding token id
     if (result.count("pad_id"))
diff --git a/benchmarks/python/build.py b/benchmarks/python/build.py
index 07447aaa8..c5fb39888 100644
--- a/benchmarks/python/build.py
+++ b/benchmarks/python/build.py
@@ -944,6 +944,7 @@ def build_gpt(args):
     network = builder.create_network()
     network.trt_network.name = engine_name
     network.plugin_config.to_legacy_setting()
+    network.plugin_config.dtype = args.dtype
 
     # Plugins
     if args.mode in ['plugin', 'plugin-ifb']:
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index 4c834e01a..fe7949a5b 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1
-size 3931604
+oid sha256:8b6ad33047e2684c7d22471f87febbb96ae26f4eac6529e2f3b7c1469ec2ec6d
+size 3931504
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index 4c834e01a..8d15a4cc4 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1
-size 3931604
+oid sha256:560f736af15a4dfba849ab29efc3520d6ec8c87bf2aa16589299b232dc171cca
+size 3989220
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
index a23bfa496..d1e552ba5 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.a
-93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.pre_cxx11.a
-c0bd2b69c932257678a2aad9bd8baba4b291795e commit
\ No newline at end of file
+f8538ac35803837e5d457ea8c1a58053 libtensorrt_llm_batch_manager_static.a
+dc6fc82dc4ba319899e1d6777bd8c3a4 libtensorrt_llm_batch_manager_static.pre_cxx11.a
+265b039443334094026fbd8f396d52fe29c2d9d1 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index 2438054cd..09e9e4105 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bd757c26886a3ffd6947615d9f2829434e94839b693007a64b47c6b5c26416e4
-size 3812158
+oid sha256:74948e00ff7341914b1831ccfdce9ae242dd149603b1ba7e24ee993f08b63542
+size 3812960
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index a88fe4a7f..effd33642 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:87321383075adf2d87cfbdc8a12a3d3815ef058d5da9b6aaa8d7d3f3263af439
-size 3773896
+oid sha256:0421ceacd5d07bc172bb4d0979edaf466aa8950290b4d6d1a7d355dbcefc2c84
+size 3772832
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
index 2710c6005..04b00e8b8 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:58cdc0a330f8bfb7b50e3202aeac47bde0835b1dc600b4bfdcd2b30801e66e03
-size 22381766
+oid sha256:46eb1d351e3e8da3945a3f451166f12536aae3e440d57337d8891492424aff78
+size 22387798
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
index 69936aaee..932e0f372 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe
-size 1396404
+oid sha256:19585b7709736197d9c1762d1bb8e3099e298d6dcc1c521d51c83637cc624c20
+size 1397814
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
index 69936aaee..926f354f0 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe
-size 1396404
+oid sha256:f5d5475663640c068af2e9b5772b9b602656641dd17ca473ce7125ef7f2ec855
+size 1423172
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
index 698fe4058..fce860082 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.a
-7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.pre_cxx11.a
-c0bd2b69c932257678a2aad9bd8baba4b291795e commit
\ No newline at end of file
+e18e84fb356995b11c04b79e55c4c3f5 libtensorrt_llm_executor_static.a
+f0555b76f21d43e676e5808bf197cc58 libtensorrt_llm_executor_static.pre_cxx11.a
+265b039443334094026fbd8f396d52fe29c2d9d1 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
index eee7ed862..702aee286 100644
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e503b4cfb1c842850287a359ffed23a1773a67a96475d365b66d757a283ac218
-size 1448772
+oid sha256:8496c9e4a20efd3d2072520cf843dac70cbb0fe23621cfba2a1e0ef3e5fa22ed
+size 1450288
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
index b6e842a0f..24277fcfa 100644
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f8c80cf7aca2b135a656a060456fb30a820e459b4b36560162b02fa65121ef50
-size 1375430
+oid sha256:1b76267834252836e26ddecc2e1b9449e33a67fb1981e5d42f721bc439be1c02
+size 1377018
diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
index f20e66567..05a1eccbc 100644
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cc65971d6d74260cb49b354aa4b0b82f92863cc722fbf206bf8a4919a4897532
-size 14031364
+oid sha256:9bd0faf48175745d7aeff58f539ae021db365b73933dab9c51329de9e92f2d86
+size 14039826
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h
index 32fefcd22..3a0b72e5a 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h
@@ -424,7 +424,8 @@ std::vector<cutlass_extensions::CutlassGemmConfig> MoeGemmRunner<T, WeightType>:
 template <typename T, typename WeightType>
 bool MoeGemmRunner<T, WeightType>::isHopperSpecialised() const
 {
-    bool config_is_sm90 = best_config_ && best_config_->is_sm90;
+    TLLM_CHECK_WITH_INFO(best_config_, "Cannot determine if hopper is specialised without a selected config");
+    bool config_is_sm90 = best_config_->is_sm90;
     return supportsHopperSpecialisation() && config_is_sm90;
 }
 
@@ -440,7 +441,7 @@ int MoeGemmRunner<T, WeightType>::getSM() const
     return this->sm_;
 }
 
-// currently support sm80 bf16/fp16 gate ativation, only set predication tensor for m direction
+// currently support sm80 bf16/fp16 gate activation, only set predication tensor for m direction
 template <typename T, typename WeightType>
 bool MoeGemmRunner<T, WeightType>::isFusedGatedActivation(bool is_gated_activation, int gemm_n, int gemm_k) const
 {
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
index 236936439..a44f5448b 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
@@ -1,2 +1,2 @@
 5b6c74ce66f62d2a58aa9cac16f11ad6 libtensorrt_llm_nvrtc_wrapper.so
-c0bd2b69c932257678a2aad9bd8baba4b291795e commit
\ No newline at end of file
+265b039443334094026fbd8f396d52fe29c2d9d1 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
index 964f3f9b0..8d3409955 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84319476e8ecf9666f40f69355f19ec3b585fc0987f940be14af9e11e3f524c3
+oid sha256:9f2f97eb5b4181917a47b6028a857d7a597ca93faa5846af42c4cb24797d7fa7
 size 1080832
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16 .cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu
similarity index 100%
rename from cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16 .cu
rename to cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu
diff --git a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
index 7b37b25ea..f016d2876 100644
--- a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
+++ b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
@@ -1072,10 +1072,38 @@ std::vector<size_t> CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWo
     size_t const hopper_size = using_hopper ? HopperGroupedGemmInput::workspaceSize(num_experts_per_node) : 0;
     size_t const gemm_workspace_size = moe_gemm_runner_.getMaxWorkspaceSize(num_experts_per_node);
 
-    std::vector<size_t> workspace{source_rows_size, permuted_rows_size, permuted_experts_size, permuted_data_size,
-        total_rows_before_expert_size, softmax_out_size, glu_inter_size,
+    // We do some overlapping of the large workspace buffers. Although we could overlap some of the other buffers, they
+    // are small enough (i.e no factor of hidden size) they will only be a couple MiB at most, so we don't bother
+    // in the case of fused activation we overlap permuted_data and fc2_result
+    // in the case of unfused activation we overlap permuted_data and fc1_result
+    // we need to calculate the max possible size, so use the max of all three
+    size_t overlapped_gemm1_gemm2_inputs = std::max(permuted_data_size, fc2_result_size);
+    // When glu_inter_elems is 0 we are always fused, otherwise we may need the un-fused case
+    if (glu_inter_elems > 0)
+    {
+        overlapped_gemm1_gemm2_inputs = std::max(overlapped_gemm1_gemm2_inputs, fc1_result_size);
+    }
+
+    // if we have glu_inter we overlap it with fc2_result, otherwise we use fc1_result by itself
+    size_t overlapped_gemm1_gemm2_outputs = fc1_result_size;
+    if (glu_inter_elems > 0)
+    {
+        overlapped_gemm1_gemm2_outputs
+            = std::max(std::max(glu_inter_size, fc2_result_size), overlapped_gemm1_gemm2_outputs);
+    }
+
+    std::vector<size_t> workspace{     //
+        source_rows_size,              //
+        permuted_rows_size,            //
+        permuted_experts_size,         //
+        total_rows_before_expert_size, //
+        softmax_out_size,              //
+        sorter_size,                   //
         // These pointers reuse the same memory
-        std::max(fc1_result_size, sorter_size), fc2_result_size, hopper_size, gemm_workspace_size};
+        overlapped_gemm1_gemm2_inputs,  //
+        overlapped_gemm1_gemm2_outputs, //
+        hopper_size,                    //
+        gemm_workspace_size};
     return workspace;
 }
 
@@ -1088,7 +1116,9 @@ size_t CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWorkspaceSize(i
     TLLM_CHECK_WITH_INFO(num_experts % ep_size == 0, "Number of experts must be a multiple of ep size");
     auto workspace = getWorkspaceBufferSizes(
         num_rows, hidden_size, inter_size, num_experts, num_experts / ep_size, k, activation_type);
-    return tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size());
+    auto ws_size = tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size());
+    TLLM_LOG_DEBUG("Mixture Of Experts Plugin requires workspace of %2f MiB", ws_size / 1024.f / 1024.f);
+    return ws_size;
 }
 
 template <class T, class WeightType, class OutputType, class Enable>
@@ -1109,29 +1139,38 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::configureWsPtrs(char
     source_rows_ = (int*) ws_sliced[0];
     permuted_rows_ = (int*) ws_sliced[1];
     permuted_experts_ = (int*) ws_sliced[2];
-    permuted_data_ = (T*) ws_sliced[3];
 
-    total_rows_before_expert_ = (int64_t*) ws_sliced[4];
+    total_rows_before_expert_ = (int64_t*) ws_sliced[3];
 
     softmax_out_ = nullptr;
     bool const is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
     if (!is_pow_2 || num_experts > 256)
     {
-        softmax_out_ = (float*) ws_sliced[5];
+        softmax_out_ = (float*) ws_sliced[4];
     }
 
-    glu_inter_result_ = (T*) ws_sliced[6];
+    sorter_ws_ = (char*) ws_sliced[5];
 
-    // These pointers are aliased. Since the sort ws can be overwritten after it is finished
-    sorter_ws_ = (char*) ws_sliced[7];
-    fc1_result_ = (T*) ws_sliced[7];
+    // Always 6, but overlapped with either fc1_result_ or fc2_result_
+    permuted_data_ = (T*) ws_sliced[6];
 
-    fc2_result_ = (T*) ws_sliced[8];
+    bool const is_gated_activation = isGatedActivation(activation_type);
+    bool const use_fused_moe = moe_gemm_runner_.isFusedGatedActivation(is_gated_activation, inter_size, hidden_size);
+    bool const using_hopper = moe_gemm_runner_.isHopperSpecialised();
+    bool const hopper_has_glu = using_hopper && (mayHaveDifferentGEMMOutputType() || is_gated_activation);
+    bool const non_hopper_has_glu = !using_hopper && !use_fused_moe && is_gated_activation;
+    bool const has_glu_inter_result = hopper_has_glu || non_hopper_has_glu;
+    // Always 7, ignored if not needed
+    glu_inter_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : nullptr;
+
+    // fc1 and fc2 alias one of the above pointers, but it depends on if actfn is fused/unfused which is overlapped
+    fc1_result_ = has_glu_inter_result ? (T*) ws_sliced[6] : (T*) ws_sliced[7];
+    fc2_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : (T*) ws_sliced[6];
 
     hopper_grouped_gemm_input_ = {};
     if (moe_gemm_runner_.isHopperSpecialised())
     {
-        hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[9], num_experts_per_node, ws_sliced[10], ws_sizes[10]);
+        hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[8], num_experts_per_node, ws_sliced[9], ws_sizes[9]);
     }
 }
 
@@ -1293,6 +1332,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::runMoe(void const* i
     }
     else
     {
+
         // Run the GEMM with activation function overridden with `Identity`, we do the activation separately
         ActivationType activation_type = (use_fused_moe) ? fc1_activation_type : ActivationType::Identity;
         T* gemm_result = (use_fused_moe) ? fc1_result_ : static_cast<T*>(glu_inter_result_);
diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
index 9f7799dee..379eb7642 100644
--- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
@@ -431,7 +431,8 @@ void InitBindings(pybind11::module_& m)
             &tle::DecodingConfig::setLookaheadDecoding)
         .def_property("medusa_choices", &tle::DecodingConfig::getMedusaChoices, &tle::DecodingConfig::setMedusaChoices);
 
-    auto executorConfigGetState = [&](tle::ExecutorConfig const& self)
+    auto executorConfigGetState = [&peftCacheConfigGetstate, &kvCacheConfigGetstate, &schedulerConfigGetstate,
+                                      &parallelConfigGetstate](tle::ExecutorConfig const& self)
     {
         py::object peftCacheConfigState = py::none();
 
@@ -453,7 +454,8 @@ void InitBindings(pybind11::module_& m)
             peftCacheConfigState, self.getLogitsPostProcessorMap(), self.getLogitsPostProcessorBatched(),
             self.getDecodingConfig(), self.getGpuWeightsPercent());
     };
-    auto executorConfigSetState = [&](py::tuple state)
+    auto executorConfigSetState = [&kvCacheConfigSetstate, &peftCacheConfigSetstate, &schedulerConfigSetstate,
+                                      &parallelConfigSetstate](py::tuple state)
     {
         if (state.size() != 15)
         {
diff --git a/cpp/tensorrt_llm/runtime/medusaModule.cpp b/cpp/tensorrt_llm/runtime/medusaModule.cpp
index 2730977c3..d889dcdfa 100644
--- a/cpp/tensorrt_llm/runtime/medusaModule.cpp
+++ b/cpp/tensorrt_llm/runtime/medusaModule.cpp
@@ -96,8 +96,11 @@ void MedusaModule::initMedusaTensorsFromChoices(MedusaChoices const& choices, st
         if (curDepth != depth)
         {
             TLLM_CHECK(depth + 1 == curDepth);
+            TLLM_CHECK_WITH_INFO(depth <= getMaxDraftPathLen(),
+                "Medusa choices require more Medusa heads than the engine was built with.");
             // Save TopK
             topKs[depth - 1] = maxTopK;
+
             // Accumulate TopK for global indexing in tree
             globalNodeInTreeIdx += maxTopK;
 
diff --git a/cpp/tests/kernels/mixtureOfExpertsTest.cu b/cpp/tests/kernels/mixtureOfExpertsTest.cu
index a23ce3d3e..27e6a46ce 100644
--- a/cpp/tests/kernels/mixtureOfExpertsTest.cu
+++ b/cpp/tests/kernels/mixtureOfExpertsTest.cu
@@ -257,7 +257,7 @@ protected:
     template <class T>
     T* allocBuffer(size_t size)
     {
-        managed_buffers.emplace_back(mBufferManager->managed(size * sizeof(T)));
+        managed_buffers.emplace_back(mBufferManager->gpu(size * sizeof(T)));
         EXPECT_EQ(cudaGetLastError(), cudaSuccess) << "Error allocating buffer of size: " << size;
         T* ptr = static_cast<T*>(managed_buffers.back()->data());
         return ptr;
@@ -268,15 +268,27 @@ protected:
         this->managed_buffers.clear();             // Make sure all the previous buffers are freed
         check_cuda_error(cudaDeviceSynchronize()); // Sync to make sure all previous operations are resolved
 
-        size_t weight_size = hidden_size * hidden_size * 4 * num_experts * sizeof(WeightType);
-        // Skip the test if the GPU does not have enough memory
-        size_t workspace_size = this->mMoERunner.getWorkspaceSize(
+        // Calculate the size contributions for all the large buffers to check if the GPU has enough space
+        bool const is_gated = tensorrt_llm::isGatedActivation(mActType);
+        size_t const num_gemms = 2 + is_gated;
+        // Expert weights
+        size_t const weight_size = hidden_size * (hidden_size * 4) * num_experts * sizeof(WeightStorage) * num_gemms;
+        // Workspace size
+        size_t const workspace_size = this->mMoERunner.getWorkspaceSize(
             num_tokens, hidden_size, hidden_size * 4, num_experts, k, this->mActType, {});
+        // The input/output buffers
+        size_t const in_out_size = 2 * num_tokens * hidden_size * sizeof(DataType);
 
-        size_t total_size = workspace_size + weight_size * 2;
+        // This should be correct to within 100MiB (on tests with 30GiB total)
+        size_t const total_size = workspace_size + weight_size + in_out_size;
 
+        size_t const memory_pool_free_mem_size = mBufferManager->memoryPoolFree();
         auto const [freeMem, totalMem] = tensorrt_llm::common::getDeviceMemoryInfo(false);
-        return freeMem >= total_size;
+        float const freeMemBuffer = 0.9f; // Add some buffer so we aren't completely pushing the limits
+        std::cout << "Free memory is: " << freeMem << ", memory pool size is: " << memory_pool_free_mem_size
+                  << ", required memory is: " << total_size << ", device total memory capacity: " << totalMem
+                  << std::endl;
+        return (freeMem + memory_pool_free_mem_size) * freeMemBuffer >= total_size;
     }
 
     void initBuffersPermute(std::vector<std::vector<DataType>> h_hidden_states,
@@ -362,7 +374,10 @@ protected:
             initFP8Scales(mMaxInput);
         }
 
-        mTpExpertScratch = allocBuffer<DataType>(mTpExpertScratchSize);
+        if (parallelism_config.tp_size > 1 || parallelism_config.ep_size > 1)
+        {
+            mTpExpertScratch = allocBuffer<DataType>(mTpExpertScratchSize);
+        }
 
         mActiveRows = mTotalTokens;
         mFinished = nullptr;
@@ -475,10 +490,18 @@ protected:
         ASSERT_NE(mExpertFP8Scale1, nullptr);
         ASSERT_NE(mExpertFP8Scale2, nullptr);
         ASSERT_NE(mExpertFP8Scale3, nullptr);
+
         // Dequant values for each expert are 1/(w_i*a_i) calculated above
-        std::fill_n(mExpertFP8Scale1, mNumExperts, 1.f / (scaleW1 * scaleAct1));
-        std::fill_n(mExpertFP8Scale3, mNumExperts, 1.f / (scaleW2 * scaleAct2));
-        *mExpertFP8Scale2 = scaleAct2;
+        std::vector<float> scales_1(mNumExperts, 1.f / (scaleW1 * scaleAct1));
+        std::vector<float> scales_2(1, scaleAct2);
+        std::vector<float> scales_3(mNumExperts, 1.f / (scaleW2 * scaleAct2));
+
+        check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale1, scales_1.data(), scales_1.size() * sizeof(float),
+            cudaMemcpyHostToDevice, mStream->get()));
+        check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale2, scales_2.data(), scales_2.size() * sizeof(float),
+            cudaMemcpyHostToDevice, mStream->get()));
+        check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale3, scales_3.data(), scales_3.size() * sizeof(float),
+            cudaMemcpyHostToDevice, mStream->get()));
 
         check_cuda_error(cudaStreamSynchronize(mStream->get()));
     }
@@ -561,6 +584,13 @@ protected:
         void* ep_scale_2 = FP8 ? (void*) mExpertFP8Scale2 : (void*) mExpertIntScale2;
         void* ep_scale_3 = FP8 ? mExpertFP8Scale3 : nullptr;
 
+        // Handle the case with no parallelism to not require the extra alloc
+        if (parallelism_config.tp_size == 1 && parallelism_config.ep_size == 1)
+        {
+            return std::tuple{
+                mExpertWeight1, mExpertWeight2, mExpertBias1, mExpertBias2, ep_scale_1, ep_scale_2, ep_scale_3};
+        }
+
         // Slice weights for EP
         size_t const gated_inter = mInterSize * mGatedMultiplier;
         size_t const experts_per_node = mNumExperts / parallelism_config.ep_size;
diff --git a/cpp/tests/resources/scripts/build_medusa_engines.py b/cpp/tests/resources/scripts/build_medusa_engines.py
index 18eccf932..a8c000d2f 100755
--- a/cpp/tests/resources/scripts/build_medusa_engines.py
+++ b/cpp/tests/resources/scripts/build_medusa_engines.py
@@ -28,7 +28,7 @@ def build_engine(weight_dir: _pl.Path, medusa_dir: _pl.Path,
     covert_cmd = [_sys.executable, "examples/medusa/convert_checkpoint.py"] + (
         ['--model_dir', str(weight_dir)] if weight_dir else []) + [
             '--medusa_model_dir', str(medusa_dir), \
-            '--output_dir', str(engine_dir), '--dtype=float16', '--fixed_num_medusa_heads=4'
+            '--output_dir', str(engine_dir), '--dtype=float16', '--num_medusa_heads=4'
         ] + list(args)
 
     run_command(covert_cmd)
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
index 4458be56d..db3509bd2 100644
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@@ -1,6 +1,6 @@
 # Multi-stage Dockerfile
 ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
-ARG BASE_TAG=24.04-py3
+ARG BASE_TAG=24.05-py3
 ARG DEVEL_IMAGE=devel
 
 FROM ${BASE_IMAGE}:${BASE_TAG} as base
diff --git a/docker/common/install_pytorch.sh b/docker/common/install_pytorch.sh
index 9a683314d..d12a1b9f5 100644
--- a/docker/common/install_pytorch.sh
+++ b/docker/common/install_pytorch.sh
@@ -4,8 +4,8 @@ set -ex
 
 # Use latest stable version from https://pypi.org/project/torch/#history
 # and closest to the version specified in
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04
-TORCH_VERSION="2.3.0"
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05
+TORCH_VERSION="2.3.1"
 SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 
 prepare_environment() {
diff --git a/docker/common/install_tensorrt.sh b/docker/common/install_tensorrt.sh
index bc2ea72da..3649b612c 100644
--- a/docker/common/install_tensorrt.sh
+++ b/docker/common/install_tensorrt.sh
@@ -2,12 +2,12 @@
 
 set -ex
 
-TRT_VER="10.0.1.6"
+TRT_VER="10.1.0.27"
 # Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05
 CUDA_VER="12.4" # 12.4.1
 # Keep the installation for cuDNN if users want to install PyTorch with source codes.
-# PyTorch 2.3.0 can compile with cuDNN v9.
+# PyTorch 2.3.x can compile with cuDNN v9.
 CUDNN_VER="9.1.0.70-1"
 NCCL_VER="2.21.5-1+cuda12.4"
 CUBLAS_VER="12.4.5.8-1"
@@ -86,8 +86,7 @@ install_tensorrt() {
         if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
         if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi
         if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-22.04" && OS="ubuntu-22.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
-        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
-
+        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.1.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
     fi
     wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
     tar -xf /tmp/TensorRT.tar -C /usr/local/
diff --git a/docs/source/reference/support-matrix.md b/docs/source/reference/support-matrix.md
index d3d96a128..3ad64739d 100644
--- a/docs/source/reference/support-matrix.md
+++ b/docs/source/reference/support-matrix.md
@@ -44,9 +44,9 @@ The following table shows the supported software for TensorRT-LLM.
     * -
       - Software Compatibility
     * - Container
-      - [24.04](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+      - [24.05](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
     * - TensorRT
-      - [10.0](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
+      - [10.1](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
     * - Precision
       -
         - Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4
diff --git a/docs/source/release-notes.md b/docs/source/release-notes.md
index 79e4be3a4..99ba44037 100644
--- a/docs/source/release-notes.md
+++ b/docs/source/release-notes.md
@@ -23,10 +23,11 @@ All published functionality in the Release Notes has been fully tested and verif
 - TBD
 
 ### Infrastructure changes
-  - Base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:24.04-py3`.
-  - Base Docker image for TensorRT-LLM backend is updated to `nvcr.io/nvidia/tritonserver:24.04-py3`.
+  - Base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:24.05-py3`.
+  - Base Docker image for TensorRT-LLM backend is updated to `nvcr.io/nvidia/tritonserver:24.05-py3`.
+  - The dependent TensorRT version is updated to 10.1.0.
   - The dependent CUDA version is updated to 12.4.1.
-  - The dependent PyTorch version is updated to 2.3.0.
+  - The dependent PyTorch version is updated to 2.3.1.
 
 
 ## TensorRT-LLM Release 0.10.0
diff --git a/docs/source/speculative_decoding.md b/docs/source/speculative_decoding.md
index 5359e2193..8864e0511 100644
--- a/docs/source/speculative_decoding.md
+++ b/docs/source/speculative_decoding.md
@@ -15,10 +15,10 @@ If the first assumption holds true, the latency of speculative decoding will no
 The combination of both these allows speculative decoding to result in reduced latency.
 
 TensorRT-LLM supports several approaches for generating draft tokens, including:
+
 1. Utilizing a smaller, auxiliary model, known as the draft model approach. For more information, refer to the [Fast Inference from Transformers via Speculative Decoding paper](https://arxiv.org/pdf/2211.17192.pdf).
 2. Implementing additional language model heads that predict tokens for future positions, as detailed in the [Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads paper](https://arxiv.org/abs/2401.10774).
 
-
 ## Performance Improvements
 
 It's important to note that the effectiveness of speculative decoding techniques is highly dependent
@@ -56,6 +56,166 @@ it is advisable to enable KV cache reuse for both models.
 This can be achieved by adding the `--use_paged_context_fmha=enable` flag to the `trtllm-build` command
 and setting `enableBlockReuse=true` in the `KVCacheConfig`.
 
+## Using Draft model approach with Triton Inference Server
+
++ Draft model approach is supported since TensorRT-LLM-0.7.0 (using two separate Tritonserver to maintain draft and target model respectively), but has significant optimization in TensorRT-LLM-0.10.0 (using one Tritonserver with [Business Logic Scripting](https://github.com/triton-inference-server/python_backend?tab=readme-ov-file#business-logic-scripting), BLS).
++ The source file of Draft model with BLS can be found [here](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py).
++ This example is based on TensorRT-LLM-0.10.0 and TRTLLM-backend-0.10.0, using docker image `nvcr.io/nvidia/tritonserver:24.05-trtllm-py3`.
++ Llama-7B-hf and Llama-30B-hf are used as draft and target model respectively in this example, assuming the paths to the models' repository are `DRAFT_MODEL_PATH` and `TARGET_MODEL_PATH`.
++ Maximum number of draft tokens is set to 10 in this example.
+
+1. Prepare TensorRT engine for inference
+    + Here are the commands to build draft / target engines in FP16 or FP8. All combinations of the data type (Draft-FP16/FP8 + Target-FP16/FP8) are supported.
+    + `--remove_input_padding=enable --paged_kv_cache=enable` are necessary for inflight-batching.
+    + `--context_fmha=enable --use_paged_context_fmha=enable` are optional, but recommended for the performance.
+    + `--gather_generation_logits` is necessary if using generation logits for selecting tokens in target model.
+    + `--tp_size` can be modified set if using TP mode for draft / target model.
+    + `--max_batch_size` more than 1 is acceptable in general usage, but we use 1 in this example.
+
+    ```bash
+    export MAX_DRAFT_LENGTH=10
+    export COMMON_COMMAND="--max_batch_size=1 --max_input_len=2048 --max_output_len=1024 --gpt_attention_plugin=float16 --gemm_plugin=float16 --remove_input_padding=enable --paged_kv_cache=enable --context_fmha=enable --use_paged_context_fmha=enable --gather_generation_logits"
+    export DRAFT_COMMAND_FP16="$COMMON_COMMAND"
+    export TARGET_COMMAND_FP16="$DRAFT_COMMAND_FP16 --max_draft_len=$MAX_DRAFT_LENGTH --speculative_decoding_mode draft_tokens_external"
+    export DRAFT_COMMAND_FP8="$COMMON_COMMAND --strongly_typed --use_fp8_context_fmha=enable"
+    export TARGET_COMMAND_FP8="$DRAFT_COMMAND_FP8 --max_draft_len=$MAX_DRAFT_LENGTH --speculative_decoding_mode draft_tokens_external"
+
+    # Build checkpoints and engines in tensorrt_llm/examples/llama/
+    # FP16 mode
+    export DRAFT_NAME=llama-7b-fp16-tp1
+    export TARGET_NAME=llama-30b-fp16-tp1
+    python3 convert_checkpoint.py --model_dir=$DRAFT_MODEL_PATH --output_dir=ckpt/$DRAFT_NAME --tp_size=1
+    python3 convert_checkpoint.py --model_dir=$TARGET_MODEL_PATH --output_dir=ckpt/$TARGET_NAME --tp_size=1
+    trtllm-build --checkpoint_dir=ckpt/$DRAFT_NAME --output_dir=engine/draft/$DRAFT_NAME $DRAFT_COMMAND_FP16
+    trtllm-build --checkpoint_dir=ckpt/$TARGET_NAME --output_dir=engine/target/$TARGET_NAME $TARGET_COMMAND_FP16
+    export DRAFT_ENGINE_PATH=$(pwd)/engine/draft/$DRAFT_NAME
+    export TARGET_ENGINE_PATH=$(pwd)/engine/target/$TARGET_NAME
+
+    # FP8 mode
+    export DRAFT_NAME=llama-7b-fp8-tp1
+    export TARGET_NAME=llama-30b-fp8-tp1
+    python3 convert_checkpoint.py --model_dir=$DRAFT_MODEL_PATH --output_dir=ckpt/$DRAFT_NAME --tp_size=1
+    python3 convert_checkpoint.py --model_dir=$TARGET_MODEL_PATH --output_dir=ckpt/$TARGET_NAME --tp_size=1
+    trtllm-build --checkpoint_dir=ckpt/$DRAFT_NAME --output_dir=engine/draft/$DRAFT_NAME $DRAFT_COMMAND_FP8
+    trtllm-build --checkpoint_dir=ckpt/$TARGET_NAME --output_dir=engine/target/$TARGET_NAME $TARGET_COMMAND_FP8
+    export DRAFT_ENGINE_PATH=$(pwd)/engine/draft/$DRAFT_NAME
+    export TARGET_ENGINE_PATH=$(pwd)/engine/target/$TARGET_NAME
+    ```
+
+2. Edit Triton configuration
+    + If both draft and target model can be placed in one GPU (for example, llama-7B-FP8 + llama-30B-FP8, totally 40GiB in one H100-80GiB GPU), `DRAFT_GPU_DEVICE_IDS` and `TARGET_GPU_DEVICE_IDS` can be the same, `0` as example. It appears better performance than placing on two separate GPUs.
+    + Elsewise, the draft and target models can be placed in different GPUs, `DRAFT_GPU_DEVICE_IDS="0"` and `TARGET_GPU_DEVICE_IDS="1"` as example.
+    + Furthermore, if TP mode is used, the value of `GPU_DEVICE_IDS` can be a list, `DRAFT_GPU_DEVICE_IDS="0"` and `TARGET_GPU_DEVICE_IDS="1,2,3,4"` as example.
+    + For more configuration of launching models with Tritonserver, please visit [TensorRT-LLM Backed repo](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/README.md).
+
+    ```bash
+    ACCUMULATE_TOKEN="false"
+    BACKEND="tensorrtllm"
+    BATCH_SCHEDULER_POLICY="guaranteed_no_evict"
+    BATCHING_STRATEGY="inflight_fused_batching"
+    BLS_INSTANCE_COUNT="1"
+    DECODING_MODE="top_k_top_p"
+    DECOUPLED_MODE="False"
+    DRAFT_GPU_DEVICE_IDS="0"
+    E2E_MODEL_NAME="ensemble"
+    ENABLE_KV_CACHE_REUSE="true"
+    ENGINE_PATH=$TARGET_ENGINE_PATH
+    EXCLUDE_INPUT_IN_OUTPUT="false"
+    KV_CACHE_FREE_GPU_MEM_FRACTION="0.8"
+    MAX_ATTENTION_WINDOW_SIZE=""
+    MAX_BEAM_WIDTH="1"
+    MAX_QUEUE_DELAY_MICROSECONDS="0"
+    MAX_TOKENS_IN_KV_CACHE=""
+    NORMALIZE_LOG_PROBS="true"
+    POSTPROCESSING_INSTANCE_COUNT="1"
+    PREPROCESSING_INSTANCE_COUNT="1"
+    TARGET_GPU_DEVICE_IDS="1"
+    TENSORRT_LLM_DRAFT_MODEL_NAME="tensorrt_llm_draft"
+    TENSORRT_LLM_MODEL_NAME="tensorrt_llm"
+    TOKENIZER_PATH=$DRAFT_MODEL_PATH
+    TOKENIZER_TYPE=llama
+    TRITON_GRPC_PORT="8001"
+    TRITON_HTTP_PORT="8000"
+    TRITON_MAX_BATCH_SIZE="4"
+    TRITON_METRICS_PORT="8002"
+    TRITON_REPO="triton_repo"
+    USE_DRAFT_LOGITS="false"
+
+    # Make a copy of triton repo and replace the fields in the configuration files
+    cd /tensorrtllm_backend/
+    apt-get update && apt-get install -y build-essential cmake git-lfs
+    pip3 install git-lfs tritonclient grpcio
+    rm -rf ${TRITON_REPO}
+    cp -R all_models/inflight_batcher_llm ${TRITON_REPO}
+    python3 tools/fill_template.py -i ${TRITON_REPO}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE}
+    python3 tools/fill_template.py -i ${TRITON_REPO}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:${PREPROCESSING_INSTANCE_COUNT}
+    python3 tools/fill_template.py -i ${TRITON_REPO}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:${POSTPROCESSING_INSTANCE_COUNT}
+    python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},accumulate_tokens:${ACCUMULATE_TOKEN},bls_instance_count:${BLS_INSTANCE_COUNT},tensorrt_llm_model_name:${TENSORRT_LLM_MODEL_NAME},tensorrt_llm_draft_model_name:${TENSORRT_LLM_DRAFT_MODEL_NAME}
+
+    # Make a copy of tensorrt_llm as configurations of draft / target models.
+    cp -R ${TRITON_REPO}/tensorrt_llm ${TRITON_REPO}/tensorrt_llm_draft
+    sed -i 's/name: "tensorrt_llm"/name: "tensorrt_llm_draft"/g' ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt
+    python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt          triton_backend:${BACKEND},engine_dir:${ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${TARGET_GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE}
+    python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt    triton_backend:${BACKEND},engine_dir:${DRAFT_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${DRAFT_GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE}
+    ```
+
+3. Launch Triton server
+    + `--multi-model` is necessary if TP mode is used for target model.
+
+    ```bash
+    python3 scripts/launch_triton_server.py \
+        --model_repo=${TRITON_REPO} \
+        --tensorrt_llm_model_name "tensorrt_llm,tensorrt_llm_draft" \
+        --multi-model \
+        --log &
+    ```
+
+    + Verbose log will be written in to file `triton_log.txt`. Triton server launches successfully if you see the output below in the file:
+
+    ```txt
+    Started HTTPService at 0.0.0.0:8000
+    Started GRPCInferenceService at 0.0.0.0:8001
+    Started Metrics Service at 0.0.0.0:8002
+    ```
+
+4. Send Requests
+    + Prepare a JSON file `input_data.json` containing input data as below (more requests are acceptable).
+
+    ```json
+    [
+        {
+            "input": "James Best, best known for his ",
+            "instruction": "Continue writing the following story:",
+            "output": "                                                                "
+        }
+    ]
+    ```
+
+    + Use command below to launch requests for inference.
+    + `--num-draft-tokens` can be modified by runtime draft lengths, 4 is used in this example.
+
+    ```bash
+    python3 tools/inflight_batcher_llm/speculative_decoding_test.py \
+        --max-input-len 2048 \
+        --dataset=input_data.json \
+        --url-target=localhost:8001 \
+        --url-draft=localhost:8001 \
+        --draft-tensorrt-llm-model-name="${TENSORRT_LLM_DRAFT_MODEL_NAME}" \
+        --target-tensorrt-llm-model-name="${TENSORRT_LLM_MODEL_NAME}" \
+        --bls-speculative-tensorrt-llm-model-name="tensorrt_llm_bls" \
+        --execute-bls-speculative-decoding \
+        --disable-output-comparison \
+        --num-draft-tokens=4 \
+        --verbose
+    ```
+
+5. Kill Tritonserver after finishing inference
+
+    ```bash
+    pkill -9 -f trtllmExecutorWorker
+    pkill -9 -f tritonserver
+    ```
+
 # Medusa
 
 This approach leverages a single model to both generate and verify draft tokens.
diff --git a/examples/baichuan/requirements.txt b/examples/baichuan/requirements.txt
index f51033e19..89b722b3e 100644
--- a/examples/baichuan/requirements.txt
+++ b/examples/baichuan/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.15.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/bloom/requirements.txt b/examples/bloom/requirements.txt
index 047fb0e86..5626d3984 100644
--- a/examples/bloom/requirements.txt
+++ b/examples/bloom/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/chatglm/requirements.txt b/examples/chatglm/requirements.txt
index 58286c4c4..3236169a0 100644
--- a/examples/chatglm/requirements.txt
+++ b/examples/chatglm/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 protobuf
diff --git a/examples/cogvlm/convert_checkpoint.py b/examples/cogvlm/convert_checkpoint.py
index fec672fe3..a52df63d7 100644
--- a/examples/cogvlm/convert_checkpoint.py
+++ b/examples/cogvlm/convert_checkpoint.py
@@ -119,7 +119,7 @@ def parse_arguments():
         'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
     )
     parser.add_argument(
-        '--modelopt_quant_ckpt_path',
+        '--quant_ckpt_path',
         type=str,
         default=None,
         help='Path of a quantized model checkpoint in .npz format')
@@ -438,7 +438,7 @@ def covert_and_save(rank):
 
         if args.use_weight_only and args.weight_only_precision == 'int4_gptq':
             weights = load_weights_from_gptq(
-                args.modelopt_quant_ckpt_path,
+                args.quant_ckpt_path,
                 PretrainedConfig.from_dict(copy.deepcopy(config)),
             )
 
diff --git a/examples/dbrx/requirements.txt b/examples/dbrx/requirements.txt
index cd04908b5..5de3a3747 100644
--- a/examples/dbrx/requirements.txt
+++ b/examples/dbrx/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/falcon/requirements.txt b/examples/falcon/requirements.txt
index 567d1d89d..77d72b907 100644
--- a/examples/falcon/requirements.txt
+++ b/examples/falcon/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 transformers>=4.31.0
 datasets~=2.14.5
 evaluate~=0.4.1
diff --git a/examples/gemma/convert_checkpoint.py b/examples/gemma/convert_checkpoint.py
index 38895306f..2e3e3ba41 100644
--- a/examples/gemma/convert_checkpoint.py
+++ b/examples/gemma/convert_checkpoint.py
@@ -71,7 +71,7 @@ def parse_arguments():
         "By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV",
     )
     parser.add_argument(
-        "--modelopt_quant_ckpt_path",
+        "--quant_ckpt_path",
         default=None,
         help=
         "Path of a directory to quantized model checkpoints in .safetensors format or \
@@ -944,7 +944,7 @@ def convert(worker_rank, args, convert_kwargs):
             weight_scales = quantize_fp8_weights(
                 weights, trt_llm_config.num_hidden_layers,
                 trt_llm_config.mapping)
-            scales = load_from_fp8_gemma(args.modelopt_quant_ckpt_path,
+            scales = load_from_fp8_gemma(args.quant_ckpt_path,
                                          trt_llm_config.num_hidden_layers,
                                          trt_llm_config.mapping,
                                          args.fp8_kv_cache, weight_scales)
diff --git a/examples/gemma/requirements.txt b/examples/gemma/requirements.txt
index 9a2afb89d..296bc6255 100644
--- a/examples/gemma/requirements.txt
+++ b/examples/gemma/requirements.txt
@@ -3,7 +3,7 @@
 # WAR the new posting of "nvidia-cudnn-cu12~=9.0".
 # "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
 nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 flax~=0.8.0
 # jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
 jax~=0.4.19; platform_system == "Windows"
diff --git a/examples/gpt/requirements.txt b/examples/gpt/requirements.txt
index 954293a71..03735ad42 100644
--- a/examples/gpt/requirements.txt
+++ b/examples/gpt/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/gptj/requirements.txt b/examples/gptj/requirements.txt
index 58af19bd7..9cd6ac378 100644
--- a/examples/gptj/requirements.txt
+++ b/examples/gptj/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/gptneox/README.md b/examples/gptneox/README.md
index d7c48483f..9d77d8d58 100644
--- a/examples/gptneox/README.md
+++ b/examples/gptneox/README.md
@@ -167,7 +167,7 @@ sh gptq_convert.sh
 ### 3. Convert weights from HF Transformers to TensorRT-LLM format
 
 To apply groupwise quantization GPTQ, addition command-line flags need to be passed to `convert_checkpoint.py`:
-Here `--modelopt_quant_ckpt_path` flag specifies the output safetensors of `gptq_convert.sh` script.
+Here `--quant_ckpt_path` flag specifies the output safetensors of `gptq_convert.sh` script.
 
 ```bash
 # Single GPU
@@ -175,7 +175,7 @@ python3 convert_checkpoint.py --model_dir ./gptneox_model \
                               --dtype float16 \
                               --use_weight_only \
                               --weight_only_precision int4_gptq \
-                              --modelopt_quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
+                              --quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
                               --output_dir ./gptneox/20B/trt_ckpt/int4_gptq/1-gpu/
 # With 2-way Tensor Parallel
 python3 convert_checkpoint.py --model_dir ./gptneox_model \
@@ -184,7 +184,7 @@ python3 convert_checkpoint.py --model_dir ./gptneox_model \
                               --weight_only_precision int4_gptq \
                               --tp_size 2 \
                               --workers 2 \
-                              --modelopt_quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
+                              --quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
                               --output_dir ./gptneox/20B/trt_ckpt/int4_gptq/2-gpu/
 ```
 
diff --git a/examples/gptneox/convert_checkpoint.py b/examples/gptneox/convert_checkpoint.py
index d4064f981..67bf72772 100644
--- a/examples/gptneox/convert_checkpoint.py
+++ b/examples/gptneox/convert_checkpoint.py
@@ -50,7 +50,7 @@ def parse_arguments():
         'Define the precision for the weights when using weight-only quantization.'
         'You must also use --use_weight_only for that argument to have an impact.'
     )
-    parser.add_argument('--modelopt_quant_ckpt_path',
+    parser.add_argument('--quant_ckpt_path',
                         type=str,
                         default=None,
                         help='Path of a quantized model checkpoint')
@@ -708,8 +708,7 @@ def convert_hf_gptneox(hf_model,
             'has_zero_point':
             True,
             'group_size':
-            get_gptq_gptneox_group_size(args.modelopt_quant_ckpt_path,
-                                        hf_config)
+            get_gptq_gptneox_group_size(args.quant_ckpt_path, hf_config)
         })
 
     with open(os.path.join(args.output_dir, 'config.json'), 'w') as f:
@@ -723,7 +722,7 @@ def covert_and_save(rank):
 
         if args.use_weight_only and args.weight_only_precision == 'int4_gptq':
             weights = load_from_gptq_gptneox(
-                args.modelopt_quant_ckpt_path,
+                args.quant_ckpt_path,
                 hf_config,
                 use_parallel_embedding=args.use_parallel_embedding,
                 sharding_dim=args.embedding_sharding_dim,
diff --git a/examples/gptneox/requirements.txt b/examples/gptneox/requirements.txt
index 8edbc46e6..b6d9a8b15 100644
--- a/examples/gptneox/requirements.txt
+++ b/examples/gptneox/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 rouge_score~=0.1.2
 evaluate~=0.4.1
diff --git a/examples/grok/requirements.txt b/examples/grok/requirements.txt
index b6e249ae7..7480cc654 100644
--- a/examples/grok/requirements.txt
+++ b/examples/grok/requirements.txt
@@ -1,6 +1,6 @@
 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/high-level-api/requirements.txt b/examples/high-level-api/requirements.txt
index 182e2ccc1..7833f8eee 100644
--- a/examples/high-level-api/requirements.txt
+++ b/examples/high-level-api/requirements.txt
@@ -1,2 +1,2 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
diff --git a/examples/internlm/requirements.txt b/examples/internlm/requirements.txt
index 6c49a765f..e86e7fd2d 100644
--- a/examples/internlm/requirements.txt
+++ b/examples/internlm/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets==2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
diff --git a/examples/llama/README.md b/examples/llama/README.md
index 96c9cf8e9..84ebc94c1 100644
--- a/examples/llama/README.md
+++ b/examples/llama/README.md
@@ -644,7 +644,7 @@ One can enable AWQ/GPTQ INT4 weight only quantization with these options when bu
 - `--use_weight_only` enables weight only GEMMs in the network.
 - `--per_group` enable groupwise weight only quantization, for GPT-J example, we support AWQ with the group size default as 128.
 - `--weight_only_precision` should specify the weight only quantization format. Supported formats are `int4_awq` or `int4_gptq`.
-- `--modelopt_quant_ckpt_path` passes the quantized checkpoint to build the engine.
+- `--quant_ckpt_path` passes the quantized checkpoint to build the engine.
 
 AWQ/GPTQ examples below involves 2 steps:
 1. Weight quantization
@@ -700,7 +700,7 @@ To run the GPTQ LLaMa example, the following steps are required:
     python convert_checkpoint.py --model_dir /tmp/llama-7b-hf \
                                  --output_dir ./tllm_checkpoint_2gpu_gptq \
                                  --dtype float16 \
-                                 --modelopt_quant_ckpt_path ./llama-7b-4bit-gs128.safetensors  \
+                                 --quant_ckpt_path ./llama-7b-4bit-gs128.safetensors  \
                                  --use_weight_only \
                                  --weight_only_precision int4_gptq \
                                  --per_group \
diff --git a/examples/llama/requirements.txt b/examples/llama/requirements.txt
index f8e55c97d..814a8c824 100644
--- a/examples/llama/requirements.txt
+++ b/examples/llama/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/mamba/README.md b/examples/mamba/README.md
index 710bb10f0..d3cfd78d4 100644
--- a/examples/mamba/README.md
+++ b/examples/mamba/README.md
@@ -29,9 +29,6 @@ Please install required packages first and setup `git-lfs`:
 
 ```bash
 pip install -r requirements.txt
-pip install "transformers>=4.39.0"
-
-# Setup git-lfs
 git lfs install
 ```
 
@@ -98,8 +95,7 @@ The `trtllm-build` command builds TensorRT-LLM engines from TensorRT-LLM checkpo
 # mamba-2.8b
 trtllm-build --checkpoint_dir ./mamba_model/mamba-2.8b/trt_ckpt/bf16/1-gpu/ \
              --paged_kv_cache disable \
-             --gemm_plugin bfloat16 \
-             --mamba_conv1d_plugin bfloat16 \
+             --gemm_plugin auto \
              --max_batch_size 8 \
              --max_input_len 924 \
              --max_seq_len 1024 \
@@ -108,8 +104,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-2.8b/trt_ckpt/bf16/1-gpu/ \
 # mamba-1.4b
 trtllm-build --checkpoint_dir ./mamba_model/mamba-1.4b/trt_ckpt/fp16/1-gpu/ \
              --paged_kv_cache disable \
-             --gemm_plugin float16 \
-             --mamba_conv1d_plugin float16 \
+             --gemm_plugin auto \
              --max_batch_size 8 \
              --max_input_len 924 \
              --max_seq_len 1024 \
@@ -118,8 +113,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-1.4b/trt_ckpt/fp16/1-gpu/ \
 # mamba-790m
 trtllm-build --checkpoint_dir ./mamba_model/mamba-790m/trt_ckpt/fp16/1-gpu/ \
              --paged_kv_cache disable \
-             --gemm_plugin float16 \
-             --mamba_conv1d_plugin float16 \
+             --gemm_plugin auto \
              --max_batch_size 8 \
              --max_input_len 924 \
              --max_seq_len 1024 \
@@ -128,8 +122,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-790m/trt_ckpt/fp16/1-gpu/ \
 # mamba-370m
 trtllm-build --checkpoint_dir ./mamba_model/mamba-370m/trt_ckpt/fp16/1-gpu/ \
              --paged_kv_cache disable \
-             --gemm_plugin float16 \
-             --mamba_conv1d_plugin float16 \
+             --gemm_plugin auto \
              --max_batch_size 8 \
              --max_input_len 924 \
              --max_seq_len 1024 \
@@ -138,8 +131,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-370m/trt_ckpt/fp16/1-gpu/ \
 # mamba-130m
 trtllm-build --checkpoint_dir ./mamba_model/mamba-130m/trt_ckpt/fp16/1-gpu/ \
              --paged_kv_cache disable \
-             --gemm_plugin float16 \
-             --mamba_conv1d_plugin float16 \
+             --gemm_plugin auto \
              --max_batch_size 8 \
              --max_input_len 924 \
              --max_seq_len 1024 \
diff --git a/examples/mamba/requirements.txt b/examples/mamba/requirements.txt
index 5d3b4823e..c64cfbf6a 100644
--- a/examples/mamba/requirements.txt
+++ b/examples/mamba/requirements.txt
@@ -1,5 +1,6 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
+transformers>=4.39.0
 datasets~=2.14.5
 evaluate
 rouge_score
diff --git a/examples/medusa/README.md b/examples/medusa/README.md
index 587363966..1ef2d4377 100644
--- a/examples/medusa/README.md
+++ b/examples/medusa/README.md
@@ -13,6 +13,7 @@ For more info about Medusa visit [speculative decoding documentation](../../docs
   * GPU Compute Capability >= 8.0 (Ampere or newer)
   * FP16
   * BF16
+  * FP8 (base model)
   * PAGED_KV_CACHE
   * Tensor Parallel
 
@@ -32,7 +33,7 @@ https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3
 ```
 
 We use `convert_checkpoint.py` script to convert the model for Medusa decoding into TensorRT-LLM checkpoint format.
-Here we also add `--fixed_num_medusa_heads 4` as `medusa_num_heads` is 2 in `config.json` of `medusa-vicuna-7b-v1.3` but it actually has 4.
+We could use `--num_medusa_heads` to set the number of medusa heads that we want to use. If not, `num_medusa_heads` will be set according to the `medusa_num_heads` from medusa weights' `config.json`.
 
 Here is the example:
 ```bash
@@ -41,20 +42,21 @@ python convert_checkpoint.py --model_dir ./vicuna-7b-v1.3 \
                             --medusa_model_dir medusa-vicuna-7b-v1.3 \
                             --output_dir ./tllm_checkpoint_1gpu_medusa \
                             --dtype float16 \
-                            --fixed_num_medusa_heads 4
+                            --num_medusa_heads 4
 
+# Note: Increasing the batch size may have a negative impact on performance
 trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_medusa \
              --output_dir ./tmp/medusa/7B/trt_engines/fp16/1-gpu/ \
              --gemm_plugin float16 \
              --speculative_decoding_mode medusa \
-             --max_batch_size 8
+             --max_batch_size 4
 
 # Convert and Build Medusa decoding support for vicuna-13b-v1.3 with 4-way tensor parallelism.
 python convert_checkpoint.py --model_dir ./vicuna-7b-v1.3 \
                             --medusa_model_dir medusa-vicuna-7b-v1.3 \
                             --output_dir ./tllm_checkpoint_1gpu_medusa \
                             --dtype float16 \
-                            --fixed_num_medusa_heads 4 \
+                            --num_medusa_heads 4 \
                             --tp_size 4 \
                             --workers 4
 
@@ -62,7 +64,30 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_medusa \
              --output_dir ./tmp/medusa/7B/trt_engines/fp16/1-gpu/ \
              --gemm_plugin float16 \
              --speculative_decoding_mode medusa \
-             --max_batch_size 8
+             --max_batch_size 4
+```
+
+### FP8 Post-Training Quantization for Base Model
+The example below quantizes the base model to FP8, while keeping the weight of the medusa head non-quantize.
+```bash
+# Quantize base model into FP8 and export trtllm checkpoint
+python ../quantization/quantize.py --model_dir /path/to/base-model-hf/ \
+                                   --dtype float16 \
+                                   --qformat fp8 \
+                                   --kv_cache_dtype fp8 \
+                                   --output_dir ./tllm_checkpoint_1gpu_base_model_fp8_medusa_fp16 \
+                                   --calib_size 512 \
+                                   --tp_size 1 \
+                                   --medusa_model_dir /path/to/medusa_head/ \
+                                   --num_medusa_heads 4
+
+# Build trtllm engines from the trtllm checkpoint
+trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_base_model_fp8_medusa_fp16 \
+         --output_dir ./trt_engine_1gpu_base_model_fp8_medusa_fp16 \
+         --gemm_plugin float16 \
+         --gpt_attention_plugin float16 \
+         --speculative_decoding_mode medusa \
+         --max_batch_size 4
 ```
 
 ### Run
diff --git a/examples/medusa/convert_checkpoint.py b/examples/medusa/convert_checkpoint.py
index b369facfd..86dc8b986 100644
--- a/examples/medusa/convert_checkpoint.py
+++ b/examples/medusa/convert_checkpoint.py
@@ -19,12 +19,13 @@
 from transformers.pytorch_utils import Conv1D
 
 import tensorrt_llm
-from tensorrt_llm._utils import str_dtype_to_torch
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models import PretrainedConfig
 from tensorrt_llm.models.convert_utils import load_calib_dataset
 from tensorrt_llm.models.llama.convert import load_weights_from_hf_by_shard
+from tensorrt_llm.models.medusa.weight import (get_tllm_linear_weight,
+                                               load_medusa_hf)
 from tensorrt_llm.quantization import QuantAlgo
 
 try:
@@ -108,11 +109,6 @@ def parse_arguments():
         help=
         'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
     )
-    parser.add_argument(
-        '--modelopt_quant_ckpt_path',
-        type=str,
-        default=None,
-        help='Path of a quantized model checkpoint in .npz format')
 
     parser.add_argument(
         '--per_group',
@@ -182,13 +178,6 @@ def parse_arguments():
         help='The number of workers for converting checkpoint in parallel')
 
     parser.add_argument('--num_medusa_heads', type=int, default=4)
-    parser.add_argument(
-        '--fixed_num_medusa_heads',
-        type=int,
-        default=None,
-        help="If exist, fix medusa_num_heads from config.json."
-        "num_medusa_heads < medusa_num_heads in config.json < fixed_num_medusa_heads"
-    )
     parser.add_argument('--num_medusa_layers', type=int, default=1)
     parser.add_argument('--max_medusa_token_len', type=int, default=63)
     parser.add_argument('--medusa_hidden_act', type=str, default="silu")
@@ -570,29 +559,6 @@ def get_weight_and_bias(config, prefix, dtype):
     return get_weight(config, prefix, dtype), get_bias(config, prefix, dtype)
 
 
-def get_tllm_linear_weight(weight,
-                           prefix,
-                           bias=None,
-                           use_weight_only=False,
-                           plugin_weight_only_quant_type=torch.int8,
-                           postfix='weight'):
-    results = {}
-    if use_weight_only:
-        v = weight.t().contiguous().cpu()
-        processed_torch_weights, torch_weight_scales = \
-            torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
-                v, plugin_weight_only_quant_type)
-        results[prefix + postfix] = processed_torch_weights
-        results[prefix + 'per_channel_scale'] = torch_weight_scales
-    else:
-        results[prefix + postfix] = weight.contiguous()
-
-    if bias is not None:
-        results[prefix + 'bias'] = bias
-
-    return results
-
-
 def dup_kv_weight(v, num_head, tp_size):
     assert tp_size % num_head == 0
     reps = tp_size // num_head
@@ -1189,77 +1155,28 @@ def covert_and_save(rank, convert_args):
                     qkv_para=convert_args['llama_qkv_para'],
                     smoother=convert_args['llama_smoother'])
 
-                def load_medusa_hf(medusa_path: str,
-                                   mapping=Mapping(),
-                                   dtype='float32'):
-                    logger.info("Loading Medusa heads' weights ...")
-                    is_ckpt_safetensors = False
-
-                    ckpt_file = Path(medusa_path) / "medusa_lm_head.pt"
-                    if not ckpt_file.exists():
-                        ckpt_file = Path(
-                            medusa_path) / "medusa_lm_head.safetensors"
-                        is_ckpt_safetensors = True
-
-                    if is_ckpt_safetensors:
-                        logger.info("Safetensors Found ...")
-                        from safetensors.torch import load_file
-                        state_dict = load_file(ckpt_file)
-                    else:
-                        state_dict = torch.load(ckpt_file, map_location="cpu")
-
-                    torch_dtype = str_dtype_to_torch(dtype)
-                    weights = {}
-
-                    for h in range(args.num_medusa_heads):
-                        for l in range(args.num_medusa_layers):
-                            w = state_dict[f"{h}.{l}.linear.weight"].clone().to(
-                                torch_dtype)
-
-                            split_v = split(w, mapping.tp_size, mapping.tp_rank)
-                            weights.update(
-                                get_tllm_linear_weight(
-                                    split_v,
-                                    f'medusa_heads.{h}.medusa_layers.{l}.linear.',
-                                    None, args.use_weight_only,
-                                    plugin_weight_only_quant_type))
-
-                            b = state_dict[f"{h}.{l}.linear.bias"].clone().to(
-                                torch_dtype)
-
-                            weights[
-                                'medusa_heads.{}.medusa_layers.{}.linear.bias'.
-                                format(h, l)] = split(b, mapping.tp_size,
-                                                      mapping.tp_rank)
-
-                        lm = state_dict[
-                            f"{h}.{args.num_medusa_layers}.weight"].clone().to(
-                                torch_dtype)  # LM Head
-
-                        weights['medusa_heads.{}.lm_head.weight'.format(
-                            h)] = split(lm, mapping.tp_size, mapping.tp_rank)
-
-                    return weights
-
                 if args.medusa_model_dir is not None:
                     config_file = Path(args.medusa_model_dir) / "config.json"
                     with open(config_file) as fp:
                         config = json.load(fp)
-                    args.num_medusa_heads = config.get('medusa_num_heads',
-                                                       args.num_medusa_heads)
+                    num_medusa_heads_from_config = config.get(
+                        'medusa_num_heads', args.num_medusa_heads)
                     args.num_medusa_layers = config.get('medusa_num_layers',
                                                         args.num_medusa_layers)
-                    if args.fixed_num_medusa_heads is not None and args.fixed_num_medusa_heads != args.num_medusa_heads:
-                        logger.info(
-                            f"fixing num_medusa_heads from {args.num_medusa_heads} to {args.fixed_num_medusa_heads}"
-                        )
-                        args.num_medusa_heads = args.fixed_num_medusa_heads
+                    if args.num_medusa_heads is None:
+                        args.num_medusa_heads = num_medusa_heads_from_config
 
                     assert args.max_medusa_token_len > 0, "should have max_medusa_token_len > 0"
 
-                    medusa_weights = load_medusa_hf(args.medusa_model_dir,
-                                                    mapping,
-                                                    dtype=args.dtype)
+                    medusa_weights = load_medusa_hf(
+                        medusa_path=args.medusa_model_dir,
+                        num_medusa_heads=args.num_medusa_heads,
+                        num_medusa_layers=args.num_medusa_layers,
+                        mapping=mapping,
+                        dtype=args.dtype,
+                        use_weight_only=args.use_weight_only,
+                        plugin_weight_only_quant_type=
+                        plugin_weight_only_quant_type)
                     weights.update(medusa_weights)
 
         safetensors.torch.save_file(
diff --git a/examples/medusa/requirements.txt b/examples/medusa/requirements.txt
index b29584c99..f396bbdcc 100644
--- a/examples/medusa/requirements.txt
+++ b/examples/medusa/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
diff --git a/examples/mixtral/requirements.txt b/examples/mixtral/requirements.txt
index 9eb7892b0..6a15339a6 100644
--- a/examples/mixtral/requirements.txt
+++ b/examples/mixtral/requirements.txt
@@ -1,4 +1,4 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 transformers==4.38.2
 accelerate==0.25.0
diff --git a/examples/mmlu.py b/examples/mmlu.py
index 18ea6d4a5..0ce98a7be 100644
--- a/examples/mmlu.py
+++ b/examples/mmlu.py
@@ -363,11 +363,13 @@ def main():
 
     if args.test_trt_llm:
         assert not args.test_hf, "Cannot test both TRT-LLM and HF"
-        runner_cls = ModelRunner if (args.debug_mode
-                                     or not PYTHON_BINDINGS) else ModelRunnerCpp
+        runner_cls = ModelRunner if not PYTHON_BINDINGS else ModelRunnerCpp
+        runner_kwargs = {}
+        if PYTHON_BINDINGS:
+            runner_kwargs.update(max_beam_width=1)
         model = runner_cls.from_dir(args.engine_dir,
                                     rank=runtime_rank,
-                                    debug_mode=args.debug_mode)
+                                    **runner_kwargs)
     else:
         assert args.test_hf, "Must test either TRT-LLM or HF"
         if model_name == 'ChatGLMForCausalLM' and model_version == 'glm':
diff --git a/examples/mpt/requirements.txt b/examples/mpt/requirements.txt
index 58af19bd7..9cd6ac378 100644
--- a/examples/mpt/requirements.txt
+++ b/examples/mpt/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/nemotron/requirements.txt b/examples/nemotron/requirements.txt
index 88383335b..6e9c7e8f7 100644
--- a/examples/nemotron/requirements.txt
+++ b/examples/nemotron/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 transformers==4.40.2
 datasets~=2.14.5
 evaluate~=0.4.1
diff --git a/examples/opt/requirements.txt b/examples/opt/requirements.txt
index 58af19bd7..9cd6ac378 100644
--- a/examples/opt/requirements.txt
+++ b/examples/opt/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/phi/README.md b/examples/phi/README.md
index 5dd4f7491..a9ee4f2c4 100644
--- a/examples/phi/README.md
+++ b/examples/phi/README.md
@@ -15,10 +15,9 @@ models using TensorRT-LLM and run on a single GPU.
 
 ## Overview
 
-The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/phi`](./). There are two files:
+The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/phi`](./) with a single file:
 
 * [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format
-* [`postprocess_quant_checkpoint.py`](./postprocess_quant_checkpoint.py) to post-process FP8 or INT8 SmoothQuant quantized checkpoints for Phi-3-small variants.
 
 In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
 
@@ -29,18 +28,19 @@ In addition, there are two shared files in the parent folder [`examples`](../) f
   * FP16
   * BF16
   * FP8
+  * INT8
   * Tensor Parallel
   ## Support Matrix
 
-|    Model Name    | FP16  | BF16  | FP8   |  TP   |
-| :--------------: | :---: | :---: | :---: | :---: |
-|    phi-2    |   Y   |   Y    |   |  Y   |
-| Phi-3-mini-4k-instruct    |   Y   |   Y   |  |  |
-| Phi-3-mini-128k-instruct  |   Y   |   Y   |  |  |
-| Phi-3-small-8k-instruct   |   Y   |   Y   | Y   | Y  |
-| Phi-3-small-128k-instruct |   Y   |   Y   | Y   | Y  |
-| Phi-3-medium-8k-instruct   |   Y   |   Y   | |   | Y  |
-| Phi-3-medium-128k-instruct   |   Y   |   Y   | |   | Y  |
+|    Model Name    | FP16  | BF16  | FP8   | INT8  | TP   |
+| :--------------: | :---: | :---: | :---: | :---: | :---: |
+|    phi-2    |   Y   |   Y    |   |    | Y |
+| Phi-3-mini-4k-instruct    |   Y   |   Y   | Y   | Y  |
+| Phi-3-mini-128k-instruct  |   Y   |   Y   | Y   | Y  |
+| Phi-3-small-8k-instruct   |   Y   |   Y   | Y   | Y  | Y |
+| Phi-3-small-128k-instruct |   Y   |   Y   | Y   | Y  | Y |
+| Phi-3-medium-8k-instruct  |   Y   |   Y   | Y   | Y  |
+| Phi-3-medium-128k-instruct |  Y   |   Y   | Y   | Y  |
 
 * Model Name: the name of the model, the same as the name on HuggingFace
 * TP: Tensor Parallel
@@ -128,9 +128,9 @@ python3 ../summarize.py --engine_dir ./phi-engine-tp2  \
 ```
 
 
-### 5. Quantization options for Phi-3-small
+### 5. Quantization
 
-Phi-3-small variants support post-training quantization to FP8 and INT8 SmoothQuant formats.
+All Phi-3 variants support post-training quantization to FP8 and INT8 SmoothQuant formats.
 
 FP8 checkpoints can be built as follows:
 
@@ -141,8 +141,6 @@ python3 ../quantization/quantize.py \
        --output_dir ./phi3-checkpoint \
        --dtype $DTYPE \
        --qformat fp8 --kv_cache_dtype fp8
-
-python3 postprocess_quant_checkpoint.py --checkpoint_dir ./phi3-checkpoint
 ```
 
 INT8 checkpoints can be built as follows:
@@ -154,8 +152,6 @@ python3 ../quantization/quantize.py \
        --output_dir ./phi3-checkpoint \
        --dtype $DTYPE \
        --qformat int8_sq --kv_cache_dtype int8
-
-python3 postprocess_quant_checkpoint.py --checkpoint_dir ./phi3-checkpoint
 ```
 
 The commands to [build TensorRT engines](#2-build-tensorrt-engines) from quantized checkpoints
diff --git a/examples/phi/convert_checkpoint.py b/examples/phi/convert_checkpoint.py
index 30fde7092..6f562952f 100644
--- a/examples/phi/convert_checkpoint.py
+++ b/examples/phi/convert_checkpoint.py
@@ -19,8 +19,7 @@
 from transformers import AutoConfig
 
 import tensorrt_llm
-from tensorrt_llm.models import (Phi3ForCausalLM, Phi3SmallForCausalLM,
-                                 PhiForCausalLM)
+from tensorrt_llm.models import Phi3ForCausalLM, PhiForCausalLM
 
 
 def parse_arguments():
@@ -81,16 +80,14 @@ def parse_arguments():
     model_config = AutoConfig.from_pretrained(args.model_dir,
                                               trust_remote_code=True)
     model_type = model_config.architectures[0]
-    supported_model = {
-        'PhiForCausalLM': PhiForCausalLM,
-        'Phi3ForCausalLM': Phi3ForCausalLM,
-        'Phi3VForCausalLM': Phi3ForCausalLM,
-        'Phi3SmallForCausalLM': Phi3SmallForCausalLM
-    }
+    supported_models = [
+        'PhiForCausalLM', 'Phi3ForCausalLM', 'Phi3VForCausalLM',
+        'Phi3SmallForCausalLM'
+    ]
     modelForCausalLM = None
-    if model_type not in supported_model:
+    if model_type not in supported_models:
         assert False, "Invalid model type"
-    modelForCausalLM = supported_model[model_type]
+    modelForCausalLM = PhiForCausalLM if model_type == 'PhiForCausalLM' else Phi3ForCausalLM
 
     modelForCausalLM.convert_hf_checkpoint(args.model_dir,
                                            dtype=args.dtype,
diff --git a/examples/phi/postprocess_quant_checkpoint.py b/examples/phi/postprocess_quant_checkpoint.py
deleted file mode 100644
index 8c9cc28bb..000000000
--- a/examples/phi/postprocess_quant_checkpoint.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import time
-
-import safetensors
-from safetensors.torch import save_file
-
-import tensorrt_llm
-from tensorrt_llm.models.phi3.phi3small.convert import shuffle_qkv_weights
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--checkpoint_dir', type=str, default=None)
-    args = parser.parse_args()
-
-    return args
-
-
-if __name__ == '__main__':
-    print(tensorrt_llm.__version__)
-    args = parse_arguments()
-    tensorrt_llm.logger.set_level('info')
-
-    tik = time.time()
-    with open(f"{args.checkpoint_dir}/config.json", "r") as f:
-        config = json.load(f)
-
-    weights = {}
-    with safetensors.safe_open(f"{args.checkpoint_dir}/rank0.safetensors",
-                               framework="pt") as f:
-        for k in f.keys():
-            weights[k] = f.get_tensor(k)
-
-    # Transform QKV weights from custom Phi3Small format to TRT-LLM format
-    num_total_heads = config[
-        'num_attention_heads'] + 2 * config['num_key_value_heads']
-    for key, value in weights.items():
-        if "qkv." in key:
-            if 'scaling_factor' in key and value.shape[0] % num_total_heads != 0:
-                continue
-            weights[key] = shuffle_qkv_weights(value, config)
-
-    save_file(weights, f'{args.checkpoint_dir}/rank0.safetensors')
-
-    tok = time.time()
-    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    print(f'Total time of converting checkpoints: {t}')
diff --git a/examples/phi/requirements.txt b/examples/phi/requirements.txt
index 0bf746b21..07d57c486 100644
--- a/examples/phi/requirements.txt
+++ b/examples/phi/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/quantization/quantize.py b/examples/quantization/quantize.py
index f1ce69d41..f2fe6f3c9 100644
--- a/examples/quantization/quantize.py
+++ b/examples/quantization/quantize.py
@@ -90,6 +90,17 @@
                         help="KV Cache dtype.",
                         default=None,
                         choices=["int8", "fp8", None])
+    # Medusa
+    parser.add_argument('--num_medusa_heads', type=int, default=4)
+    parser.add_argument('--num_medusa_layers', type=int, default=1)
+    parser.add_argument('--max_draft_len', type=int, default=63)
+    parser.add_argument('--medusa_hidden_act', type=str, default="silu")
+    parser.add_argument('--medusa_model_dir', type=str, default=None)
+    parser.add_argument('--quant_medusa_head',
+                        default=False,
+                        action='store_true',
+                        help="whether to quantize the weights of medusa heads")
+
     args = parser.parse_args()
 
     if args.model_dir is not None:
@@ -108,7 +119,13 @@
             tp_size=args.tp_size,
             pp_size=args.pp_size,
             seed=args.seed,
-            tokenizer_max_seq_length=args.tokenizer_max_seq_length)
+            tokenizer_max_seq_length=args.tokenizer_max_seq_length,
+            num_medusa_heads=args.num_medusa_heads,
+            num_medusa_layers=args.num_medusa_layers,
+            max_draft_len=args.max_draft_len,
+            medusa_hidden_act=args.medusa_hidden_act,
+            medusa_model_dir=args.medusa_model_dir,
+            quant_medusa_head=args.quant_medusa_head)
     elif args.nemo_ckpt_path is not None:
         quantize_nemo_and_export(nemo_ckpt_path=args.nemo_ckpt_path,
                                  decoder_type=args.decoder_type,
diff --git a/examples/quantization/requirements.txt b/examples/quantization/requirements.txt
index b904ad81d..98ab003b8 100644
--- a/examples/quantization/requirements.txt
+++ b/examples/quantization/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets>=2.14.4
 nemo-toolkit[all]<=1.20.0,>=1.18.0
 rouge_score~=0.1.2
diff --git a/examples/qwen/requirements.txt b/examples/qwen/requirements.txt
index f1d4accf8..80a4f7c70 100644
--- a/examples/qwen/requirements.txt
+++ b/examples/qwen/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/qwenvl/requirements.txt b/examples/qwenvl/requirements.txt
index 09ee007be..0dc21bb2d 100644
--- a/examples/qwenvl/requirements.txt
+++ b/examples/qwenvl/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/recurrentgemma/requirements.txt b/examples/recurrentgemma/requirements.txt
index 90d3ff974..6d8673322 100644
--- a/examples/recurrentgemma/requirements.txt
+++ b/examples/recurrentgemma/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 git+https://github.com/google-deepmind/recurrentgemma.git
 flax>=0.8.2
 jax~=0.4.23
diff --git a/examples/run.py b/examples/run.py
index bf802cb9b..eb1313e57 100644
--- a/examples/run.py
+++ b/examples/run.py
@@ -247,7 +247,7 @@ def main(args):
 
     model_name, model_version = read_model_name(
         args.engine_dir) if not is_enc_dec else ("", "")
-    if args.tokenizer_dir is None:
+    if args.tokenizer_dir is None and model_name in DEFAULT_HF_MODEL_DIRS:
         logger.warning(
             "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect."
         )
diff --git a/examples/skywork/requirements.txt b/examples/skywork/requirements.txt
index afc5cb1e1..a2bbe457e 100644
--- a/examples/skywork/requirements.txt
+++ b/examples/skywork/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.16.1
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/smaug/requirements.txt b/examples/smaug/requirements.txt
index f8e55c97d..814a8c824 100644
--- a/examples/smaug/requirements.txt
+++ b/examples/smaug/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/whisper/requirements.txt b/examples/whisper/requirements.txt
index a5f91efea..b2a1ae5bc 100644
--- a/examples/whisper/requirements.txt
+++ b/examples/whisper/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 tiktoken
 datasets
 kaldialign
diff --git a/requirements.txt b/requirements.txt
index 259a73da8..d5d157897 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,12 +16,13 @@ pandas
 h5py==3.10.0
 StrEnum
 sentencepiece>=0.1.99
-tensorrt==10.0.1
-# https://github.com/pytorch/pytorch/blob/v2.3.0/version.txt uses 2.3.0a0.
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04 uses 2.3.0a0.
-torch>=2.3.0a,<=2.3.0
+tensorrt==10.1.0
+# https://github.com/pytorch/pytorch/blob/v2.3.1/version.txt uses 2.3.0a0.
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05 uses 2.4.0a0.
+torch>=2.3.0a0,<=2.4.0a0
 nvidia-modelopt~=0.11,<0.12
 transformers>=4.38.2
+pillow==10.2.0
 wheel
 optimum
 evaluate
diff --git a/tensorrt_llm/auto_parallel/parallelization.py b/tensorrt_llm/auto_parallel/parallelization.py
index 6883616db..44f2cd59d 100644
--- a/tensorrt_llm/auto_parallel/parallelization.py
+++ b/tensorrt_llm/auto_parallel/parallelization.py
@@ -1146,8 +1146,8 @@ def shard_gpt_attention(self, context: ShardContext):
         num_kv_heads = plugin_info.pfc_as_ndarray["num_kv_heads"].copy()
         tp_size = plugin_info.pfc_as_ndarray["tp_size"].copy()
         tp_rank = plugin_info.pfc_as_ndarray["tp_rank"].copy()
-        num_kv_heads = num_kv_heads // kv_partition
-        num_heads = num_heads // partition
+        num_kv_heads = np.maximum(num_kv_heads // kv_partition, 1)
+        num_heads = np.maximum(num_heads // partition, 1)
         tp_size[0] = partition
         tp_rank[0] = index
 
diff --git a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_node.py b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_node.py
index 42a2eeb8a..419308c7c 100644
--- a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_node.py
+++ b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_node.py
@@ -15,6 +15,11 @@ def __init__(self, layer):
                                                        layer.name)
         layer.to_base_class()
 
+    def _collect_strategies(self, device_mesh):
+        raise NotImplementedError(
+            f"Auto parallel does not support {self.plugin_type} plugin right now."
+        )
+
     def _default_strategy(self, device_mesh):
         strategies_vector = StrategiesVector(self)
         dim_partition_dict_mapping = {}
diff --git a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py
index 86b334da9..a818e4153 100644
--- a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py
+++ b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py
@@ -1,5 +1,6 @@
 from enum import Enum, auto
 
+import numpy as np
 import torch
 
 from tensorrt_llm.functional import PositionEmbeddingType
@@ -369,8 +370,8 @@ def _profile_sharding_cost(self, strategy, device_mesh):
         num_kv_heads = self.plugin_info.pfc_as_ndarray["num_kv_heads"].copy()
         tp_size = self.plugin_info.pfc_as_ndarray["tp_size"].copy()
         tp_rank = self.plugin_info.pfc_as_ndarray["tp_rank"].copy()
-        num_kv_heads = num_kv_heads // kv_partition
-        num_heads = num_heads // partition
+        num_kv_heads = np.maximum(num_kv_heads // kv_partition, 1)
+        num_heads = np.maximum(num_heads // partition, 1)
         tp_size[0] = partition
         tp_rank[0] = 0
 
diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py
index 1c614c445..4384894f1 100644
--- a/tensorrt_llm/commands/build.py
+++ b/tensorrt_llm/commands/build.py
@@ -30,10 +30,9 @@
 from ..logger import logger
 from ..lora_manager import LoraConfig, LoraManager
 from ..models import MODEL_MAP, PretrainedConfig
-from ..models.modeling_utils import (WEIGHT_LOADER_MODELS, QuantConfig,
+from ..models.modeling_utils import (WEIGHT_LOADER_MODELS,
                                      SpeculativeDecodingMode)
 from ..plugin import PluginConfig, add_plugin_argument
-from ..quantization import QuantAlgo
 
 
 def parse_arguments():
@@ -133,10 +132,6 @@ def parse_arguments():
                         type=str,
                         default=None,
                         choices=['float16', 'float32'])
-    parser.add_argument('--weight_only_precision',
-                        type=str,
-                        default=None,
-                        choices=['int8', 'int4'])
     parser.add_argument('--weight_sparsity', default=False, action='store_true')
     parser.add_argument(
         '--max_draft_len',
@@ -270,14 +265,6 @@ def build_model(build_config: BuildConfig,
     if logits_dtype is not None:
         model_config.logits_dtype = logits_dtype
 
-    weight_only_precision = kwargs.get('weight_only_precision', None)
-    if not model_config.quant_mode.has_any_quant(
-    ) and weight_only_precision is not None:
-        if weight_only_precision == 'int4':
-            model_config.quantization = QuantConfig(QuantAlgo.W4A16)
-        else:
-            model_config.quantization = QuantConfig(QuantAlgo.W8A16)
-
     architecture = model_config.architecture
     assert not build_config.plugin_config.streamingllm or architecture == "LlamaForCausalLM", \
         "StreamingLLM is only supported in the llama model."
@@ -420,7 +407,6 @@ def main():
     kwargs = {
         'logits_dtype': args.logits_dtype,
         'use_fused_mlp': args.use_fused_mlp,
-        'weight_only_precision': args.weight_only_precision,
         'tp_size': args.tp_size,
         'pp_size': args.pp_size,
         'lora_dir': args.lora_dir,
diff --git a/tensorrt_llm/models/__init__.py b/tensorrt_llm/models/__init__.py
index b09e1d505..aa5cb7a2d 100755
--- a/tensorrt_llm/models/__init__.py
+++ b/tensorrt_llm/models/__init__.py
@@ -40,7 +40,6 @@
 from .mpt.model import MPTForCausalLM, MPTModel
 from .opt.model import OPTForCausalLM, OPTModel
 from .phi3.model import Phi3ForCausalLM, Phi3Model
-from .phi3.phi3small.model import Phi3SmallForCausalLM, Phi3SmallModel
 from .phi.model import PhiForCausalLM, PhiModel
 from .qwen.model import QWenForCausalLM
 from .recurrentgemma.model import RecurrentGemmaForCausalLM
@@ -70,10 +69,8 @@
     'GPTNeoXForCausalLM',
     'PhiModel',
     'Phi3Model',
-    'Phi3SmallModel',
     'PhiForCausalLM',
     'Phi3ForCausalLM',
-    'Phi3SmallForCausalLM',
     'ChatGLMForCausalLM',
     'ChatGLMModel',
     'BaichuanForCausalLM',
@@ -103,7 +100,7 @@
     'FalconForCausalLM': FalconForCausalLM,
     'PhiForCausalLM': PhiForCausalLM,
     'Phi3ForCausalLM': Phi3ForCausalLM,
-    'Phi3SmallForCausalLM': Phi3SmallForCausalLM,
+    'Phi3SmallForCausalLM': Phi3ForCausalLM,
     'MambaForCausalLM': MambaForCausalLM,
     'GPTNeoXForCausalLM': GPTNeoXForCausalLM,
     'GPTJForCausalLM': GPTJForCausalLM,
diff --git a/tensorrt_llm/models/gemma/model.py b/tensorrt_llm/models/gemma/model.py
index 4c4753547..2e9d80459 100644
--- a/tensorrt_llm/models/gemma/model.py
+++ b/tensorrt_llm/models/gemma/model.py
@@ -292,6 +292,7 @@ def from_hugging_face(cls,
         return tllm_llama
 
     def check_config(self, config):
+        config.set_if_not_exist("share_embedding_table", True)
         config.set_if_not_exist('use_parallel_embedding', False)
         config.set_if_not_exist('embedding_sharding_dim', 0)
         config.set_if_not_exist('mlp_bias', False)
diff --git a/tensorrt_llm/models/generation_mixin.py b/tensorrt_llm/models/generation_mixin.py
index ab5a81b62..5f18ff321 100644
--- a/tensorrt_llm/models/generation_mixin.py
+++ b/tensorrt_llm/models/generation_mixin.py
@@ -62,6 +62,80 @@ def split_num_tokens_range(max_num_tokens):
             [split_point[-1], max_num_tokens, max_num_tokens])
         return num_tokens_ranges
 
+    @staticmethod
+    def get_profiles_ranges(
+        *,
+        max_batch_size,
+        max_beam_width,
+        max_input_len,
+        max_num_tokens,
+        max_draft_len,
+        opt_batch_size,
+        opt_num_tokens,
+        enable_ctx_gen_opt_profiles,
+        multiple_profiles,
+    ):
+        default_range = GenerationMixin.default_range
+        if opt_batch_size:
+            bb_range_cxt = [1, opt_batch_size, max_batch_size]
+            bb_range_gen = [
+                1, opt_batch_size * max_beam_width,
+                max_batch_size * max_beam_width
+            ]
+        else:
+            bb_range_cxt = default_range(max_batch_size)
+            bb_range_gen = default_range(max_batch_size * max_beam_width)
+        tokens_per_engine_step = max_draft_len + 1
+        tokens_per_engine_step_range = [
+            1, tokens_per_engine_step, tokens_per_engine_step
+        ]
+        bbd_range_ctx = [
+            bb_range_cxt[i] * (tokens_per_engine_step if i != 0 else 1)
+            for i in range(len(bb_range_cxt))
+        ]
+        bbd_range_gen = [
+            bb_range_gen[i] * (tokens_per_engine_step if i != 0 else 1)
+            for i in range(len(bb_range_gen))
+        ]
+        inlen_range_cxt = default_range(max_input_len)
+        inlen_range_gen = [1, 1, tokens_per_engine_step]
+        if enable_ctx_gen_opt_profiles:
+            num_profiles = 2
+            bb_range = [bb_range_cxt, bb_range_gen]
+            bbd_range = [bbd_range_ctx, bbd_range_gen]
+            inlen_range = [inlen_range_cxt, inlen_range_gen]
+            position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]]
+            num_tokens_range_ctx = default_range(max_batch_size * max_input_len)
+            # Draft tokens cannot be combined with beam search
+            num_tokens_range_gen = default_range(
+                max_batch_size * max(tokens_per_engine_step, max_beam_width))
+            num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen]
+        else:
+            if multiple_profiles:
+                num_tokens_range = GenerationMixin.split_num_tokens_range(
+                    max_num_tokens)
+            else:
+                if opt_num_tokens is None:
+                    opt_num_tokens = min(max_num_tokens,
+                                         max_batch_size * max_beam_width)
+                num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
+            num_profiles = len(num_tokens_range)
+            bb_range = [bb_range_gen] * num_profiles
+            bbd_range = [bbd_range_gen] * num_profiles
+            inlen_range = [[1, 1, max_input_len]] * num_profiles
+            position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles
+        tokens_per_engine_step_range = [tokens_per_engine_step_range
+                                        ] * num_profiles
+        ranges = {
+            'bb_range': bb_range,
+            'bbd_range': bbd_range,
+            'inlen_range': inlen_range,
+            'position_ids_inlen_range': position_ids_inlen_range,
+            'num_tokens_range': num_tokens_range,
+            'tokens_per_engine_step_range': tokens_per_engine_step_range,
+        }
+        return num_profiles, ranges
+
     def prepare_attention_inputs(self,
                                  *,
                                  max_batch_size,
@@ -343,61 +417,26 @@ def prepare_basic_inputs(
             streamingllm: bool = False,
             opt_batch_size=None):
 
-        default_range = GenerationMixin.default_range
-        tokens_per_engine_step = max_draft_len + 1
-        tokens_per_engine_step_range = [
-            1, tokens_per_engine_step, tokens_per_engine_step
-        ]
-        if opt_batch_size:
-            bb_range_cxt = [1, opt_batch_size, max_batch_size]
-            bb_range_gen = [
-                1, opt_batch_size * max_beam_width,
-                max_batch_size * max_beam_width
-            ]
-        else:
-            bb_range_cxt = default_range(max_batch_size)
-            bb_range_gen = default_range(max_batch_size * max_beam_width)
-        bbd_range_ctx = [
-            bb_range_cxt[i] * (tokens_per_engine_step if i != 0 else 1)
-            for i in range(len(bb_range_cxt))
-        ]
-        bbd_range_gen = [
-            bb_range_gen[i] * (tokens_per_engine_step if i != 0 else 1)
-            for i in range(len(bb_range_gen))
-        ]
-        inlen_range_cxt = default_range(max_input_len)
-        inlen_range_gen = [1, 1, tokens_per_engine_step]
-
         enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles(
             use_gpt_attention_plugin, use_gemm_plugin, remove_input_padding,
             paged_kv_cache)
-        if enable_ctx_gen_opt_profiles:
-            num_profiles = 2
-            bb_range = [bb_range_cxt, bb_range_gen]
-            bbd_range = [bbd_range_ctx, bbd_range_gen]
-            inlen_range = [inlen_range_cxt, inlen_range_gen]
-            position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]]
-            num_tokens_range_ctx = default_range(max_batch_size * max_input_len)
-            # Draft tokens cannot be combined with beam search
-            num_tokens_range_gen = default_range(
-                max_batch_size * max(tokens_per_engine_step, max_beam_width))
-            num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen]
-        else:
-            if multiple_profiles:
-                num_tokens_range = GenerationMixin.split_num_tokens_range(
-                    max_num_tokens)
-            else:
-                if opt_num_tokens is None:
-                    opt_num_tokens = min(max_num_tokens,
-                                         max_batch_size * max_beam_width)
-                num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
-            num_profiles = len(num_tokens_range)
-            bb_range = [bb_range_gen] * num_profiles
-            bbd_range = [bbd_range_gen] * num_profiles
-            inlen_range = [[1, 1, max_input_len]] * num_profiles
-            position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles
-        tokens_per_engine_step_range = [tokens_per_engine_step_range
-                                        ] * num_profiles
+
+        num_profiles, ranges = GenerationMixin.get_profiles_ranges(
+            max_batch_size=max_batch_size,
+            max_beam_width=max_beam_width,
+            max_input_len=max_input_len,
+            max_num_tokens=max_num_tokens,
+            max_draft_len=max_draft_len,
+            opt_batch_size=opt_batch_size,
+            opt_num_tokens=opt_num_tokens,
+            enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles,
+            multiple_profiles=multiple_profiles)
+        bb_range = ranges['bb_range']
+        bbd_range = ranges['bbd_range']
+        inlen_range = ranges['inlen_range']
+        num_tokens_range = ranges['num_tokens_range']
+        position_ids_inlen_range = ranges['position_ids_inlen_range']
+        tokens_per_engine_step_range = ranges['tokens_per_engine_step_range']
         position_ids_num_tokens_range = num_tokens_range
 
         input_ids = None
@@ -597,12 +636,13 @@ def prepare_basic_inputs(
         spec_decoding_params = None
         # Use positional offsets and packed mask only when not in SpS spec decoding
         if speculative_decoding_draft_tokens_external == False and max_draft_len > 0:
+            tokens_per_engine_step = max_draft_len + 1
             # 32 bits packed mask aligned.
             num_packed_masks = (tokens_per_engine_step + 32 - 1) // 32
             packed_mask_len_range = [[0, 1, num_packed_masks]] * num_profiles
             # total number of spec decoding tokens for all sequences (sequence length can be variable).
             num_gen_tokens_range = [
-                default_range(
+                GenerationMixin.default_range(
                     max_batch_size * max_beam_width * tokens_per_engine_step,
                     min_range=0)
             ] * num_profiles
diff --git a/tensorrt_llm/models/llama/convert.py b/tensorrt_llm/models/llama/convert.py
index 7d6209040..5f243b926 100644
--- a/tensorrt_llm/models/llama/convert.py
+++ b/tensorrt_llm/models/llama/convert.py
@@ -1535,6 +1535,12 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig):
 
     moe_config = config.moe
 
+    kv_tp_size = None
+    kv_tp_rank = None
+    if config.num_key_value_heads < mapping.tp_size:
+        kv_tp_size = config.num_key_value_heads
+        kv_tp_rank = mapping.tp_rank * kv_tp_size // mapping.tp_size
+
     model_prefix = "model."
     key_list = [
         "embed_tokens.weight",  # vocab_embedding
@@ -1552,7 +1558,12 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig):
 
     torch_dtype = str_dtype_to_torch(dtype)
 
-    def load(key, tp_dim=-1, no_prefix=0, is_expert_weights=False):
+    def load(key,
+             tp_dim=-1,
+             no_prefix=0,
+             is_expert_weights=False,
+             tp_size=None,
+             tp_rank=None):
         if not no_prefix:
             key = model_prefix + key
         ptr_idx = safetensors_map[key] if key in safetensors_map else 0
@@ -1560,38 +1571,28 @@ def load(key, tp_dim=-1, no_prefix=0, is_expert_weights=False):
         if key not in safetensors_ptrs[ptr_idx].keys():
             return None
 
+        tensor_slice = safetensors_ptrs[ptr_idx].get_slice(key)
+        tensor_shape = tensor_slice.get_shape()
         if tp_dim == -1:
-            res = safetensors_ptrs[ptr_idx].get_tensor(key)
-        else:
+            res = tensor_slice[:]
+        elif tp_dim >= 0 and tp_dim < len(tensor_shape):
             if is_expert_weights:
                 tp_size = mapping.moe_tp_size
                 tp_rank = mapping.moe_tp_rank
             else:
-                tp_size = mapping.tp_size
-                tp_rank = mapping.tp_rank
-            tensor_slice = safetensors_ptrs[ptr_idx].get_slice(key)
-            tensor_shape = tensor_slice.get_shape()
-            if len(tensor_shape) == 1:
-                if tp_dim == 0:
-                    slice_width = tensor_shape[0] // tp_size
-                    res = tensor_slice[slice_width * tp_rank:slice_width *
-                                       (tp_rank + 1)]
-                else:
-                    res = tensor_slice[:]
-            else:
-                if tensor_shape[tp_dim] % tp_size != 0:
-                    logger.error(
-                        "Current weight shape is invalid for tp_size=" +
-                        str(tp_size))
-                slice_width = tensor_shape[tp_dim] // tp_size
-                if tp_dim == 0:
-                    res = tensor_slice[slice_width * tp_rank:slice_width *
-                                       (tp_rank + 1), :]
-                elif tp_dim == 1:
-                    res = tensor_slice[:, slice_width * tp_rank:slice_width *
-                                       (tp_rank + 1)]
-                else:
-                    assert False, "Invalid TP dim"
+                tp_size = tp_size or mapping.tp_size
+                tp_rank = tp_rank or mapping.tp_rank
+            dim_size = tensor_shape[tp_dim]
+            if dim_size % tp_size != 0:
+                logger.error(
+                    f"Current weight shape {tensor_shape} is invalid at dimension {tp_dim} for TP size {tp_size}"
+                )
+            indices = [slice(None)] * len(tensor_shape)
+            indices[tp_dim] = slice(dim_size * tp_rank // tp_size,
+                                    dim_size * (tp_rank + 1) // tp_size)
+            res = tensor_slice[indices]
+        else:
+            raise ValueError(f"Invalid TP dim: {tp_dim}")
         return res.to(torch_dtype).contiguous(
         ) if "block_sparse_moe.gate" not in key else res.to(torch.float32)
 
@@ -1632,11 +1633,19 @@ def load_and_set(target,
         # Attention
         qkv_list = []
         for comp in ["q", "k", "v"]:
-            weight_part = load(prefix + key_list[3] + comp + key_list[4], 0)
+            tp_size = kv_tp_size if comp != "q" else None
+            tp_rank = kv_tp_rank if comp != "q" else None
+            weight_part = load(prefix + key_list[3] + comp + key_list[4],
+                               0,
+                               tp_size=tp_size,
+                               tp_rank=tp_rank)
             qkv_list.append(weight_part)
             bias_part = load(
                 (prefix + key_list[3] + comp + key_list[4]).replace(
-                    "weight", "bias"), 0)
+                    "weight", "bias"),
+                0,
+                tp_size=tp_size,
+                tp_rank=tp_rank)
             if bias_part is not None:
                 qkv_list.append(bias_part)
         if len(qkv_list) == 3:
diff --git a/tensorrt_llm/models/mamba/model.py b/tensorrt_llm/models/mamba/model.py
index 2f835dd41..32b2e1cc6 100644
--- a/tensorrt_llm/models/mamba/model.py
+++ b/tensorrt_llm/models/mamba/model.py
@@ -23,6 +23,7 @@
                            gather_last_token_logits, shape, unsqueeze)
 from ...layers import Embedding, LayerNorm, Linear, Mamba, RmsNorm
 from ...module import Module, ModuleList
+from ...plugin import current_all_reduce_helper
 from ..generation_mixin import GenerationMixin
 from ..modeling_utils import PretrainedConfig, PretrainedModel
 
@@ -192,6 +193,7 @@ def forward(self,
                 ssm_states,
                 host_request_types,
                 last_token_ids,
+                last_token_ids_for_logits,
                 host_context_lengths,
                 slot_mapping: Optional[Tensor] = None):
         hidden_states, present_convs, present_ssms = self.backbone(
@@ -200,7 +202,7 @@ def forward(self,
 
         if not self.gather_context_logits:
             hidden_states = gather_last_token_logits(
-                hidden_states, last_token_ids,
+                hidden_states, last_token_ids_for_logits,
                 default_net().plugin_config.remove_input_padding)
 
         lm_logits = self.lm_head(hidden_states)
@@ -218,9 +220,9 @@ def prepare_inputs(
             max_batch_size,
             max_input_len,
             max_seq_len,
+            max_num_tokens,
             use_cache,
             max_beam_width: int = 1,
-            max_num_tokens: int = None,
             opt_num_tokens: int = None,
             opt_batch_size: int = 0,
             prompt_embedding_table_size: int = 0,
@@ -235,56 +237,79 @@ def prepare_inputs(
             @return: a list contains values which can be fed into the self.forward()
         '''
         assert speculative_decoding_draft_tokens_external == False, "Speculative decoding is not supported in Mamba"
+        assert max_beam_width == 1, "We don't support beam search for the Mamba model."
+
         remove_input_padding = default_net().plugin_config.remove_input_padding
+        use_gemm_plugin = default_net().plugin_config.gemm_plugin
+        paged_state = default_net().plugin_config.paged_state
+        multiple_profiles = default_net().plugin_config.multiple_profiles
         use_mamba_conv1d_plugin = default_net(
         ).plugin_config.mamba_conv1d_plugin
-        batch_range = [GenerationMixin.default_range(max_batch_size)]
+        use_custom_all_reduce = default_net(
+        ).plugin_config.use_custom_all_reduce
+
         self.gather_context_logits = gather_context_logits
+        mapping = self.config.mapping
+
+        # basic inputs
+        enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles(
+            True, use_gemm_plugin, remove_input_padding, paged_state)
+
+        num_profiles, ranges = GenerationMixin.get_profiles_ranges(
+            max_batch_size=max_batch_size,
+            max_beam_width=max_beam_width,
+            max_input_len=max_input_len,
+            max_num_tokens=max_num_tokens,
+            max_draft_len=max_draft_len,
+            opt_batch_size=opt_batch_size,
+            opt_num_tokens=opt_num_tokens,
+            enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles,
+            multiple_profiles=multiple_profiles)
+
         if remove_input_padding:
             assert use_mamba_conv1d_plugin, "mamba_conv1d_plugin is needed to support remove_input_padding"
-            max_num_tokens = max(
-                max_input_len * max_batch_size,
-                max_beam_width * (max_draft_len + 1) * max_batch_size)
-            if opt_num_tokens is None:
-                opt_num_tokens = max_beam_width * (max_draft_len +
-                                                   1) * max_batch_size
-            num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
             input_ids = Tensor(name='input_ids',
                                dtype=trt.int32,
                                shape=[-1],
                                dim_range=OrderedDict([
-                                   ('num_tokens', num_tokens_range),
+                                   ('num_tokens', ranges['num_tokens_range']),
                                ]))
         else:
             input_ids = Tensor(name='input_ids',
                                dtype=trt.int32,
                                shape=[-1, -1],
                                dim_range=OrderedDict([
-                                   ('batch_size', batch_range),
-                                   ('input_len', [[1, 1, max_input_len]]),
+                                   ('batch_size_beam_width',
+                                    ranges['bb_range']),
+                                   ('input_len', ranges['inlen_range']),
                                ]))
+        if use_custom_all_reduce and mapping.tp_size > 1:
+            current_all_reduce_helper().set_workspace_tensor(
+                mapping, num_profiles)
+
+        # recurrent inputs
         conv_states = []
         ssm_states = []
         if use_mamba_conv1d_plugin:
             conv_state_dim_range = OrderedDict([
-                ('batch_size', batch_range),
-                ('kernel_size', [self.d_conv - 1]),
-                ('dim_size', [self.d_inner]),
+                ('batch_size', ranges['bb_range']),
+                ('kernel_size', [self.d_conv - 1] * num_profiles),
+                ('dim_size', [self.d_inner] * num_profiles),
             ])
         else:
             conv_state_dim_range = OrderedDict([
-                ('batch_size', batch_range),
-                ('dim_size', [self.d_inner]),
-                ('kernel_size', [self.d_conv - 1]),
+                ('batch_size', ranges['bb_range']),
+                ('dim_size', [self.d_inner] * num_profiles),
+                ('kernel_size', [self.d_conv - 1] * num_profiles),
             ])
 
         ssm_state_dim_range = OrderedDict([
-            ('batch_size', batch_range),
-            ('state_size', [self.d_state]),
-            ('dim_size', [self.d_inner]),
+            ('batch_size', ranges['bb_range']),
+            ('state_size', [self.d_state] * num_profiles),
+            ('dim_size', [self.d_inner] * num_profiles),
         ])
         one_dim_range = OrderedDict([
-            ('buffer_count', [1]),
+            ('buffer_count', [1] * num_profiles),
         ])
 
         for i in range(self.config.num_hidden_layers):
@@ -324,7 +349,7 @@ def prepare_inputs(
             name='host_request_types',
             dtype=trt.int32,
             shape=[-1],
-            dim_range=OrderedDict([('batch_size', batch_range)]),
+            dim_range=OrderedDict([('batch_size', ranges['bb_range'])]),
         )
 
         if use_mamba_conv1d_plugin and remove_input_padding:
@@ -332,21 +357,22 @@ def prepare_inputs(
                 name='host_context_lengths',
                 dtype=trt.int32,
                 shape=[-1],
-                dim_range=OrderedDict([('batch_size', batch_range)]),
+                dim_range=OrderedDict([('batch_size', ranges['bb_range'])]),
             )
         else:
             host_context_lengths = None
 
-        last_token_ids = None
+        last_token_ids = Tensor(
+            name='last_token_ids',
+            dtype=trt.int32,
+            shape=[-1],
+            dim_range=OrderedDict([
+                ('batch_size', ranges['bbd_range']),
+            ]),
+        )
+        last_token_ids_for_logits = None
         if not gather_context_logits:
-            last_token_ids = Tensor(
-                name='last_token_ids',
-                dtype=trt.int32,
-                shape=[-1],
-                dim_range=OrderedDict([
-                    ('batch_size', batch_range),
-                ]),
-            )
+            last_token_ids_for_logits = last_token_ids
 
         return_dict = {
             'input_ids': input_ids,
@@ -354,6 +380,7 @@ def prepare_inputs(
             'ssm_states': ssm_states,
             'host_request_types': host_request_types,
             'last_token_ids': last_token_ids,
+            'last_token_ids_for_logits': last_token_ids_for_logits,
             'host_context_lengths': host_context_lengths,
         }
 
@@ -362,7 +389,7 @@ def prepare_inputs(
                 name='slot_mapping',
                 dtype=trt.int32,
                 shape=[-1],
-                dim_range=OrderedDict([('batch_size', batch_range)]),
+                dim_range=OrderedDict([('batch_size', ranges['bb_range'])]),
             )
             return_dict['slot_mapping'] = slot_mapping
 
diff --git a/tensorrt_llm/models/medusa/weight.py b/tensorrt_llm/models/medusa/weight.py
index 251ea3edc..3173f2996 100644
--- a/tensorrt_llm/models/medusa/weight.py
+++ b/tensorrt_llm/models/medusa/weight.py
@@ -1,43 +1,80 @@
 from pathlib import Path
 
-import numpy as np
 import torch
 
 from tensorrt_llm import logger
-from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy
+from tensorrt_llm._utils import str_dtype_to_torch
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import MedusaLM
 from tensorrt_llm.models.convert_utils import split
 
 
+def get_tllm_linear_weight(weight,
+                           prefix,
+                           bias=None,
+                           use_weight_only=False,
+                           plugin_weight_only_quant_type=torch.int8,
+                           postfix='weight'):
+    results = {}
+    if use_weight_only:
+        v = weight.t().contiguous().cpu()
+        processed_torch_weights, torch_weight_scales = \
+            torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
+                v, plugin_weight_only_quant_type)
+        results[prefix + postfix] = processed_torch_weights
+        results[prefix + 'per_channel_scale'] = torch_weight_scales
+    else:
+        results[prefix + postfix] = weight.contiguous()
+
+    if bias is not None:
+        results[prefix + 'bias'] = bias
+
+    return results
+
+
 def load_medusa_hf(medusa_path: str,
-                   trt_llm_medusa: MedusaLM,
+                   num_medusa_heads: int,
+                   num_medusa_layers: int,
                    mapping=Mapping(),
-                   dtype='float32'):
+                   dtype='float32',
+                   use_weight_only=False,
+                   plugin_weight_only_quant_type=None):
     logger.info("Loading Medusa heads' weights ...")
+    is_ckpt_safetensors = False
+
     ckpt_file = Path(medusa_path) / "medusa_lm_head.pt"
-    state_dict = torch.load(ckpt_file, map_location="cpu")
+    if not ckpt_file.exists():
+        ckpt_file = Path(medusa_path) / "medusa_lm_head.safetensors"
+        is_ckpt_safetensors = True
+
+    if is_ckpt_safetensors:
+        logger.info("Safetensors Found ...")
+        from safetensors.torch import load_file
+        state_dict = load_file(ckpt_file)
+    else:
+        state_dict = torch.load(ckpt_file, map_location="cpu")
+
     torch_dtype = str_dtype_to_torch(dtype)
-    for h in range(trt_llm_medusa.num_medusa_heads):
-        for l in range(trt_llm_medusa.num_medusa_layers):
-            w = state_dict[f"{h}.{l}.linear.weight"].clone()
-            w = torch_to_numpy(w.to(torch_dtype).detach().cpu())
-            trt_llm_medusa.medusa_heads[h].medusa_layers[
-                l].linear.weight.value = np.ascontiguousarray(
-                    split(w, mapping.tp_size, mapping.tp_rank))
-            if trt_llm_medusa.medusa_heads[h].medusa_layers[
-                    l].linear.bias is not None:
-                # print(f"Setting bias for {h} {l}")
-                b = state_dict[f"{h}.{l}.linear.bias"].clone()
-                b = torch_to_numpy(b.to(torch_dtype).detach().cpu())
-                trt_llm_medusa.medusa_heads[h].medusa_layers[
-                    l].linear.bias.value = np.ascontiguousarray(
-                        np.split(b, mapping.tp_size,
-                                 axis=0)[mapping.tp_rank].copy())
-        lm = state_dict[f"{h}.{trt_llm_medusa.num_medusa_layers}.weight"].clone(
-        )  # LM Head
-        lm = torch_to_numpy(lm.to(torch_dtype).detach().cpu())
-        trt_llm_medusa.medusa_heads[
-            h].lm_head.weight.value = np.ascontiguousarray(
-                split(lm, mapping.tp_size, mapping.tp_rank))
-    return
+    weights = {}
+
+    for h in range(num_medusa_heads):
+        for l in range(num_medusa_layers):
+            w = state_dict[f"{h}.{l}.linear.weight"].clone().to(torch_dtype)
+
+            split_v = split(w, mapping.tp_size, mapping.tp_rank)
+            weights.update(
+                get_tllm_linear_weight(
+                    split_v, f'medusa_heads.{h}.medusa_layers.{l}.linear.',
+                    None, use_weight_only, plugin_weight_only_quant_type))
+
+            b = state_dict[f"{h}.{l}.linear.bias"].clone().to(torch_dtype)
+
+            weights['medusa_heads.{}.medusa_layers.{}.linear.bias'.format(
+                h, l)] = split(b, mapping.tp_size, mapping.tp_rank)
+
+        lm = state_dict[f"{h}.{num_medusa_layers}.weight"].clone().to(
+            torch_dtype)  # LM Head
+
+        weights['medusa_heads.{}.lm_head.weight'.format(h)] = split(
+            lm, mapping.tp_size, mapping.tp_rank)
+
+    return weights
diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py
index c042e7440..15f076d75 100644
--- a/tensorrt_llm/models/modeling_utils.py
+++ b/tensorrt_llm/models/modeling_utils.py
@@ -27,6 +27,10 @@
 from ..module import Module, ModuleList
 from ..parameter import Parameter
 from ..quantization import QuantMode
+from ..quantization.layers import (WeightOnlyGroupwiseQuantLinear,
+                                   WeightOnlyGroupwiseQuantRowLinear,
+                                   WeightOnlyQuantLinear,
+                                   WeightOnlyQuantRowLinear)
 from ..quantization.mode import W8A8_SQ_PLUGIN_LIST, QuantAlgo
 from ..top_model_mixin import TopModelMixin
 from .convert_utils import weight_only_quantize_dict
@@ -841,7 +845,8 @@ def unfuse_qkv_gemm(model: PretrainedModel) -> PretrainedModel:
                 continue
             qkv_params = get_init_params(layer.qkv, ColumnLinear)
             qkv_params["bias"] = qkv_params["bias"] is not None
-            qkv_params["strict_dtype"] = qkv_params["strict_dtype"] is not None
+            qkv_params["strict_dtype"] = qkv_params.get(
+                "strict_dtype") is not None
             q = ColumnLinear(
                 **{
                     **qkv_params,
@@ -866,20 +871,34 @@ def unfuse_qkv_gemm(model: PretrainedModel) -> PretrainedModel:
             q = quantize(q, model.config.quantization)
             k = quantize(k, model.config.quantization)
             v = quantize(v, model.config.quantization)
+            out_features = q.out_features + k.out_features + v.out_features
+            if isinstance(layer.qkv, (
+                    WeightOnlyQuantLinear,
+                    WeightOnlyQuantRowLinear,
+                    WeightOnlyGroupwiseQuantLinear,
+                    WeightOnlyGroupwiseQuantRowLinear,
+            )):
+                out_dim = 1
+            else:
+                out_dim = 0
             if layer.qkv.weight.is_inited():
                 qkv_weight = layer.qkv.weight.raw_value
                 weights = np.split(qkv_weight, [
-                    q.out_features,
-                    q.out_features + k.out_features,
-                ])
+                    qkv_weight.shape[out_dim] * q.out_features // out_features,
+                    qkv_weight.shape[out_dim] *
+                    (q.out_features + k.out_features) // out_features,
+                ],
+                                   axis=out_dim)
                 for gemm, weight in zip([q, k, v], weights):
                     gemm.weight.value = weight
             if layer.qkv.bias is not None and layer.qkv.bias.is_inited():
                 qkv_bias = layer.qkv.bias.raw_value
                 biases = np.split(qkv_bias, [
-                    q.out_features,
-                    q.out_features + k.out_features,
-                ])
+                    qkv_bias.shape[out_dim] * q.out_features // out_features,
+                    qkv_bias.shape[out_dim] *
+                    (q.out_features + k.out_features) // out_features,
+                ],
+                                  axis=out_dim)
                 for gemm, bias in zip([q, k, v], biases):
                     gemm.bias.value = bias
             for name, parameter in layer.qkv._parameters.items():
diff --git a/tensorrt_llm/models/phi3/convert.py b/tensorrt_llm/models/phi3/convert.py
index dcc7e2686..da2940178 100644
--- a/tensorrt_llm/models/phi3/convert.py
+++ b/tensorrt_llm/models/phi3/convert.py
@@ -1,9 +1,12 @@
 import torch
 
+from tensorrt_llm.quantization import QuantAlgo
+
 from ..._utils import str_dtype_to_torch
+from .split_weights import shuffle_qkv_weights, split_weights_tp
 
 
-def convert_hf_weights(hf_model, dtype, **kwargs):
+def convert_hf_weights(hf_model, dtype, config, small_variant, args, rank):
     torch_dtype = str_dtype_to_torch(dtype)
     hf_state_dict = hf_model.state_dict()
     weights = {}
@@ -15,13 +18,16 @@ def convert_hf_weights(hf_model, dtype, **kwargs):
             key = key.replace("model.layers.", "transformer.layers.")
             #Attention
             key = key.replace("self_attn.", "attention.")
+            key = key.replace("query_key_value.", "qkv.")  # small
             key = key.replace("Wqkv.weight", "qkv.weight")
             key = key.replace("qkv_proj.", "qkv.")  #128k
             #MLP
             key = key.replace("mlp.fc1.", "mlp.fc.")
             key = key.replace("mlp.fc2.", "mlp.proj.")
             key = key.replace("mlp.gate_up_proj.", "mlp.fc.")
-            key = key.replace("mlp.up_proj.", "mlp.gate.")  #128k
+            key = key.replace(
+                "mlp.up_proj.",
+                "mlp.fc." if small_variant else "mlp.gate.")  #128k
             key = key.replace("mlp.down_proj.", "mlp.proj.")  #128k
             key = key.replace("mlp.gate_proj.", "mlp.fc.")  #128k
             key = key.replace("o_proj.", "dense.")  #128k
@@ -53,27 +59,67 @@ def convert_hf_weights(hf_model, dtype, **kwargs):
             key = key.replace("q_proj.weight", "qkv.weight")
         elif "k_proj" in key or "v_proj" in key:
             continue
+
         weights[key] = value.to(torch_dtype).cpu()
 
+    if small_variant:
+        weights['lm_head.weight'] = weights[
+            'transformer.vocab_embedding.weight'].clone()
+
+        # Transform QKV weights from custom Phi3Small format to TRT-LLM format
+        for key, value in weights.items():
+            if "qkv." in key:
+                weights[key] = shuffle_qkv_weights(weights[key], config)
+
+        weights = split_weights_tp(config, weights, args, rank, torch_dtype)
+
     return weights
 
 
-def convert_hf_config(hf_config, dtype, **kwargs):
+def convert_small_hf_config(hf_config):
+    return {
+        'architecture': "Phi3SmallForCausalLM",
+        'rotary_base': hf_config.rope_embedding_base,
+        'gegelu_limit': hf_config.gegelu_limit,
+        'mup_attn_multiplier': hf_config.mup_attn_multiplier,
+        'mup_embedding_multiplier': hf_config.mup_embedding_multiplier,
+        'mup_use_scaling': hf_config.mup_use_scaling,
+        'mup_width_multiplier': hf_config.mup_width_multiplier,
+        'blocksparse_block_size': hf_config.blocksparse_block_size,
+        'blocksparse_homo_head_pattern':
+        hf_config.blocksparse_homo_head_pattern,
+        'blocksparse_num_local_blocks': hf_config.blocksparse_num_local_blocks,
+        'blocksparse_vertical_stride': hf_config.blocksparse_vert_stride,
+        'dense_attention_every_n_layers':
+        hf_config.dense_attention_every_n_layers,
+    }
+
+
+def convert_hf_config(hf_config, dtype, args):
     config = {
         'architecture': "Phi3ForCausalLM",
         'dtype': dtype,
         'num_hidden_layers': hf_config.num_hidden_layers,
         'num_attention_heads': hf_config.num_attention_heads,
         'num_key_value_heads': hf_config.num_key_value_heads,
-        'rope_theta': hf_config.rope_theta,
         'hidden_size': hf_config.hidden_size,
         'intermediate_size': hf_config.intermediate_size,
         'vocab_size': hf_config.vocab_size,
         'max_position_embeddings': hf_config.max_position_embeddings,
         'hidden_act': hf_config.hidden_act,
         'share_embedding_table': False,
-        'norm_epsilon': hf_config.rms_norm_eps,
     }
+
+    small_variant = hf_config.architectures[0] == "Phi3SmallForCausalLM"
+    if small_variant:
+        config.update(convert_small_hf_config(hf_config))
+    else:
+        config.update({
+            'rotary_base': hf_config.rope_theta,
+            'norm_epsilon': hf_config.rms_norm_eps,
+        })
+
+    # Long-context variants
     if hf_config.max_position_embeddings >= 128000:
         config.update({
             'original_max_position_embeddings':
@@ -83,6 +129,31 @@ def convert_hf_config(hf_config, dtype, **kwargs):
             'longrope_scaling_long_factors':
             hf_config.rope_scaling["long_factor"]
         })
+
+        if small_variant:
+            config.update({
+                'longrope_long_mscale':
+                hf_config.rope_scaling["long_mscale"],
+                'longrope_short_mscale':
+                hf_config.rope_scaling["short_mscale"]
+            })
+
     if config["hidden_act"] == "silu":
         config["hidden_act"] = "swiglu"
+
+    # Tensor parallelism and weight-only quantization
+    if args is not None:
+        config.update({
+            'mapping': {
+                'world_size': args.tp_size * args.pp_size,
+                'tp_size': args.tp_size,
+                'pp_size': args.pp_size,
+            }
+        })
+
+        if args.use_weight_only and args.weight_only_precision == 'int8':
+            config.update({'quantization': {'quant_algo': QuantAlgo.W8A16}})
+        elif args.use_weight_only and args.weight_only_precision == 'int4':
+            config.update({'quantization': {'quant_algo': QuantAlgo.W4A16}})
+
     return config
diff --git a/tensorrt_llm/models/phi3/model.py b/tensorrt_llm/models/phi3/model.py
index 451791169..683ad2a5a 100644
--- a/tensorrt_llm/models/phi3/model.py
+++ b/tensorrt_llm/models/phi3/model.py
@@ -1,15 +1,20 @@
+import json
+import os
+import traceback
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Optional
 
 import numpy as np
+import safetensors
 from transformers import AutoModelForCausalLM
 
 from ..._utils import pad_vocab_size
 from ...functional import PositionEmbeddingType, Tensor
-from ...layers import (MLP, Attention, AttentionMaskType, Embedding,
-                       ParallelLMHead, RmsNorm)
+from ...layers import (MLP, Attention, AttentionMaskType, BlockSparseAttnParams,
+                       Embedding, LayerNorm, ParallelLMHead, RmsNorm)
 from ...module import Module
 from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
-                              PretrainedConfig, save_checkpoint)
+                              PretrainedConfig)
 from .convert import convert_hf_config, convert_hf_weights
 
 
@@ -22,28 +27,65 @@ def __init__(self, config: PretrainedConfig, layer_idx: int):
         tp_group = config.mapping.tp_group
         tp_size = config.mapping.tp_size
 
-        self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
-                                       eps=config.norm_epsilon,
-                                       dtype=config.dtype)
-        self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size,
-                                      eps=config.norm_epsilon,
-                                      dtype=config.dtype)
+        attention_mask_type = AttentionMaskType.causal
+        block_sparse_attn_params = BlockSparseAttnParams()
+        q_scaling = 1.0
+        self.gegelu_limit = None
+
+        self.small_variant = config.architecture == "Phi3SmallForCausalLM"
+        if self.small_variant:
+            self.gegelu_limit = config.gegelu_limit
+
+            # MuP uses norm_factor=attention_head_size (rather than sqrt(attention_head_size))
+            # We achieve this using q_scaling = sqrt(attention_head_size)
+            hidden_size = config.hidden_size
+            num_attention_heads = config.num_attention_heads
+            attention_head_size = hidden_size / num_attention_heads
+            q_scaling = attention_head_size**.5
+
+            block_sparse = (
+                (layer_idx + 1) % config.dense_attention_every_n_layers) != 0
+            attention_mask_type = AttentionMaskType.blocksparse if block_sparse else AttentionMaskType.causal
+
+            block_sparse_attn_params = BlockSparseAttnParams(
+                config.blocksparse_block_size,
+                config.blocksparse_homo_head_pattern,
+                config.blocksparse_num_local_blocks,
+                config.blocksparse_vertical_stride)
+
+            self.input_layernorm = LayerNorm(
+                normalized_shape=config.hidden_size, dtype=config.dtype)
+            self.post_layernorm = LayerNorm(normalized_shape=config.hidden_size,
+                                            dtype=config.dtype)
+        else:
+            self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
+                                           eps=config.norm_epsilon,
+                                           dtype=config.dtype)
+            self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size,
+                                          eps=config.norm_epsilon,
+                                          dtype=config.dtype)
 
         layers_range = config.mapping.pp_layers(config.num_hidden_layers)
         local_layer_idx = layer_idx - layers_range[0]
         position_embedding_type = PositionEmbeddingType.rope_gpt_neox
 
-        rope_scaling_short_factors = 1.0
-        rope_scaling_long_factors = 1.0
+        rope_scaling_short_factors, rope_scaling_long_factors = 1.0, 1.0
+        rope_scaling_short_mscale, rope_scaling_long_mscale = 1.0, 1.0
         original_max_position_embeddings = config.max_position_embeddings
+
         if hasattr(config, "longrope_scaling_short_factors"):
             rope_scaling_short_factors = np.asarray(
                 config.longrope_scaling_short_factors).astype(np.float32)
             rope_scaling_long_factors = np.asarray(
                 config.longrope_scaling_long_factors).astype(np.float32)
+
             original_max_position_embeddings = config.original_max_position_embeddings
             position_embedding_type = PositionEmbeddingType.long_rope
 
+            if self.small_variant:
+                rope_scaling_short_mscale = config.longrope_short_mscale
+                rope_scaling_long_mscale = config.longrope_long_mscale
+
         self.attention = Attention(
             local_layer_idx=local_layer_idx,
             hidden_size=config.hidden_size,
@@ -53,15 +95,18 @@ def __init__(self, config: PretrainedConfig, layer_idx: int):
             rotary_embedding_base=config.rotary_base,
             max_position_embeddings=config.max_position_embeddings,
             dtype=config.dtype,
-            attention_mask_type=AttentionMaskType.causal,
-            bias=False,
+            attention_mask_type=attention_mask_type,
+            bias=self.small_variant,
+            q_scaling=q_scaling,
             tp_group=tp_group,
             tp_size=tp_size,
             quant_mode=config.quant_mode,
             rope_scaling_short_factors=rope_scaling_short_factors,
             rope_scaling_long_factors=rope_scaling_long_factors,
+            rope_scaling_short_mscale=rope_scaling_short_mscale,
+            rope_scaling_long_mscale=rope_scaling_long_mscale,
             original_max_position_embeddings=original_max_position_embeddings,
-        )
+            block_sparse_params=block_sparse_attn_params)
 
         self.mlp = MLP(hidden_size=config.hidden_size,
                        ffn_hidden_size=config.intermediate_size,
@@ -70,7 +115,7 @@ def __init__(self, config: PretrainedConfig, layer_idx: int):
                        tp_group=tp_group,
                        tp_size=tp_size,
                        quant_mode=config.quant_mode,
-                       bias=False)
+                       bias=self.small_variant)
 
     def forward(
         self,
@@ -88,7 +133,7 @@ def forward(
             use_cache=use_cache,
             kv_cache_params=kv_cache_params,
             attention_params=attention_params,
-            norm_before_bmm1=True,
+            norm_before_bmm1=not self.small_variant,
         )
 
         if use_cache:
@@ -96,7 +141,8 @@ def forward(
 
         post_attention_input = hidden_states + attention_output
         post_attention_output = self.post_layernorm(post_attention_input)
-        feed_forward_hidden_states = self.mlp(post_attention_output, )
+        feed_forward_hidden_states = self.mlp(post_attention_output,
+                                              gegelu_limit=self.gegelu_limit)
         hidden_states = post_attention_input + feed_forward_hidden_states
         if use_cache:
             return (hidden_states, presents)
@@ -112,9 +158,15 @@ def __init__(self, config: PretrainedConfig):
                                          dtype=config.dtype)
 
         self.layers = DecoderLayerList(Phi3DecoderLayer, config)
-        self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
-                            eps=config.norm_epsilon,
-                            dtype=config.dtype)
+        self.small_variant = config.architecture == "Phi3SmallForCausalLM"
+        if self.small_variant:
+            self.ln_f = LayerNorm(normalized_shape=config.hidden_size,
+                                  dtype=config.dtype)
+            self.mup_embedding_multiplier = config.mup_embedding_multiplier
+        else:
+            self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
+                                eps=config.norm_epsilon,
+                                dtype=config.dtype)
 
     def forward(
         self,
@@ -132,6 +184,9 @@ def forward(
                 ] if prompt_embedding_table is not None else []
         hidden_states = self.vocab_embedding(input_ids, *args)
 
+        if self.small_variant and self.mup_embedding_multiplier > 0.0:
+            hidden_states = hidden_states * self.mup_embedding_multiplier
+
         hidden_states = self.layers(
             hidden_states,
             use_cache=use_cache,
@@ -152,7 +207,6 @@ def forward(
 class Phi3ForCausalLM(DecoderModelForCausalLM):
 
     def __init__(self, config: PretrainedConfig):
-        self.check_config(config)
         transformer = Phi3Model(config)
         vocab_size_padded = pad_vocab_size(config.vocab_size,
                                            config.mapping.tp_size)
@@ -167,25 +221,48 @@ def __init__(self, config: PretrainedConfig):
 
         super().__init__(config, transformer, lm_head)
 
-    def check_config(self, config):
-        config.set_if_not_exist('rotary_base', 10000.0)
-
     @classmethod
     def convert_hf_checkpoint(cls,
                               hf_model_dir: str,
                               dtype: Optional[str] = "float16",
                               output_dir: Optional[str] = None,
-                              **kwargs):
+                              args=None):
         '''
         Convert Huggingface checkpoint to TRT-LLM checkpoint
         '''
+
         hf_model = AutoModelForCausalLM.from_pretrained(hf_model_dir,
                                                         torch_dtype="auto",
                                                         trust_remote_code=True)
-        config = convert_hf_config(hf_model.config, dtype=dtype, **kwargs)
-        weights = convert_hf_weights(hf_model, dtype=dtype, **kwargs)
+        config = convert_hf_config(hf_model.config, dtype, args)
+        with open(os.path.join(output_dir, 'config.json'), 'w') as f:
+            json.dump(config, f, indent=4)
+
+        small_variant = config['architecture'] == "Phi3SmallForCausalLM"
 
-        if output_dir:
-            save_checkpoint(output_dir, config=config, weights=weights)
+        def covert_and_save(rank):
+            weights = convert_hf_weights(hf_model, dtype, config, small_variant,
+                                         args, rank)
+            safetensors.torch.save_file(
+                weights, os.path.join(output_dir, f'rank{rank}.safetensors'))
 
-        return {"weights": weights, "config": config}
+        world_size = args.tp_size * args.pp_size
+        if args.workers == 1:
+            for rank in range(world_size):
+                covert_and_save(rank)
+        else:
+            with ThreadPoolExecutor(max_workers=args.workers) as p:
+                futures = [
+                    p.submit(covert_and_save, rank)
+                    for rank in range(world_size)
+                ]
+                exceptions = []
+                for future in as_completed(futures):
+                    try:
+                        future.result()
+                    except Exception as e:
+                        traceback.print_exc()
+                        exceptions.append(e)
+                assert len(
+                    exceptions
+                ) == 0, "Checkpoint conversion failed, please check error log."
diff --git a/tensorrt_llm/models/phi3/phi3small/__init__.py b/tensorrt_llm/models/phi3/phi3small/__init__.py
deleted file mode 100644
index 71bf6d298..000000000
--- a/tensorrt_llm/models/phi3/phi3small/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/tensorrt_llm/models/phi3/phi3small/model.py b/tensorrt_llm/models/phi3/phi3small/model.py
deleted file mode 100644
index 2d0ff7d79..000000000
--- a/tensorrt_llm/models/phi3/phi3small/model.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import os
-import traceback
-from concurrent.futures import ThreadPoolExecutor, as_completed
-
-import numpy as np
-import safetensors
-from transformers import AutoModelForCausalLM
-
-from ...._utils import pad_vocab_size
-from ....functional import PositionEmbeddingType, Tensor
-from ....layers import (MLP, Attention, AttentionMaskType,
-                        BlockSparseAttnParams, Embedding, LayerNorm,
-                        ParallelLMHead)
-from ....module import Module
-from ...modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
-                               PretrainedConfig)
-from .convert import convert_hf_config, convert_hf_weights
-
-
-class Phi3SmallDecoderLayer(Module):
-
-    def __init__(self, config: PretrainedConfig, layer_idx: int):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        tp_group = config.mapping.tp_group
-        tp_size = config.mapping.tp_size
-        self.gegelu_limit = config.gegelu_limit
-
-        self.input_layernorm = LayerNorm(normalized_shape=config.hidden_size,
-                                         dtype=config.dtype)
-
-        # MuP uses norm_factor=attention_head_size (rather than sqrt(attention_head_size))
-        # We achieve this using q_scaling = sqrt(attention_head_size)
-        hidden_size = config.hidden_size
-        num_attention_heads = config.num_attention_heads
-        attention_head_size = hidden_size / num_attention_heads
-        q_scaling = attention_head_size**.5
-
-        block_sparse = (
-            (layer_idx + 1) % config.dense_attention_every_n_layers) != 0
-        attention_mask_type = AttentionMaskType.blocksparse if block_sparse else AttentionMaskType.causal
-
-        block_sparse_attn_params = BlockSparseAttnParams(
-            config.blocksparse_block_size, config.blocksparse_homo_head_pattern,
-            config.blocksparse_num_local_blocks,
-            config.blocksparse_vertical_stride)
-
-        layers_range = config.mapping.pp_layers(config.num_hidden_layers)
-        local_layer_idx = layer_idx - layers_range[0]
-
-        position_embedding_type = PositionEmbeddingType.rope_gpt_neox
-        original_max_position_embeddings = config.max_position_embeddings
-
-        rope_scaling_short_factors, rope_scaling_long_factors = 1.0, 1.0
-        rope_scaling_short_mscale, rope_scaling_long_mscale = 1.0, 1.0
-
-        if hasattr(config, "longrope_scaling_short_factors"):
-            rope_scaling_short_factors = np.asarray(
-                config.longrope_scaling_short_factors).astype(np.float32)
-            rope_scaling_long_factors = np.asarray(
-                config.longrope_scaling_long_factors).astype(np.float32)
-            rope_scaling_short_mscale = config.longrope_short_mscale
-            rope_scaling_long_mscale = config.longrope_long_mscale
-
-            position_embedding_type = PositionEmbeddingType.long_rope
-            original_max_position_embeddings = config.original_max_position_embeddings
-
-        self.attention = Attention(
-            local_layer_idx=local_layer_idx,
-            hidden_size=config.hidden_size,
-            num_attention_heads=config.num_attention_heads,
-            num_kv_heads=config.num_kv_heads,
-            position_embedding_type=position_embedding_type,
-            rotary_embedding_base=config.rotary_embedding_base,
-            max_position_embeddings=config.max_position_embeddings,
-            original_max_position_embeddings=original_max_position_embeddings,
-            dtype=config.dtype,
-            attention_mask_type=attention_mask_type,
-            bias=True,
-            q_scaling=q_scaling,
-            tp_group=tp_group,
-            tp_size=tp_size,
-            quant_mode=config.quant_mode,
-            rope_scaling_short_factors=rope_scaling_short_factors,
-            rope_scaling_long_factors=rope_scaling_long_factors,
-            rope_scaling_short_mscale=rope_scaling_short_mscale,
-            rope_scaling_long_mscale=rope_scaling_long_mscale,
-            block_sparse_params=block_sparse_attn_params)
-
-        self.post_layernorm = LayerNorm(normalized_shape=config.hidden_size,
-                                        dtype=config.dtype)
-
-        self.mlp = MLP(hidden_size=config.hidden_size,
-                       ffn_hidden_size=config.intermediate_size,
-                       hidden_act=config.hidden_act,
-                       dtype=config.dtype,
-                       tp_group=tp_group,
-                       tp_size=tp_size,
-                       quant_mode=config.quant_mode)
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        attention_mask=None,
-        use_cache=False,
-        kv_cache_params=None,
-        attention_params=None,
-    ):
-        residual = hidden_states
-        input_layernorm_output = self.input_layernorm(hidden_states)
-
-        # Self attention
-        attention_output = self.attention(
-            input_layernorm_output,
-            attention_mask=attention_mask,
-            use_cache=use_cache,
-            kv_cache_params=kv_cache_params,
-            attention_params=attention_params,
-        )
-
-        if use_cache:
-            attention_output, presents = attention_output
-
-        hidden_states = residual + attention_output
-
-        # Fully connected
-        residual = hidden_states
-        hidden_states = self.post_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states, gegelu_limit=self.gegelu_limit)
-        hidden_states = residual + hidden_states
-
-        if use_cache:
-            return (hidden_states, presents)
-        return hidden_states
-
-
-class Phi3SmallModel(Module):
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-        self.vocab_embedding = Embedding(num_embeddings=config.vocab_size,
-                                         embedding_dim=config.hidden_size,
-                                         dtype=config.dtype)
-
-        self.layers = DecoderLayerList(Phi3SmallDecoderLayer, config)
-        self.ln_f = LayerNorm(normalized_shape=config.hidden_size,
-                              dtype=config.dtype)
-        self.mup_embedding_multiplier = config.mup_embedding_multiplier
-
-    def forward(
-        self,
-        input_ids: Tensor,
-        position_ids=None,
-        use_cache=False,
-        attention_mask=None,
-        kv_cache_params=None,
-        attention_params=None,
-        prompt_embedding_table=None,
-        prompt_tasks=None,
-        prompt_vocab_size=None,
-    ):
-        args = [prompt_embedding_table, prompt_tasks, prompt_vocab_size
-                ] if prompt_embedding_table is not None else []
-        hidden_states = self.vocab_embedding(input_ids, *args)
-
-        if self.mup_embedding_multiplier is not None and self.mup_embedding_multiplier > 0.0:
-            hidden_states = hidden_states * self.mup_embedding_multiplier
-
-        hidden_states = self.layers(
-            hidden_states,
-            use_cache=use_cache,
-            attention_mask=attention_mask,
-            kv_cache_params=kv_cache_params,
-            attention_params=attention_params,
-        )
-        if use_cache:
-            hidden_states, presents = hidden_states
-
-        hidden_states = self.ln_f(hidden_states)
-
-        if use_cache:
-            return (hidden_states, tuple(presents))
-        return hidden_states
-
-
-class Phi3SmallForCausalLM(DecoderModelForCausalLM):
-
-    def __init__(self, config: PretrainedConfig):
-        transformer = Phi3SmallModel(config)
-        vocab_size_padded = pad_vocab_size(config.vocab_size,
-                                           config.mapping.tp_size)
-
-        lm_head = ParallelLMHead(config.hidden_size,
-                                 vocab_size_padded,
-                                 bias=False,
-                                 dtype=config.dtype,
-                                 tp_group=config.mapping.tp_group,
-                                 tp_size=config.mapping.tp_size,
-                                 gather_output=True)
-
-        super().__init__(config, transformer, lm_head)
-
-    @classmethod
-    def convert_hf_checkpoint(cls, model_dir, dtype, output_dir, args=None):
-        '''
-        Convert Huggingface checkpoint to TRT-LLM checkpoint
-        '''
-
-        hf_model = AutoModelForCausalLM.from_pretrained(model_dir,
-                                                        torch_dtype="auto",
-                                                        trust_remote_code=True)
-
-        config = convert_hf_config(hf_model.config, dtype, args)
-        with open(os.path.join(output_dir, 'config.json'), 'w') as f:
-            json.dump(config, f, indent=4)
-
-        def covert_and_save(rank):
-            weights = convert_hf_weights(hf_model, config, args, rank)
-            safetensors.torch.save_file(
-                weights, os.path.join(output_dir, f'rank{rank}.safetensors'))
-
-        world_size = args.tp_size * args.pp_size
-        if args.workers == 1:
-            for rank in range(world_size):
-                covert_and_save(rank)
-        else:
-            with ThreadPoolExecutor(max_workers=args.workers) as p:
-                futures = [
-                    p.submit(covert_and_save, rank)
-                    for rank in range(world_size)
-                ]
-                exceptions = []
-                for future in as_completed(futures):
-                    try:
-                        future.result()
-                    except Exception as e:
-                        traceback.print_exc()
-                        exceptions.append(e)
-                assert len(
-                    exceptions
-                ) == 0, "Checkpoint conversion failed, please check error log."
diff --git a/tensorrt_llm/models/phi3/phi3small/convert.py b/tensorrt_llm/models/phi3/split_weights.py
similarity index 66%
rename from tensorrt_llm/models/phi3/phi3small/convert.py
rename to tensorrt_llm/models/phi3/split_weights.py
index c11824876..72ccf0117 100644
--- a/tensorrt_llm/models/phi3/phi3small/convert.py
+++ b/tensorrt_llm/models/phi3/split_weights.py
@@ -15,10 +15,6 @@
 
 import torch
 
-from tensorrt_llm.quantization import QuantAlgo
-
-from ...._utils import str_dtype_to_torch
-
 
 def shuffle_qkv_weights(weights, config):
     # Input weights are organized as
@@ -29,8 +25,7 @@ def shuffle_qkv_weights(weights, config):
     # (q00, q01, ..., qnm), (k0, k1, .., kn), (v0, v1, .., vn)
 
     num_heads = config['num_attention_heads']
-    num_kv_heads = config['num_kv_heads'] if 'num_kv_heads' in config.keys(
-    ) else config['num_key_value_heads']
+    num_kv_heads = config['num_key_value_heads']
     num_q_per_kv = num_heads // num_kv_heads
 
     hidden_size = config['hidden_size']
@@ -152,7 +147,7 @@ def get_tllm_linear_weight(weight,
 
 def split_weights_tp(config, weights, args, rank, dtype):
     num_heads = config['num_attention_heads']
-    num_kv_heads = config['num_kv_heads']
+    num_kv_heads = config['num_key_value_heads']
     hidden_size = config['hidden_size']
 
     mha_mode = num_heads == num_kv_heads
@@ -228,96 +223,3 @@ def get_weight(weight, prefix, bias):
                                                 dim=0)
 
     return weights
-
-
-def convert_hf_weights(hf_model, config, args, rank):
-    torch_dtype = str_dtype_to_torch(args.dtype)
-    hf_state_dict = hf_model.state_dict()
-    weights = {}
-
-    # replace key name
-    for key, value in hf_state_dict.items():
-        # Decoder Layers
-        if "model.layers." in key:
-            key = key.replace("model.layers.", "transformer.layers.")
-            key = key.replace("self_attn.", "attention.")
-            key = key.replace("query_key_value.", "qkv.")
-            key = key.replace("mlp.up_proj.", "mlp.fc.")
-            key = key.replace("mlp.down_proj.", "mlp.proj.")
-            key = key.replace("post_attention_layernorm.", "post_layernorm.")
-        # Embedding
-        key = key.replace("model.embed_tokens.weight",
-                          "transformer.vocab_embedding.weight")
-        # Final Layer norm
-        key = key.replace("model.final_layernorm.", "transformer.ln_f.")
-        weights[key] = value.to(torch_dtype).cpu()
-
-    weights['lm_head.weight'] = weights[
-        'transformer.vocab_embedding.weight'].clone()
-
-    # Transform QKV weights from custom Phi3Small format to TRT-LLM format
-    for key, value in weights.items():
-        if "qkv." in key:
-            weights[key] = shuffle_qkv_weights(weights[key], config)
-
-    weights = split_weights_tp(config, weights, args, rank, torch_dtype)
-
-    return weights
-
-
-def convert_hf_config(hf_config, dtype, args):
-    config = {
-        'architecture': 'Phi3SmallForCausalLM',
-        'dtype': dtype,
-        'num_hidden_layers': hf_config.num_hidden_layers,
-        'num_attention_heads': hf_config.num_attention_heads,
-        'num_kv_heads': hf_config.num_key_value_heads,
-        'rotary_embedding_base': hf_config.rope_embedding_base,
-        'hidden_size': hf_config.hidden_size,
-        'intermediate_size': hf_config.intermediate_size,
-        'vocab_size': hf_config.vocab_size,
-        'max_position_embeddings': hf_config.max_position_embeddings,
-        'hidden_act': hf_config.hidden_act,
-        'share_embedding_table': False,
-        'gegelu_limit': hf_config.gegelu_limit,
-        'mup_attn_multiplier': hf_config.mup_attn_multiplier,
-        'mup_embedding_multiplier': hf_config.mup_embedding_multiplier,
-        'mup_use_scaling': hf_config.mup_use_scaling,
-        'mup_width_multiplier': hf_config.mup_width_multiplier,
-        'blocksparse_block_size': hf_config.blocksparse_block_size,
-        'blocksparse_homo_head_pattern':
-        hf_config.blocksparse_homo_head_pattern,
-        'blocksparse_num_local_blocks': hf_config.blocksparse_num_local_blocks,
-        'blocksparse_vertical_stride': hf_config.blocksparse_vert_stride,
-        'dense_attention_every_n_layers':
-        hf_config.dense_attention_every_n_layers,
-    }
-
-    if args is not None:
-        config.update({
-            'mapping': {
-                'world_size': args.tp_size * args.pp_size,
-                'tp_size': args.tp_size,
-                'pp_size': args.pp_size,
-            }
-        })
-
-        if args.use_weight_only and args.weight_only_precision == 'int8':
-            config.update({'quantization': {'quant_algo': QuantAlgo.W8A16}})
-        elif args.use_weight_only and args.weight_only_precision == 'int4':
-            config.update({'quantization': {'quant_algo': QuantAlgo.W4A16}})
-
-    if hf_config.max_position_embeddings >= 128000:
-        config.update({
-            'original_max_position_embeddings':
-            hf_config.original_max_position_embeddings,
-            'longrope_scaling_short_factors':
-            hf_config.rope_scaling["short_factor"],
-            'longrope_scaling_long_factors':
-            hf_config.rope_scaling["long_factor"],
-            'longrope_long_mscale':
-            hf_config.rope_scaling["long_mscale"],
-            'longrope_short_mscale':
-            hf_config.rope_scaling["short_mscale"]
-        })
-    return config
diff --git a/tensorrt_llm/models/recurrentgemma/model.py b/tensorrt_llm/models/recurrentgemma/model.py
index be11ed431..53a6c4502 100644
--- a/tensorrt_llm/models/recurrentgemma/model.py
+++ b/tensorrt_llm/models/recurrentgemma/model.py
@@ -398,9 +398,9 @@ def prepare_inputs(
             max_batch_size,
             max_input_len,
             max_seq_len,
+            max_num_tokens,
             use_cache,
             max_beam_width: int = 1,
-            max_num_tokens: int = None,
             opt_num_tokens: int = None,
             opt_batch_size: int = 0,
             prompt_embedding_table_size: int = 0,
@@ -434,44 +434,20 @@ def prepare_inputs(
         self.gather_context_logits = gather_context_logits
         mapping = self.config.mapping
 
-        default_range = GenerationMixin.default_range
-        batch_range = default_range(max_batch_size)
-        bbd_range = [
-            batch_range[i] * ((max_draft_len + 1) if i != 0 else 1)
-            for i in range(len(batch_range))
-        ]
-        inlen_range_cxt = default_range(max_input_len)
-        inlen_range_gen = [1, 1, max_draft_len + 1]
-
         # basic inputs
         enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles(
             use_gpt_attention_plugin, use_gemm_plugin, remove_input_padding,
             paged_kv_cache)
-        if max_num_tokens is None:
-            max_num_tokens = max(
-                max_input_len * max_batch_size,
-                max_beam_width * (max_draft_len + 1) * max_batch_size)
-        if enable_ctx_gen_opt_profiles:
-            num_profiles = 2
-            inlen_range = [inlen_range_cxt, inlen_range_gen]
-            num_tokens_range_ctx = default_range(max_num_tokens)
-            num_tokens_range_gen = default_range(
-                max_batch_size * (max_draft_len + 1) * max_beam_width)
-            num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen]
-            position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]]
-        else:
-            max_bs_x_max_bw = max_batch_size * max_beam_width
-            if opt_num_tokens is None:
-                opt_num_tokens = max_bs_x_max_bw
-            if multiple_profiles:
-                num_tokens_range = GenerationMixin.split_num_tokens_range(
-                    max_num_tokens)
-            else:
-                num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
-            num_profiles = len(num_tokens_range)
-            inlen_range = [[1, 1, max_input_len]] * num_profiles
-            position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles
-        bb_range = [batch_range] * num_profiles
+        num_profiles, ranges = GenerationMixin.get_profiles_ranges(
+            max_batch_size=max_batch_size,
+            max_beam_width=max_beam_width,
+            max_input_len=max_input_len,
+            max_num_tokens=max_num_tokens,
+            max_draft_len=max_draft_len,
+            opt_batch_size=opt_batch_size,
+            opt_num_tokens=opt_num_tokens,
+            enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles,
+            multiple_profiles=multiple_profiles)
 
         if remove_input_padding:
             assert use_mamba_conv1d_plugin, "mamba_conv1d_plugin is needed to support remove_input_padding"
@@ -479,14 +455,14 @@ def prepare_inputs(
                                dtype=trt.int32,
                                shape=[-1],
                                dim_range=OrderedDict([
-                                   ('num_tokens', num_tokens_range),
+                                   ('num_tokens', ranges['num_tokens_range']),
                                ]))
             position_ids = Tensor(name='position_ids',
                                   dtype=trt.int32,
                                   shape=[-1],
                                   dim_range=OrderedDict([
                                       ('position_ids_num_tokens_range',
-                                       num_tokens_range),
+                                       ranges['num_tokens_range']),
                                   ]))
         else:
             input_ids = Tensor(name='input_ids',
@@ -494,16 +470,17 @@ def prepare_inputs(
                                shape=[-1, -1],
                                dim_range=OrderedDict([
                                    ('batch_size_beam_width',
-                                    [batch_range] * num_profiles),
-                                   ('input_len', inlen_range),
+                                    ranges['bb_range']),
+                                   ('input_len', ranges['inlen_range']),
                                ]))
             position_ids = Tensor(name='position_ids',
                                   dtype=trt.int32,
                                   shape=[-1, -1],
                                   dim_range=OrderedDict([
-                                      ('batch_size_beam_width', bb_range),
+                                      ('batch_size_beam_width',
+                                       ranges['bb_range']),
                                       ('position_ids_inlen_range',
-                                       position_ids_inlen_range),
+                                       ranges['position_ids_inlen_range']),
                                   ]))
         if use_custom_all_reduce and mapping.tp_size > 1:
             current_all_reduce_helper().set_workspace_tensor(
@@ -559,7 +536,8 @@ def prepare_inputs(
                 name='host_request_types',
                 dtype=trt.int32,
                 shape=[-1],
-                dim_range=OrderedDict([('batch_size_beam_width', bb_range)]),
+                dim_range=OrderedDict([('batch_size_beam_width',
+                                        ranges['bb_range'])]),
             )
 
         last_token_ids = Tensor(
@@ -567,7 +545,7 @@ def prepare_inputs(
             dtype=trt.int32,
             shape=[-1],
             dim_range=OrderedDict([
-                ('batch_size_last_token_ids', [bbd_range] * num_profiles),
+                ('batch_size_last_token_ids', ranges['bbd_range']),
             ]),
         )
         last_token_ids_for_logits = None
@@ -581,7 +559,8 @@ def prepare_inputs(
                 name='host_context_lengths',
                 dtype=trt.int32,
                 shape=[-1],
-                dim_range=OrderedDict([('batch_size_beam_width', bb_range)]),
+                dim_range=OrderedDict([('batch_size_beam_width',
+                                        ranges['bb_range'])]),
             )
         else:
             host_context_lengths = None
diff --git a/tensorrt_llm/quantization/layers.py b/tensorrt_llm/quantization/layers.py
index 591dba9d5..b7273b962 100644
--- a/tensorrt_llm/quantization/layers.py
+++ b/tensorrt_llm/quantization/layers.py
@@ -883,7 +883,6 @@ def forward(self, x, lora_runtime_params=None):
         assert lora_runtime_params is None or default_net(
         ).plugin_config.lora_plugin == self.dtype
 
-        lora_hidden_state = x if lora_runtime_params is not None else None
         if default_net().strongly_typed:
             assert is_same_dtype(
                 x.dtype,
@@ -894,8 +893,13 @@ def forward(self, x, lora_runtime_params=None):
         activation_scaling_factor = cast(activation_scaling_factor, self.dtype)
         if x.dtype != trt.fp8:
             quantized_out = quantize(x, activation_scaling_factor, 'fp8')
+            lora_hidden_state = x if lora_runtime_params is not None else None
         else:
             quantized_out = x
+            # TODO: add fp8 LoRA support
+            lora_hidden_state = dequantize(
+                x, activation_scaling_factor, -1,
+                self.dtype) if lora_runtime_params is not None else None
 
         weights_scaling_factor = constant(
             self.weights_scaling_factor.raw_value.copy())
@@ -956,14 +960,18 @@ def forward(self, x, lora_runtime_params=None, reduce_fusion_params=None):
         assert lora_runtime_params is None or default_net(
         ).plugin_config.lora_plugin == self.dtype
 
-        lora_hidden_state = x if lora_runtime_params is not None else None
         activation_scaling_factor = constant(
             self.activation_scaling_factor.raw_value.copy())
         activation_scaling_factor = cast(activation_scaling_factor, self.dtype)
         if x.dtype != trt.fp8:
             quantized_out = quantize(x, activation_scaling_factor, 'fp8')
+            lora_hidden_state = x if lora_runtime_params is not None else None
         else:
             quantized_out = x
+            # TODO: add fp8 LoRA support
+            lora_hidden_state = dequantize(
+                x, activation_scaling_factor, -1,
+                self.dtype) if lora_runtime_params is not None else None
 
         weights_scaling_factor = constant(
             self.weights_scaling_factor.raw_value.copy())
diff --git a/tensorrt_llm/quantization/quantize.py b/tensorrt_llm/quantization/quantize.py
index 5a33799dd..d4e271749 100644
--- a/tensorrt_llm/quantization/quantize.py
+++ b/tensorrt_llm/quantization/quantize.py
@@ -57,7 +57,10 @@ def quantize_layers(
             if preprocess_init_params is not None:
                 preprocess_init_params(init_params, name, module)
             quant_layer = quant_cls(**init_params)
-            setattr(parent, module_name, quant_layer)
+            if parent is not None:
+                setattr(parent, module_name, quant_layer)
+            else:
+                model = quant_layer
 
     setattr(model, 'quant_mode', quant_config.quant_mode)
     return model
@@ -78,7 +81,7 @@ def preprocess_init_params(init_params, name, module):
             module_name = name.rsplit('.', 1)[-1]
             init_params["transb"] = module_name == "lm_head"
 
-    quantize_layers(
+    model = quantize_layers(
         model,
         quant_config,
         quant_map,
@@ -102,7 +105,7 @@ def preprocess_init_params(init_params, name, module):
         init_params[
             "use_w4a8_awq"] = quant_config.quant_algo == QuantAlgo.W4A8_AWQ
 
-    quantize_layers(
+    model = quantize_layers(
         model,
         quant_config,
         quant_map,
@@ -120,7 +123,7 @@ def smooth_quantize_ootb(
         RowLinear: Int8SmoothQuantRowLinear,
     }
 
-    quantize_layers(
+    model = quantize_layers(
         model,
         quant_config,
         quant_map,
@@ -138,7 +141,7 @@ def smooth_quantize_plugin(model, quant_mode):
     }
     for name, layer, parent in model.named_modules_with_parent():
         layer_name = name.rsplit('.', 1)[-1]
-        if layer_name in ['ln_f']:
+        if layer_name in ['ln_f', 'ln_embed']:
             continue
 
         quant_cls = None
@@ -156,7 +159,10 @@ def smooth_quantize_plugin(model, quant_mode):
             init_params[
                 "num_attention_heads"] = layer.num_attention_heads * layer.tp_size
         quant_layer = quant_cls(**init_params)
-        setattr(parent, layer_name, quant_layer)
+        if parent is not None:
+            setattr(parent, layer_name, quant_layer)
+        else:
+            model = quant_layer
 
     setattr(model, 'quant_mode', quant_mode)
     return model
@@ -178,7 +184,7 @@ def fp8_quantize(model, quant_config: QuantConfig):
         RowLinear: FP8RowLinear,
     }
 
-    quantize_layers(
+    model = quantize_layers(
         model,
         quant_config,
         quant_map,
diff --git a/tensorrt_llm/quantization/quantize_by_modelopt.py b/tensorrt_llm/quantization/quantize_by_modelopt.py
index 86c2527f4..b71d7590c 100644
--- a/tensorrt_llm/quantization/quantize_by_modelopt.py
+++ b/tensorrt_llm/quantization/quantize_by_modelopt.py
@@ -31,10 +31,12 @@
 import safetensors
 import torch
 from datasets import load_dataset
+from safetensors.torch import load_file, save_file
 from torch.utils.data import DataLoader
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
 from ..logger import logger
+from ..mapping import Mapping
 from .mode import QuantAlgo
 
 EMPTY_CFG = {
@@ -122,7 +124,8 @@ def quant_cfg_choices():
     "Gemma": "gemma",
     "MixtralForCausalLM": "llama",
     "ArcticForCausalLM": "llama",
-    "Phi3SmallForCausalLM": "phi",
+    "Phi3SmallForCausalLM": "phi3small",
+    "Phi3ForCausalLM": "phi3",
 }
 
 
@@ -263,10 +266,95 @@ def calibrate_loop():
     return model
 
 
-def quantize_and_export(*, model_dir, device, calib_dataset, dtype, qformat,
-                        kv_cache_dtype, calib_size, batch_size,
-                        calib_max_seq_length, awq_block_size, output_dir,
-                        tp_size, pp_size, seed, tokenizer_max_seq_length):
+def combine_medusa_weight(tp_size, pp_size, base_model_output_dir,
+                          num_medusa_heads, num_medusa_layers, max_draft_len,
+                          medusa_hidden_act, medusa_model_dir,
+                          quant_medusa_head):
+
+    with open(f"{medusa_model_dir}/config.json", "r") as fp:
+        medusa_config = json.load(fp)
+
+    num_medusa_heads_from_config = medusa_config.get('medusa_num_heads',
+                                                     num_medusa_heads)
+    num_medusa_layers = medusa_config.get('medusa_num_layers',
+                                          num_medusa_layers)
+    if num_medusa_heads is None:
+        num_medusa_heads = num_medusa_heads_from_config
+
+    assert max_draft_len > 0, "should have max_draft_len > 0"
+
+    world_size = tp_size * pp_size
+    # Process for each rank
+    for rank in range(world_size):
+        mapping = Mapping(world_size=world_size,
+                          rank=rank,
+                          tp_size=tp_size,
+                          pp_size=pp_size)
+        # 1. Load medusa weight for each rank
+        from tensorrt_llm.models.medusa.weight import load_medusa_hf
+        medusa_weights = load_medusa_hf(medusa_path=medusa_model_dir,
+                                        num_medusa_heads=num_medusa_heads,
+                                        num_medusa_layers=num_medusa_layers,
+                                        mapping=mapping,
+                                        dtype="float16")
+        # 2. Load base model safetensors (after quant)
+        base_model_weights = load_file(
+            f"{base_model_output_dir}/rank{rank}.safetensors")
+
+        # 3. Combine and save weight
+        base_model_weights.update(medusa_weights)
+        save_file(base_model_weights,
+                  f"{base_model_output_dir}/rank{rank}.safetensors")
+
+    # 4. Add medusa config into config.json
+    with open(f"{base_model_output_dir}/config.json", 'r') as f:
+        base_model_config = json.load(f)
+        f.close()
+
+    with open(f"{base_model_output_dir}/config.json", 'w') as f:
+        base_model_config['architecture'] = "MedusaForCausalLM"
+        base_model_config['quantization']['exclude_modules'] = [
+            'lm_head',
+            '*router',
+            '*vocab_embedding',
+            '*position_embedding',
+            '*block_embedding',
+        ]
+        if not quant_medusa_head:
+            base_model_config['quantization']['exclude_modules'].append(
+                '*medusa_heads*')
+
+        base_model_config['max_draft_len'] = max_draft_len
+        base_model_config['num_medusa_heads'] = num_medusa_heads
+        base_model_config['num_medusa_layers'] = num_medusa_layers
+        json.dump(base_model_config, f, indent=4)
+
+    torch.cuda.empty_cache()
+    print("Combine medusa heads' weight, done.")
+
+
+def quantize_and_export(*,
+                        model_dir,
+                        device,
+                        calib_dataset,
+                        dtype,
+                        qformat,
+                        kv_cache_dtype,
+                        calib_size,
+                        batch_size,
+                        calib_max_seq_length,
+                        awq_block_size,
+                        output_dir,
+                        tp_size,
+                        pp_size,
+                        seed,
+                        tokenizer_max_seq_length,
+                        num_medusa_heads=None,
+                        num_medusa_layers=None,
+                        max_draft_len=None,
+                        medusa_hidden_act=None,
+                        medusa_model_dir=None,
+                        quant_medusa_head=None):
     '''
         Load model from the model_dir, call Modelopt to quantize the model, and then export
         the quantized model as TRT-LLM checkpoint
@@ -419,24 +507,16 @@ def quantize_and_export(*, model_dir, device, calib_dataset, dtype, qformat,
             with open(f"{export_path}/config.json", "w") as f:
                 json.dump(tensorrt_llm_config, f, indent=4)
 
-        if model_type == 'phi':
-            with open(f"{export_path}/config.json", "r") as f:
-                tensorrt_llm_config = json.load(f)
-            phi_config = AutoConfig.from_pretrained(model_dir,
-                                                    trust_remote_code=True)
-
-            from ..models.phi3.phi3small.convert import \
-                convert_hf_config as phi_config_converter
-            phi_config = phi_config_converter(phi_config, dtype, None)
-
-            for key, value in phi_config.items():
-                tensorrt_llm_config[key] = value
-
-            with open(f"{export_path}/config.json", "w") as f:
-                json.dump(tensorrt_llm_config, f, indent=4)
-
         torch.cuda.empty_cache(
         )  # otherwise torch is keeping using GPU, other routine like build engine has less free GPU to use
+
+        # Workaround for combining medusa head
+        # TODO: move these integration into modelopt to avoid redundant reading and writing
+        if medusa_model_dir is not None:
+            combine_medusa_weight(tp_size, pp_size, export_path,
+                                  num_medusa_heads, num_medusa_layers,
+                                  max_draft_len, medusa_hidden_act,
+                                  medusa_model_dir, quant_medusa_head)
         end_time = time.time()
         print(
             "Quantized model exported to {} \nTotal time used {:.2f} s.".format(
diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py
index 12206c450..cb051e10d 100644
--- a/tensorrt_llm/version.py
+++ b/tensorrt_llm/version.py
@@ -12,4 +12,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.11.0.dev2024061800"
+__version__ = "0.11.0.dev2024062500"
diff --git a/tests/model/test_mamba.py b/tests/model/test_mamba.py
index ce5498361..91c1cdfe7 100644
--- a/tests/model/test_mamba.py
+++ b/tests/model/test_mamba.py
@@ -84,10 +84,12 @@ def _gen_tensorrt_llm_network(self, network, hf_config, hf_path, hf_mamba,
             hf_config, hf_path, hf_mamba, load_mode, dtype)
         with net_guard(network):
             network.set_named_parameters(tensorrt_llm_mamba.named_parameters())
-            inputs = tensorrt_llm_mamba.prepare_inputs(batch_size,
-                                                       input_len,
-                                                       input_len + output_len,
-                                                       use_cache=False)
+            inputs = tensorrt_llm_mamba.prepare_inputs(
+                batch_size,
+                input_len,
+                input_len + output_len,
+                max_num_tokens=batch_size * input_len,
+                use_cache=False)
             # Prepare
             tensorrt_llm_mamba(**inputs)
         return network
diff --git a/tests/test_llama_conversion.sh b/tests/test_llama_conversion.sh
index 8b55cd7a8..063fc8d14 100755
--- a/tests/test_llama_conversion.sh
+++ b/tests/test_llama_conversion.sh
@@ -80,7 +80,7 @@ test_gptq() {
     python convert_checkpoint.py --model_dir ${MODEL} \
                                  --output_dir ./tllm_checkpoint/2gpu_gptq \
                                  --dtype float16 \
-                                 --modelopt_quant_ckpt_path /home/scratch.trt_llm_data/llm-models/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors \
+                                 --quant_ckpt_path /home/scratch.trt_llm_data/llm-models/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors \
                                  --use_weight_only \
                                  --weight_only_precision int4_gptq \
                                  --per_group \