From 94522b62d4679eca2b96923ac2b47bae277f92b7 Mon Sep 17 00:00:00 2001 From: Georg Kolling Date: Tue, 21 May 2024 12:54:14 +0100 Subject: [PATCH 1/5] TxModel: Enable use of koi masked attention kernel. Make sure chunk size is a multiple of (conv_stride * 16) --- dorado/basecall/BasecallerParams.cpp | 10 ++++------ dorado/basecall/BasecallerParams.h | 2 +- dorado/basecall/CRFModelConfig.h | 4 +++- dorado/basecall/nn/TxModel.cpp | 20 ++++++++++++++++++-- tests/CRFModelConfigTest.cpp | 4 ++-- 5 files changed, 28 insertions(+), 12 deletions(-) diff --git a/dorado/basecall/BasecallerParams.cpp b/dorado/basecall/BasecallerParams.cpp index 1991df2c3..ad7e33b85 100644 --- a/dorado/basecall/BasecallerParams.cpp +++ b/dorado/basecall/BasecallerParams.cpp @@ -90,11 +90,9 @@ void BasecallerParams::update(const BasecallerParams &other) { merge(m_batch_size, other.m_batch_size, "batchsize"); } -void BasecallerParams::normalise(size_t divisor) { - const int div = static_cast(divisor); - +void BasecallerParams::normalise(size_t chunk_size_divisor, size_t overlap_divisor) { // Apply normalised value with FORCE - auto normalise_param = [&, div](Value &self, const std::string &name) { + auto normalise_param = [&](Value &self, const std::string &name, int div) { const int before_val = self.val; const int new_val = (self.val / div) * div; if (set_value(self, Value{new_val, Priority::FORCE})) { @@ -102,8 +100,8 @@ void BasecallerParams::normalise(size_t divisor) { } }; - normalise_param(m_chunk_size, "chunksize"); - normalise_param(m_overlap, "overlap"); + normalise_param(m_chunk_size, "chunksize", static_cast(chunk_size_divisor)); + normalise_param(m_overlap, "overlap", static_cast(overlap_divisor)); } } // namespace dorado::basecall \ No newline at end of file diff --git a/dorado/basecall/BasecallerParams.h b/dorado/basecall/BasecallerParams.h index e67e80e2a..7af2d5c60 100644 --- a/dorado/basecall/BasecallerParams.h +++ b/dorado/basecall/BasecallerParams.h @@ -56,7 +56,7 @@ class BasecallerParams { void update(const BasecallerParams& other); // Normalise the `chunk_size` and `overlap` to the nearest evenly divisible integer of divisor (stride) - void normalise(size_t divisor); + void normalise(size_t chunk_size_divisor, size_t overlap_divisor); std::string to_string() const { std::string str = "BasecallerParams {"; diff --git a/dorado/basecall/CRFModelConfig.h b/dorado/basecall/CRFModelConfig.h index 29e434d2d..7c1806fb3 100644 --- a/dorado/basecall/CRFModelConfig.h +++ b/dorado/basecall/CRFModelConfig.h @@ -184,7 +184,9 @@ struct CRFModelConfig { int stride_inner() const { return stride * scale_factor(); }; // Normalise the basecaller parameters `chunk_size` and `overlap` to the `strde_inner` - void normalise_basecaller_params() { basecaller.normalise(stride_inner()); } + void normalise_basecaller_params() { + basecaller.normalise(stride_inner() * (is_tx_model() ? 16 : 1), stride_inner()); + } // True if `chunk_size` and `overlap` is evenly divisible by the `strde_inner` bool has_normalised_basecaller_params() const; diff --git a/dorado/basecall/nn/TxModel.cpp b/dorado/basecall/nn/TxModel.cpp index ec3efef32..9a8a8a068 100644 --- a/dorado/basecall/nn/TxModel.cpp +++ b/dorado/basecall/nn/TxModel.cpp @@ -299,7 +299,7 @@ at::Tensor MultiHeadAttentionImpl::forward(at::Tensor x) { stream, static_cast(N), static_cast(T), nhead, head_dim, rotary_emb->theta, qkv.data_ptr(), out.data_ptr()); if (res != KOI_SUCCESS) { - throw std::runtime_error("Koi windowed attention failed."); + throw std::runtime_error("Koi rotary embedding failed."); } qkv = out; } else @@ -308,10 +308,26 @@ at::Tensor MultiHeadAttentionImpl::forward(at::Tensor x) { qkv = rotary_emb(qkv); } } + attn_output_ntc = at::empty({N, T, C}, x.options()); +#if DORADO_CUDA_BUILD + int res = KOI_NOT_SUPPORTED; + if (utils::get_dev_opt("use_koi_attention", true) && koi_can_use_cutlass()) { + utils::ScopedProfileRange spr("KOI_MEA", 3); + auto stream = at::cuda::getCurrentCUDAStream().stream(); + const auto [win_upper, win_lower] = attn_window; + res = host_masked_attention_f16(stream, static_cast(N), static_cast(T), nhead, + head_dim, win_upper, win_lower, qkv[0].data_ptr(), + qkv[1].data_ptr(), qkv[2].data_ptr(), + attn_output_ntc.data_ptr()); + if (res != KOI_SUCCESS && res != KOI_NOT_SUPPORTED) { + throw std::runtime_error("Koi windowed attention failed."); + } + } + if (res == KOI_NOT_SUPPORTED) +#endif { utils::ScopedProfileRange spr("MEA", 3); auto attn_window_mask = get_attn_window_mask(T); - attn_output_ntc = at::empty({N, T, C}, x.options()); auto attn_output = attn_output_ntc.view({N, T, nhead, head_dim}).transpose(1, 2); const auto [win_upper, win_lower] = attn_window; for (int i = 0; i < num_splits; ++i) { diff --git a/tests/CRFModelConfigTest.cpp b/tests/CRFModelConfigTest.cpp index 8357798c7..dc8814a2f 100644 --- a/tests/CRFModelConfigTest.cpp +++ b/tests/CRFModelConfigTest.cpp @@ -38,8 +38,8 @@ TEST_CASE(CUT_TAG ": test normalise BasecallerParams", CUT_TAG) { config.normalise_basecaller_params(); CHECK(config.has_normalised_basecaller_params()); CHECK(config.basecaller.chunk_size() % config.stride_inner() == 0); - // Expected (121 / 12) * 12 - CHECK(config.basecaller.chunk_size() == 120); + // Expected ((121 + 191) / 192) * 192 + CHECK(config.basecaller.chunk_size() == 192); } } From ad4f43619f86a0bc800a793c5fbc9dff4b5532e3 Mon Sep 17 00:00:00 2001 From: Chris Seymour Date: Tue, 21 May 2024 13:04:17 +0100 Subject: [PATCH 2/5] koi v0.4.8 --- cmake/Koi.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Koi.cmake b/cmake/Koi.cmake index b3cbe3d8b..324e9cfee 100644 --- a/cmake/Koi.cmake +++ b/cmake/Koi.cmake @@ -20,7 +20,7 @@ endfunction() if(CMAKE_SYSTEM_NAME STREQUAL "Linux" OR WIN32) - set(KOI_VERSION 0.4.7) + set(KOI_VERSION 0.4.8) if(BUILD_KOI_FROM_SOURCE) set(KOI_DIR "${DORADO_3RD_PARTY_SOURCE}/koi") if(NOT EXISTS ${KOI_DIR}) From c3c2d2a70d40a1412873625cb5cb0e9be8e31bf3 Mon Sep 17 00:00:00 2001 From: Georg Kolling Date: Tue, 21 May 2024 13:21:44 +0100 Subject: [PATCH 3/5] Fix test for non-normalised basecaller params --- tests/CRFModelConfigTest.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/CRFModelConfigTest.cpp b/tests/CRFModelConfigTest.cpp index dc8814a2f..3522a29e4 100644 --- a/tests/CRFModelConfigTest.cpp +++ b/tests/CRFModelConfigTest.cpp @@ -31,15 +31,15 @@ TEST_CASE(CUT_TAG ": test normalise BasecallerParams", CUT_TAG) { fs::path(get_data_dir("model_configs/dna_r10.4.1_e8.2_400bps_sup@v5.0.0")); CRFModelConfig config = load_crf_model_config(path); - // Set chunksize to (12 * 10) + 1 to ensure it's not mod12 - config.basecaller.set_chunk_size(121); + // Set chunksize to (12 * 16 * 10) + 1 to ensure it's not mod192 + config.basecaller.set_chunk_size(1921); CHECK_FALSE(config.has_normalised_basecaller_params()); config.normalise_basecaller_params(); CHECK(config.has_normalised_basecaller_params()); CHECK(config.basecaller.chunk_size() % config.stride_inner() == 0); - // Expected ((121 + 191) / 192) * 192 - CHECK(config.basecaller.chunk_size() == 192); + // Expected (1921 / 192) * 192 + CHECK(config.basecaller.chunk_size() == 1920); } } From 46abe1b8d012e24e11baff02ec173df4217ecba7 Mon Sep 17 00:00:00 2001 From: Georg Kolling Date: Tue, 21 May 2024 13:24:22 +0100 Subject: [PATCH 4/5] TxModel: don't try to use koi attention kernel when DORADO_TX2 is set --- dorado/basecall/nn/TxModel.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dorado/basecall/nn/TxModel.cpp b/dorado/basecall/nn/TxModel.cpp index 9a8a8a068..dd9c5fe51 100644 --- a/dorado/basecall/nn/TxModel.cpp +++ b/dorado/basecall/nn/TxModel.cpp @@ -309,7 +309,7 @@ at::Tensor MultiHeadAttentionImpl::forward(at::Tensor x) { } } attn_output_ntc = at::empty({N, T, C}, x.options()); -#if DORADO_CUDA_BUILD +#if DORADO_CUDA_BUILD && !defined(DORADO_TX2) int res = KOI_NOT_SUPPORTED; if (utils::get_dev_opt("use_koi_attention", true) && koi_can_use_cutlass()) { utils::ScopedProfileRange spr("KOI_MEA", 3); From 461d4badea7827f23284d3820be980add5b798bd Mon Sep 17 00:00:00 2001 From: Georg Kolling Date: Tue, 21 May 2024 14:46:00 +0100 Subject: [PATCH 5/5] TxModel/CudaCaller: limit transformer batch size to 512 for now, as larger batch sizes seem to cause problems at the moment --- dorado/basecall/CudaCaller.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dorado/basecall/CudaCaller.cpp b/dorado/basecall/CudaCaller.cpp index 07c4ca990..a6c00e32c 100644 --- a/dorado/basecall/CudaCaller.cpp +++ b/dorado/basecall/CudaCaller.cpp @@ -317,7 +317,8 @@ void CudaCaller::determine_batch_dims(float memory_limit_fraction, // trade-off between getting more accurate measurements and avoiding excessive startup time. const int chunk_size = std::min(m_batch_dims.back().T_in, m_config.stride * 300); // We limit the maximum when doing benchmarking to avoid excessive startup time. - const int max_batch_size_limit = 10240; + // The limit for transformer models should be increased at a later time. + const int max_batch_size_limit = m_config.is_tx_model() ? 512 : 10240; int max_batch_size = *std::max_element(max_batch_sizes.begin(), max_batch_sizes.end()); max_batch_size = std::min(max_batch_size, max_batch_size_limit); spdlog::debug("Auto batchsize {}: testing up to {} in steps of {}", m_device, max_batch_size,