From 94522b62d4679eca2b96923ac2b47bae277f92b7 Mon Sep 17 00:00:00 2001
From: Georg Kolling <georg.kolling@nanoporetech.com>
Date: Tue, 21 May 2024 12:54:14 +0100
Subject: [PATCH 1/5] TxModel: Enable use of koi masked attention kernel. Make
 sure chunk size is a multiple of (conv_stride * 16)

---
 dorado/basecall/BasecallerParams.cpp | 10 ++++------
 dorado/basecall/BasecallerParams.h   |  2 +-
 dorado/basecall/CRFModelConfig.h     |  4 +++-
 dorado/basecall/nn/TxModel.cpp       | 20 ++++++++++++++++++--
 tests/CRFModelConfigTest.cpp         |  4 ++--
 5 files changed, 28 insertions(+), 12 deletions(-)
diff --git a/dorado/basecall/BasecallerParams.cpp b/dorado/basecall/BasecallerParams.cpp
index 1991df2c3..ad7e33b85 100644
--- a/dorado/basecall/BasecallerParams.cpp
+++ b/dorado/basecall/BasecallerParams.cpp
@@ -90,11 +90,9 @@ void BasecallerParams::update(const BasecallerParams &other) {
     merge(m_batch_size, other.m_batch_size, "batchsize");
 }
 
-void BasecallerParams::normalise(size_t divisor) {
-    const int div = static_cast<int>(divisor);
-
+void BasecallerParams::normalise(size_t chunk_size_divisor, size_t overlap_divisor) {
     // Apply normalised value with FORCE
-    auto normalise_param = [&, div](Value &self, const std::string &name) {
+    auto normalise_param = [&](Value &self, const std::string &name, int div) {
         const int before_val = self.val;
         const int new_val = (self.val / div) * div;
         if (set_value(self, Value{new_val, Priority::FORCE})) {
@@ -102,8 +100,8 @@ void BasecallerParams::normalise(size_t divisor) {
         }
     };
 
-    normalise_param(m_chunk_size, "chunksize");
-    normalise_param(m_overlap, "overlap");
+    normalise_param(m_chunk_size, "chunksize", static_cast<int>(chunk_size_divisor));
+    normalise_param(m_overlap, "overlap", static_cast<int>(overlap_divisor));
 }
 
 }  // namespace dorado::basecall
\ No newline at end of file
diff --git a/dorado/basecall/BasecallerParams.h b/dorado/basecall/BasecallerParams.h
index e67e80e2a..7af2d5c60 100644
--- a/dorado/basecall/BasecallerParams.h
+++ b/dorado/basecall/BasecallerParams.h
@@ -56,7 +56,7 @@ class BasecallerParams {
     void update(const BasecallerParams& other);
 
     // Normalise the `chunk_size` and `overlap` to the nearest evenly divisible integer of divisor (stride)
-    void normalise(size_t divisor);
+    void normalise(size_t chunk_size_divisor, size_t overlap_divisor);
 
     std::string to_string() const {
         std::string str = "BasecallerParams {";
diff --git a/dorado/basecall/CRFModelConfig.h b/dorado/basecall/CRFModelConfig.h
index 29e434d2d..7c1806fb3 100644
--- a/dorado/basecall/CRFModelConfig.h
+++ b/dorado/basecall/CRFModelConfig.h
@@ -184,7 +184,9 @@ struct CRFModelConfig {
     int stride_inner() const { return stride * scale_factor(); };
 
     // Normalise the basecaller parameters `chunk_size` and `overlap` to the `strde_inner`
-    void normalise_basecaller_params() { basecaller.normalise(stride_inner()); }
+    void normalise_basecaller_params() {
+        basecaller.normalise(stride_inner() * (is_tx_model() ? 16 : 1), stride_inner());
+    }
     // True if `chunk_size` and `overlap` is evenly divisible by the `strde_inner`
     bool has_normalised_basecaller_params() const;
 
diff --git a/dorado/basecall/nn/TxModel.cpp b/dorado/basecall/nn/TxModel.cpp
index ec3efef32..9a8a8a068 100644
--- a/dorado/basecall/nn/TxModel.cpp
+++ b/dorado/basecall/nn/TxModel.cpp
@@ -299,7 +299,7 @@ at::Tensor MultiHeadAttentionImpl::forward(at::Tensor x) {
                     stream, static_cast<int>(N), static_cast<int>(T), nhead, head_dim,
                     rotary_emb->theta, qkv.data_ptr(), out.data_ptr());
             if (res != KOI_SUCCESS) {
-                throw std::runtime_error("Koi windowed attention failed.");
+                throw std::runtime_error("Koi rotary embedding failed.");
             }
             qkv = out;
         } else
@@ -308,10 +308,26 @@ at::Tensor MultiHeadAttentionImpl::forward(at::Tensor x) {
             qkv = rotary_emb(qkv);
         }
     }
+    attn_output_ntc = at::empty({N, T, C}, x.options());
+#if DORADO_CUDA_BUILD
+    int res = KOI_NOT_SUPPORTED;
+    if (utils::get_dev_opt<bool>("use_koi_attention", true) && koi_can_use_cutlass()) {
+        utils::ScopedProfileRange spr("KOI_MEA", 3);
+        auto stream = at::cuda::getCurrentCUDAStream().stream();
+        const auto [win_upper, win_lower] = attn_window;
+        res = host_masked_attention_f16(stream, static_cast<int>(N), static_cast<int>(T), nhead,
+                                        head_dim, win_upper, win_lower, qkv[0].data_ptr(),
+                                        qkv[1].data_ptr(), qkv[2].data_ptr(),
+                                        attn_output_ntc.data_ptr());
+        if (res != KOI_SUCCESS && res != KOI_NOT_SUPPORTED) {
+            throw std::runtime_error("Koi windowed attention failed.");
+        }
+    }
+    if (res == KOI_NOT_SUPPORTED)
+#endif
     {
         utils::ScopedProfileRange spr("MEA", 3);
         auto attn_window_mask = get_attn_window_mask(T);
-        attn_output_ntc = at::empty({N, T, C}, x.options());
         auto attn_output = attn_output_ntc.view({N, T, nhead, head_dim}).transpose(1, 2);
         const auto [win_upper, win_lower] = attn_window;
         for (int i = 0; i < num_splits; ++i) {
diff --git a/tests/CRFModelConfigTest.cpp b/tests/CRFModelConfigTest.cpp
index 8357798c7..dc8814a2f 100644
--- a/tests/CRFModelConfigTest.cpp
+++ b/tests/CRFModelConfigTest.cpp
@@ -38,8 +38,8 @@ TEST_CASE(CUT_TAG ": test normalise BasecallerParams", CUT_TAG) {
         config.normalise_basecaller_params();
         CHECK(config.has_normalised_basecaller_params());
         CHECK(config.basecaller.chunk_size() % config.stride_inner() == 0);
-        // Expected (121 / 12) * 12
-        CHECK(config.basecaller.chunk_size() == 120);
+        // Expected ((121 + 191) / 192) * 192
+        CHECK(config.basecaller.chunk_size() == 192);
     }
 }
 

From ad4f43619f86a0bc800a793c5fbc9dff4b5532e3 Mon Sep 17 00:00:00 2001
From: Chris Seymour <christopher.seymour@nanoporetech.com>
Date: Tue, 21 May 2024 13:04:17 +0100
Subject: [PATCH 2/5] koi v0.4.8

---
 cmake/Koi.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Koi.cmake b/cmake/Koi.cmake
index b3cbe3d8b..324e9cfee 100644
--- a/cmake/Koi.cmake
+++ b/cmake/Koi.cmake
@@ -20,7 +20,7 @@ endfunction()
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Linux" OR WIN32)
 
-    set(KOI_VERSION 0.4.7)
+    set(KOI_VERSION 0.4.8)
     if(BUILD_KOI_FROM_SOURCE)
         set(KOI_DIR "${DORADO_3RD_PARTY_SOURCE}/koi")
         if(NOT EXISTS ${KOI_DIR})

From c3c2d2a70d40a1412873625cb5cb0e9be8e31bf3 Mon Sep 17 00:00:00 2001
From: Georg Kolling <georg.kolling@nanoporetech.com>
Date: Tue, 21 May 2024 13:21:44 +0100
Subject: [PATCH 3/5] Fix test for non-normalised basecaller params

---
 tests/CRFModelConfigTest.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/CRFModelConfigTest.cpp b/tests/CRFModelConfigTest.cpp
index dc8814a2f..3522a29e4 100644
--- a/tests/CRFModelConfigTest.cpp
+++ b/tests/CRFModelConfigTest.cpp
@@ -31,15 +31,15 @@ TEST_CASE(CUT_TAG ": test normalise BasecallerParams", CUT_TAG) {
                 fs::path(get_data_dir("model_configs/dna_r10.4.1_e8.2_400bps_sup@v5.0.0"));
         CRFModelConfig config = load_crf_model_config(path);
 
-        // Set chunksize to (12 * 10) + 1 to ensure it's not mod12
-        config.basecaller.set_chunk_size(121);
+        // Set chunksize to (12 * 16 * 10) + 1 to ensure it's not mod192
+        config.basecaller.set_chunk_size(1921);
         CHECK_FALSE(config.has_normalised_basecaller_params());
 
         config.normalise_basecaller_params();
         CHECK(config.has_normalised_basecaller_params());
         CHECK(config.basecaller.chunk_size() % config.stride_inner() == 0);
-        // Expected ((121 + 191) / 192) * 192
-        CHECK(config.basecaller.chunk_size() == 192);
+        // Expected (1921 / 192) * 192
+        CHECK(config.basecaller.chunk_size() == 1920);
     }
 }
 

From 46abe1b8d012e24e11baff02ec173df4217ecba7 Mon Sep 17 00:00:00 2001
From: Georg Kolling <georg.kolling@nanoporetech.com>
Date: Tue, 21 May 2024 13:24:22 +0100
Subject: [PATCH 4/5] TxModel: don't try to use koi attention kernel when
 DORADO_TX2 is set

---
 dorado/basecall/nn/TxModel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dorado/basecall/nn/TxModel.cpp b/dorado/basecall/nn/TxModel.cpp
index 9a8a8a068..dd9c5fe51 100644
--- a/dorado/basecall/nn/TxModel.cpp
+++ b/dorado/basecall/nn/TxModel.cpp
@@ -309,7 +309,7 @@ at::Tensor MultiHeadAttentionImpl::forward(at::Tensor x) {
         }
     }
     attn_output_ntc = at::empty({N, T, C}, x.options());
-#if DORADO_CUDA_BUILD
+#if DORADO_CUDA_BUILD && !defined(DORADO_TX2)
     int res = KOI_NOT_SUPPORTED;
     if (utils::get_dev_opt<bool>("use_koi_attention", true) && koi_can_use_cutlass()) {
         utils::ScopedProfileRange spr("KOI_MEA", 3);

From 461d4badea7827f23284d3820be980add5b798bd Mon Sep 17 00:00:00 2001
From: Georg Kolling <georg.kolling@nanoporetech.com>
Date: Tue, 21 May 2024 14:46:00 +0100
Subject: [PATCH 5/5] TxModel/CudaCaller: limit transformer batch size to 512
 for now, as larger batch sizes seem to cause problems at the moment

---
 dorado/basecall/CudaCaller.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dorado/basecall/CudaCaller.cpp b/dorado/basecall/CudaCaller.cpp
index 07c4ca990..a6c00e32c 100644
--- a/dorado/basecall/CudaCaller.cpp
+++ b/dorado/basecall/CudaCaller.cpp
@@ -317,7 +317,8 @@ void CudaCaller::determine_batch_dims(float memory_limit_fraction,
     // trade-off between getting more accurate measurements and avoiding excessive startup time.
     const int chunk_size = std::min(m_batch_dims.back().T_in, m_config.stride * 300);
     // We limit the maximum when doing benchmarking to avoid excessive startup time.
-    const int max_batch_size_limit = 10240;
+    // The limit for transformer models should be increased at a later time.
+    const int max_batch_size_limit = m_config.is_tx_model() ? 512 : 10240;
     int max_batch_size = *std::max_element(max_batch_sizes.begin(), max_batch_sizes.end());
     max_batch_size = std::min(max_batch_size, max_batch_size_limit);
     spdlog::debug("Auto batchsize {}: testing up to {} in steps of {}", m_device, max_batch_size,