diff --git a/cpp/bench/prims/linalg/masked_matmul.cu b/cpp/bench/prims/linalg/masked_matmul.cu
index eda9cb1710..b96e14a25d 100644
--- a/cpp/bench/prims/linalg/masked_matmul.cu
+++ b/cpp/bench/prims/linalg/masked_matmul.cu
@@ -22,7 +22,7 @@
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/random/rng.cuh>
-#include <raft/sparse/linalg/masked_matmul.hpp>
+#include <raft/sparse/linalg/masked_matmul.cuh>
 #include <raft/util/itertools.hpp>
 
 #include <cusparse_v2.h>
@@ -49,11 +49,14 @@ inline auto operator<<(std::ostream& os, const MaskedMatmulBenchParams<value_t>&
 {
   os << " m*k*n=" << params.m << "*" << params.k << "*" << params.n
      << "\tsparsity=" << params.sparsity;
-  if (params.sparsity == 1.0) { os << "<-inner product for comparison"; }
+  if (params.sparsity == 0.0) { os << "<-inner product for comparison"; }
   return os;
 }
 
-template <typename value_t, typename index_t = int64_t, typename bitmap_t = uint32_t>
+template <typename value_t,
+          bool bitmap_or_bitset = true,
+          typename index_t      = int64_t,
+          typename bits_t       = uint32_t>
 struct MaskedMatmulBench : public fixture {
   MaskedMatmulBench(const MaskedMatmulBenchParams<value_t>& p)
     : fixture(true),
@@ -64,15 +67,15 @@ struct MaskedMatmulBench : public fixture {
       c_indptr_d(0, stream),
       c_indices_d(0, stream),
       c_data_d(0, stream),
-      bitmap_d(0, stream),
+      bits_d(0, stream),
       c_dense_data_d(0, stream)
   {
-    index_t element = raft::ceildiv(index_t(params.m * params.n), index_t(sizeof(bitmap_t) * 8));
-    std::vector<bitmap_t> bitmap_h(element);
+    index_t element = raft::ceildiv(index_t(params.m * params.n), index_t(sizeof(bits_t) * 8));
+    std::vector<bits_t> bits_h(element);
 
     a_data_d.resize(params.m * params.k, stream);
     b_data_d.resize(params.k * params.n, stream);
-    bitmap_d.resize(element, stream);
+    bits_d.resize(element, stream);
 
     raft::random::RngState rng(2024ULL);
     raft::random::uniform(
@@ -82,7 +85,13 @@ struct MaskedMatmulBench : public fixture {
 
     std::vector<bool> c_dense_data_h(params.m * params.n);
 
-    c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bitmap_h);
+    if constexpr (bitmap_or_bitset) {
+      c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bits_h);
+    } else {
+      c_true_nnz = create_sparse_matrix(1, params.n, params.sparsity, bits_h);
+      repeat_cpu_bitset_inplace(bits_h, params.n, params.m - 1);
+      c_true_nnz *= params.m;
+    }
 
     std::vector<value_t> values(c_true_nnz);
     std::vector<index_t> indices(c_true_nnz);
@@ -93,24 +102,49 @@ struct MaskedMatmulBench : public fixture {
     c_indices_d.resize(c_true_nnz, stream);
     c_dense_data_d.resize(params.m * params.n, stream);
 
-    cpu_convert_to_csr(bitmap_h, params.m, params.n, indices, indptr);
+    cpu_convert_to_csr(bits_h, params.m, params.n, indices, indptr);
     RAFT_EXPECTS(c_true_nnz == c_indices_d.size(),
                  "Something wrong. The c_true_nnz != c_indices_d.size()!");
 
     update_device(c_data_d.data(), values.data(), c_true_nnz, stream);
     update_device(c_indices_d.data(), indices.data(), c_true_nnz, stream);
     update_device(c_indptr_d.data(), indptr.data(), params.m + 1, stream);
-    update_device(bitmap_d.data(), bitmap_h.data(), element, stream);
+    update_device(bits_d.data(), bits_h.data(), element, stream);
+  }
+
+  void repeat_cpu_bitset_inplace(std::vector<bits_t>& inout, size_t input_bits, size_t repeat)
+  {
+    size_t output_bit_index = input_bits;
+
+    for (size_t r = 0; r < repeat; ++r) {
+      for (size_t i = 0; i < input_bits; ++i) {
+        size_t input_unit_index = i / (sizeof(bits_t) * 8);
+        size_t input_bit_offset = i % (sizeof(bits_t) * 8);
+        bool bit                = (inout[input_unit_index] >> input_bit_offset) & 1;
+
+        size_t output_unit_index = output_bit_index / (sizeof(bits_t) * 8);
+        size_t output_bit_offset = output_bit_index % (sizeof(bits_t) * 8);
+
+        inout[output_unit_index] |= (static_cast<bits_t>(bit) << output_bit_offset);
+
+        ++output_bit_index;
+      }
+    }
   }
 
-  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitmap_t>& bitmap)
+  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bits_t>& bits)
   {
     index_t total    = static_cast<index_t>(m * n);
-    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * (1.0f - sparsity));
     index_t res      = num_ones;
 
-    for (auto& item : bitmap) {
-      item = static_cast<bitmap_t>(0);
+    if (sparsity == 0.0f) {
+      std::fill(bits.begin(), bits.end(), 0xffffffff);
+      return num_ones;
+    }
+
+    for (auto& item : bits) {
+      item = static_cast<bits_t>(0);
     }
 
     std::random_device rd;
@@ -120,8 +154,8 @@ struct MaskedMatmulBench : public fixture {
     while (num_ones > 0) {
       index_t index = dis(gen);
 
-      bitmap_t& element    = bitmap[index / (8 * sizeof(bitmap_t))];
-      index_t bit_position = index % (8 * sizeof(bitmap_t));
+      bits_t& element      = bits[index / (8 * sizeof(bits_t))];
+      index_t bit_position = index % (8 * sizeof(bits_t));
 
       if (((element >> bit_position) & 1) == 0) {
         element |= (static_cast<index_t>(1) << bit_position);
@@ -131,7 +165,7 @@ struct MaskedMatmulBench : public fixture {
     return res;
   }
 
-  void cpu_convert_to_csr(std::vector<bitmap_t>& bitmap,
+  void cpu_convert_to_csr(std::vector<bits_t>& bits,
                           index_t rows,
                           index_t cols,
                           std::vector<index_t>& indices,
@@ -142,14 +176,14 @@ struct MaskedMatmulBench : public fixture {
     indptr[offset_indptr++] = 0;
 
     index_t index        = 0;
-    bitmap_t element     = 0;
+    bits_t element       = 0;
     index_t bit_position = 0;
 
     for (index_t i = 0; i < rows; ++i) {
       for (index_t j = 0; j < cols; ++j) {
         index        = i * cols + j;
-        element      = bitmap[index / (8 * sizeof(bitmap_t))];
-        bit_position = index % (8 * sizeof(bitmap_t));
+        element      = bits[index / (8 * sizeof(bits_t))];
+        bit_position = index % (8 * sizeof(bits_t));
 
         if (((element >> bit_position) & 1)) {
           indices[offset_values] = static_cast<index_t>(j);
@@ -181,13 +215,17 @@ struct MaskedMatmulBench : public fixture {
       params.n,
       static_cast<index_t>(c_indices_d.size()));
 
-    auto mask =
-      raft::core::bitmap_view<const bitmap_t, index_t>(bitmap_d.data(), params.m, params.n);
-
     auto c = raft::make_device_csr_matrix_view<value_t>(c_data_d.data(), c_structure);
 
-    if (params.sparsity < 1.0) {
-      raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+    if (params.sparsity > 0.0) {
+      if constexpr (bitmap_or_bitset) {
+        auto mask =
+          raft::core::bitmap_view<const bits_t, index_t>(bits_d.data(), params.m, params.n);
+        raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+      } else {
+        auto mask = raft::core::bitset_view<const bits_t, index_t>(bits_d.data(), params.n);
+        raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+      }
     } else {
       raft::distance::pairwise_distance(handle,
                                         a_data_d.data(),
@@ -201,12 +239,16 @@ struct MaskedMatmulBench : public fixture {
     }
     resource::sync_stream(handle);
 
-    raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
-    resource::sync_stream(handle);
-
-    loop_on_state(state, [this, &a, &b, &mask, &c]() {
-      if (params.sparsity < 1.0) {
-        raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+    loop_on_state(state, [this, &a, &b, &c]() {
+      if (params.sparsity > 0.0) {
+        if constexpr (bitmap_or_bitset) {
+          auto mask =
+            raft::core::bitmap_view<const bits_t, index_t>(bits_d.data(), params.m, params.n);
+          raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+        } else {
+          auto mask = raft::core::bitset_view<const bits_t, index_t>(bits_d.data(), params.n);
+          raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+        }
       } else {
         raft::distance::pairwise_distance(handle,
                                           a_data_d.data(),
@@ -228,7 +270,7 @@ struct MaskedMatmulBench : public fixture {
 
   rmm::device_uvector<value_t> a_data_d;
   rmm::device_uvector<value_t> b_data_d;
-  rmm::device_uvector<bitmap_t> bitmap_d;
+  rmm::device_uvector<bits_t> bits_d;
 
   rmm::device_uvector<value_t> c_dense_data_d;
 
@@ -253,7 +295,7 @@ static std::vector<MaskedMatmulBenchParams<value_t>> getInputs()
     raft::util::itertools::product<TestParams>({size_t(10), size_t(1024)},
                                                {size_t(128), size_t(1024)},
                                                {size_t(1024 * 1024)},
-                                               {0.01f, 0.1f, 0.2f, 0.5f, 1.0f});
+                                               {0.99f, 0.9f, 0.8f, 0.5f, 0.0f});
 
   param_vec.reserve(params_group.size());
   for (TestParams params : params_group) {
@@ -263,6 +305,7 @@ static std::vector<MaskedMatmulBenchParams<value_t>> getInputs()
   return param_vec;
 }
 
-RAFT_BENCH_REGISTER((MaskedMatmulBench<float>), "", getInputs<float>());
+RAFT_BENCH_REGISTER((MaskedMatmulBench<float, true>), "", getInputs<float>());
+RAFT_BENCH_REGISTER((MaskedMatmulBench<float, false>), "", getInputs<float>());
 
 }  // namespace raft::bench::linalg
diff --git a/cpp/bench/prims/sparse/bitset_to_csr.cu b/cpp/bench/prims/sparse/bitset_to_csr.cu
new file mode 100644
index 0000000000..fef2d44d3e
--- /dev/null
+++ b/cpp/bench/prims/sparse/bitset_to_csr.cu
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <sstream>
+#include <vector>
+
+namespace raft::bench::sparse {
+
+template <typename index_t>
+struct bench_param {
+  index_t n_repeat;
+  index_t n_cols;
+  float sparsity;
+};
+
+template <typename index_t>
+inline auto operator<<(std::ostream& os, const bench_param<index_t>& params) -> std::ostream&
+{
+  os << " rows*cols=" << params.n_repeat << "*" << params.n_cols
+     << "\tsparsity=" << params.sparsity;
+  return os;
+}
+
+template <typename bitset_t, typename index_t, typename value_t = float>
+struct BitsetToCsrBench : public fixture {
+  BitsetToCsrBench(const bench_param<index_t>& p)
+    : fixture(true),
+      params(p),
+      handle(stream),
+      bitset_d(0, stream),
+      nnz(0),
+      indptr_d(0, stream),
+      indices_d(0, stream),
+      values_d(0, stream)
+  {
+    index_t element = raft::ceildiv(1 * params.n_cols, index_t(sizeof(bitset_t) * 8));
+    std::vector<bitset_t> bitset_h(element);
+    nnz = create_sparse_matrix(1, params.n_cols, params.sparsity, bitset_h);
+
+    bitset_d.resize(bitset_h.size(), stream);
+    indptr_d.resize(params.n_repeat + 1, stream);
+    indices_d.resize(nnz, stream);
+    values_d.resize(nnz, stream);
+
+    update_device(bitset_d.data(), bitset_h.data(), bitset_h.size(), stream);
+
+    resource::sync_stream(handle);
+  }
+
+  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitset_t>& bitset)
+  {
+    index_t total    = static_cast<index_t>(m * n);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * (1.0f - sparsity));
+    index_t res      = num_ones;
+
+    for (auto& item : bitset) {
+      item = static_cast<bitset_t>(0);
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<index_t> dis(0, total - 1);
+
+    while (num_ones > 0) {
+      index_t index = dis(gen);
+
+      bitset_t& element    = bitset[index / (8 * sizeof(bitset_t))];
+      index_t bit_position = index % (8 * sizeof(bitset_t));
+
+      if (((element >> bit_position) & 1) == 0) {
+        element |= (static_cast<index_t>(1) << bit_position);
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    auto bitset = raft::core::bitset_view<bitset_t, index_t>(bitset_d.data(), 1 * params.n_cols);
+
+    auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      indptr_d.data(), indices_d.data(), params.n_repeat, params.n_cols, nnz);
+    auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, csr_view);
+
+    raft::sparse::convert::bitset_to_csr<bitset_t, index_t>(handle, bitset, csr);
+
+    resource::sync_stream(handle);
+    loop_on_state(state, [this, &bitset, &csr]() {
+      raft::sparse::convert::bitset_to_csr<bitset_t, index_t>(handle, bitset, csr);
+    });
+  }
+
+ protected:
+  const raft::device_resources handle;
+
+  bench_param<index_t> params;
+
+  rmm::device_uvector<bitset_t> bitset_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<value_t> values_d;
+
+  index_t nnz;
+};  // struct BitsetToCsrBench
+
+template <typename index_t>
+const std::vector<bench_param<index_t>> getInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    float sparsity;
+  };
+
+  const std::vector<TestParams> params_group = raft::util::itertools::product<TestParams>(
+    {index_t(10), index_t(1024)}, {index_t(1024 * 1024)}, {0.99f, 0.9f, 0.8f, 0.5f});
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.sparsity}));
+  }
+  return param_vec;
+}
+
+template <typename index_t = int64_t>
+const std::vector<bench_param<index_t>> getLargeInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    float sparsity;
+  };
+
+  const std::vector<TestParams> params_group = raft::util::itertools::product<TestParams>(
+    {index_t(1), index_t(100)}, {index_t(100 * 1000000)}, {0.95f, 0.99f});
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.sparsity}));
+  }
+  return param_vec;
+}
+
+RAFT_BENCH_REGISTER((BitsetToCsrBench<uint32_t, int, float>), "", getInputs<int>());
+RAFT_BENCH_REGISTER((BitsetToCsrBench<uint64_t, int, double>), "", getInputs<int>());
+
+RAFT_BENCH_REGISTER((BitsetToCsrBench<uint32_t, int64_t, float>), "", getLargeInputs<int64_t>());
+
+}  // namespace raft::bench::sparse
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index 081192ed44..5237edd383 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -22,6 +22,7 @@
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/sparse/convert/detail/adj_to_csr.cuh>
 #include <raft/sparse/convert/detail/bitmap_to_csr.cuh>
+#include <raft/sparse/convert/detail/bitset_to_csr.cuh>
 #include <raft/sparse/convert/detail/csr.cuh>
 #include <raft/sparse/csr.hpp>
 
@@ -129,6 +130,80 @@ void bitmap_to_csr(raft::resources const& handle,
   detail::bitmap_to_csr(handle, bitmap, csr);
 }
 
+/**
+ * @brief  Converts a bitset matrix to a Compressed Sparse Row (CSR) format matrix.
+ *
+ * The bitset format inherently supports only a single-row matrix (rows=1). If the CSR matrix
+ * requires multiple rows, the data from the bitset will be repeated for each row in the output.
+ *
+ * Example usage:
+ *
+ * @code{.cpp}
+ * #include <raft/core/resource/cuda_stream.hpp>
+ * #include <raft/sparse/convert/csr.cuh>
+ * #include <rmm/device_uvector.hpp>
+ *
+ * #include <vector>
+ *
+ * using bitset_t = uint32_t;
+ * using index_t  = int;
+ * using value_t  = float;
+ * using nnz_t    = index_t;
+ *
+ * raft::resources handle;
+ * auto stream    = resource::get_cuda_stream(handle);
+ * index_t n_rows = 3;
+ * index_t n_cols = 30;
+ *
+ * nnz_t nnz_for_bitset = 4;
+ * nnz_t nnz_for_csr    = nnz_for_bitset * n_rows;
+ *
+ * index_t bitset_size = (n_cols + sizeof(bitset_t) * 8 - 1) / (sizeof(bitset_t) * 8);  //  = 1
+ *
+ * rmm::device_uvector<bitset_t> bitset_d(bitset_size, stream);
+ * std::vector<bitset_t> bitset_h = {
+ *   bitset_t(0b11001010),
+ * };  // nnz_for_bitset = 4;
+ *
+ * raft::copy(bitset_d.data(), bitset_h.data(), bitset_h.size(), stream);
+ *
+ * auto bitset_view = raft::core::bitset_view<bitset_t, index_t>(bitset_d.data(), n_cols);
+ * auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, n_rows, n_cols, nnz_for_csr);
+ *
+ * raft::sparse::convert::bitset_to_csr(handle, bitset_view, csr);
+ * resource::sync_stream(handle);
+ *
+ * // Results:
+ * // csr.indptr  = [0, 4, 8, 12];
+ * // csr.indices = [1, 3, 6, 7,
+ * //                1, 3, 6, 7,
+ * //                1, 3, 6, 7];
+ * // csr.values  = [1, 1, 1, 1,
+ * //                1, 1, 1, 1,
+ * //                1, 1, 1, 1];
+ * @endcode
+ *
+ * @tparam       bitset_t       The data type of the elements in the bitset matrix.
+ * @tparam       index_t        The data type used for indexing the elements in the matrices.
+ * @tparam       csr_matrix_t   Specifies the CSR matrix type, constrained to
+ * raft::device_csr_matrix.
+ *
+ * @param[in]    handle         The RAFT handle containing the CUDA stream for operations.
+ * @param[in]    bitset         The bitset matrix view, to be converted to CSR format.
+ * @param[out]   csr            Output parameter where the resulting CSR matrix is stored. In the
+ * bitset, each '1' bit corresponds to a non-zero element in the CSR matrix.
+ */
+template <typename bitset_t,
+          typename index_t,
+          typename csr_matrix_t,
+          typename = std::enable_if_t<raft::is_device_csr_matrix_v<csr_matrix_t>>>
+void bitset_to_csr(raft::resources const& handle,
+                   raft::core::bitset_view<bitset_t, index_t> bitset,
+                   csr_matrix_t& csr)
+{
+  detail::bitset_to_csr(handle, bitset, csr);
+}
+
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh
new file mode 100644
index 0000000000..72abd02f7e
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/detail/mdspan_util.cuh>  // detail::popc
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/sparse/convert/detail/adj_to_csr.cuh>
+#include <raft/sparse/convert/detail/bitmap_to_csr.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <thrust/copy.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+
+#include <assert.h>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+namespace detail {
+
+template <typename index_t, typename nnz_t>
+__global__ void repeat_csr_kernel(const index_t* indptr,
+                                  const index_t* indices,
+                                  index_t* repeated_indptr,
+                                  index_t* repeated_indices,
+                                  nnz_t nnz,
+                                  index_t repeat_count)
+{
+  int global_id                  = blockIdx.x * blockDim.x + threadIdx.x;
+  bool guard                     = global_id < nnz;
+  index_t* repeated_indices_addr = repeated_indices + global_id;
+
+  for (index_t i = global_id; i < repeat_count; i += gridDim.x * blockDim.x) {
+    repeated_indptr[i] = (i + 2) * nnz;
+  }
+
+  __syncthreads();
+
+  int block_offset = blockIdx.x * blockDim.x;
+
+  index_t item;
+  int idx = block_offset + threadIdx.x;
+  item    = (idx < nnz) ? indices[idx] : -1;
+
+  __syncthreads();
+
+  for (index_t row = 0; row < repeat_count; ++row) {
+    index_t start_offset = row * nnz;
+    if (guard) { repeated_indices_addr[start_offset] = item; }
+  }
+}
+
+template <typename index_t, typename nnz_t>
+void gpu_repeat_csr(raft::resources const& handle,
+                    const index_t* d_indptr,
+                    const index_t* d_indices,
+                    nnz_t nnz,
+                    index_t repeat_count,
+                    index_t* d_repeated_indptr,
+                    index_t* d_repeated_indices)
+{
+  auto stream            = resource::get_cuda_stream(handle);
+  index_t repeat_csr_tpb = 256;
+  index_t grid           = (nnz + repeat_csr_tpb - 1) / (repeat_csr_tpb);
+
+  repeat_csr_kernel<<<grid, repeat_csr_tpb, 0, stream>>>(
+    d_indptr, d_indices, d_repeated_indptr, d_repeated_indices, nnz, repeat_count);
+}
+
+template <typename bitset_t,
+          typename index_t,
+          typename csr_matrix_t,
+          typename = std::enable_if_t<raft::is_device_csr_matrix_v<csr_matrix_t>>>
+void bitset_to_csr(raft::resources const& handle,
+                   raft::core::bitset_view<bitset_t, index_t> bitset,
+                   csr_matrix_t& csr)
+{
+  using row_t = typename csr_matrix_t::row_type;
+  using nnz_t = typename csr_matrix_t::nnz_type;
+
+  auto csr_view = csr.structure_view();
+
+  if (csr_view.get_n_rows() == 0 || csr_view.get_n_cols() == 0 || csr_view.get_nnz() == 0) {
+    return;
+  }
+
+  RAFT_EXPECTS(bitset.size() == csr_view.get_n_cols(),
+               "Number of size in bitset must be equal to "
+               "number of columns in csr");
+
+  auto thrust_policy = resource::get_thrust_policy(handle);
+  auto stream        = resource::get_cuda_stream(handle);
+
+  index_t* indptr  = csr_view.get_indptr().data();
+  index_t* indices = csr_view.get_indices().data();
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(indptr, 0, (csr_view.get_n_rows() + 1) * sizeof(index_t), stream));
+
+  size_t sub_nnz_size      = 0;
+  index_t bits_per_sub_col = 0;
+
+  // Get buffer size and number of bits per each sub-columns
+  calc_nnz_by_rows(handle,
+                   bitset.data(),
+                   row_t(1),
+                   csr_view.get_n_cols(),
+                   static_cast<nnz_t*>(nullptr),
+                   sub_nnz_size,
+                   bits_per_sub_col);
+
+  rmm::device_async_resource_ref device_memory = resource::get_workspace_resource(handle);
+  rmm::device_uvector<nnz_t> sub_nnz(sub_nnz_size + 1, stream, device_memory);
+
+  calc_nnz_by_rows(handle,
+                   bitset.data(),
+                   row_t(1),
+                   csr_view.get_n_cols(),
+                   sub_nnz.data(),
+                   sub_nnz_size,
+                   bits_per_sub_col);
+
+  thrust::exclusive_scan(
+    thrust_policy, sub_nnz.data(), sub_nnz.data() + sub_nnz_size + 1, sub_nnz.data());
+
+  index_t bitset_nnz = 0;
+  if constexpr (is_device_csr_sparsity_owning_v<csr_matrix_t>) {
+    RAFT_CUDA_TRY(cudaMemcpyAsync(
+      &bitset_nnz, sub_nnz.data() + sub_nnz_size, sizeof(index_t), cudaMemcpyDeviceToHost, stream));
+    resource::sync_stream(handle);
+    csr.initialize_sparsity(bitset_nnz * csr_view.get_n_rows());
+  } else {
+    bitset_nnz = csr_view.get_nnz() / csr_view.get_n_rows();
+  }
+  constexpr bool check_nnz = is_device_csr_sparsity_preserving_v<csr_matrix_t>;
+  fill_indices_by_rows<bitset_t, index_t, nnz_t, check_nnz>(handle,
+                                                            bitset.data(),
+                                                            indptr,
+                                                            1,
+                                                            csr_view.get_n_cols(),
+                                                            csr_view.get_nnz(),
+                                                            indices,
+                                                            sub_nnz.data(),
+                                                            bits_per_sub_col,
+                                                            sub_nnz_size);
+  if (csr_view.get_n_rows() > 1) {
+    gpu_repeat_csr<index_t, nnz_t>(handle,
+                                   indptr,
+                                   indices,
+                                   bitset_nnz,
+                                   csr_view.get_n_rows() - 1,
+                                   indptr + 2,
+                                   indices + bitset_nnz);
+  }
+
+  thrust::fill_n(thrust_policy,
+                 csr.get_elements().data(),
+                 csr_view.get_nnz(),
+                 typename csr_matrix_t::element_type(1));
+}
+
+};  // end NAMESPACE detail
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh b/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
index 276960628d..0364daff83 100644
--- a/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <raft/core/bitmap.cuh>
+#include <raft/core/bitset.cuh>
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -100,6 +101,69 @@ void masked_matmul(raft::resources const& handle,
   }
 }
 
+template <typename value_t, typename output_t, typename index_t, typename nnz_t, typename bitset_t>
+void masked_matmul(raft::resources const& handle,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major>& A,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major>& B,
+                   raft::core::bitset_view<const bitset_t, index_t>& mask,
+                   raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t>& C,
+                   std::optional<raft::host_scalar_view<output_t>> alpha,
+                   std::optional<raft::host_scalar_view<output_t>> beta)
+{
+  index_t m   = A.extent(0);
+  index_t n   = B.extent(0);
+  index_t dim = A.extent(1);
+
+  auto compressed_C_view = C.structure_view();
+
+  RAFT_EXPECTS(A.extent(1) == B.extent(1), "The dim of A must be equal to the dim of B.");
+  RAFT_EXPECTS(A.extent(0) == compressed_C_view.get_n_rows(),
+               "Number of rows in C must match the number of rows in A.");
+  RAFT_EXPECTS(B.extent(0) == compressed_C_view.get_n_cols(),
+               "Number of columns in C must match the number of columns in B.");
+
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  auto C_matrix = raft::make_device_csr_matrix<output_t, index_t>(handle, compressed_C_view);
+
+  // fill C
+  raft::sparse::convert::bitset_to_csr(handle, mask, C_matrix);
+
+  if (m > 10 || alpha.has_value() || beta.has_value()) {
+    auto C_view = raft::make_device_csr_matrix_view<output_t, index_t, index_t, index_t>(
+      C.get_elements().data(), compressed_C_view);
+
+    // create B col_major view
+    auto B_col_major = raft::make_device_matrix_view<const value_t, index_t, raft::col_major>(
+      B.data_handle(), dim, n);
+
+    output_t default_alpha = static_cast<output_t>(1.0f);
+    output_t default_beta  = static_cast<output_t>(0.0f);
+
+    if (!alpha.has_value()) { alpha = raft::make_host_scalar_view<output_t>(&default_alpha); }
+    if (!beta.has_value()) { beta = raft::make_host_scalar_view<output_t>(&default_beta); }
+
+    raft::sparse::linalg::sddmm(handle,
+                                A,
+                                B_col_major,
+                                C_view,
+                                raft::linalg::Operation::NON_TRANSPOSE,
+                                raft::linalg::Operation::NON_TRANSPOSE,
+                                *alpha,
+                                *beta);
+  } else {
+    raft::sparse::distance::detail::faster_dot_on_csr(handle,
+                                                      C.get_elements().data(),
+                                                      compressed_C_view.get_nnz(),
+                                                      compressed_C_view.get_indptr().data(),
+                                                      compressed_C_view.get_indices().data(),
+                                                      A.data_handle(),
+                                                      B.data_handle(),
+                                                      compressed_C_view.get_n_rows(),
+                                                      dim);
+  }
+}
+
 }  // namespace detail
 }  // namespace linalg
 }  // namespace sparse
diff --git a/cpp/include/raft/sparse/linalg/masked_matmul.cuh b/cpp/include/raft/sparse/linalg/masked_matmul.cuh
new file mode 100644
index 0000000000..288068dae2
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/masked_matmul.cuh
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain A copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/sparse/linalg/detail/masked_matmul.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @defgroup masked_matmul Masked Matrix Multiplication
+ * @{
+ */
+
+/**
+ * @brief Performs a masked multiplication of dense matrices A and B, followed by an element-wise
+ * multiplication with the sparsity pattern defined by the mask, resulting in the computation
+ * C = alpha * ((A * B) ∘ spy(mask)) + beta * C.
+ *
+ * This function multiplies two dense matrices A and B, and then applies an element-wise
+ * multiplication using the sparsity pattern provided by the mask. The result is scaled by alpha
+ * and added to beta times the original matrix C.
+ *
+ * @tparam value_t Data type of elements in the input matrices (e.g., half, float, double)
+ * @tparam output_t Data type of elements in the output matrices (e.g., float, double)
+ * @tparam index_t Type used for matrix indices
+ * @tparam nnz_t Type used for the number of non-zero entries in CSR format
+ * @tparam bitmap_t Type of the bitmap used for the mask
+ *
+ * @param[in] handle RAFT handle for resource management
+ * @param[in] A Input dense matrix (device_matrix_view) with shape [m, k]
+ * @param[in] B Input dense matrix (device_matrix_view) with shape [n, k]
+ * @param[in] mask Bitmap view representing the sparsity pattern (bitmap_view) with logical shape
+ * [m, n]. Each bit in the mask indicates whether the corresponding element pair in A and B is
+ * included (1) or masked out (0).
+ * @param[inout] C Output sparse matrix in CSR format (device_csr_matrix_view) with dense shape [m,
+ * n]
+ * @param[in] alpha Optional scalar multiplier for the product of A and B (default: 1.0 if
+ * std::nullopt)
+ * @param[in] beta Optional scalar multiplier for the original matrix C (default: 0 if std::nullopt)
+ */
+template <typename value_t, typename output_t, typename index_t, typename nnz_t, typename bitmap_t>
+void masked_matmul(raft::resources const& handle,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major> A,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major> B,
+                   raft::core::bitmap_view<const bitmap_t, index_t> mask,
+                   raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t> C,
+                   std::optional<raft::host_scalar_view<output_t>> alpha = std::nullopt,
+                   std::optional<raft::host_scalar_view<output_t>> beta  = std::nullopt)
+{
+  detail::masked_matmul(handle, A, B, mask, C, alpha, beta);
+}
+
+/**
+ * @brief Computes a sparse matrix product with a masked sparsity pattern and scaling.
+ *
+ * This function computes the result of:
+ * C = alpha * ((A * B) ∘ spy(mask)) + beta * C
+ * where:
+ * - A and B are dense input matrices.
+ * - "mask" defines the sparsity pattern for element-wise multiplication.
+ * - The result is scaled by alpha and added to beta times the original C.
+ *
+ * **Special behavior of the mask**:
+ * - The `bitset` mask represents a single row of data, with its bits indicating whether
+ *   each corresponding element in (A * B) is included (1) or masked out (0).
+ * - If the output CSR matrix `C` has multiple rows, the `bitset` is logically repeated
+ *   across all rows of `C`. For example, if `C` has `n_rows` rows, the same `bitset`
+ *   pattern is applied to all rows.
+ *
+ * @tparam value_t    Data type of input matrix elements (e.g., half, float, double).
+ * @tparam output_t   Data type of output matrix elements (e.g., float, double).
+ * @tparam index_t    Type for matrix indices.
+ * @tparam nnz_t      Type for non-zero entries in CSR format.
+ * @tparam bitmap_t   Type for the bitmap mask.
+ *
+ * @param[in] handle  RAFT handle for managing resources.
+ * @param[in] A       Dense input matrix [m, k] (row-major).
+ * @param[in] B       Dense input matrix [n, k] (row-major).
+ * @param[in] mask    Bitmap view representing a single row [1, n], where each bit
+ *                    indicates if the corresponding element in (A * B) is included (1)
+ *                    or masked out (0). The pattern is repeated for all rows of `C`.
+ * @param[inout] C    Output sparse matrix in CSR format [m, n].
+ * @param[in] alpha   Scalar multiplier for (A * B) (default: 1.0 if std::nullopt).
+ * @param[in] beta    Scalar multiplier for the initial C (default: 0 if std::nullopt).
+ */
+template <typename value_t, typename output_t, typename index_t, typename nnz_t, typename bitmap_t>
+void masked_matmul(raft::resources const& handle,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major> A,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major> B,
+                   raft::core::bitset_view<const bitmap_t, index_t> mask,
+                   raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t> C,
+                   std::optional<raft::host_scalar_view<output_t>> alpha = std::nullopt,
+                   std::optional<raft::host_scalar_view<output_t>> beta  = std::nullopt)
+{
+  detail::masked_matmul(handle, A, B, mask, C, alpha, beta);
+}
+
+/** @} */  // end of masked_matmul
+
+}  // end namespace linalg
+}  // end namespace sparse
+}  // end namespace raft
diff --git a/cpp/include/raft/sparse/linalg/masked_matmul.hpp b/cpp/include/raft/sparse/linalg/masked_matmul.hpp
index 6cf6e834b9..32322b90f6 100644
--- a/cpp/include/raft/sparse/linalg/masked_matmul.hpp
+++ b/cpp/include/raft/sparse/linalg/masked_matmul.hpp
@@ -13,60 +13,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
-
-#include <raft/sparse/linalg/detail/masked_matmul.cuh>
-
-namespace raft {
-namespace sparse {
-namespace linalg {
-
 /**
- * @defgroup masked_matmul Masked Matrix Multiplication
- * @{
+ * This file is deprecated and will be removed in future release.
+ * Please use the cuh version instead.
  */
 
 /**
- * @brief Performs a masked multiplication of dense matrices A and B, followed by an element-wise
- * multiplication with the sparsity pattern defined by the mask, resulting in the computation
- * C = alpha * ((A * B) ∘ spy(mask)) + beta * C.
- *
- * This function multiplies two dense matrices A and B, and then applies an element-wise
- * multiplication using the sparsity pattern provided by the mask. The result is scaled by alpha
- * and added to beta times the original matrix C.
- *
- * @tparam value_t Data type of elements in the input matrices (e.g., half, float, double)
- * @tparam output_t Data type of elements in the output matrices (e.g., float, double)
- * @tparam index_t Type used for matrix indices
- * @tparam nnz_t Type used for the number of non-zero entries in CSR format
- * @tparam bitmap_t Type of the bitmap used for the mask
- *
- * @param[in] handle RAFT handle for resource management
- * @param[in] A Input dense matrix (device_matrix_view) with shape [m, k]
- * @param[in] B Input dense matrix (device_matrix_view) with shape [n, k]
- * @param[in] mask Bitmap view representing the sparsity pattern (bitmap_view) with logical shape
- * [m, n]. Each bit in the mask indicates whether the corresponding element pair in A and B is
- * included (1) or masked out (0).
- * @param[inout] C Output sparse matrix in CSR format (device_csr_matrix_view) with dense shape [m,
- * n]
- * @param[in] alpha Optional scalar multiplier for the product of A and B (default: 1.0 if
- * std::nullopt)
- * @param[in] beta Optional scalar multiplier for the original matrix C (default: 0 if std::nullopt)
+ * DISCLAIMER: this file is deprecated: use masked_matmul.cuh instead
  */
-template <typename value_t, typename output_t, typename index_t, typename nnz_t, typename bitmap_t>
-void masked_matmul(raft::resources const& handle,
-                   raft::device_matrix_view<const value_t, index_t, raft::row_major> A,
-                   raft::device_matrix_view<const value_t, index_t, raft::row_major> B,
-                   raft::core::bitmap_view<const bitmap_t, index_t> mask,
-                   raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t> C,
-                   std::optional<raft::host_scalar_view<output_t>> alpha = std::nullopt,
-                   std::optional<raft::host_scalar_view<output_t>> beta  = std::nullopt)
-{
-  detail::masked_matmul(handle, A, B, mask, C, alpha, beta);
-}
 
-/** @} */  // end of masked_matmul
+#pragma once
+
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the cuh version instead.")
+#endif
 
-}  // end namespace linalg
-}  // end namespace sparse
-}  // end namespace raft
+#include <raft/sparse/linalg/masked_matmul.cuh>
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index c1a495ea3d..eed9262a17 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -17,6 +17,7 @@
 #include "../test_utils.cuh"
 
 #include <raft/core/bitmap.cuh>
+#include <raft/core/bitset.cuh>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
@@ -477,5 +478,289 @@ INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
                         BitmapToCSRTestLOnLargeSize,
                         ::testing::ValuesIn(bitmaptocsr_large_inputs<int64_t>));
 
+/******************************** bitset to csr ********************************/
+
+template <typename index_t>
+struct BitsetToCSRInputs {
+  index_t n_repeat;
+  index_t n_cols;
+  float sparsity;
+  bool owning;
+};
+
+template <typename bitset_t, typename index_t, typename value_t>
+class BitsetToCSRTest : public ::testing::TestWithParam<BitsetToCSRInputs<index_t>> {
+ public:
+  BitsetToCSRTest()
+    : stream(resource::get_cuda_stream(handle)),
+      params(::testing::TestWithParam<BitsetToCSRInputs<index_t>>::GetParam()),
+      bitset_d(0, stream),
+      indices_d(0, stream),
+      indptr_d(0, stream),
+      values_d(0, stream),
+      indptr_expected_d(0, stream),
+      indices_expected_d(0, stream),
+      values_expected_d(0, stream)
+  {
+  }
+
+ protected:
+  void repeat_cpu_bitset(std::vector<bitset_t>& input,
+                         size_t input_bits,
+                         size_t repeat,
+                         std::vector<bitset_t>& output)
+  {
+    const size_t output_bits  = input_bits * repeat;
+    const size_t output_units = (output_bits + sizeof(bitset_t) * 8 - 1) / (sizeof(bitset_t) * 8);
+
+    std::memset(output.data(), 0, output_units * sizeof(bitset_t));
+
+    size_t output_bit_index = 0;
+
+    for (size_t r = 0; r < repeat; ++r) {
+      for (size_t i = 0; i < input_bits; ++i) {
+        size_t input_unit_index = i / (sizeof(bitset_t) * 8);
+        size_t input_bit_offset = i % (sizeof(bitset_t) * 8);
+        bool bit                = (input[input_unit_index] >> input_bit_offset) & 1;
+
+        size_t output_unit_index = output_bit_index / (sizeof(bitset_t) * 8);
+        size_t output_bit_offset = output_bit_index % (sizeof(bitset_t) * 8);
+
+        output[output_unit_index] |= (static_cast<bitset_t>(bit) << output_bit_offset);
+
+        ++output_bit_index;
+      }
+    }
+  }
+
+  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitset_t>& bitset)
+  {
+    index_t total    = static_cast<index_t>(m * n);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t res      = num_ones;
+
+    for (auto& item : bitset) {
+      item = static_cast<bitset_t>(0);
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<index_t> dis(0, total - 1);
+
+    while (num_ones > 0) {
+      index_t index = dis(gen);
+
+      bitset_t& element    = bitset[index / (8 * sizeof(bitset_t))];
+      index_t bit_position = index % (8 * sizeof(bitset_t));
+
+      if (((element >> bit_position) & 1) == 0) {
+        element |= (static_cast<index_t>(1) << bit_position);
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void cpu_convert_to_csr(std::vector<bitset_t>& bitset,
+                          index_t rows,
+                          index_t cols,
+                          std::vector<index_t>& indices,
+                          std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    index_t index        = 0;
+    bitset_t element     = 0;
+    index_t bit_position = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        index        = i * cols + j;
+        element      = bitset[index / (8 * sizeof(bitset_t))];
+        bit_position = index % (8 * sizeof(bitset_t));
+
+        if (((element >> bit_position) & 1)) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  bool csr_compare(const std::vector<index_t>& row_ptrs1,
+                   const std::vector<index_t>& col_indices1,
+                   const std::vector<index_t>& row_ptrs2,
+                   const std::vector<index_t>& col_indices2)
+  {
+    if (row_ptrs1.size() != row_ptrs2.size()) { return false; }
+
+    if (col_indices1.size() != col_indices2.size()) { return false; }
+
+    if (!std::equal(row_ptrs1.begin(), row_ptrs1.end(), row_ptrs2.begin())) { return false; }
+
+    for (size_t i = 0; i < row_ptrs1.size() - 1; ++i) {
+      size_t start_idx = row_ptrs1[i];
+      size_t end_idx   = row_ptrs1[i + 1];
+
+      std::vector<index_t> cols1(col_indices1.begin() + start_idx, col_indices1.begin() + end_idx);
+      std::vector<index_t> cols2(col_indices2.begin() + start_idx, col_indices2.begin() + end_idx);
+
+      std::sort(cols1.begin(), cols1.end());
+      std::sort(cols2.begin(), cols2.end());
+
+      if (cols1 != cols2) { return false; }
+    }
+
+    return true;
+  }
+
+  void SetUp() override
+  {
+    index_t element = raft::ceildiv(1 * params.n_cols, index_t(sizeof(bitset_t) * 8));
+    std::vector<bitset_t> bitset_h(element);
+    std::vector<bitset_t> bitset_repeat_h(element * params.n_repeat);
+
+    nnz = create_sparse_matrix(1, params.n_cols, params.sparsity, bitset_h);
+
+    repeat_cpu_bitset(bitset_h, size_t(params.n_cols), size_t(params.n_repeat), bitset_repeat_h);
+    nnz *= params.n_repeat;
+
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_repeat + 1);
+
+    cpu_convert_to_csr(bitset_repeat_h, params.n_repeat, params.n_cols, indices_h, indptr_h);
+
+    bitset_d.resize(bitset_h.size(), stream);
+    indptr_d.resize(params.n_repeat + 1, stream);
+    indices_d.resize(nnz, stream);
+
+    indptr_expected_d.resize(params.n_repeat + 1, stream);
+    indices_expected_d.resize(nnz, stream);
+    values_expected_d.resize(nnz, stream);
+
+    thrust::fill_n(resource::get_thrust_policy(handle), values_expected_d.data(), nnz, value_t{1});
+
+    values_d.resize(nnz, stream);
+
+    update_device(indices_expected_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_expected_d.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(bitset_d.data(), bitset_h.data(), bitset_h.size(), stream);
+
+    resource::sync_stream(handle);
+  }
+
+  void Run()
+  {
+    auto bitset = raft::core::bitset_view<bitset_t, index_t>(bitset_d.data(), params.n_cols);
+
+    if (params.owning) {
+      auto csr =
+        raft::make_device_csr_matrix<value_t, index_t>(handle, params.n_repeat, params.n_cols, nnz);
+      auto csr_view = csr.structure_view();
+
+      convert::bitset_to_csr(handle, bitset, csr);
+      raft::copy(indptr_d.data(), csr_view.get_indptr().data(), indptr_d.size(), stream);
+      raft::copy(indices_d.data(), csr_view.get_indices().data(), indices_d.size(), stream);
+      raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
+    } else {
+      auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+        indptr_d.data(), indices_d.data(), params.n_repeat, params.n_cols, nnz);
+      auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, csr_view);
+
+      convert::bitset_to_csr(handle, bitset, csr);
+      raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
+    }
+    resource::sync_stream(handle);
+
+    std::vector<index_t> indices_h(indices_expected_d.size(), 0);
+    std::vector<index_t> indices_expected_h(indices_expected_d.size(), 0);
+    update_host(indices_h.data(), indices_d.data(), indices_h.size(), stream);
+    update_host(indices_expected_h.data(), indices_expected_d.data(), indices_h.size(), stream);
+
+    std::vector<index_t> indptr_h(indptr_expected_d.size(), 0);
+    std::vector<index_t> indptr_expected_h(indptr_expected_d.size(), 0);
+    update_host(indptr_h.data(), indptr_d.data(), indptr_h.size(), stream);
+    update_host(indptr_expected_h.data(), indptr_expected_d.data(), indptr_h.size(), stream);
+
+    resource::sync_stream(handle);
+
+    ASSERT_TRUE(csr_compare(indptr_h, indices_h, indptr_expected_h, indices_expected_h));
+    ASSERT_TRUE(raft::devArrMatch<value_t>(
+      values_expected_d.data(), values_d.data(), nnz, raft::Compare<value_t>(), stream));
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  BitsetToCSRInputs<index_t> params;
+
+  rmm::device_uvector<bitset_t> bitset_d;
+
+  index_t nnz;
+
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<float> values_d;
+
+  rmm::device_uvector<index_t> indptr_expected_d;
+  rmm::device_uvector<index_t> indices_expected_d;
+  rmm::device_uvector<float> values_expected_d;
+};
+
+using BitsetToCSRTestI = BitsetToCSRTest<uint32_t, int, float>;
+TEST_P(BitsetToCSRTestI, Result) { Run(); }
+
+using BitsetToCSRTestL = BitsetToCSRTest<uint32_t, int64_t, float>;
+TEST_P(BitsetToCSRTestL, Result) { Run(); }
+
+using BitsetToCSRTestLOnLargeSize = BitsetToCSRTest<uint32_t, int64_t, float>;
+TEST_P(BitsetToCSRTestLOnLargeSize, Result) { Run(); }
+
+template <typename index_t>
+const std::vector<BitsetToCSRInputs<index_t>> bitsettocsr_inputs = {
+  {0, 0, 0.2, false},
+  {10, 32, 0.4, false},
+  {10, 3, 0.2, false},
+  {32, 1024, 0.4, false},
+  {1024, 1048576, 0.01, false},
+  {1024, 1024, 0.4, false},
+  {64 * 1024 + 10, 2, 0.3, false},  // 64K + 10 is slightly over maximum of blockDim.y
+  {16, 16, 0.3, false},             // No peeling-remainder
+  {17, 16, 0.3, false},             // Check peeling-remainder
+  {18, 16, 0.3, false},             // Check peeling-remainder
+  {32 + 9, 33, 0.2, false},         // Check peeling-remainder
+  {2, 33, 0.2, false},              // Check peeling-remainder
+  {0, 0, 0.2, true},
+  {10, 32, 0.4, true},
+  {10, 3, 0.2, true},
+  {32, 1024, 0.4, true},
+  {1024, 1048576, 0.01, true},
+  {1024, 1024, 0.4, true},
+  {64 * 1024 + 10, 2, 0.3, true},  // 64K + 10 is slightly over maximum of blockDim.y
+  {16, 16, 0.3, true},             // No peeling-remainder
+  {17, 16, 0.3, true},             // Check peeling-remainder
+  {18, 16, 0.3, true},             // Check peeling-remainder
+  {32 + 9, 33, 0.2, true},         // Check peeling-remainder
+  {2, 33, 0.2, true},              // Check peeling-remainder
+};
+
+template <typename index_t>
+const std::vector<BitsetToCSRInputs<index_t>> bitsettocsr_large_inputs = {
+  {100, 100000000, 0.01, true}, {100, 100000000, 0.05, false}, {100, 100000000 + 17, 0.05, false}};
+
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        BitsetToCSRTestI,
+                        ::testing::ValuesIn(bitsettocsr_inputs<int>));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        BitsetToCSRTestL,
+                        ::testing::ValuesIn(bitsettocsr_inputs<int64_t>));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        BitsetToCSRTestLOnLargeSize,
+                        ::testing::ValuesIn(bitsettocsr_large_inputs<int64_t>));
+
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/masked_matmul.cu b/cpp/test/sparse/masked_matmul.cu
index f883beae32..5ee1677015 100644
--- a/cpp/test/sparse/masked_matmul.cu
+++ b/cpp/test/sparse/masked_matmul.cu
@@ -19,7 +19,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/random/make_blobs.cuh>
-#include <raft/sparse/linalg/masked_matmul.hpp>
+#include <raft/sparse/linalg/masked_matmul.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 #include <thrust/reduce.h>
@@ -46,6 +46,8 @@ struct MaskedMatmulInputs {
   unsigned long long int seed;
 };
 
+enum class BitsLayout { Bitset, Bitmap };
+
 template <typename value_t>
 struct sum_abs_op {
   __host__ __device__ value_t operator()(const value_t& x, const value_t& y) const
@@ -87,7 +89,8 @@ bool isCuSparseVersionGreaterThan_12_0_1()
 template <typename value_t,
           typename output_t,
           typename index_t,
-          typename bitmap_t      = uint32_t,
+          BitsLayout bits_layout = BitsLayout::Bitmap,
+          typename bits_t        = uint32_t,
           typename LayoutPolicyA = raft::row_major,
           typename LayoutPolicyB = raft::row_major>
 class MaskedMatmulTest
@@ -98,7 +101,7 @@ class MaskedMatmulTest
       stream(resource::get_cuda_stream(handle)),
       a_data_d(0, resource::get_cuda_stream(handle)),
       b_data_d(0, resource::get_cuda_stream(handle)),
-      bitmap_d(0, resource::get_cuda_stream(handle)),
+      bits_d(0, resource::get_cuda_stream(handle)),
       c_indptr_d(0, resource::get_cuda_stream(handle)),
       c_indices_d(0, resource::get_cuda_stream(handle)),
       c_data_d(0, resource::get_cuda_stream(handle)),
@@ -107,14 +110,14 @@ class MaskedMatmulTest
   }
 
  protected:
-  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitmap_t>& bitmap)
+  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bits_t>& bits)
   {
     index_t total    = static_cast<index_t>(m * n);
     index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
     index_t res      = num_ones;
 
-    for (auto& item : bitmap) {
-      item = static_cast<bitmap_t>(0);
+    for (auto& item : bits) {
+      item = static_cast<bits_t>(0);
     }
 
     std::random_device rd;
@@ -124,8 +127,8 @@ class MaskedMatmulTest
     while (num_ones > 0) {
       index_t index = dis(gen);
 
-      bitmap_t& element    = bitmap[index / (8 * sizeof(bitmap_t))];
-      index_t bit_position = index % (8 * sizeof(bitmap_t));
+      bits_t& element      = bits[index / (8 * sizeof(bits_t))];
+      index_t bit_position = index % (8 * sizeof(bits_t));
 
       if (((element >> bit_position) & 1) == 0) {
         element |= (static_cast<index_t>(1) << bit_position);
@@ -135,7 +138,27 @@ class MaskedMatmulTest
     return res;
   }
 
-  void cpu_convert_to_csr(std::vector<bitmap_t>& bitmap,
+  void repeat_cpu_bitset_inplace(std::vector<bits_t>& inout, size_t input_bits, size_t repeat)
+  {
+    size_t output_bit_index = input_bits;
+
+    for (size_t r = 0; r < repeat; ++r) {
+      for (size_t i = 0; i < input_bits; ++i) {
+        size_t input_unit_index = i / (sizeof(bits_t) * 8);
+        size_t input_bit_offset = i % (sizeof(bits_t) * 8);
+        bool bit                = (inout[input_unit_index] >> input_bit_offset) & 1;
+
+        size_t output_unit_index = output_bit_index / (sizeof(bits_t) * 8);
+        size_t output_bit_offset = output_bit_index % (sizeof(bits_t) * 8);
+
+        inout[output_unit_index] |= (static_cast<bits_t>(bit) << output_bit_offset);
+
+        ++output_bit_index;
+      }
+    }
+  }
+
+  void cpu_convert_to_csr(std::vector<bits_t>& bits,
                           index_t rows,
                           index_t cols,
                           std::vector<index_t>& indices,
@@ -146,14 +169,14 @@ class MaskedMatmulTest
     indptr[offset_indptr++] = 0;
 
     index_t index        = 0;
-    bitmap_t element     = 0;
+    bits_t element       = 0;
     index_t bit_position = 0;
 
     for (index_t i = 0; i < rows; ++i) {
       for (index_t j = 0; j < cols; ++j) {
         index        = i * cols + j;
-        element      = bitmap[index / (8 * sizeof(bitmap_t))];
-        bit_position = index % (8 * sizeof(bitmap_t));
+        element      = bits[index / (8 * sizeof(bits_t))];
+        bit_position = index % (8 * sizeof(bits_t));
 
         if (((element >> bit_position) & 1)) {
           indices[offset_values] = static_cast<index_t>(j);
@@ -201,15 +224,17 @@ class MaskedMatmulTest
     index_t b_size = params.k * params.n;
     index_t c_size = params.m * params.n;
 
-    index_t element = raft::ceildiv(params.m * params.n, index_t(sizeof(bitmap_t) * 8));
-    std::vector<bitmap_t> bitmap_h(element);
+    index_t element = raft::ceildiv(params.m * params.n, index_t(sizeof(bits_t) * 8));
+    std::vector<bits_t> bits_h(element);
+
+    std::memset(bits_h.data(), 0, bits_h.size() * sizeof(bits_t));
 
     std::vector<value_t> a_data_h(a_size);
     std::vector<value_t> b_data_h(b_size);
 
     a_data_d.resize(a_size, stream);
     b_data_d.resize(b_size, stream);
-    bitmap_d.resize(bitmap_h.size(), stream);
+    bits_d.resize(bits_h.size(), stream);
 
     auto blobs_a_b = raft::make_device_matrix<output_t, index_t>(handle, 1, a_size + b_size);
     auto labels    = raft::make_device_vector<index_t, index_t>(handle, 1);
@@ -262,18 +287,27 @@ class MaskedMatmulTest
 
     resource::sync_stream(handle);
 
-    index_t c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bitmap_h);
+    index_t c_true_nnz = 0;
+    if constexpr (bits_layout == BitsLayout::Bitmap) {
+      c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bits_h);
+    } else if constexpr (bits_layout == BitsLayout::Bitset) {
+      c_true_nnz = create_sparse_matrix(1, params.n, params.sparsity, bits_h);
+      repeat_cpu_bitset_inplace(bits_h, params.n, params.m - 1);
+      c_true_nnz *= params.m;
+    } else {
+      GTEST_SKIP() << "Unsupported BitsLayout!";
+    }
 
     std::vector<index_t> c_indptr_h(params.m + 1);
     std::vector<index_t> c_indices_h(c_true_nnz);
     std::vector<output_t> c_data_h(c_true_nnz);
 
-    cpu_convert_to_csr(bitmap_h, params.m, params.n, c_indices_h, c_indptr_h);
+    cpu_convert_to_csr(bits_h, params.m, params.n, c_indices_h, c_indptr_h);
 
     c_data_d.resize(c_data_h.size(), stream);
 
     update_device(c_data_d.data(), c_data_h.data(), c_data_h.size(), stream);
-    update_device(bitmap_d.data(), bitmap_h.data(), bitmap_h.size(), stream);
+    update_device(bits_d.data(), bits_h.data(), bits_h.size(), stream);
     resource::sync_stream(handle);
 
     cpu_sddmm(a_data_h, b_data_h, c_data_h, c_indices_h, c_indptr_h, true, true);
@@ -304,9 +338,6 @@ class MaskedMatmulTest
     auto B =
       raft::make_device_matrix_view<const value_t, index_t>(b_data_d.data(), params.n, params.k);
 
-    auto mask =
-      raft::core::bitmap_view<const bitmap_t, index_t>(bitmap_d.data(), params.m, params.n);
-
     auto c_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
       c_indptr_d.data(),
       c_indices_d.data(),
@@ -316,7 +347,15 @@ class MaskedMatmulTest
 
     auto C = raft::make_device_csr_matrix_view<output_t>(c_data_d.data(), c_structure);
 
-    raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
+    if constexpr (bits_layout == BitsLayout::Bitmap) {
+      auto mask = raft::core::bitmap_view<const bits_t, index_t>(bits_d.data(), params.m, params.n);
+      raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
+    } else if constexpr (bits_layout == BitsLayout::Bitset) {
+      auto mask = raft::core::bitset_view<const bits_t, index_t>(bits_d.data(), params.n);
+      raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
+    } else {
+      GTEST_SKIP() << "Unsupported BitsLayout!";
+    }
 
     resource::sync_stream(handle);
 
@@ -344,7 +383,7 @@ class MaskedMatmulTest
 
   rmm::device_uvector<value_t> a_data_d;
   rmm::device_uvector<value_t> b_data_d;
-  rmm::device_uvector<bitmap_t> bitmap_d;
+  rmm::device_uvector<bits_t> bits_d;
 
   rmm::device_uvector<index_t> c_indptr_d;
   rmm::device_uvector<index_t> c_indices_d;
@@ -353,14 +392,23 @@ class MaskedMatmulTest
   rmm::device_uvector<output_t> c_expected_data_d;
 };
 
-using MaskedMatmulTestF = MaskedMatmulTest<float, float, int>;
-TEST_P(MaskedMatmulTestF, Result) { Run(); }
+using MaskedMatmulOnBitmapTestF = MaskedMatmulTest<float, float, int, BitsLayout::Bitmap>;
+TEST_P(MaskedMatmulOnBitmapTestF, Result) { Run(); }
+
+using MaskedMatmulOnBitmapTestD = MaskedMatmulTest<double, double, int, BitsLayout::Bitmap>;
+TEST_P(MaskedMatmulOnBitmapTestD, Result) { Run(); }
 
-using MaskedMatmulTestD = MaskedMatmulTest<double, double, int>;
-TEST_P(MaskedMatmulTestD, Result) { Run(); }
+using MaskedMatmulOnBitmapTestH = MaskedMatmulTest<half, float, int, BitsLayout::Bitmap>;
+TEST_P(MaskedMatmulOnBitmapTestH, Result) { Run(); }
 
-using MaskedMatmulTestH = MaskedMatmulTest<half, float, int>;
-TEST_P(MaskedMatmulTestH, Result) { Run(); }
+using MaskedMatmulOnBitsetTestF = MaskedMatmulTest<float, float, int, BitsLayout::Bitset>;
+TEST_P(MaskedMatmulOnBitsetTestF, Result) { Run(); }
+
+using MaskedMatmulOnBitsetTestD = MaskedMatmulTest<double, double, int, BitsLayout::Bitset>;
+TEST_P(MaskedMatmulOnBitsetTestD, Result) { Run(); }
+
+using MaskedMatmulOnBitsetTestH = MaskedMatmulTest<half, float, int, BitsLayout::Bitset>;
+TEST_P(MaskedMatmulOnBitsetTestH, Result) { Run(); }
 
 const std::vector<MaskedMatmulInputs<float, float, int>> sddmm_inputs_f = {
   {0.001f, 2, 255, 1023, 0.19, 1234ULL},
@@ -419,11 +467,29 @@ const std::vector<MaskedMatmulInputs<half, float, int>> sddmm_inputs_h = {
   {0.0003f, 31, 1025, 1025, 0.19, 1234ULL},
   {0.001f, 1024, 1024, 1024, 0.1, 1234ULL}};
 
-INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestF, ::testing::ValuesIn(sddmm_inputs_f));
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitmapTestF,
+                        ::testing::ValuesIn(sddmm_inputs_f));
+
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitmapTestD,
+                        ::testing::ValuesIn(sddmm_inputs_d));
+
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitmapTestH,
+                        ::testing::ValuesIn(sddmm_inputs_h));
+
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitsetTestF,
+                        ::testing::ValuesIn(sddmm_inputs_f));
 
-INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestD, ::testing::ValuesIn(sddmm_inputs_d));
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitsetTestD,
+                        ::testing::ValuesIn(sddmm_inputs_d));
 
-INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestH, ::testing::ValuesIn(sddmm_inputs_h));
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitsetTestH,
+                        ::testing::ValuesIn(sddmm_inputs_h));
 
 }  // namespace sparse
 }  // namespace raft