tracel-ai · louisfd · Aug 20, 2024 · Aug 20, 2024 · Aug 21, 2024 · Aug 21, 2024
diff --git a/crates/cubecl-core/src/codegen/compiler.rs b/crates/cubecl-core/src/codegen/compiler.rs
@@ -17,6 +17,6 @@ pub trait Compiler: Sync + Send + 'static + Clone + Default + core::fmt::Debug {
     fn compile(kernel: KernelDefinition, mode: ExecutionMode) -> Self::Representation;
     /// The size of the given element in bytes.
     fn elem_size(elem: Elem) -> usize;
-    /// The maximal size of a shared memory
+    /// The maximal size of a shared memory, in bytes
     fn max_shared_memory_size() -> usize;
 }
diff --git a/crates/cubecl-core/src/frontend/sequence.rs b/crates/cubecl-core/src/frontend/sequence.rs
@@ -9,6 +9,7 @@ use std::{cell::RefCell, rc::Rc};
 /// All methods [push](Sequence::push), [index](Sequence::index) and
 /// [into_iter](Sequence::into_iter) are executed _during_ compilation and don't add any overhead
 /// on the generated kernel.
+#[derive(Debug, Clone)]
 pub struct Sequence<T: CubeType> {
     values: Vec<T>,
 }

diff --git a/crates/cubecl-core/src/ir/kernel.rs b/crates/cubecl-core/src/ir/kernel.rs
@@ -227,7 +227,7 @@ pub struct CubeDim {
 }
 
 impl CubeDim {
-    pub(crate) fn num_elems(&self) -> u32 {
+    pub fn num_elems(&self) -> u32 {
         self.x * self.y * self.z
     }
 }

diff --git a/crates/cubecl-core/src/runtime_tests/topology.rs b/crates/cubecl-core/src/runtime_tests/topology.rs
@@ -3,41 +3,33 @@ use crate as cubecl;
 use cubecl::prelude::*;
 
 #[cube(launch)]
-pub fn kernel_absolute_pos(output1: &mut Array<UInt>, output2: &mut Array<UInt>) {
+pub fn kernel_absolute_pos(output1: &mut Array<UInt>) {
     if ABSOLUTE_POS >= output1.len() {
         return;
     }
 
     output1[ABSOLUTE_POS] = ABSOLUTE_POS;
-    output2[ABSOLUTE_POS] = ABSOLUTE_POS;
 }
 
 pub fn test_kernel_topology_absolute_pos<R: Runtime>(client: ComputeClient<R::Server, R::Channel>) {
     let cube_count = (3, 5, 7);
     let cube_dim = (16, 16, 1);
-    let extra: u32 = 3u32;
 
-    let length =
-        (cube_count.0 * cube_count.1 * cube_count.2 * cube_dim.0 * cube_dim.1 * cube_dim.2) + extra;
+    let length = cube_count.0 * cube_count.1 * cube_count.2 * cube_dim.0 * cube_dim.1 * cube_dim.2;
     let handle1 = client.empty(length as usize * core::mem::size_of::<u32>());
-    let handle2 = client.empty(length as usize * core::mem::size_of::<u32>());
 
     unsafe {
         kernel_absolute_pos::launch::<R>(
             &client,
             CubeCount::Static(cube_count.0, cube_count.1, cube_count.2),
             CubeDim::new(cube_dim.0, cube_dim.1, cube_dim.2),
             ArrayArg::from_raw_parts(&handle1, length as usize, 1),
-            ArrayArg::from_raw_parts(&handle2, length as usize, 1),
         )
     };
 
     let actual = client.read(handle1.binding());
     let actual = u32::from_bytes(&actual);
-    let mut expect: Vec<u32> = (0..length - extra).collect();
-    expect.push(0);
-    expect.push(0);
-    expect.push(0);
+    let expect: Vec<u32> = (0..length).collect();
 
     assert_eq!(actual, &expect);
 }

diff --git a/crates/cubecl-cuda/src/compiler/base.rs b/crates/cubecl-cuda/src/compiler/base.rs
@@ -46,8 +46,7 @@ impl Compiler for CudaCompiler {
     }
 
     fn max_shared_memory_size() -> usize {
-        // TODO: Find out this value.
-        usize::MAX
+        49152
     }
 }
 

diff --git a/crates/cubecl-linalg/src/matmul/cmma/base.rs b/crates/cubecl-linalg/src/matmul/cmma/base.rs
@@ -2,31 +2,7 @@ use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
 
 use super::block_loop::block_loop;
-use super::config::CmmaConfig;
-
-#[cube(launch_unchecked)]
-#[allow(unused_mut)]
-pub fn cmma_kernel<F: Float, FC: Float>(
-    lhs: &Tensor<F>,
-    rhs: &Tensor<F>,
-    out: &mut Tensor<F>,
-    config: Comptime<CmmaConfig>,
-) {
-    let dims = get_dims::<F>(lhs, rhs);
-    let offsets = calculate_offsets::<F>(lhs, rhs, out, config);
-    let shared_memories = make_shared_memories::<FC>(config);
-    let accumulate = make_accumulators::<F>();
-    block_loop::<F, FC>(
-        lhs,
-        rhs,
-        out,
-        offsets,
-        shared_memories,
-        accumulate,
-        config,
-        dims,
-    );
-}
+use super::config::ComptimeCmmaInfo;
 
 #[derive(CubeType, Copy, Clone)]
 pub(crate) struct Dimensions {
@@ -36,15 +12,22 @@ pub(crate) struct Dimensions {
 }
 
 #[derive(CubeType, Copy, Clone)]
-pub(crate) struct SharedMemories<FC: Float> {
-    pub lhs: SharedMemory<FC>,
-    pub rhs: SharedMemory<FC>,
+pub(crate) struct Ids {
+    pub coop: UInt,
+    pub lane: UInt,
 }
 
 #[derive(CubeType, Copy, Clone)]
-pub(crate) struct Accumulators<F: Float> {
-    pub first: cmma::Matrix<F>,
-    pub second: cmma::Matrix<F>,
+pub(crate) struct RuntimeCmmaInfo {
+    pub ids: Ids,
+    pub dims: Dimensions,
+    pub offsets: Offsets,
+}
+
+#[derive(CubeType, Copy, Clone)]
+pub(crate) struct SharedMemories<FC: Float> {
+    pub lhs: SharedMemory<FC>,
+    pub rhs: SharedMemory<FC>,
 }
 
 #[derive(CubeType, Copy, Clone)]
@@ -57,7 +40,39 @@ pub(crate) struct Offsets {
     pub batch_out: UInt,
     pub cube_row: UInt,
     pub cube_col: UInt,
-    pub k: UInt,
+}
+
+#[derive(CubeType)]
+pub(crate) struct CmmaMatrices<F: Float, FC: Float> {
+    pub accumulators: Sequence<cmma::Matrix<F>>,
+    pub lhs: cmma::Matrix<FC>,
+    pub rhs: cmma::Matrix<FC>,
+}
+
+#[cube(launch_unchecked)]
+#[allow(unused_mut)]
+pub fn cmma_kernel<F: Float, FC: Float>(
+    lhs: &Tensor<F>,
+    rhs: &Tensor<F>,
+    out: &mut Tensor<F>,
+    comptime_info: Comptime<ComptimeCmmaInfo>,
+) {
+    let ids = get_ids();
+    let dims = get_dims::<F>(lhs, rhs);
+    let offsets = calculate_offsets::<F>(lhs, rhs, out, comptime_info);
+    let runtime_info = RuntimeCmmaInfo { ids, dims, offsets };
+
+    let shared_memories = make_shared_memories::<FC>(comptime_info);
+    let cmma_matrices = make_cmma_matrices::<F, FC>(comptime_info);
+    block_loop::<F, FC>(
+        lhs,
+        rhs,
+        out,
+        shared_memories,
+        cmma_matrices,
+        runtime_info,
+        comptime_info,
+    );
 }
 
 #[cube]
@@ -77,7 +92,7 @@ fn calculate_offsets<F: Float>(
     lhs: &Tensor<F>,
     rhs: &Tensor<F>,
     out: &Tensor<F>,
-    config: Comptime<CmmaConfig>,
+    config: Comptime<ComptimeCmmaInfo>,
 ) -> Offsets {
     let block_size_m = Comptime::map(config, |c| c.block_size_m);
     let block_size_n = Comptime::map(config, |c| c.block_size_n);
@@ -109,12 +124,11 @@ fn calculate_offsets<F: Float>(
         batch_out,
         cube_row,
         cube_col,
-        k: UInt::new(0), // Changes during kernel
     }
 }
 
 #[cube]
-fn make_shared_memories<FC: Float>(config: Comptime<CmmaConfig>) -> SharedMemories<FC> {
+fn make_shared_memories<FC: Float>(config: Comptime<ComptimeCmmaInfo>) -> SharedMemories<FC> {
     let block_size_m = Comptime::map(config, |c| c.block_size_m);
     let block_size_k = Comptime::map(config, |c| c.block_size_k);
     let block_size_n = Comptime::map(config, |c| c.block_size_n);
@@ -126,28 +140,53 @@ fn make_shared_memories<FC: Float>(config: Comptime<CmmaConfig>) -> SharedMemori
 }
 
 #[cube]
-pub(crate) fn make_accumulators<F: Float>() -> Accumulators<F> {
-    // Assumes two per warp. TODO generalize
-    let acc0 = cmma::Matrix::<F>::new(
-        cmma::MatrixIdent::Accumulator,
+pub(crate) fn make_cmma_matrices<F: Float, FC: Float>(
+    config: Comptime<ComptimeCmmaInfo>,
+) -> CmmaMatrices<F, FC> {
+    let num_accumulators = Comptime::map(config, |c| c.num_accumulators);
+    let mut accumulators = Sequence::<cmma::Matrix<F>>::new();
+
+    for _ in range(0u32, Comptime::get(num_accumulators), Comptime::new(true)) {
+        let acc = cmma::Matrix::<F>::new(
+            cmma::MatrixIdent::Accumulator,
+            16,
+            16,
+            16,
+            cmma::MatrixLayout::Undefined,
+        );
+
+        cmma::fill::<F>(&acc, F::new(0.0));
+
+        accumulators.push(acc);
+    }
+
+    let lhs = cmma::Matrix::<FC>::new(
+        cmma::MatrixIdent::A,
         16,
         16,
         16,
-        cmma::MatrixLayout::Undefined,
+        cmma::MatrixLayout::RowMajor,
     );
-    let acc1 = cmma::Matrix::<F>::new(
-        cmma::MatrixIdent::Accumulator,
+
+    let rhs = cmma::Matrix::<FC>::new(
+        cmma::MatrixIdent::B,
         16,
         16,
         16,
-        cmma::MatrixLayout::Undefined,
+        cmma::MatrixLayout::RowMajor,
     );
 
-    cmma::fill::<F>(&acc0, F::new(0.0));
-    cmma::fill::<F>(&acc1, F::new(0.0));
+    CmmaMatrices {
+        accumulators,
+        lhs,
+        rhs,
+    }
+}
 
-    Accumulators {
-        first: acc0,
-        second: acc1,
+#[cube]
+fn get_ids() -> Ids {
+    Ids {
+        coop: UNIT_POS_Y,
+        lane: UNIT_POS_X,
     }
 }
diff --git a/crates/cubecl-linalg/src/matmul/cmma/block_io/base.rs b/crates/cubecl-linalg/src/matmul/cmma/block_io/base.rs
@@ -2,7 +2,6 @@ use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
 
 use crate::matmul::cmma::base::Dimensions;
-use crate::matmul::cmma::config::CmmaConfig;
 
 #[cube]
 pub(crate) trait BlockLoader<F: Float, FC: Float>: Send + Sync + 'static {
@@ -24,12 +23,10 @@ pub(crate) trait BlockWriter<F: Float>: Send + Sync + 'static {
     fn write_output(
         out: &mut Tensor<F>,
         accumulator_sm: SharedMemory<F>,
-        n_iter: UInt,
         batch_offset: UInt,
         read_position: UInt,
         write_row: UInt,
         write_col: UInt,
         dims: Dimensions,
-        config: Comptime<CmmaConfig>,
     );
 }
diff --git a/crates/cubecl-linalg/src/matmul/cmma/block_io/horizontal_block_check.rs b/crates/cubecl-linalg/src/matmul/cmma/block_io/horizontal_block_check.rs
@@ -1,7 +1,7 @@
 use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
 
-use crate::matmul::cmma::{base::Dimensions, config::CmmaConfig};
+use crate::matmul::cmma::base::Dimensions;
 
 use super::base::{BlockLoader, BlockWriter};
 
@@ -21,13 +21,18 @@ impl<F: Float, FC: Float> BlockLoader<F, FC> for HorizontalCheckBlockIO {
     ) {
         let tensor_vec = Comptime::vectorization(tensor);
         let tensor_vec_r = Comptime::runtime(tensor_vec);
+        let is_scalar = Comptime::map(tensor_vec, |v| v.val == 1);
 
         if read_col < dim_horizontal {
             let read_pos = (batch_offset + read_row * dim_horizontal + read_col) / tensor_vec_r;
             let value = tensor[read_pos];
 
-            for i in range(0u32, Comptime::get(tensor_vec), Comptime::new(true)) {
-                shared_memory[write_pos + i] = FC::cast_from(value[i]);
+            if Comptime::get(is_scalar) {
+                shared_memory[write_pos] = FC::cast_from(value);
+            } else {
+                for i in range(0u32, Comptime::get(tensor_vec), Comptime::new(true)) {
+                    shared_memory[write_pos + i] = FC::cast_from(value[i]);
+                }
             }
         } else {
             for i in range(0u32, Comptime::get(tensor_vec), Comptime::new(true)) {
@@ -42,33 +47,31 @@ impl<F: Float> BlockWriter<F> for HorizontalCheckBlockIO {
     fn write_output(
         out: &mut Tensor<F>,
         accumulator_sm: SharedMemory<F>,
-        n_iter: UInt,
         batch_offset: UInt,
         read_position: UInt,
         write_row: UInt,
         write_col: UInt,
         dims: Dimensions,
-        config: Comptime<CmmaConfig>,
     ) {
-        let tile_size = Comptime::map(config, |c| c.tile_size);
         let out_vec = Comptime::vectorization(out);
         let out_vec_r = Comptime::runtime(out_vec);
+        let is_scalar = Comptime::map(out_vec, |v| v.val == 1);
 
-        let col_with_n_iter = write_col + n_iter * Comptime::runtime(tile_size);
-
-        if col_with_n_iter < dims.n {
-            let n_iter_read_offset = n_iter * Comptime::runtime(tile_size * tile_size);
-            let read_position = read_position + n_iter_read_offset;
+        if write_col < dims.n {
+            let write_position = batch_offset + write_row * dims.n + write_col;
 
-            let write_position = batch_offset + write_row * dims.n + col_with_n_iter;
+            if Comptime::get(is_scalar) {
+                let val = accumulator_sm[read_position];
+                out[write_position / out_vec_r] = val;
+            } else {
+                let mut value = F::vectorized_empty(Comptime::get(out_vec));
 
-            let mut value = F::vectorized_empty(Comptime::get(out_vec));
+                for i in range(0u32, Comptime::get(out_vec), Comptime::new(true)) {
+                    value[i] = accumulator_sm[read_position + i];
+                }
 
-            for i in range(0u32, 4u32, Comptime::new(true)) {
-                value[i] = accumulator_sm[read_position + i];
+                out[write_position / out_vec_r] = value;
             }
-
-            out[write_position / out_vec_r] = value;
         }
     }
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -46,8 +46,7 @@ impl Compiler for CudaCompiler { @@
         }
         fn max_shared_memory_size() -> usize {
-            // TODO: Find out this value.
-            usize::MAX
         }
     }
@@ Expand Down @@