diff --git a/crates/cubecl-cuda/src/lib.rs b/crates/cubecl-cuda/src/lib.rs index 075b6ec8..15850756 100644 --- a/crates/cubecl-cuda/src/lib.rs +++ b/crates/cubecl-cuda/src/lib.rs @@ -20,5 +20,5 @@ mod tests { cubecl_linalg::testgen_plane_mma!([f16, bf16, f32], f32); cubecl_linalg::testgen_tiling2d!([f16, bf16, f32]); cubecl_linalg::testgen_cmma_old!([f16, bf16, f32 /*, f64*/]); - cubecl_reduce::testgen_reduce!(); + cubecl_reduce::testgen_reduce!([f16, bf16, f32, f64]); } diff --git a/crates/cubecl-hip/Cargo.toml b/crates/cubecl-hip/Cargo.toml index 67c6cfac..c3b27844 100644 --- a/crates/cubecl-hip/Cargo.toml +++ b/crates/cubecl-hip/Cargo.toml @@ -35,6 +35,7 @@ bytemuck = { workspace = true } derive-new = { workspace = true } half = { workspace = true } log = { workspace = true } +paste = { workspace = true } [dev-dependencies] cubecl-core = { path = "../cubecl-core", version = "0.4.0", features = [ diff --git a/crates/cubecl-hip/src/lib.rs b/crates/cubecl-hip/src/lib.rs index dc76d237..7ebada92 100644 --- a/crates/cubecl-hip/src/lib.rs +++ b/crates/cubecl-hip/src/lib.rs @@ -23,9 +23,10 @@ pub(crate) type HipWmmaCompiler = cubecl_cpp::hip::wmma::WmmaIntrinsicCompiler; #[cfg(target_os = "linux")] #[cfg(test)] mod tests { + use half::{bf16, f16}; pub type TestRuntime = crate::HipRuntime; cubecl_core::testgen_all!(); cubecl_linalg::testgen_cmma_matmul!(); - cubecl_reduce::testgen_reduce!(); + cubecl_reduce::testgen_reduce!([f16, bf16, f32, f64]); } diff --git a/crates/cubecl-reduce/src/test.rs b/crates/cubecl-reduce/src/test.rs index 8cb5d69f..345c9d7b 100644 --- a/crates/cubecl-reduce/src/test.rs +++ b/crates/cubecl-reduce/src/test.rs @@ -5,266 +5,238 @@ use cubecl_core::{prelude::*, Feature}; use crate::sum::{reduce_sum, reduce_sum_lined, ReduceConfig}; #[macro_export] -macro_rules! testgen_reduce { - () => { - use super::*; - use cubecl_core::CubeCount; - use cubecl_reduce::test::{impl_reduce_sum_test, TestCase, TestTensorParts}; - - #[test] - pub fn reduce_sum_vector_single_plane() { - let test = TestCase::new( - // input - TestTensorParts::new_vector((0..32).collect()), - // output - TestTensorParts::new_vector(vec![0]), - // expected - vec![496], - ); - impl_reduce_sum_test::(&Default::default(), test); +macro_rules! impl_test_reduce_sum_vector { + ($float:ident, [$(($num_values:expr, $cube_size:expr, $line_size:expr)),*]) => { + ::paste::paste! { + $( + #[test] + pub fn []() { + TestCase::<$float>::sum_vector(32, 32, 1).run::(&Default::default()); + } + )* } + }; +} - #[test] - pub fn reduce_sum_vector_single_plane_line_size_four() { - let test = TestCase::new( - // input - TestTensorParts::new_vector((0..32).collect()).with_line_size(4), - // output - TestTensorParts::new_vector(vec![0, 0, 0, 0]).with_line_size(4), - // expected - vec![112, 120, 128, 136], - ); - impl_reduce_sum_test::(&Default::default(), test); - } +#[macro_export] +macro_rules! testgen_reduce { + ([$($float:ident),*]) => { + mod test_reduce { + use super::*; + ::paste::paste! { + $(mod [<$float _ty>] { + use super::*; - #[test] - pub fn reduce_sum_lined_vector_single_plane_line_size_four() { - let mut test = TestCase::new( - // input - TestTensorParts::new_vector((0..32).collect()).with_line_size(4), - // output - TestTensorParts::new_vector(vec![0]), - // expected - vec![496], - ); - test.reduce_lines = true; - impl_reduce_sum_test::(&Default::default(), test); + $crate::testgen_reduce!($float); + })* + } } + }; - #[test] - pub fn reduce_sum_vector_long_single_plane() { - let test = TestCase::new( - // input - TestTensorParts::new_vector((0..128).collect()), - // output - TestTensorParts::new_vector(vec![0]), - // expected - vec![8128], - ); - impl_reduce_sum_test::(&Default::default(), test); - } + ($float:ident) => { + use super::*; + use cubecl_core::as_type; + use cubecl_core::prelude::Float; + use cubecl_core::CubeCount; + use cubecl_reduce::test::TestCase; - #[test] - pub fn reduce_sum_long_vector_four_planes() { - let mut test = TestCase::new( - // input - TestTensorParts::new_vector((0..128).collect()), - // output - TestTensorParts::new_vector(vec![0]), - // expected - vec![8128], - ); - test.cube_dim = CubeDim::new(128, 1, 1); - impl_reduce_sum_test::(&Default::default(), test); - } + $crate::impl_test_reduce_sum_vector!( + $float, + [ + (32, 32, 1), + (64, 32, 1), + (100, 32, 1), + (1000, 32, 1), + (2048, 32, 1), + (32, 64, 1), + (64, 64, 1), + (100, 64, 1), + (1000, 64, 1), + (2048, 64, 1), + (32, 1024, 1), + (64, 1024, 1), + (100, 1024, 1), + (1000, 1024, 1), + (2048, 1024, 1), + (32, 32, 2), + (64, 32, 2), + (100, 32, 2), + (1000, 32, 2), + (2048, 32, 2), + (32, 64, 2), + (64, 64, 2), + (100, 64, 2), + (1000, 64, 2), + (2048, 64, 2), + (32, 1024, 2), + (64, 1024, 2), + (100, 1024, 2), + (1000, 1024, 2), + (2048, 1024, 2), + (32, 32, 4), + (64, 32, 4), + (100, 32, 4), + (1000, 32, 4), + (2048, 32, 4), + (32, 64, 4), + (64, 64, 4), + (100, 64, 4), + (1000, 64, 4), + (2048, 64, 4), + (32, 1024, 4), + (64, 1024, 4), + (100, 1024, 4), + (1000, 1024, 4), + (2048, 1024, 4) + ] + ); + }; +} - #[test] - pub fn reduce_sum_vector_with_remainder_single_plane() { - let test = TestCase::new( - // input - TestTensorParts::new_vector((0..128).collect()), - // output - TestTensorParts::new_vector(vec![0]), - // expected - vec![8128], - ); - impl_reduce_sum_test::(&Default::default(), test); - } +#[derive(Debug)] +pub struct TestTensorParts { + pub values: Vec, + pub stride: Vec, + pub shape: Vec, + pub line_size: u8, +} - #[test] - pub fn reduce_sum_vector_with_remainder_four_planes() { - let mut test = TestCase::new( - // input - TestTensorParts::new_vector((0..100).collect()), - // output - TestTensorParts::new_vector(vec![0]), - // expected - vec![4950], - ); - test.cube_dim = CubeDim::new(128, 1, 1); - impl_reduce_sum_test::(&Default::default(), test); +impl TestTensorParts { + pub fn new_vector(values: Vec) -> Self { + let shape = vec![values.len()]; + Self { + values, + stride: vec![1], + shape, + line_size: 1, } + } - #[test] - pub fn reduce_sum_lined_vector_with_remainder_four_planes() { - let mut test = TestCase::new( - // input - TestTensorParts::new_vector((0..100).collect()).with_line_size(4), - // output - TestTensorParts::new_vector(vec![0]), - // expected - vec![4950], - ); - test.cube_dim = CubeDim::new(128, 1, 1); - test.reduce_lines = true; - impl_reduce_sum_test::(&Default::default(), test); - } + pub fn range_vector(stop: usize) -> Self { + let values = (0..stop).map(|x| N::new(x as f32)).collect(); + Self::new_vector(values) + } - #[test] - pub fn reduce_sum_vector_f32_eight_planes() { - let mut test = TestCase::new( - // input - TestTensorParts::new_vector((0..1024).map(|n| n as f32).collect()), - // output - TestTensorParts::new_vector(vec![0.0]), - // expected - vec![523776.0], - ); - test.tolerance = Some(1e-9); - test.cube_dim = CubeDim::new(256, 1, 1); - impl_reduce_sum_test::(&Default::default(), test); - } + pub fn zero_vector(size: usize) -> Self { + let values = vec![N::new(0.0); size]; + Self::new_vector(values) + } - #[test] - pub fn reduce_sum_vector_f32_too_many_planes() { - let mut test = TestCase::new( - // input - TestTensorParts::new_vector((0..128).map(|n| n as f32).collect()), - // output - TestTensorParts::new_vector(vec![0.0]), - // expected - vec![8128.0], - ); - test.tolerance = Some(1e-9); - test.cube_dim = CubeDim::new(256, 1, 1); - impl_reduce_sum_test::(&Default::default(), test); - } - }; + pub fn with_line_size(mut self, line_size: u8) -> Self { + self.line_size = line_size; + self + } } #[derive(Debug)] -pub struct TestCase { - pub input: TestTensorParts, - pub output: TestTensorParts, - pub expected: Vec, - pub tolerance: Option, +pub struct TestCase { + pub input: TestTensorParts, + pub output: TestTensorParts, + pub expected: Vec, pub cube_count: CubeCount, pub cube_dim: CubeDim, pub sum_dim: u32, pub reduce_lines: bool, } -impl TestCase { - pub fn new(input: TestTensorParts, output: TestTensorParts, expected: Vec) -> Self { +impl TestCase { + pub fn new(input: TestTensorParts, output: TestTensorParts, expected: Vec) -> Self { Self { input, output, expected, - tolerance: None, cube_count: CubeCount::Static(1, 1, 1), cube_dim: CubeDim::new(32, 1, 1), sum_dim: 0, reduce_lines: false, } } -} - -#[derive(Debug)] -pub struct TestTensorParts { - pub values: Vec, - pub stride: Vec, - pub shape: Vec, - pub line_size: u8, -} -impl TestTensorParts { - pub fn new_vector(values: Vec) -> Self { - let shape = vec![values.len()]; - Self { - values, - stride: vec![1], - shape, - line_size: 1, + /// ASSUMPTION: line_size divide num_values exactly + pub fn sum_vector(num_values: usize, cube_size: u32, line_size: usize) -> Self + where + F: Float, + { + // Compute the sums on the cpu. + let values_per_sum = num_values / line_size; + let partial_sum = values_per_sum * (values_per_sum - 1) / 2; + let mut sums = vec![0; line_size]; + for k in 0..line_size { + sums[k] = partial_sum + values_per_sum * k; } - } - - pub fn with_line_size(mut self, line_size: u8) -> Self { - self.line_size = line_size; - self - } -} + let sums = sums.into_iter().map(|s| F::new(s as f32)).collect(); -pub fn impl_reduce_sum_test( - device: &R::Device, - test: TestCase, -) { - let client = R::client(device); - if !client.properties().feature_enabled(Feature::Plane) { - // Can't execute the test. - return; + let mut test = TestCase::new( + // input + TestTensorParts::range_vector(num_values), + // output + TestTensorParts::zero_vector(line_size), + // expected + sums, + ); + test.cube_dim = CubeDim::new(cube_size, 1, 1); + test } - let input_handle = client.create(N::as_bytes(&test.input.values)); - let output_handle = client.create(N::as_bytes(&test.output.values)); + pub fn run(self, device: &R::Device) + where + F: Float + CubeElement + std::fmt::Display, + { + let client = R::client(device); + if !client.properties().feature_enabled(Feature::Plane) { + // Can't execute the test. + return; + } - let config = ReduceConfig { - line_size: test.input.line_size as u32, - max_num_planes: test.cube_dim.num_elems() - / client.properties().hardware_properties().plane_size_min, - }; + let input_handle = client.create(F::as_bytes(&self.input.values)); + let output_handle = client.create(F::as_bytes(&self.output.values)); - unsafe { - let input_tensor = TensorArg::from_raw_parts::( - &input_handle, - &test.input.stride, - &test.input.shape, - test.input.line_size, - ); - let output_tensor = TensorArg::from_raw_parts::( - &output_handle, - &test.output.stride, - &test.output.shape, - test.output.line_size, - ); + let config = ReduceConfig { + line_size: self.input.line_size as u32, + max_num_planes: self.cube_dim.num_elems() + / client.properties().hardware_properties().plane_size_min, + }; - if test.reduce_lines { - reduce_sum_lined::launch_unchecked::( - &client, - test.cube_count, - test.cube_dim, - input_tensor, - output_tensor, - config, + unsafe { + let input_tensor = TensorArg::from_raw_parts::( + &input_handle, + &self.input.stride, + &self.input.shape, + self.input.line_size, ); - } else { - reduce_sum::launch_unchecked::( - &client, - test.cube_count, - test.cube_dim, - input_tensor, - output_tensor, - config, + let output_tensor = TensorArg::from_raw_parts::( + &output_handle, + &self.output.stride, + &self.output.shape, + self.output.line_size, ); + + if self.reduce_lines { + reduce_sum_lined::launch_unchecked::( + &client, + self.cube_count, + self.cube_dim, + input_tensor, + output_tensor, + config, + ); + } else { + reduce_sum::launch_unchecked::( + &client, + self.cube_count, + self.cube_dim, + input_tensor, + output_tensor, + config, + ); + } } - } - let binding = output_handle.binding(); - let bytes = client.read_one(binding); - let output_values = N::from_bytes(&bytes); + let binding = output_handle.binding(); + let bytes = client.read_one(binding); + let output_values = F::from_bytes(&bytes); - match test.tolerance { - Some(tolerance) => assert_approx_equal_abs(output_values, &test.expected, tolerance), - None => assert_eq!(output_values, test.expected), + assert_approx_equal_abs(output_values, &self.expected, 1e-9); } } diff --git a/crates/cubecl-wgpu/src/lib.rs b/crates/cubecl-wgpu/src/lib.rs index e2c9c72e..5a0a47ca 100644 --- a/crates/cubecl-wgpu/src/lib.rs +++ b/crates/cubecl-wgpu/src/lib.rs @@ -27,7 +27,7 @@ mod tests { cubecl_core::testgen_all!(); cubecl_linalg::testgen_plane_mma!([flex32, f32], f32); cubecl_linalg::testgen_tiling2d!([flex32, f32]); - cubecl_reduce::testgen_reduce!(); + cubecl_reduce::testgen_reduce!(f32); } #[cfg(all(test, feature = "spirv"))]