diff --git a/crates/cubecl-cuda/src/lib.rs b/crates/cubecl-cuda/src/lib.rs
index 075b6ec8..15850756 100644
--- a/crates/cubecl-cuda/src/lib.rs
+++ b/crates/cubecl-cuda/src/lib.rs
@@ -20,5 +20,5 @@ mod tests {
     cubecl_linalg::testgen_plane_mma!([f16, bf16, f32], f32);
     cubecl_linalg::testgen_tiling2d!([f16, bf16, f32]);
     cubecl_linalg::testgen_cmma_old!([f16, bf16, f32 /*, f64*/]);
-    cubecl_reduce::testgen_reduce!();
+    cubecl_reduce::testgen_reduce!([f16, bf16, f32, f64]);
 }
diff --git a/crates/cubecl-hip/Cargo.toml b/crates/cubecl-hip/Cargo.toml
index 67c6cfac..c3b27844 100644
--- a/crates/cubecl-hip/Cargo.toml
+++ b/crates/cubecl-hip/Cargo.toml
@@ -35,6 +35,7 @@ bytemuck = { workspace = true }
 derive-new = { workspace = true }
 half = { workspace = true }
 log = { workspace = true }
+paste = { workspace = true }
 
 [dev-dependencies]
 cubecl-core = { path = "../cubecl-core", version = "0.4.0", features = [
diff --git a/crates/cubecl-hip/src/lib.rs b/crates/cubecl-hip/src/lib.rs
index dc76d237..7ebada92 100644
--- a/crates/cubecl-hip/src/lib.rs
+++ b/crates/cubecl-hip/src/lib.rs
@@ -23,9 +23,10 @@ pub(crate) type HipWmmaCompiler = cubecl_cpp::hip::wmma::WmmaIntrinsicCompiler;
 #[cfg(target_os = "linux")]
 #[cfg(test)]
 mod tests {
+    use half::{bf16, f16};
     pub type TestRuntime = crate::HipRuntime;
 
     cubecl_core::testgen_all!();
     cubecl_linalg::testgen_cmma_matmul!();
-    cubecl_reduce::testgen_reduce!();
+    cubecl_reduce::testgen_reduce!([f16, bf16, f32, f64]);
 }
diff --git a/crates/cubecl-reduce/src/test.rs b/crates/cubecl-reduce/src/test.rs
index 8cb5d69f..345c9d7b 100644
--- a/crates/cubecl-reduce/src/test.rs
+++ b/crates/cubecl-reduce/src/test.rs
@@ -5,266 +5,238 @@ use cubecl_core::{prelude::*, Feature};
 use crate::sum::{reduce_sum, reduce_sum_lined, ReduceConfig};
 
 #[macro_export]
-macro_rules! testgen_reduce {
-    () => {
-        use super::*;
-        use cubecl_core::CubeCount;
-        use cubecl_reduce::test::{impl_reduce_sum_test, TestCase, TestTensorParts};
-
-        #[test]
-        pub fn reduce_sum_vector_single_plane() {
-            let test = TestCase::new(
-                // input
-                TestTensorParts::new_vector((0..32).collect()),
-                // output
-                TestTensorParts::new_vector(vec![0]),
-                // expected
-                vec![496],
-            );
-            impl_reduce_sum_test::<TestRuntime, u32>(&Default::default(), test);
+macro_rules! impl_test_reduce_sum_vector {
+    ($float:ident, [$(($num_values:expr, $cube_size:expr, $line_size:expr)),*]) => {
+        ::paste::paste! {
+            $(
+                #[test]
+                pub fn [<reduce_sum_vector_ $num_values _ $cube_size _ $line_size >]() {
+                    TestCase::<$float>::sum_vector(32, 32, 1).run::<TestRuntime>(&Default::default());
+                }
+            )*
         }
+    };
+}
 
-        #[test]
-        pub fn reduce_sum_vector_single_plane_line_size_four() {
-            let test = TestCase::new(
-                // input
-                TestTensorParts::new_vector((0..32).collect()).with_line_size(4),
-                // output
-                TestTensorParts::new_vector(vec![0, 0, 0, 0]).with_line_size(4),
-                // expected
-                vec![112, 120, 128, 136],
-            );
-            impl_reduce_sum_test::<TestRuntime, u32>(&Default::default(), test);
-        }
+#[macro_export]
+macro_rules! testgen_reduce {
+    ([$($float:ident),*]) => {
+        mod test_reduce {
+            use super::*;
+            ::paste::paste! {
+                $(mod [<$float _ty>] {
+                    use super::*;
 
-        #[test]
-        pub fn reduce_sum_lined_vector_single_plane_line_size_four() {
-            let mut test = TestCase::new(
-                // input
-                TestTensorParts::new_vector((0..32).collect()).with_line_size(4),
-                // output
-                TestTensorParts::new_vector(vec![0]),
-                // expected
-                vec![496],
-            );
-            test.reduce_lines = true;
-            impl_reduce_sum_test::<TestRuntime, u32>(&Default::default(), test);
+                    $crate::testgen_reduce!($float);
+                })*
+            }
         }
+    };
 
-        #[test]
-        pub fn reduce_sum_vector_long_single_plane() {
-            let test = TestCase::new(
-                // input
-                TestTensorParts::new_vector((0..128).collect()),
-                // output
-                TestTensorParts::new_vector(vec![0]),
-                // expected
-                vec![8128],
-            );
-            impl_reduce_sum_test::<TestRuntime, u32>(&Default::default(), test);
-        }
+    ($float:ident) => {
+        use super::*;
+        use cubecl_core::as_type;
+        use cubecl_core::prelude::Float;
+        use cubecl_core::CubeCount;
+        use cubecl_reduce::test::TestCase;
 
-        #[test]
-        pub fn reduce_sum_long_vector_four_planes() {
-            let mut test = TestCase::new(
-                // input
-                TestTensorParts::new_vector((0..128).collect()),
-                // output
-                TestTensorParts::new_vector(vec![0]),
-                // expected
-                vec![8128],
-            );
-            test.cube_dim = CubeDim::new(128, 1, 1);
-            impl_reduce_sum_test::<TestRuntime, u32>(&Default::default(), test);
-        }
+        $crate::impl_test_reduce_sum_vector!(
+            $float,
+            [
+                (32, 32, 1),
+                (64, 32, 1),
+                (100, 32, 1),
+                (1000, 32, 1),
+                (2048, 32, 1),
+                (32, 64, 1),
+                (64, 64, 1),
+                (100, 64, 1),
+                (1000, 64, 1),
+                (2048, 64, 1),
+                (32, 1024, 1),
+                (64, 1024, 1),
+                (100, 1024, 1),
+                (1000, 1024, 1),
+                (2048, 1024, 1),
+                (32, 32, 2),
+                (64, 32, 2),
+                (100, 32, 2),
+                (1000, 32, 2),
+                (2048, 32, 2),
+                (32, 64, 2),
+                (64, 64, 2),
+                (100, 64, 2),
+                (1000, 64, 2),
+                (2048, 64, 2),
+                (32, 1024, 2),
+                (64, 1024, 2),
+                (100, 1024, 2),
+                (1000, 1024, 2),
+                (2048, 1024, 2),
+                (32, 32, 4),
+                (64, 32, 4),
+                (100, 32, 4),
+                (1000, 32, 4),
+                (2048, 32, 4),
+                (32, 64, 4),
+                (64, 64, 4),
+                (100, 64, 4),
+                (1000, 64, 4),
+                (2048, 64, 4),
+                (32, 1024, 4),
+                (64, 1024, 4),
+                (100, 1024, 4),
+                (1000, 1024, 4),
+                (2048, 1024, 4)
+            ]
+        );
+    };
+}
 
-        #[test]
-        pub fn reduce_sum_vector_with_remainder_single_plane() {
-            let test = TestCase::new(
-                // input
-                TestTensorParts::new_vector((0..128).collect()),
-                // output
-                TestTensorParts::new_vector(vec![0]),
-                // expected
-                vec![8128],
-            );
-            impl_reduce_sum_test::<TestRuntime, u32>(&Default::default(), test);
-        }
+#[derive(Debug)]
+pub struct TestTensorParts<N> {
+    pub values: Vec<N>,
+    pub stride: Vec<usize>,
+    pub shape: Vec<usize>,
+    pub line_size: u8,
+}
 
-        #[test]
-        pub fn reduce_sum_vector_with_remainder_four_planes() {
-            let mut test = TestCase::new(
-                // input
-                TestTensorParts::new_vector((0..100).collect()),
-                // output
-                TestTensorParts::new_vector(vec![0]),
-                // expected
-                vec![4950],
-            );
-            test.cube_dim = CubeDim::new(128, 1, 1);
-            impl_reduce_sum_test::<TestRuntime, u32>(&Default::default(), test);
+impl<N: Float> TestTensorParts<N> {
+    pub fn new_vector(values: Vec<N>) -> Self {
+        let shape = vec![values.len()];
+        Self {
+            values,
+            stride: vec![1],
+            shape,
+            line_size: 1,
         }
+    }
 
-        #[test]
-        pub fn reduce_sum_lined_vector_with_remainder_four_planes() {
-            let mut test = TestCase::new(
-                // input
-                TestTensorParts::new_vector((0..100).collect()).with_line_size(4),
-                // output
-                TestTensorParts::new_vector(vec![0]),
-                // expected
-                vec![4950],
-            );
-            test.cube_dim = CubeDim::new(128, 1, 1);
-            test.reduce_lines = true;
-            impl_reduce_sum_test::<TestRuntime, u32>(&Default::default(), test);
-        }
+    pub fn range_vector(stop: usize) -> Self {
+        let values = (0..stop).map(|x| N::new(x as f32)).collect();
+        Self::new_vector(values)
+    }
 
-        #[test]
-        pub fn reduce_sum_vector_f32_eight_planes() {
-            let mut test = TestCase::new(
-                // input
-                TestTensorParts::new_vector((0..1024).map(|n| n as f32).collect()),
-                // output
-                TestTensorParts::new_vector(vec![0.0]),
-                // expected
-                vec![523776.0],
-            );
-            test.tolerance = Some(1e-9);
-            test.cube_dim = CubeDim::new(256, 1, 1);
-            impl_reduce_sum_test::<TestRuntime, f32>(&Default::default(), test);
-        }
+    pub fn zero_vector(size: usize) -> Self {
+        let values = vec![N::new(0.0); size];
+        Self::new_vector(values)
+    }
 
-        #[test]
-        pub fn reduce_sum_vector_f32_too_many_planes() {
-            let mut test = TestCase::new(
-                // input
-                TestTensorParts::new_vector((0..128).map(|n| n as f32).collect()),
-                // output
-                TestTensorParts::new_vector(vec![0.0]),
-                // expected
-                vec![8128.0],
-            );
-            test.tolerance = Some(1e-9);
-            test.cube_dim = CubeDim::new(256, 1, 1);
-            impl_reduce_sum_test::<TestRuntime, f32>(&Default::default(), test);
-        }
-    };
+    pub fn with_line_size(mut self, line_size: u8) -> Self {
+        self.line_size = line_size;
+        self
+    }
 }
 
 #[derive(Debug)]
-pub struct TestCase<N> {
-    pub input: TestTensorParts<N>,
-    pub output: TestTensorParts<N>,
-    pub expected: Vec<N>,
-    pub tolerance: Option<f32>,
+pub struct TestCase<F> {
+    pub input: TestTensorParts<F>,
+    pub output: TestTensorParts<F>,
+    pub expected: Vec<F>,
     pub cube_count: CubeCount,
     pub cube_dim: CubeDim,
     pub sum_dim: u32,
     pub reduce_lines: bool,
 }
 
-impl<N> TestCase<N> {
-    pub fn new(input: TestTensorParts<N>, output: TestTensorParts<N>, expected: Vec<N>) -> Self {
+impl<F> TestCase<F> {
+    pub fn new(input: TestTensorParts<F>, output: TestTensorParts<F>, expected: Vec<F>) -> Self {
         Self {
             input,
             output,
             expected,
-            tolerance: None,
             cube_count: CubeCount::Static(1, 1, 1),
             cube_dim: CubeDim::new(32, 1, 1),
             sum_dim: 0,
             reduce_lines: false,
         }
     }
-}
-
-#[derive(Debug)]
-pub struct TestTensorParts<N> {
-    pub values: Vec<N>,
-    pub stride: Vec<usize>,
-    pub shape: Vec<usize>,
-    pub line_size: u8,
-}
 
-impl<N> TestTensorParts<N> {
-    pub fn new_vector(values: Vec<N>) -> Self {
-        let shape = vec![values.len()];
-        Self {
-            values,
-            stride: vec![1],
-            shape,
-            line_size: 1,
+    /// ASSUMPTION: line_size divide num_values exactly
+    pub fn sum_vector(num_values: usize, cube_size: u32, line_size: usize) -> Self
+    where
+        F: Float,
+    {
+        // Compute the sums on the cpu.
+        let values_per_sum = num_values / line_size;
+        let partial_sum = values_per_sum * (values_per_sum - 1) / 2;
+        let mut sums = vec![0; line_size];
+        for k in 0..line_size {
+            sums[k] = partial_sum + values_per_sum * k;
         }
-    }
-
-    pub fn with_line_size(mut self, line_size: u8) -> Self {
-        self.line_size = line_size;
-        self
-    }
-}
+        let sums = sums.into_iter().map(|s| F::new(s as f32)).collect();
 
-pub fn impl_reduce_sum_test<R: Runtime, N: Numeric + CubeElement + std::fmt::Display>(
-    device: &R::Device,
-    test: TestCase<N>,
-) {
-    let client = R::client(device);
-    if !client.properties().feature_enabled(Feature::Plane) {
-        // Can't execute the test.
-        return;
+        let mut test = TestCase::new(
+            // input
+            TestTensorParts::range_vector(num_values),
+            // output
+            TestTensorParts::zero_vector(line_size),
+            // expected
+            sums,
+        );
+        test.cube_dim = CubeDim::new(cube_size, 1, 1);
+        test
     }
 
-    let input_handle = client.create(N::as_bytes(&test.input.values));
-    let output_handle = client.create(N::as_bytes(&test.output.values));
+    pub fn run<R: Runtime>(self, device: &R::Device)
+    where
+        F: Float + CubeElement + std::fmt::Display,
+    {
+        let client = R::client(device);
+        if !client.properties().feature_enabled(Feature::Plane) {
+            // Can't execute the test.
+            return;
+        }
 
-    let config = ReduceConfig {
-        line_size: test.input.line_size as u32,
-        max_num_planes: test.cube_dim.num_elems()
-            / client.properties().hardware_properties().plane_size_min,
-    };
+        let input_handle = client.create(F::as_bytes(&self.input.values));
+        let output_handle = client.create(F::as_bytes(&self.output.values));
 
-    unsafe {
-        let input_tensor = TensorArg::from_raw_parts::<N>(
-            &input_handle,
-            &test.input.stride,
-            &test.input.shape,
-            test.input.line_size,
-        );
-        let output_tensor = TensorArg::from_raw_parts::<N>(
-            &output_handle,
-            &test.output.stride,
-            &test.output.shape,
-            test.output.line_size,
-        );
+        let config = ReduceConfig {
+            line_size: self.input.line_size as u32,
+            max_num_planes: self.cube_dim.num_elems()
+                / client.properties().hardware_properties().plane_size_min,
+        };
 
-        if test.reduce_lines {
-            reduce_sum_lined::launch_unchecked::<N, R>(
-                &client,
-                test.cube_count,
-                test.cube_dim,
-                input_tensor,
-                output_tensor,
-                config,
+        unsafe {
+            let input_tensor = TensorArg::from_raw_parts::<F>(
+                &input_handle,
+                &self.input.stride,
+                &self.input.shape,
+                self.input.line_size,
             );
-        } else {
-            reduce_sum::launch_unchecked::<N, R>(
-                &client,
-                test.cube_count,
-                test.cube_dim,
-                input_tensor,
-                output_tensor,
-                config,
+            let output_tensor = TensorArg::from_raw_parts::<F>(
+                &output_handle,
+                &self.output.stride,
+                &self.output.shape,
+                self.output.line_size,
             );
+
+            if self.reduce_lines {
+                reduce_sum_lined::launch_unchecked::<F, R>(
+                    &client,
+                    self.cube_count,
+                    self.cube_dim,
+                    input_tensor,
+                    output_tensor,
+                    config,
+                );
+            } else {
+                reduce_sum::launch_unchecked::<F, R>(
+                    &client,
+                    self.cube_count,
+                    self.cube_dim,
+                    input_tensor,
+                    output_tensor,
+                    config,
+                );
+            }
         }
-    }
 
-    let binding = output_handle.binding();
-    let bytes = client.read_one(binding);
-    let output_values = N::from_bytes(&bytes);
+        let binding = output_handle.binding();
+        let bytes = client.read_one(binding);
+        let output_values = F::from_bytes(&bytes);
 
-    match test.tolerance {
-        Some(tolerance) => assert_approx_equal_abs(output_values, &test.expected, tolerance),
-        None => assert_eq!(output_values, test.expected),
+        assert_approx_equal_abs(output_values, &self.expected, 1e-9);
     }
 }
 
diff --git a/crates/cubecl-wgpu/src/lib.rs b/crates/cubecl-wgpu/src/lib.rs
index e2c9c72e..5a0a47ca 100644
--- a/crates/cubecl-wgpu/src/lib.rs
+++ b/crates/cubecl-wgpu/src/lib.rs
@@ -27,7 +27,7 @@ mod tests {
     cubecl_core::testgen_all!();
     cubecl_linalg::testgen_plane_mma!([flex32, f32], f32);
     cubecl_linalg::testgen_tiling2d!([flex32, f32]);
-    cubecl_reduce::testgen_reduce!();
+    cubecl_reduce::testgen_reduce!(f32);
 }
 
 #[cfg(all(test, feature = "spirv"))]