tracel-ai · nathanielsimard · Jul 26, 2024 · Jul 25, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/crates/cubecl-core/src/compute/kernel.rs b/crates/cubecl-core/src/compute/kernel.rs
@@ -1,17 +1,87 @@
-use std::{fmt::Debug, marker::PhantomData};
+use std::{
+    fmt::{Debug, Display},
+    marker::PhantomData,
+};
 
 use crate::{codegen::CompilerRepresentation, ir::CubeDim, Compiler, Kernel};
 use alloc::sync::Arc;
 use cubecl_runtime::server::{Binding, ComputeServer};
 
 /// A kernel, compiled in the target language
 pub struct CompiledKernel {
+    pub name: Option<&'static str>,
     /// Source code of the kernel
     pub source: String,
     /// Size of a cube for the compiled kernel
     pub cube_dim: CubeDim,
     /// The number of bytes used by the share memory
     pub shared_mem_bytes: usize,
+    pub lang_tag: Option<&'static str>,
+}
+
+impl Display for CompiledKernel {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str("\n======== Compiled Kernel ========")?;
+
+        if let Some(name) = self.name {
+            if name.len() <= 32 {
+                f.write_fmt(format_args!("\nname: {name}"))?;
+            } else {
+                let name = format_type_name(name);
+                f.write_fmt(format_args!("\nname: {name}"))?;
+            }
+        }
+
+        f.write_fmt(format_args!(
+            "
+cube_dim: ({}, {}, {})
+shared_memory: {} bytes
+source:
+```{}
+{}
+```
+=================================
+",
+            self.cube_dim.x,
+            self.cube_dim.y,
+            self.cube_dim.z,
+            self.shared_mem_bytes,
+            self.lang_tag.unwrap_or(""),
+            self.source
+        ))
+    }
+}
+
+fn format_type_name(type_name: &str) -> String {
+    let mut result = String::new();
+    let mut depth = 0;
+    let indendation = 4;
+
+    for c in type_name.chars() {
+        if c == ' ' {
+            continue;
+        }
+
+        if c == '<' {
+            depth += 1;
+            result.push_str("<\n");
+            result.push_str(&" ".repeat(indendation * depth));
+            continue;
+        } else if c == '>' {
+            depth -= 1;
+            result.push_str(",\n>");
+            continue;
+        }
+
+        if c == ',' && depth > 0 {
+            result.push_str(",\n");
+            result.push_str(&" ".repeat(indendation * depth));
+        } else {
+            result.push(c);
+        }
+    }
+
+    result
 }
 
 /// Kernel trait with the ComputeShader that will be compiled and cached based on the
@@ -39,9 +109,11 @@ impl<C: Compiler, K: Kernel> CubeTask for KernelTask<C, K> {
         let source = lower_level_ir.to_string();
 
         CompiledKernel {
+            name: Some(core::any::type_name::<K>()),
             source,
             cube_dim,
             shared_mem_bytes,
+            lang_tag: None,
         }
     }
 

diff --git a/crates/cubecl-cuda/src/compiler/binary.rs b/crates/cubecl-cuda/src/compiler/binary.rs
@@ -151,7 +151,35 @@ impl Binary for IndexAssign {
         Rhs: Component,
         Out: Component,
     {
-        f.write_fmt(format_args!("{out}[{lhs}] = {rhs};\n"))
+        let item_out = out.item();
+        let item_rhs = rhs.item();
+
+        if item_out.vectorization != item_rhs.vectorization {
+            let is_vec_native = item_out.is_vec_native();
+            f.write_str("{\n")?;
+            let var = "scalar_broadcasted";
+            f.write_fmt(format_args!("{item_out} {var};\n"))?;
+            for i in 0..item_out.vectorization {
+                if is_vec_native {
+                    let char = match i {
+                        0 => 'x',
+                        1 => 'y',
+                        2 => 'z',
+                        3 => 'w',
+                        _ => panic!("Invalid"),
+                    };
+                    f.write_fmt(format_args!("{var}.{char} = {rhs};\n"))?;
+                } else {
+                    f.write_fmt(format_args!("{var}.i_{i} = {rhs};\n"))?;
+                }
+            }
+            f.write_fmt(format_args!("{out}[{lhs}] = {var};\n"))?;
+            f.write_str("}")?;
+
+            Ok(())
+        } else {
+            f.write_fmt(format_args!("{out}[{lhs}] = {rhs};\n"))
+        }
     }
 
     fn unroll_vec(

diff --git a/crates/cubecl-cuda/src/compiler/kernel.rs b/crates/cubecl-cuda/src/compiler/kernel.rs
@@ -1,6 +1,6 @@
 use super::{Body, Item};
 use cubecl_core::{ir::CubeDim, CompilerRepresentation};
-use std::{collections::HashSet, fmt::Display};
+use std::{collections::HashSet, fmt::Display, io::Write, process::Command};
 
 #[derive(Debug, PartialEq, Eq, Clone)]
 pub struct Binding {
@@ -84,11 +84,7 @@ impl Display for ComputeKernel {
             f.write_str("using namespace nvcuda;\n")?;
         }
 
-        f.write_str(
-            "
-typedef unsigned int uint;
-            ",
-        )?;
+        f.write_str("typedef unsigned int uint;\n")?;
 
         for item in self.items.iter() {
             if item.is_vec_native() {
@@ -155,3 +151,27 @@ extern \"C\" __global__ void kernel(
         Ok(())
     }
 }
+
+/// Format C++ code, useful when debugging.
+pub(crate) fn format_cpp_code(code: &str) -> Result<String, std::io::Error> {
+    let mut child = Command::new("clang-format")
+        .stdin(std::process::Stdio::piped())
+        .stdout(std::process::Stdio::piped())
+        .spawn()?;
+
+    {
+        let stdin = child.stdin.as_mut().expect("Failed to open stdin");
+        stdin.write_all(code.as_bytes())?;
+    }
+
+    let output = child.wait_with_output()?;
+
+    if output.status.success() {
+        Ok(String::from_utf8_lossy(&output.stdout).into_owned())
+    } else {
+        Err(std::io::Error::new(
+            std::io::ErrorKind::Other,
+            "clang-format failed",
+        ))
+    }
+}
diff --git a/crates/cubecl-cuda/src/compiler/unary.rs b/crates/cubecl-cuda/src/compiler/unary.rs
@@ -63,7 +63,7 @@ macro_rules! function {
                     Elem::F162 => f.write_fmt(format_args!("{out} = h2{}({input});\n", $func)),
                     Elem::BF16 => f.write_fmt(format_args!("{out} = h{}({input});\n", $func)),
                     Elem::BF162 => f.write_fmt(format_args!("{out} = h2{}({input});\n", $func)),
-                    Elem::F32 => f.write_fmt(format_args!("{out} = __{}f({input});\n", $func)),
+                    Elem::F32 => f.write_fmt(format_args!("{out} = {}({input});\n", $func)),
                     _ => f.write_fmt(format_args!("{out} = {}({input});\n", $func)),
                 }
             }
@@ -79,7 +79,7 @@ function!(Sin, "sin");
 function!(Tanh, "tanh");
 function!(Sqrt, "sqrt");
 function!(Exp, "exp");
-function!(Erf, "erff");
+function!(Erf, "erf");
 function!(Ceil, "ceil");
 function!(Floor, "floor");
 

diff --git a/crates/cubecl-cuda/src/compute/server.rs b/crates/cubecl-cuda/src/compute/server.rs
@@ -1,10 +1,13 @@
-use super::storage::Binding;
+use crate::compiler::format_cpp_code;
+
 use super::storage::CudaStorage;
+use super::CudaResource;
 use cubecl_common::reader::{reader_from_concrete, Reader};
 use cubecl_common::sync_type::SyncType;
 use cubecl_core::ir::CubeDim;
 use cubecl_core::prelude::*;
 use cubecl_core::FeatureSet;
+use cubecl_runtime::debug::DebugLogger;
 use cubecl_runtime::{
     memory_management::MemoryManagement,
     server::{self, ComputeServer},
@@ -14,10 +17,12 @@ use cudarc::driver::sys::CUfunc_st;
 use std::collections::HashMap;
 use std::ffi::CStr;
 use std::ffi::CString;
+use std::path::PathBuf;
 
 #[derive(Debug)]
 pub struct CudaServer<MM: MemoryManagement<CudaStorage>> {
     state: CudaServerState<MM>,
+    logger: DebugLogger,
     pub(crate) archs: Vec<i32>,
     pub(crate) minimum_arch_version: i32,
 }
@@ -131,18 +136,18 @@ impl<MM: MemoryManagement<CudaStorage>> ComputeServer for CudaServer<MM> {
             }
         };
 
-        let ctx = self.get_context();
+        let (ctx, logger) = self.get_context_with_logger();
 
         if !ctx.module_names.contains_key(&kernel_id) {
-            ctx.compile_kernel(&kernel_id, kernel, arch);
+            ctx.compile_kernel(&kernel_id, kernel, arch, logger);
         }
 
-        let bindings = bindings
+        let resources = bindings
             .into_iter()
-            .map(|binding| ctx.memory_management.get(binding.memory).as_binding())
-            .collect();
+            .map(|binding| ctx.memory_management.get(binding.memory))
+            .collect::<Vec<_>>();
 
-        ctx.execute_task(kernel_id, count, bindings);
+        ctx.execute_task(kernel_id, count, resources);
     }
 
     fn sync(&mut self, sync_type: SyncType) {
@@ -186,21 +191,29 @@ impl<MM: MemoryManagement<CudaStorage>> CudaContext<MM> {
         };
     }
 
-    fn compile_kernel(&mut self, kernel_id: &str, kernel: Box<dyn CubeTask>, arch: i32) {
-        let kernel_compiled = kernel.compile();
+    fn compile_kernel(
+        &mut self,
+        kernel_id: &str,
+        kernel: Box<dyn CubeTask>,
+        arch: i32,
+        logger: &mut DebugLogger,
+    ) {
+        let mut kernel_compiled = kernel.compile();
+        kernel_compiled.lang_tag = Some("cpp");
+
+        if let Ok(formatted) = format_cpp_code(&kernel_compiled.source) {
+            kernel_compiled.source = formatted;
+        }
+
         let shared_mem_bytes = kernel_compiled.shared_mem_bytes;
         let cube_dim = kernel_compiled.cube_dim;
         let arch = format!("--gpu-architecture=sm_{}", arch);
 
-        #[cfg(target_os = "linux")]
-        let options = &[
-            arch.as_str(),
-            "--include-path=/usr/include",
-            "--include-path=/usr/include/cuda",
-            "--include-path=/usr/local/include/cuda",
-        ];
-        #[cfg(not(target_os = "linux"))] // TODO: add include-path for other OS.
-        let options = &[arch.as_str()];
+        let include_path = include_path();
+        let include_option = format!("--include-path={}", include_path.to_str().unwrap());
+        let options = &[arch.as_str(), include_option.as_str()];
+
+        let kernel_compiled = logger.debug(kernel_compiled);
 
         let ptx = unsafe {
             let program = cudarc::nvrtc::result::create_program(kernel_compiled.source).unwrap();
@@ -241,8 +254,13 @@ impl<MM: MemoryManagement<CudaStorage>> CudaContext<MM> {
         &mut self,
         kernel_id: String,
         dispatch_count: (u32, u32, u32),
-        mut bindings: Vec<Binding>,
+        resources: Vec<CudaResource>,
     ) {
+        let mut bindings = resources
+            .iter()
+            .map(|memory| memory.as_binding())
+            .collect::<Vec<_>>();
+
         let kernel = self.module_names.get(&kernel_id).unwrap();
         let cube_dim = kernel.cube_dim;
         unsafe {
@@ -257,7 +275,7 @@ impl<MM: MemoryManagement<CudaStorage>> CudaContext<MM> {
             .unwrap();
         };
 
-        self.memory_management.storage().flush()
+        self.memory_management.storage().flush(resources)
     }
 }
 
@@ -281,12 +299,17 @@ impl<MM: MemoryManagement<CudaStorage>> CudaServer<MM> {
                 device_index: index,
                 init,
             },
+            logger: DebugLogger::new(),
             archs,
             minimum_arch_version,
         }
     }
 
     fn get_context(&mut self) -> &mut CudaContext<MM> {
+        self.get_context_with_logger().0
+    }
+
+    fn get_context_with_logger(&mut self) -> (&mut CudaContext<MM>, &mut DebugLogger) {
         if let CudaServerState::Uninitialized { device_index, init } = &self.state {
             let ctx = init(*device_index);
             self.state = CudaServerState::Initialized { ctx };
@@ -295,9 +318,40 @@ impl<MM: MemoryManagement<CudaStorage>> CudaServer<MM> {
             unsafe {
                 cudarc::driver::result::ctx::set_current(ctx.context).unwrap();
             };
-            ctx
+            (ctx, &mut self.logger)
         } else {
             panic!("Context should be initialized");
         }
     }
 }
+
+fn include_path() -> PathBuf {
+    let mut path = cuda_path().expect("
+        CUDA installation not found.
+        Please ensure that CUDA is installed and the CUDA_PATH environment variable is set correctly.
+        Note: Default paths are used for Linux (/usr/local/cuda) and Windows (C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/), which may not be correct.
+    ");
+    path.push("include");
+    path
+}
+
+fn cuda_path() -> Option<PathBuf> {
+    if let Ok(path) = std::env::var("CUDA_PATH") {
+        return Some(PathBuf::from(path));
+    }
+
+    #[cfg(target_os = "linux")]
+    {
+        return Some(PathBuf::from("/usr/local/cuda"));
+    }
+
+    #[cfg(target_os = "windows")]
+    {
+        return Some(PathBuf::from(
+            "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/",
+        ));
+    }
+
+    #[allow(unreachable_code)]
+    None
+}