Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/debug logger #38

Merged
merged 6 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 73 additions & 1 deletion crates/cubecl-core/src/compute/kernel.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,87 @@
use std::{fmt::Debug, marker::PhantomData};
use std::{
fmt::{Debug, Display},
marker::PhantomData,
};

use crate::{codegen::CompilerRepresentation, ir::CubeDim, Compiler, Kernel};
use alloc::sync::Arc;
use cubecl_runtime::server::{Binding, ComputeServer};

/// A kernel, compiled in the target language
pub struct CompiledKernel {
pub name: Option<&'static str>,
/// Source code of the kernel
pub source: String,
/// Size of a cube for the compiled kernel
pub cube_dim: CubeDim,
/// The number of bytes used by the share memory
pub shared_mem_bytes: usize,
pub lang_tag: Option<&'static str>,
}

impl Display for CompiledKernel {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str("\n======== Compiled Kernel ========")?;

if let Some(name) = self.name {
if name.len() <= 32 {
f.write_fmt(format_args!("\nname: {name}"))?;
} else {
let name = format_type_name(name);
f.write_fmt(format_args!("\nname: {name}"))?;
}
}

f.write_fmt(format_args!(
"
cube_dim: ({}, {}, {})
shared_memory: {} bytes
source:
```{}
{}
```
=================================
",
self.cube_dim.x,
self.cube_dim.y,
self.cube_dim.z,
self.shared_mem_bytes,
self.lang_tag.unwrap_or(""),
self.source
))
}
}

fn format_type_name(type_name: &str) -> String {
let mut result = String::new();
let mut depth = 0;
let indendation = 4;

for c in type_name.chars() {
if c == ' ' {
continue;
}

if c == '<' {
depth += 1;
result.push_str("<\n");
result.push_str(&" ".repeat(indendation * depth));
continue;
} else if c == '>' {
depth -= 1;
result.push_str(",\n>");
continue;
}

if c == ',' && depth > 0 {
result.push_str(",\n");
result.push_str(&" ".repeat(indendation * depth));
} else {
result.push(c);
}
}

result
}

/// Kernel trait with the ComputeShader that will be compiled and cached based on the
Expand Down Expand Up @@ -39,9 +109,11 @@ impl<C: Compiler, K: Kernel> CubeTask for KernelTask<C, K> {
let source = lower_level_ir.to_string();

CompiledKernel {
name: Some(core::any::type_name::<K>()),
source,
cube_dim,
shared_mem_bytes,
lang_tag: None,
}
}

Expand Down
30 changes: 29 additions & 1 deletion crates/cubecl-cuda/src/compiler/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,35 @@ impl Binary for IndexAssign {
Rhs: Component,
Out: Component,
{
f.write_fmt(format_args!("{out}[{lhs}] = {rhs};\n"))
let item_out = out.item();
let item_rhs = rhs.item();

if item_out.vectorization != item_rhs.vectorization {
let is_vec_native = item_out.is_vec_native();
f.write_str("{\n")?;
let var = "scalar_broadcasted";
f.write_fmt(format_args!("{item_out} {var};\n"))?;
for i in 0..item_out.vectorization {
if is_vec_native {
let char = match i {
0 => 'x',
1 => 'y',
2 => 'z',
3 => 'w',
_ => panic!("Invalid"),
};
f.write_fmt(format_args!("{var}.{char} = {rhs};\n"))?;
} else {
f.write_fmt(format_args!("{var}.i_{i} = {rhs};\n"))?;
}
}
f.write_fmt(format_args!("{out}[{lhs}] = {var};\n"))?;
f.write_str("}")?;

Ok(())
} else {
f.write_fmt(format_args!("{out}[{lhs}] = {rhs};\n"))
}
}

fn unroll_vec(
Expand Down
32 changes: 26 additions & 6 deletions crates/cubecl-cuda/src/compiler/kernel.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use super::{Body, Item};
use cubecl_core::{ir::CubeDim, CompilerRepresentation};
use std::{collections::HashSet, fmt::Display};
use std::{collections::HashSet, fmt::Display, io::Write, process::Command};

#[derive(Debug, PartialEq, Eq, Clone)]
pub struct Binding {
Expand Down Expand Up @@ -84,11 +84,7 @@ impl Display for ComputeKernel {
f.write_str("using namespace nvcuda;\n")?;
}

f.write_str(
"
typedef unsigned int uint;
",
)?;
f.write_str("typedef unsigned int uint;\n")?;

for item in self.items.iter() {
if item.is_vec_native() {
Expand Down Expand Up @@ -155,3 +151,27 @@ extern \"C\" __global__ void kernel(
Ok(())
}
}

/// Format C++ code, useful when debugging.
pub(crate) fn format_cpp_code(code: &str) -> Result<String, std::io::Error> {
let mut child = Command::new("clang-format")
.stdin(std::process::Stdio::piped())
.stdout(std::process::Stdio::piped())
.spawn()?;

{
let stdin = child.stdin.as_mut().expect("Failed to open stdin");
stdin.write_all(code.as_bytes())?;
}

let output = child.wait_with_output()?;

if output.status.success() {
Ok(String::from_utf8_lossy(&output.stdout).into_owned())
} else {
Err(std::io::Error::new(
std::io::ErrorKind::Other,
"clang-format failed",
))
}
}
4 changes: 2 additions & 2 deletions crates/cubecl-cuda/src/compiler/unary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ macro_rules! function {
Elem::F162 => f.write_fmt(format_args!("{out} = h2{}({input});\n", $func)),
Elem::BF16 => f.write_fmt(format_args!("{out} = h{}({input});\n", $func)),
Elem::BF162 => f.write_fmt(format_args!("{out} = h2{}({input});\n", $func)),
Elem::F32 => f.write_fmt(format_args!("{out} = __{}f({input});\n", $func)),
Elem::F32 => f.write_fmt(format_args!("{out} = {}({input});\n", $func)),
_ => f.write_fmt(format_args!("{out} = {}({input});\n", $func)),
}
}
Expand All @@ -79,7 +79,7 @@ function!(Sin, "sin");
function!(Tanh, "tanh");
function!(Sqrt, "sqrt");
function!(Exp, "exp");
function!(Erf, "erff");
function!(Erf, "erf");
function!(Ceil, "ceil");
function!(Floor, "floor");

Expand Down
96 changes: 75 additions & 21 deletions crates/cubecl-cuda/src/compute/server.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
use super::storage::Binding;
use crate::compiler::format_cpp_code;

use super::storage::CudaStorage;
use super::CudaResource;
use cubecl_common::reader::{reader_from_concrete, Reader};
use cubecl_common::sync_type::SyncType;
use cubecl_core::ir::CubeDim;
use cubecl_core::prelude::*;
use cubecl_core::FeatureSet;
use cubecl_runtime::debug::DebugLogger;
use cubecl_runtime::{
memory_management::MemoryManagement,
server::{self, ComputeServer},
Expand All @@ -14,10 +17,12 @@ use cudarc::driver::sys::CUfunc_st;
use std::collections::HashMap;
use std::ffi::CStr;
use std::ffi::CString;
use std::path::PathBuf;

#[derive(Debug)]
pub struct CudaServer<MM: MemoryManagement<CudaStorage>> {
state: CudaServerState<MM>,
logger: DebugLogger,
pub(crate) archs: Vec<i32>,
pub(crate) minimum_arch_version: i32,
}
Expand Down Expand Up @@ -131,18 +136,18 @@ impl<MM: MemoryManagement<CudaStorage>> ComputeServer for CudaServer<MM> {
}
};

let ctx = self.get_context();
let (ctx, logger) = self.get_context_with_logger();

if !ctx.module_names.contains_key(&kernel_id) {
ctx.compile_kernel(&kernel_id, kernel, arch);
ctx.compile_kernel(&kernel_id, kernel, arch, logger);
}

let bindings = bindings
let resources = bindings
.into_iter()
.map(|binding| ctx.memory_management.get(binding.memory).as_binding())
.collect();
.map(|binding| ctx.memory_management.get(binding.memory))
.collect::<Vec<_>>();

ctx.execute_task(kernel_id, count, bindings);
ctx.execute_task(kernel_id, count, resources);
}

fn sync(&mut self, sync_type: SyncType) {
Expand Down Expand Up @@ -186,21 +191,29 @@ impl<MM: MemoryManagement<CudaStorage>> CudaContext<MM> {
};
}

fn compile_kernel(&mut self, kernel_id: &str, kernel: Box<dyn CubeTask>, arch: i32) {
let kernel_compiled = kernel.compile();
fn compile_kernel(
&mut self,
kernel_id: &str,
kernel: Box<dyn CubeTask>,
arch: i32,
logger: &mut DebugLogger,
) {
let mut kernel_compiled = kernel.compile();
kernel_compiled.lang_tag = Some("cpp");

if let Ok(formatted) = format_cpp_code(&kernel_compiled.source) {
kernel_compiled.source = formatted;
}

let shared_mem_bytes = kernel_compiled.shared_mem_bytes;
let cube_dim = kernel_compiled.cube_dim;
let arch = format!("--gpu-architecture=sm_{}", arch);

#[cfg(target_os = "linux")]
let options = &[
arch.as_str(),
"--include-path=/usr/include",
"--include-path=/usr/include/cuda",
"--include-path=/usr/local/include/cuda",
];
#[cfg(not(target_os = "linux"))] // TODO: add include-path for other OS.
let options = &[arch.as_str()];
let include_path = include_path();
let include_option = format!("--include-path={}", include_path.to_str().unwrap());
let options = &[arch.as_str(), include_option.as_str()];

let kernel_compiled = logger.debug(kernel_compiled);

let ptx = unsafe {
let program = cudarc::nvrtc::result::create_program(kernel_compiled.source).unwrap();
Expand Down Expand Up @@ -241,8 +254,13 @@ impl<MM: MemoryManagement<CudaStorage>> CudaContext<MM> {
&mut self,
kernel_id: String,
dispatch_count: (u32, u32, u32),
mut bindings: Vec<Binding>,
resources: Vec<CudaResource>,
) {
let mut bindings = resources
.iter()
.map(|memory| memory.as_binding())
.collect::<Vec<_>>();

let kernel = self.module_names.get(&kernel_id).unwrap();
let cube_dim = kernel.cube_dim;
unsafe {
Expand All @@ -257,7 +275,7 @@ impl<MM: MemoryManagement<CudaStorage>> CudaContext<MM> {
.unwrap();
};

self.memory_management.storage().flush()
self.memory_management.storage().flush(resources)
}
}

Expand All @@ -281,12 +299,17 @@ impl<MM: MemoryManagement<CudaStorage>> CudaServer<MM> {
device_index: index,
init,
},
logger: DebugLogger::new(),
archs,
minimum_arch_version,
}
}

fn get_context(&mut self) -> &mut CudaContext<MM> {
self.get_context_with_logger().0
}

fn get_context_with_logger(&mut self) -> (&mut CudaContext<MM>, &mut DebugLogger) {
if let CudaServerState::Uninitialized { device_index, init } = &self.state {
let ctx = init(*device_index);
self.state = CudaServerState::Initialized { ctx };
Expand All @@ -295,9 +318,40 @@ impl<MM: MemoryManagement<CudaStorage>> CudaServer<MM> {
unsafe {
cudarc::driver::result::ctx::set_current(ctx.context).unwrap();
};
ctx
(ctx, &mut self.logger)
} else {
panic!("Context should be initialized");
}
}
}

fn include_path() -> PathBuf {
let mut path = cuda_path().expect("
CUDA installation not found.
Please ensure that CUDA is installed and the CUDA_PATH environment variable is set correctly.
Note: Default paths are used for Linux (/usr/local/cuda) and Windows (C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/), which may not be correct.
");
path.push("include");
path
}

fn cuda_path() -> Option<PathBuf> {
if let Ok(path) = std::env::var("CUDA_PATH") {
return Some(PathBuf::from(path));
}

#[cfg(target_os = "linux")]
{
return Some(PathBuf::from("/usr/local/cuda"));
}

#[cfg(target_os = "windows")]
{
return Some(PathBuf::from(
"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/",
));
}

#[allow(unreachable_code)]
None
}
Loading
Loading