From d41dd0c00381fb935e4389927ee4bef0b8aba785 Mon Sep 17 00:00:00 2001
From: Nathaniel Simard <nathaniel.simard.42@gmail.com>
Date: Sun, 25 Aug 2024 14:16:19 -0400
Subject: [PATCH] Use simple memory management with wasm (#81)

---
 crates/cubecl-core/src/codegen/integrator.rs  |  5 ++
 .../tests/error/for_loop_range.stderr         |  2 +-
 .../src/codegen_function/branch.rs            |  4 +-
 crates/cubecl-wgpu/Cargo.toml                 |  4 ++
 crates/cubecl-wgpu/build.rs                   |  8 +++
 .../cubecl-wgpu/src/compiler/wgsl/shader.rs   |  5 ++
 crates/cubecl-wgpu/src/runtime.rs             | 60 ++++++++++++-------
 crates/cubecl/Cargo.toml                      |  1 +
 8 files changed, 64 insertions(+), 25 deletions(-)
 create mode 100644 crates/cubecl-wgpu/build.rs

diff --git a/crates/cubecl-core/src/codegen/integrator.rs b/crates/cubecl-core/src/codegen/integrator.rs
index c55ee2d3..7bf25c23 100644
--- a/crates/cubecl-core/src/codegen/integrator.rs
+++ b/crates/cubecl-core/src/codegen/integrator.rs
@@ -495,6 +495,11 @@ impl KernelIntegrator {
         let output = match self.expansion.outputs.get_mut(mapping.pos_output) {
             Some(output) => output,
             None => {
+                if let Some(binding) = self.input_bindings.get_mut(mapping.pos_input) {
+                    // Update input visibility.
+                    binding.visibility = Visibility::ReadWrite;
+                }
+
                 // The mapping is handled differently, normally by cube itself.
                 return;
             }
diff --git a/crates/cubecl-core/tests/error/for_loop_range.stderr b/crates/cubecl-core/tests/error/for_loop_range.stderr
index 947b5817..0a31e86c 100644
--- a/crates/cubecl-core/tests/error/for_loop_range.stderr
+++ b/crates/cubecl-core/tests/error/for_loop_range.stderr
@@ -1,4 +1,4 @@
-error: Invalid for loop: use [range](cubecl::prelude::range] instead.
+error: Invalid for loop: use [range](cubecl::prelude::range] or [range_stepped](cubecl::prelude::range_stepped) instead.
  --> tests/error/for_loop_range.rs:6:14
   |
 6 |     for _ in 0..10 {}
diff --git a/crates/cubecl-macros/src/codegen_function/branch.rs b/crates/cubecl-macros/src/codegen_function/branch.rs
index bdadef0b..0305aff5 100644
--- a/crates/cubecl-macros/src/codegen_function/branch.rs
+++ b/crates/cubecl-macros/src/codegen_function/branch.rs
@@ -15,11 +15,11 @@ use super::{
 
 /// Codegen of for loops
 /// Supports range:
-/// ```norun
+/// ```ignore
 /// for i in range(start, end, unroll) {...}
 /// ```
 /// and range_stepped:
-/// ```norun
+/// ```ignore
 /// for i in range_stepped(start, end, step, unroll) {...}
 /// ```
 pub(crate) fn codegen_for_loop(
diff --git a/crates/cubecl-wgpu/Cargo.toml b/crates/cubecl-wgpu/Cargo.toml
index 88bcbeb7..86d4e5c5 100644
--- a/crates/cubecl-wgpu/Cargo.toml
+++ b/crates/cubecl-wgpu/Cargo.toml
@@ -17,6 +17,7 @@ default = [
   "cubecl-core/default",
 ]
 std = ["cubecl-runtime/std", "cubecl-common/std", "cubecl-core/std"]
+simple-memory-management = []
 
 [dependencies]
 cubecl-runtime = { path = "../cubecl-runtime", version = "0.1.1", default-features = false, features = [
@@ -41,3 +42,6 @@ cubecl-core = { path = "../cubecl-core", version = "0.1.1", features = [
 cubecl-linalg = { path = "../cubecl-linalg", version = "0.1.1", features = [
   "export_tests",
 ] }
+
+[build-dependencies]
+cfg_aliases = "0.2.1"
diff --git a/crates/cubecl-wgpu/build.rs b/crates/cubecl-wgpu/build.rs
new file mode 100644
index 00000000..c28f0edd
--- /dev/null
+++ b/crates/cubecl-wgpu/build.rs
@@ -0,0 +1,8 @@
+use cfg_aliases::cfg_aliases;
+
+fn main() {
+    // Setup cfg aliases
+    cfg_aliases! {
+        simple_memory_management: { any(feature = "simple-memory-management", target_family = "wasm") },
+    }
+}
diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/shader.rs b/crates/cubecl-wgpu/src/compiler/wgsl/shader.rs
index a43bd0cb..cf42386d 100644
--- a/crates/cubecl-wgpu/src/compiler/wgsl/shader.rs
+++ b/crates/cubecl-wgpu/src/compiler/wgsl/shader.rs
@@ -228,7 +228,12 @@ impl Display for Location {
 impl Display for Visibility {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
+            // With the dynamic memory strategy we have to put everything read_write.
+            #[cfg(not(simple_memory_management))]
             Visibility::Read => f.write_str("read_write"),
+            // With the simple memory strategy we can use the correct visibility.
+            #[cfg(simple_memory_management)]
+            Visibility::Read => f.write_str("read"),
             Visibility::ReadWrite => f.write_str("read_write"),
         }
     }
diff --git a/crates/cubecl-wgpu/src/runtime.rs b/crates/cubecl-wgpu/src/runtime.rs
index df935910..450cde12 100644
--- a/crates/cubecl-wgpu/src/runtime.rs
+++ b/crates/cubecl-wgpu/src/runtime.rs
@@ -5,13 +5,9 @@ use crate::{
 };
 use alloc::sync::Arc;
 use cubecl_core::{Feature, FeatureSet, Runtime};
-use cubecl_runtime::{
-    channel::MutexComputeChannel,
-    client::ComputeClient,
-    memory_management::dynamic::{DynamicMemoryManagement, DynamicMemoryManagementOptions},
-    ComputeRuntime,
-};
-use wgpu::DeviceDescriptor;
+use cubecl_runtime::memory_management;
+use cubecl_runtime::{channel::MutexComputeChannel, client::ComputeClient, ComputeRuntime};
+use wgpu::{DeviceDescriptor, Limits};
 
 /// Runtime that uses the [wgpu] crate with the wgsl compiler. This is used in the Wgpu backend.
 /// For advanced configuration, use [`init_sync`] to pass in runtime options or to select a
@@ -23,13 +19,42 @@ pub struct WgpuRuntime;
 static RUNTIME: ComputeRuntime<WgpuDevice, Server, MutexComputeChannel<Server>> =
     ComputeRuntime::new();
 
-type Server = WgpuServer<DynamicMemoryManagement<WgpuStorage>>;
+type Server = WgpuServer<MemoryManagement>;
+
+#[cfg(not(simple_memory_management))]
+type MemoryManagement = memory_management::dynamic::DynamicMemoryManagement<WgpuStorage>;
+#[cfg(simple_memory_management)]
+type MemoryManagement = memory_management::simple::SimpleMemoryManagement<WgpuStorage>;
+
+#[cfg(not(simple_memory_management))]
+fn init_memory_management(device: Arc<wgpu::Device>, limits: &Limits) -> MemoryManagement {
+    let storage = WgpuStorage::new(device.clone());
+
+    memory_management::dynamic::DynamicMemoryManagement::new(
+        storage,
+        memory_management::dynamic::DynamicMemoryManagementOptions::preset(
+            limits.max_storage_buffer_binding_size as usize,
+            limits.min_storage_buffer_offset_alignment as usize,
+        ),
+    )
+}
+
+#[cfg(simple_memory_management)]
+fn init_memory_management(device: Arc<wgpu::Device>, _limits: &Limits) -> MemoryManagement {
+    let storage = WgpuStorage::new(device.clone());
+
+    memory_management::simple::SimpleMemoryManagement::new(
+        storage,
+        memory_management::simple::DeallocStrategy::new_period_tick(32),
+        memory_management::simple::SliceStrategy::Ratio(0.8),
+    )
+}
 
 impl Runtime for WgpuRuntime {
     type Compiler = wgsl::WgslCompiler;
-    type Server = WgpuServer<DynamicMemoryManagement<WgpuStorage>>;
+    type Server = WgpuServer<MemoryManagement>;
 
-    type Channel = MutexComputeChannel<WgpuServer<DynamicMemoryManagement<WgpuStorage>>>;
+    type Channel = MutexComputeChannel<WgpuServer<MemoryManagement>>;
     type Device = WgpuDevice;
 
     fn client(device: &Self::Device) -> ComputeClient<Self::Server, Self::Channel> {
@@ -112,19 +137,10 @@ fn create_client(
     device_wgpu: Arc<wgpu::Device>,
     queue: Arc<wgpu::Queue>,
     options: RuntimeOptions,
-) -> ComputeClient<
-    WgpuServer<DynamicMemoryManagement<WgpuStorage>>,
-    MutexComputeChannel<WgpuServer<DynamicMemoryManagement<WgpuStorage>>>,
-> {
+) -> ComputeClient<WgpuServer<MemoryManagement>, MutexComputeChannel<WgpuServer<MemoryManagement>>>
+{
     let limits = device_wgpu.limits();
-    let storage = WgpuStorage::new(device_wgpu.clone());
-    let memory_management = DynamicMemoryManagement::new(
-        storage,
-        DynamicMemoryManagementOptions::preset(
-            limits.max_storage_buffer_binding_size as usize,
-            limits.min_storage_buffer_offset_alignment as usize,
-        ),
-    );
+    let memory_management = init_memory_management(device_wgpu.clone(), &limits);
     let server = WgpuServer::new(memory_management, device_wgpu, queue, options.tasks_max);
     let channel = MutexComputeChannel::new(server);
 
diff --git a/crates/cubecl/Cargo.toml b/crates/cubecl/Cargo.toml
index 3bed4202..45b6a0fa 100644
--- a/crates/cubecl/Cargo.toml
+++ b/crates/cubecl/Cargo.toml
@@ -22,6 +22,7 @@ default = ["std", "linalg", "cubecl-core/default", "cubecl-wgpu?/default", "cube
 std = ["cubecl-core/std", "cubecl-wgpu?/std", "cubecl-cuda?/std"]
 template = ["cubecl-core/template"]
 linalg = ["dep:cubecl-linalg"]
+simple-memory-management = ["cubecl-wgpu?/simple-memory-management"]
 
 # Runtimes
 wgpu = ["cubecl-wgpu"]