shadertui live reloading demo initial implementation

AnswerDotAI · Jul 11, 2024 · fb56cf0 · fb56cf0
1 parent f983bcf
commit fb56cf0
Show file tree

Hide file tree

Showing 8 changed files with 304 additions and 9 deletions.
diff --git a/examples/README.md b/examples/README.md
@@ -14,6 +14,7 @@ In order of beginning to advanced:
 | [gpu_puzzles](gpu_puzzles) | (WIP) Implementation of Sasha Rush's GPU puzzles
 | [render](render) | GPU rendering of a signed distance function for a 3D sphere. |
 | [physics](physics) | Parallel physics simulation of a double pendulum with each thread starting at a different initial condition. |
+| [matmul](matmul) | Tiled matrix multiplication. |
 | [webgpu_from_scratch](webgpu_from_scratch) | A minimal from-scratch example of how to use WebGPU directly without this library. This is useful to understand the code internals of gpu.cpp. Note this takes a while to build as it compiles the WebGPU C API implementation. |
 
 Future examples that may be added (collaborations welcome):

diff --git a/examples/matmul/run.cpp b/examples/matmul/run.cpp
@@ -354,10 +354,10 @@ void runTest(int version, size_t M, size_t K, size_t N,
 
  // Report performance
  auto duration =
- std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+ std::chrono::duration_cast<std::chrono::seconds>(end - start);
  float gflops = 2 * M * N *
  K / // factor of 2 for multiplication & accumulation
- (static_cast<float>(duration.count()) / 1000.0) /
+ (static_cast<float>(duration.count())) /
  1000000000.0 * static_cast<float>(nIter);
  LOG(kDefLog, kInfo,
  "Execution Time: (M = %d, K = %d, N = %d) x %d iterations : %.1f "

diff --git a/examples/shadertui/Makefile b/examples/shadertui/Makefile
@@ -0,0 +1,22 @@
+CXX=clang++
+GPUCPP ?= $(PWD)/../..
+LIBDIR ?= $(GPUCPP)/third_party/lib
+LIBSPEC ?= . $(GPUCPP)/source
+NUM_JOBS?=$(shell nproc)
+TARGET=shadertui
+FLAGS=-stdlib=libc++ -std=c++17 -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -L$(GPUCPP)/third_party/lib run.cpp -ldl -ldawn 
+CODEPATH = find . ../../utils ../../ -maxdepth 1 -type f
+
+run: ./build/$(TARGET)
+ $(LIBSPEC) && ./build/$(TARGET)
+
+# Use clang -v to see the include paths
+build/$(TARGET): run.cpp
+ mkdir -p build && $(CXX) $(FLAGS) -o ./build/$(TARGET)
+
+watch: 
+ @command -v entr >/dev/null 2>&1 || { echo >&2 "Please install entr with 'brew install entr' or 'sudo apt-get install entr'"; exit 1; }
+ mkdir -p build && $(CODEPATH) | entr -s "$(LIBSPEC) && rm -f ./build/$(TARGET) && make -j$(NUM_JOBS) run"
+
+clean:
+ read -r -p "This will delete the contents of build/*. Are you sure? [CTRL-C to abort] " response && rm -rf build/*
diff --git a/examples/shadertui/run.cpp b/examples/shadertui/run.cpp
@@ -0,0 +1,123 @@
+#include "gpu.h"
+#include <array>
+#include <cstdio>
+#include <fstream>
+#include <future>
+#include <string>
+#include <thread>
+
+#include "utils/array_utils.h"
+#include "utils/logging.h"
+
+using namespace gpu;
+
+template <size_t rows, size_t cols>
+void rasterize(const std::array<float, rows * cols> &values,
+ std::array<char, rows *(cols + 1)> &raster) {
+ static const char intensity[] = " .`'^-+=*x17X$8#%@";
+ for (size_t i = 0; i < rows; ++i) {
+ for (size_t j = 0; j < cols; ++j) {
+ // values ranges b/w 0 and 1
+ size_t index =
+ std::min(sizeof(intensity) - 2,
+ std::max(0ul, static_cast<size_t>(values[i * cols + j] *
+ (sizeof(intensity) - 2))));
+ raster[i * (cols + 1) + j] = intensity[index];
+ }
+ raster[i * (cols + 1) + cols] = '\n';
+ }
+}
+
+float getCurrentTimeInMilliseconds(
+ std::chrono::time_point<std::chrono::high_resolution_clock> &zeroTime) {
+ std::chrono::duration<float> duration =
+ std::chrono::high_resolution_clock::now() - zeroTime;
+ return duration.count();
+}
+
+void loadShaderCode(const std::string &filename, std::string &codeString) {
+ codeString = "";
+ FILE *file = fopen(filename.c_str(), "r");
+ while (!file) {
+ fclose(file);
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
+ file = fopen(filename.c_str(), "r");
+ }
+ char buffer[4096];
+ while (fgets(buffer, sizeof(buffer), file)) {
+ codeString += buffer;
+ }
+ fclose(file);
+}
+
+int main() {
+
+ Context ctx = createContext();
+ static constexpr size_t kRows = 40;
+ static constexpr size_t kCols = 70;
+
+ LOG(kDefLog, kInfo, "Creating screen tensor");
+
+ std::array<float, kRows * kCols> screenArr;
+ std::fill(begin(screenArr), end(screenArr), 0.0);
+ Tensor screen = createTensor(ctx, {kRows, kCols}, kf32, screenArr.data());
+
+ std::promise<void> promise;
+ std::future<void> future = promise.get_future();
+
+ std::string codeString;
+ struct Params {
+ float time;
+ uint32_t screenWidth;
+ uint32_t screenHeight;
+ } params = {0.0, kCols, kRows};
+
+ LOG(kDefLog, kInfo, "Loading shader code from shader.wgsl");
+
+ LOG(kDefLog, kInfo, "Creating shader and kernel");
+
+ loadShaderCode("shader.wgsl", codeString);
+ ShaderCode shader = createShader(codeString.c_str(), Shape{16, 16, 1});
+ Kernel renderKernel =
+ createKernel(ctx, shader, Bindings{screen},
+ cdiv({kCols, kRows, 1}, shader.workgroupSize), params);
+
+ LOG(kDefLog, kInfo, "Starting render loop");
+
+ std::array<char, kRows *(kCols + 1)> raster;
+
+ auto start = std::chrono::high_resolution_clock::now();
+ std::chrono::duration<float> elapsed;
+ size_t ticks = 0;
+ while (true) {
+ if (elapsed.count() - static_cast<float>(ticks) > 1.0) {
+ loadShaderCode("shader.wgsl", codeString);
+ if (codeString != shader.data) {
+ shader = createShader(codeString.c_str(), Shape{16, 16, 1});
+ renderKernel =
+ createKernel(ctx, shader, Bindings{screen},
+ cdiv({kCols, kRows, 1}, shader.workgroupSize), params);
+ ticks++;
+ }
+ }
+ params.time = getCurrentTimeInMilliseconds(start);
+ wgpuQueueWriteBuffer(ctx.queue,
+ renderKernel.buffers[renderKernel.numBindings - 1], 0,
+ static_cast<void *>(&params), sizeof(params));
+ auto frameStart = std::chrono::high_resolution_clock::now();
+ std::promise<void> promise;
+ std::future<void> future = promise.get_future();
+ dispatchKernel(ctx, renderKernel, promise);
+ wait(ctx, future);
+ resetCommandBuffer(ctx.device, renderKernel);
+ toCPU(ctx, screen, screenArr);
+ rasterize<kRows, kCols>(screenArr, raster);
+ auto frameEnd = std::chrono::high_resolution_clock::now();
+ std::chrono::duration<float> frameElapsed = frameEnd - frameStart;
+ elapsed = frameEnd - start;
+ std::this_thread::sleep_for(std::chrono::milliseconds(20) - frameElapsed);
+ printf("\033[H\033[J%s\nReloaded file %zu times\n", raster.data(), ticks);
+ }
+
+ LOG(kDefLog, kInfo, "Done");
+}
diff --git a/examples/shadertui/shader.wgsl b/examples/shadertui/shader.wgsl
@@ -0,0 +1,27 @@
+@group(0) @binding(0) var<storage, read_write> out: array<f32>;
+@group(0) @binding(1) var<uniform> params: Params;
+
+struct Params {
+ time: f32,
+ screenwidth: u32,
+ screenheight: u32,
+};
+
+fn sdf(p: vec2<f32>, c: vec2<f32>, r: f32) -> f32 {
+ return length(p - c) - r;
+}
+
+@compute @workgroup_size(16, 16, 1)
+fn main(@builtin(global_invocation_id) globalID : vec3<u32>) {
+ let xy: vec2<f32> =
+ vec2<f32>(f32(globalID.x) / f32(params.screenwidth),
+ f32(globalID.y) / f32(params.screenheight));
+ let t: f32 = params.time / 1.0;
+ let idx = globalID.y * params.screenwidth + globalID.x;
+ let center = vec2<f32>(0.5, 0.5 + 0.3 * sin(3.0 * t));
+ let center2 = vec2<f32>(0.5 + 0.2 * cos(3.0 * t), 0.5);
+ // out[idx] += 0.4 - min(5 * abs(sdf(xy, center, 0.2)), 0.4) + 0.5 * cos(xy.y + t) + 0.5 * sin(xy.x);
+ out[idx] = 0.3 - min(5 * abs(sdf(xy, center, 0.2)), 0.3);
+ out[idx] += 0.3 - min(5 * abs(sdf(xy, center2, 0.2)), 0.3);
+ out[idx] += 0.4 * sin(xy.y +t);
+}
diff --git a/gpu.h b/gpu.h
@@ -171,10 +171,10 @@ inline std::string toString(const Shape &shape) {
 inline std::string toString(size_t value) { return std::to_string(value); }
 
 /**
- * @brief Represents a shader code.
- * workgroup size and precision are stored since they are specified in the
- * shader code and making the values available helps keep parameters
- * consistent.
+ * @brief Represents shader code. Wrapper type around the code string with
+ * additional metadata for workgroup size and precision since they are
+ * specified in the shader code. Additionally, label and entryPoint are used by
+ * `createKernel()` to specify the label and entry point of the shader.
  */
 struct ShaderCode {
  inline ShaderCode(const std::string &data = "", size_t workgroupSize = 256,
@@ -694,7 +694,7 @@ inline void toCPU(Context &ctx, Tensor &tensor, float *data,
  * @example toCPU(ctx, tensor, data);
  */
 template <size_t N>
-void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> data) {
+void toCPU(Context &ctx, Tensor &tensor, std::array<float, N>& data) {
  toCPU(ctx, tensor, data.data(), sizeof(data));
 }
 
@@ -922,8 +922,8 @@ inline Kernel createKernel(Context &ctx, const ShaderCode &shader,
  computePipelineDesc.layout = pipelineLayout;
  computePipelineDesc.compute.module =
  wgpuDeviceCreateShaderModule(device, &shaderModuleDesc);
- computePipelineDesc.compute.entryPoint = "main";
- computePipelineDesc.label = "compute pipeline";
+ computePipelineDesc.compute.entryPoint = shader.entryPoint.c_str();
+ computePipelineDesc.label = shader.label.c_str();
  op.computePipeline =
  wgpuDeviceCreateComputePipeline(device, &computePipelineDesc);
  }