diff --git a/example/cartpole/CMakeLists.txt b/example/cartpole/CMakeLists.txt
deleted file mode 100644
index 5f4156c..0000000
--- a/example/cartpole/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-cmake_minimum_required(VERSION 3.15)
-
-set(PROJECT_NAME     "cartpole")
-
-set(PROJECT_INCLUDES
-    inc
-)
-
-set(PROJECT_SOURCES
-    main.c
-)
-
-project(${PROJECT_NAME})
-
-
-add_executable(${PROJECT_NAME} ${PROJECT_SOURCES})
-target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_INCLUDES})
-
-add_subdirectory(../../nn ./build/nn)
-target_link_libraries(${PROJECT_NAME} PUBLIC nn)
-
-
diff --git a/example/cartpole/README.md b/example/cartpole/README.md
deleted file mode 100644
index e69de29..0000000
diff --git a/example/cartpole/main.c b/example/cartpole/main.c
deleted file mode 100644
index e69de29..0000000
diff --git a/example/diffuse-loco/CMakeLists.txt b/example/diffuse-loco/CMakeLists.txt
new file mode 100644
index 0000000..96b2656
--- /dev/null
+++ b/example/diffuse-loco/CMakeLists.txt
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(diffuse-loco LANGUAGES C)
+
+
+add_executable(diffuse-loco main.c)
+target_include_directories(diffuse-loco PUBLIC inc)
+
+target_compile_features(diffuse-loco INTERFACE c_std_11)
+
+if (X86)
+    message("diffuse-loco: building for x86")
+    target_link_libraries(diffuse-loco PUBLIC target-x86)
+
+elseif (RISCV)
+    message("diffuse-loco: building for RISC-V")
+    # CMake toolchain definition for RISC-V GCC toolchain
+    set(CMAKE_SYSTEM_NAME "Generic" CACHE STRING "")
+    set(CMAKE_SYSTEM_PROCESSOR "riscv" CACHE STRING "")
+
+    set(TOOLCHAIN_PREFIX    "riscv64-unknown-elf-")
+
+    set(CMAKE_C_COMPILER    "${TOOLCHAIN_PREFIX}gcc")
+    set(CMAKE_ASM_COMPILER  "${TOOLCHAIN_PREFIX}gcc")
+    set(CMAKE_CXX_COMPILER  "${TOOLCHAIN_PREFIX}g++")
+    set(CMAKE_AR            "${TOOLCHAIN_PREFIX}ar")
+    set(CMAKE_LINKER        "{TOOLCHAIN_PREFIX}ld")
+    set(CMAKE_OBJCOPY       "${TOOLCHAIN_PREFIX}objcopy")
+    set(CMAKE_SIZE          "${TOOLCHAIN_PREFIX}size")
+    set(CMAKE_STRIP         "${TOOLCHAIN_PREFIX}ld")
+    
+    target_link_libraries(diffuse-loco PUBLIC target-riscv)
+endif ()
+
+add_compile_options(-O3 -Wall -Wextra)
+
+target_compile_options(diffuse-loco PRIVATE -u _printf_float)
+
+add_subdirectory(../../ ./build/)
+
+add_subdirectory(../../nn/ ./build/nn)
+target_link_libraries(diffuse-loco PUBLIC nn)
+
+target_link_libraries(diffuse-loco PUBLIC m)
+
diff --git a/example/diffuse-loco/README.md b/example/diffuse-loco/README.md
new file mode 100644
index 0000000..87c4b46
--- /dev/null
+++ b/example/diffuse-loco/README.md
@@ -0,0 +1,4 @@
+# DiffuseLoco
+
+
+
diff --git a/example/diffuse-loco/main.c b/example/diffuse-loco/main.c
new file mode 100644
index 0000000..33b9e1b
--- /dev/null
+++ b/example/diffuse-loco/main.c
@@ -0,0 +1,50 @@
+/**
+ * @file main.c
+ * 
+ * A simple example demonstrating C = A * B + D
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "rv.h"
+#include "nn.h"
+#include "model.h"
+
+
+// static void enable_vector_operations() {
+//     unsigned long mstatus;
+//     asm volatile("csrr %0, mstatus" : "=r"(mstatus));
+//     mstatus |= 0x00000600 | 0x00006000 | 0x00018000;
+//     asm volatile("csrw mstatus, %0"::"r"(mstatus));
+// }
+
+int main() {
+
+  // enable_vector_operations();
+  
+  Model *model = malloc(sizeof(Model));
+
+  size_t cycles;
+  
+  printf("initalizing model...\n");
+  init(model);
+
+  printf("setting input data...\n");
+  NN_fill(&model->input_1, 1.0);
+  
+  // cycles = READ_CSR("mcycle");
+  forward(model);
+  // cycles = READ_CSR("mcycle") - cycles;
+
+  printf("cycles: %lu\n", cycles);
+
+  // output tensor([[ 0.0258, -0.0050,  0.0902, -0.0022, -0.0924, -0.0574,  0.0328,  0.0386, -0.0277,  0.0788,  0.0603, -0.0085]])
+
+  printf("output:\n");
+  NN_printf(&model->actor_6);
+  
+  return 0;
+}
diff --git a/example/diffuse-loco/model.bin b/example/diffuse-loco/model.bin
new file mode 100644
index 0000000..955f2ef
Binary files /dev/null and b/example/diffuse-loco/model.bin differ
diff --git a/example/diffuse-loco/model.h b/example/diffuse-loco/model.h
new file mode 100644
index 0000000..7edefba
--- /dev/null
+++ b/example/diffuse-loco/model.h
@@ -0,0 +1,100 @@
+#ifndef __MODEL_H
+#define __MODEL_H
+
+#include "nn.h"
+
+
+// load the weight data block from the model.bin file
+INCLUDE_FILE(".rodata", "../model.bin", model_weight);
+extern uint8_t model_weight_data[];
+extern size_t model_weight_start[];
+extern size_t model_weight_end[];
+
+typedef struct {
+  Tensor input_1;
+  Tensor actor_0_weight;
+  Tensor actor_0_bias;
+  Tensor actor_0;
+  Tensor actor_1;
+  Tensor actor_2_weight;
+  Tensor actor_2_bias;
+  Tensor actor_2;
+  Tensor actor_3;
+  Tensor actor_4_weight;
+  Tensor actor_4_bias;
+  Tensor actor_4;
+  Tensor actor_5;
+  Tensor actor_6_weight;
+  Tensor actor_6_bias;
+  Tensor actor_6;
+
+} Model;
+
+
+void init(Model *model);
+
+void forward(Model *model);
+
+/**
+ * Initialize the required tensors for the model
+ */
+void init(Model *model) {
+  float *array_pointer = (float *)model_weight_data;
+
+  NN_initTensor(&model->input_1, 2, (size_t[]){1, 48}, DTYPE_F32, NULL);
+
+  // <class 'torch.nn.modules.linear.Linear'>: actor_0
+  NN_initTensor(&model->actor_0_weight, 2, (size_t[]){512, 48}, DTYPE_F32, array_pointer);
+  array_pointer += 24576;
+  NN_initTensor(&model->actor_0_bias, 1, (size_t[]){512}, DTYPE_F32, array_pointer);
+  array_pointer += 512;
+  NN_initTensor(&model->actor_0, 2, (size_t[]){1, 512}, DTYPE_F32, NULL);
+
+  // <class 'torch.nn.modules.activation.ELU'>: actor_1
+  NN_initTensor(&model->actor_1, 2, (size_t[]){1, 512}, DTYPE_F32, NULL);
+
+  // <class 'torch.nn.modules.linear.Linear'>: actor_2
+  NN_initTensor(&model->actor_2_weight, 2, (size_t[]){256, 512}, DTYPE_F32, array_pointer);
+  array_pointer += 131072;
+  NN_initTensor(&model->actor_2_bias, 1, (size_t[]){256}, DTYPE_F32, array_pointer);
+  array_pointer += 256;
+  NN_initTensor(&model->actor_2, 2, (size_t[]){1, 256}, DTYPE_F32, NULL);
+
+  // <class 'torch.nn.modules.activation.ELU'>: actor_3
+  NN_initTensor(&model->actor_3, 2, (size_t[]){1, 256}, DTYPE_F32, NULL);
+
+  // <class 'torch.nn.modules.linear.Linear'>: actor_4
+  NN_initTensor(&model->actor_4_weight, 2, (size_t[]){128, 256}, DTYPE_F32, array_pointer);
+  array_pointer += 32768;
+  NN_initTensor(&model->actor_4_bias, 1, (size_t[]){128}, DTYPE_F32, array_pointer);
+  array_pointer += 128;
+  NN_initTensor(&model->actor_4, 2, (size_t[]){1, 128}, DTYPE_F32, NULL);
+
+  // <class 'torch.nn.modules.activation.ELU'>: actor_5
+  NN_initTensor(&model->actor_5, 2, (size_t[]){1, 128}, DTYPE_F32, NULL);
+
+  // <class 'torch.nn.modules.linear.Linear'>: actor_6
+  NN_initTensor(&model->actor_6_weight, 2, (size_t[]){12, 128}, DTYPE_F32, array_pointer);
+  array_pointer += 1536;
+  NN_initTensor(&model->actor_6_bias, 1, (size_t[]){12}, DTYPE_F32, array_pointer);
+  array_pointer += 12;
+  NN_initTensor(&model->actor_6, 2, (size_t[]){1, 12}, DTYPE_F32, NULL);
+
+}
+
+
+/**
+ * Forward pass of the model
+ */
+void forward(Model *model) {
+  NN_Linear(&model->actor_0, &model->input_1, &model->actor_0_weight, &model->actor_0_bias);
+  NN_ELU(&model->actor_1, &model->actor_0, 1.0);
+  NN_Linear(&model->actor_2, &model->actor_1, &model->actor_2_weight, &model->actor_2_bias);
+  NN_ELU(&model->actor_3, &model->actor_2, 1.0);
+  NN_Linear(&model->actor_4, &model->actor_3, &model->actor_4_weight, &model->actor_4_bias);
+  NN_ELU(&model->actor_5, &model->actor_4, 1.0);
+  NN_Linear(&model->actor_6, &model->actor_5, &model->actor_6_weight, &model->actor_6_bias);
+
+}
+
+#endif
\ No newline at end of file
diff --git a/example/diffuse-loco/scripts/run.py b/example/diffuse-loco/scripts/run.py
new file mode 100644
index 0000000..2e1f34a
--- /dev/null
+++ b/example/diffuse-loco/scripts/run.py
@@ -0,0 +1,42 @@
+import numpy as np
+import torch
+import torch.nn as nn
+
+import barstools
+
+
+torch.manual_seed(0)
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.actor = nn.Sequential(
+            nn.Linear(48, 512, bias=True),
+            nn.ELU(alpha=1.0),
+            nn.Linear(512, 256, bias=True),
+            nn.ELU(alpha=1.0),
+            nn.Linear(256, 128, bias=True),
+            nn.ELU(alpha=1.0),
+            nn.Linear(128, 12, bias=True),
+        )
+
+    def forward(self, input):
+        output = self.actor.forward(input)
+        return output
+
+# Tracing the module
+m = Net()
+
+# m.load_state_dict(torch.load("model.pth", map_location=torch.device("cpu")))
+m.eval()
+
+test_input = torch.ones((48, )).unsqueeze(0)
+
+print(test_input)
+
+with torch.no_grad():
+    output = m.forward(test_input)
+    print("output", output)
+
+output = barstools.TorchConverter(m).convert(test_input, output_dir=".")
+
diff --git a/example/diffuse-loco/test_attn.py b/example/diffuse-loco/test_attn.py
new file mode 100644
index 0000000..f043a70
--- /dev/null
+++ b/example/diffuse-loco/test_attn.py
@@ -0,0 +1,144 @@
+
+import math
+
+import torch
+
+# seed
+torch.manual_seed(0)
+
+
+batch_size = 1
+dim = 4
+max_seq_len = 2
+
+q = torch.randn(batch_size, max_seq_len, dim)
+k = torch.randn(batch_size, max_seq_len, dim)
+v = torch.randn(batch_size, max_seq_len, dim)
+
+
+# class Attention(nn.Module):
+#     def __init__(self):
+#         super().__init__()
+#         n_heads = 1
+#         dim = 8
+#         max_seq_len = 4
+
+#         self.n_kv_heads = 1
+#         model_parallel_size = 1
+#         self.n_local_heads = n_heads // model_parallel_size
+#         self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+#         self.n_rep = self.n_local_heads // self.n_local_kv_heads
+#         self.head_dim = dim // n_heads
+#         self.wq = nn.Linear(dim, n_heads * self.head_dim, bias=False)
+#         self.wk = nn.Linear(dim, self.n_kv_heads * self.head_dim, bias=False)
+#         self.wv = nn.Linear(dim, self.n_kv_heads * self.head_dim, bias=False)
+#         self.wo = nn.Linear(n_heads * self.head_dim, dim, bias=False)
+        
+#         print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
+#         mask = torch.full((1, 1, max_seq_len, max_seq_len), float("-inf"))
+#         mask = torch.triu(mask, diagonal=1)
+#         self.register_buffer("mask", mask)
+
+#     def forward(
+#         self,
+#         x: torch.Tensor,
+#         freqs_cos: torch.Tensor,
+#         freqs_sin: torch.Tensor,
+#     ):
+#         bsz, seqlen, _ = x.shape
+
+#         # QKV
+#         xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+#         xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+#         xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+#         xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+
+#         # RoPE relative positional embeddings
+#         xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)
+
+#         # grouped multiquery attention: expand out keys and values
+#         xk = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+#         xv = repeat_kv(xv, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+
+#         # make heads into a batch dimension
+#         xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+#         xk = xk.transpose(1, 2)
+#         xv = xv.transpose(1, 2)
+
+#         # flash implementation
+#         if self.flash:
+#             output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None, dropout_p=self.dropout if self.training else 0.0, is_causal=True)
+#         else:
+#             # manual implementation
+#             scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)
+#             assert hasattr(self, 'mask')
+#             scores = scores + self.mask[:, :, :seqlen, :seqlen]   # (bs, n_local_heads, seqlen, cache_len + seqlen)
+#             scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+#             scores = self.attn_dropout(scores)
+#             output = torch.matmul(scores, xv)  # (bs, n_local_heads, seqlen, head_dim)
+
+#         # restore time as batch dimension and concat heads
+#         output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+
+#         # final projection into the residual stream
+#         output = self.wo(output)
+#         output = self.resid_dropout(output)
+#         return output
+
+
+
+
+
+def DotProductAttention(query, key, value, scale=0):
+    # q, k, v shape: (batch_size, seq_len, head_dim)
+    
+    l = query.shape[-2]
+    s = key.shape[-2]
+
+    d_k = query.shape[-1]
+
+
+    scale_factor = 1 / math.sqrt(d_k) if scale == 0 else scale
+    attn_bias = torch.zeros(l, s, dtype=query.dtype)
+
+    attn_weight = query @ key.transpose(-2, -1) * scale_factor
+    attn_weight += attn_bias
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    # attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+
+    result = attn_weight @ value
+
+    return result
+
+
+# # Efficient implementation equivalent to the following:
+# def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> torch.Tensor:
+#     L, S = query.size(-2), key.size(-2)
+#     scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+#     attn_bias = torch.zeros(L, S, dtype=query.dtype)
+#     if is_causal:
+#         assert attn_mask is None
+#         temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+#         attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+#         attn_bias.to(query.dtype)
+
+#     if attn_mask is not None:
+#         if attn_mask.dtype == torch.bool:
+#             attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+#         else:
+#             attn_bias += attn_mask
+#     attn_weight = query @ key.transpose(-2, -1) * scale_factor
+#     attn_weight += attn_bias
+#     attn_weight = torch.softmax(attn_weight, dim=-1)
+#     attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+#     return attn_weight @ value
+
+
+result = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None)
+
+result2 = DotProductAttention(q, k, v, scale=0)
+
+
+
+print(result)
+print(result2)
\ No newline at end of file