diff --git a/example/cartpole/CMakeLists.txt b/example/cartpole/CMakeLists.txt deleted file mode 100644 index 5f4156c..0000000 --- a/example/cartpole/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -cmake_minimum_required(VERSION 3.15) - -set(PROJECT_NAME "cartpole") - -set(PROJECT_INCLUDES - inc -) - -set(PROJECT_SOURCES - main.c -) - -project(${PROJECT_NAME}) - - -add_executable(${PROJECT_NAME} ${PROJECT_SOURCES}) -target_include_directories(${PROJECT_NAME} PUBLIC ${PROJECT_INCLUDES}) - -add_subdirectory(../../nn ./build/nn) -target_link_libraries(${PROJECT_NAME} PUBLIC nn) - - diff --git a/example/cartpole/README.md b/example/cartpole/README.md deleted file mode 100644 index e69de29..0000000 diff --git a/example/cartpole/main.c b/example/cartpole/main.c deleted file mode 100644 index e69de29..0000000 diff --git a/example/diffuse-loco/CMakeLists.txt b/example/diffuse-loco/CMakeLists.txt new file mode 100644 index 0000000..96b2656 --- /dev/null +++ b/example/diffuse-loco/CMakeLists.txt @@ -0,0 +1,45 @@ +cmake_minimum_required(VERSION 3.10) + +project(diffuse-loco LANGUAGES C) + + +add_executable(diffuse-loco main.c) +target_include_directories(diffuse-loco PUBLIC inc) + +target_compile_features(diffuse-loco INTERFACE c_std_11) + +if (X86) + message("diffuse-loco: building for x86") + target_link_libraries(diffuse-loco PUBLIC target-x86) + +elseif (RISCV) + message("diffuse-loco: building for RISC-V") + # CMake toolchain definition for RISC-V GCC toolchain + set(CMAKE_SYSTEM_NAME "Generic" CACHE STRING "") + set(CMAKE_SYSTEM_PROCESSOR "riscv" CACHE STRING "") + + set(TOOLCHAIN_PREFIX "riscv64-unknown-elf-") + + set(CMAKE_C_COMPILER "${TOOLCHAIN_PREFIX}gcc") + set(CMAKE_ASM_COMPILER "${TOOLCHAIN_PREFIX}gcc") + set(CMAKE_CXX_COMPILER "${TOOLCHAIN_PREFIX}g++") + set(CMAKE_AR "${TOOLCHAIN_PREFIX}ar") + set(CMAKE_LINKER "{TOOLCHAIN_PREFIX}ld") + set(CMAKE_OBJCOPY "${TOOLCHAIN_PREFIX}objcopy") + set(CMAKE_SIZE "${TOOLCHAIN_PREFIX}size") + set(CMAKE_STRIP "${TOOLCHAIN_PREFIX}ld") + + target_link_libraries(diffuse-loco PUBLIC target-riscv) +endif () + +add_compile_options(-O3 -Wall -Wextra) + +target_compile_options(diffuse-loco PRIVATE -u _printf_float) + +add_subdirectory(../../ ./build/) + +add_subdirectory(../../nn/ ./build/nn) +target_link_libraries(diffuse-loco PUBLIC nn) + +target_link_libraries(diffuse-loco PUBLIC m) + diff --git a/example/diffuse-loco/README.md b/example/diffuse-loco/README.md new file mode 100644 index 0000000..87c4b46 --- /dev/null +++ b/example/diffuse-loco/README.md @@ -0,0 +1,4 @@ +# DiffuseLoco + + + diff --git a/example/diffuse-loco/main.c b/example/diffuse-loco/main.c new file mode 100644 index 0000000..33b9e1b --- /dev/null +++ b/example/diffuse-loco/main.c @@ -0,0 +1,50 @@ +/** + * @file main.c + * + * A simple example demonstrating C = A * B + D + */ + +#include +#include +#include +#include + +#include "rv.h" +#include "nn.h" +#include "model.h" + + +// static void enable_vector_operations() { +// unsigned long mstatus; +// asm volatile("csrr %0, mstatus" : "=r"(mstatus)); +// mstatus |= 0x00000600 | 0x00006000 | 0x00018000; +// asm volatile("csrw mstatus, %0"::"r"(mstatus)); +// } + +int main() { + + // enable_vector_operations(); + + Model *model = malloc(sizeof(Model)); + + size_t cycles; + + printf("initalizing model...\n"); + init(model); + + printf("setting input data...\n"); + NN_fill(&model->input_1, 1.0); + + // cycles = READ_CSR("mcycle"); + forward(model); + // cycles = READ_CSR("mcycle") - cycles; + + printf("cycles: %lu\n", cycles); + + // output tensor([[ 0.0258, -0.0050, 0.0902, -0.0022, -0.0924, -0.0574, 0.0328, 0.0386, -0.0277, 0.0788, 0.0603, -0.0085]]) + + printf("output:\n"); + NN_printf(&model->actor_6); + + return 0; +} diff --git a/example/diffuse-loco/model.bin b/example/diffuse-loco/model.bin new file mode 100644 index 0000000..955f2ef Binary files /dev/null and b/example/diffuse-loco/model.bin differ diff --git a/example/diffuse-loco/model.h b/example/diffuse-loco/model.h new file mode 100644 index 0000000..7edefba --- /dev/null +++ b/example/diffuse-loco/model.h @@ -0,0 +1,100 @@ +#ifndef __MODEL_H +#define __MODEL_H + +#include "nn.h" + + +// load the weight data block from the model.bin file +INCLUDE_FILE(".rodata", "../model.bin", model_weight); +extern uint8_t model_weight_data[]; +extern size_t model_weight_start[]; +extern size_t model_weight_end[]; + +typedef struct { + Tensor input_1; + Tensor actor_0_weight; + Tensor actor_0_bias; + Tensor actor_0; + Tensor actor_1; + Tensor actor_2_weight; + Tensor actor_2_bias; + Tensor actor_2; + Tensor actor_3; + Tensor actor_4_weight; + Tensor actor_4_bias; + Tensor actor_4; + Tensor actor_5; + Tensor actor_6_weight; + Tensor actor_6_bias; + Tensor actor_6; + +} Model; + + +void init(Model *model); + +void forward(Model *model); + +/** + * Initialize the required tensors for the model + */ +void init(Model *model) { + float *array_pointer = (float *)model_weight_data; + + NN_initTensor(&model->input_1, 2, (size_t[]){1, 48}, DTYPE_F32, NULL); + + // : actor_0 + NN_initTensor(&model->actor_0_weight, 2, (size_t[]){512, 48}, DTYPE_F32, array_pointer); + array_pointer += 24576; + NN_initTensor(&model->actor_0_bias, 1, (size_t[]){512}, DTYPE_F32, array_pointer); + array_pointer += 512; + NN_initTensor(&model->actor_0, 2, (size_t[]){1, 512}, DTYPE_F32, NULL); + + // : actor_1 + NN_initTensor(&model->actor_1, 2, (size_t[]){1, 512}, DTYPE_F32, NULL); + + // : actor_2 + NN_initTensor(&model->actor_2_weight, 2, (size_t[]){256, 512}, DTYPE_F32, array_pointer); + array_pointer += 131072; + NN_initTensor(&model->actor_2_bias, 1, (size_t[]){256}, DTYPE_F32, array_pointer); + array_pointer += 256; + NN_initTensor(&model->actor_2, 2, (size_t[]){1, 256}, DTYPE_F32, NULL); + + // : actor_3 + NN_initTensor(&model->actor_3, 2, (size_t[]){1, 256}, DTYPE_F32, NULL); + + // : actor_4 + NN_initTensor(&model->actor_4_weight, 2, (size_t[]){128, 256}, DTYPE_F32, array_pointer); + array_pointer += 32768; + NN_initTensor(&model->actor_4_bias, 1, (size_t[]){128}, DTYPE_F32, array_pointer); + array_pointer += 128; + NN_initTensor(&model->actor_4, 2, (size_t[]){1, 128}, DTYPE_F32, NULL); + + // : actor_5 + NN_initTensor(&model->actor_5, 2, (size_t[]){1, 128}, DTYPE_F32, NULL); + + // : actor_6 + NN_initTensor(&model->actor_6_weight, 2, (size_t[]){12, 128}, DTYPE_F32, array_pointer); + array_pointer += 1536; + NN_initTensor(&model->actor_6_bias, 1, (size_t[]){12}, DTYPE_F32, array_pointer); + array_pointer += 12; + NN_initTensor(&model->actor_6, 2, (size_t[]){1, 12}, DTYPE_F32, NULL); + +} + + +/** + * Forward pass of the model + */ +void forward(Model *model) { + NN_Linear(&model->actor_0, &model->input_1, &model->actor_0_weight, &model->actor_0_bias); + NN_ELU(&model->actor_1, &model->actor_0, 1.0); + NN_Linear(&model->actor_2, &model->actor_1, &model->actor_2_weight, &model->actor_2_bias); + NN_ELU(&model->actor_3, &model->actor_2, 1.0); + NN_Linear(&model->actor_4, &model->actor_3, &model->actor_4_weight, &model->actor_4_bias); + NN_ELU(&model->actor_5, &model->actor_4, 1.0); + NN_Linear(&model->actor_6, &model->actor_5, &model->actor_6_weight, &model->actor_6_bias); + +} + +#endif \ No newline at end of file diff --git a/example/diffuse-loco/scripts/run.py b/example/diffuse-loco/scripts/run.py new file mode 100644 index 0000000..2e1f34a --- /dev/null +++ b/example/diffuse-loco/scripts/run.py @@ -0,0 +1,42 @@ +import numpy as np +import torch +import torch.nn as nn + +import barstools + + +torch.manual_seed(0) + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.actor = nn.Sequential( + nn.Linear(48, 512, bias=True), + nn.ELU(alpha=1.0), + nn.Linear(512, 256, bias=True), + nn.ELU(alpha=1.0), + nn.Linear(256, 128, bias=True), + nn.ELU(alpha=1.0), + nn.Linear(128, 12, bias=True), + ) + + def forward(self, input): + output = self.actor.forward(input) + return output + +# Tracing the module +m = Net() + +# m.load_state_dict(torch.load("model.pth", map_location=torch.device("cpu"))) +m.eval() + +test_input = torch.ones((48, )).unsqueeze(0) + +print(test_input) + +with torch.no_grad(): + output = m.forward(test_input) + print("output", output) + +output = barstools.TorchConverter(m).convert(test_input, output_dir=".") + diff --git a/example/diffuse-loco/test_attn.py b/example/diffuse-loco/test_attn.py new file mode 100644 index 0000000..f043a70 --- /dev/null +++ b/example/diffuse-loco/test_attn.py @@ -0,0 +1,144 @@ + +import math + +import torch + +# seed +torch.manual_seed(0) + + +batch_size = 1 +dim = 4 +max_seq_len = 2 + +q = torch.randn(batch_size, max_seq_len, dim) +k = torch.randn(batch_size, max_seq_len, dim) +v = torch.randn(batch_size, max_seq_len, dim) + + +# class Attention(nn.Module): +# def __init__(self): +# super().__init__() +# n_heads = 1 +# dim = 8 +# max_seq_len = 4 + +# self.n_kv_heads = 1 +# model_parallel_size = 1 +# self.n_local_heads = n_heads // model_parallel_size +# self.n_local_kv_heads = self.n_kv_heads // model_parallel_size +# self.n_rep = self.n_local_heads // self.n_local_kv_heads +# self.head_dim = dim // n_heads +# self.wq = nn.Linear(dim, n_heads * self.head_dim, bias=False) +# self.wk = nn.Linear(dim, self.n_kv_heads * self.head_dim, bias=False) +# self.wv = nn.Linear(dim, self.n_kv_heads * self.head_dim, bias=False) +# self.wo = nn.Linear(n_heads * self.head_dim, dim, bias=False) + +# print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0") +# mask = torch.full((1, 1, max_seq_len, max_seq_len), float("-inf")) +# mask = torch.triu(mask, diagonal=1) +# self.register_buffer("mask", mask) + +# def forward( +# self, +# x: torch.Tensor, +# freqs_cos: torch.Tensor, +# freqs_sin: torch.Tensor, +# ): +# bsz, seqlen, _ = x.shape + +# # QKV +# xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) +# xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim) +# xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) +# xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) + +# # RoPE relative positional embeddings +# xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin) + +# # grouped multiquery attention: expand out keys and values +# xk = repeat_kv(xk, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) +# xv = repeat_kv(xv, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + +# # make heads into a batch dimension +# xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) +# xk = xk.transpose(1, 2) +# xv = xv.transpose(1, 2) + +# # flash implementation +# if self.flash: +# output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None, dropout_p=self.dropout if self.training else 0.0, is_causal=True) +# else: +# # manual implementation +# scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim) +# assert hasattr(self, 'mask') +# scores = scores + self.mask[:, :, :seqlen, :seqlen] # (bs, n_local_heads, seqlen, cache_len + seqlen) +# scores = F.softmax(scores.float(), dim=-1).type_as(xq) +# scores = self.attn_dropout(scores) +# output = torch.matmul(scores, xv) # (bs, n_local_heads, seqlen, head_dim) + +# # restore time as batch dimension and concat heads +# output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1) + +# # final projection into the residual stream +# output = self.wo(output) +# output = self.resid_dropout(output) +# return output + + + + + +def DotProductAttention(query, key, value, scale=0): + # q, k, v shape: (batch_size, seq_len, head_dim) + + l = query.shape[-2] + s = key.shape[-2] + + d_k = query.shape[-1] + + + scale_factor = 1 / math.sqrt(d_k) if scale == 0 else scale + attn_bias = torch.zeros(l, s, dtype=query.dtype) + + attn_weight = query @ key.transpose(-2, -1) * scale_factor + attn_weight += attn_bias + attn_weight = torch.softmax(attn_weight, dim=-1) + # attn_weight = torch.dropout(attn_weight, dropout_p, train=True) + + result = attn_weight @ value + + return result + + +# # Efficient implementation equivalent to the following: +# def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> torch.Tensor: +# L, S = query.size(-2), key.size(-2) +# scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale +# attn_bias = torch.zeros(L, S, dtype=query.dtype) +# if is_causal: +# assert attn_mask is None +# temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0) +# attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf")) +# attn_bias.to(query.dtype) + +# if attn_mask is not None: +# if attn_mask.dtype == torch.bool: +# attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf")) +# else: +# attn_bias += attn_mask +# attn_weight = query @ key.transpose(-2, -1) * scale_factor +# attn_weight += attn_bias +# attn_weight = torch.softmax(attn_weight, dim=-1) +# attn_weight = torch.dropout(attn_weight, dropout_p, train=True) +# return attn_weight @ value + + +result = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) + +result2 = DotProductAttention(q, k, v, scale=0) + + + +print(result) +print(result2) \ No newline at end of file