diff --git a/software/spmd/bsg_cuda_lite_runtime/jacobi/Makefile b/software/spmd/bsg_cuda_lite_runtime/jacobi/Makefile new file mode 100644 index 000000000..79c41f5e2 --- /dev/null +++ b/software/spmd/bsg_cuda_lite_runtime/jacobi/Makefile @@ -0,0 +1,34 @@ +######################################################### +# Network Configutaion +# If not configured, Will use default Values + bsg_global_X ?= $(bsg_tiles_X) + bsg_global_Y ?= $(bsg_tiles_Y)+1 + +######################################################### +#Tile group configuration +# If not configured, Will use default Values + bsg_tiles_org_X ?= 0 + bsg_tiles_org_Y ?= 1 + +# If not configured, Will use default Values + bsg_tiles_X ?= 2 + bsg_tiles_Y ?= 2 + + +all: main.run + + +KERNEL_NAME ?=kernel_jacobi + +OBJECT_FILES=main.o kernel_jacobi.o bsg_barrier_amoadd.o bsg_cuda_lite_barrier.o + +include ../../Makefile.include + + +main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) ../../common/crt.o + $(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS) + + +main.o: Makefile + +include ../../../mk/Makefile.tail_rules diff --git a/software/spmd/bsg_cuda_lite_runtime/jacobi/bsg_group_strider.hpp b/software/spmd/bsg_cuda_lite_runtime/jacobi/bsg_group_strider.hpp new file mode 100644 index 000000000..b34b2c25a --- /dev/null +++ b/software/spmd/bsg_cuda_lite_runtime/jacobi/bsg_group_strider.hpp @@ -0,0 +1,42 @@ +#ifndef __BSG_GROUP_STRIDER +#define __BSG_GROUP_STRIDER +#define BSG_TILE_GROUP_LOG_Y_DIM ((int)(log2(BSG_TILE_GROUP_Y_DIM))) +#define BSG_TILE_GROUP_LOG_X_DIM ((int)(log2(BSG_TILE_GROUP_X_DIM))) +#define MAKE_MASK(WIDTH) ((1UL << (WIDTH)) - 1UL) +template +class bsg_tile_group_strider{ + static const unsigned int GROUP_EPA_WIDTH = 18; + static const unsigned int GROUP_X_CORD_WIDTH = 6; + static const unsigned int GROUP_Y_CORD_WIDTH = 5; + static const unsigned int GROUP_X_CORD_SHIFT = (GROUP_EPA_WIDTH); + static const unsigned int GROUP_Y_CORD_SHIFT = (GROUP_X_CORD_SHIFT+GROUP_X_CORD_WIDTH); + static const unsigned int GROUP_PREFIX_SHIFT = (GROUP_Y_CORD_SHIFT+GROUP_Y_CORD_WIDTH); + + static const unsigned int Y_STRIDE = (1 << GROUP_Y_CORD_SHIFT); + static const unsigned int X_STRIDE = (1 << GROUP_X_CORD_SHIFT); + static const unsigned int Y_MASK = ~(MAKE_MASK(GROUP_Y_CORD_WIDTH - (unsigned int)(log2(TG_Y))) << ((unsigned int)(log2(TG_Y)) + GROUP_Y_CORD_SHIFT)); + static const unsigned int X_MASK = ~(MAKE_MASK(GROUP_X_CORD_WIDTH - (unsigned int)(log2(TG_X))) << ((unsigned int)(log2(TG_X)) + GROUP_X_CORD_SHIFT)); + +protected: +public: + T *ptr; + bsg_tile_group_strider(T *p, int x, int y){ + ptr =(T*)( ((1 << GROUP_PREFIX_SHIFT) + | (y << GROUP_Y_CORD_SHIFT) + | (x << GROUP_X_CORD_SHIFT) + | ((unsigned int) p))); + } + + T* stride(){ + if(S_X == 0){ + return ptr = (T*)(((unsigned int) ptr + Y_STRIDE) & Y_MASK); + } else if(S_Y == 0){ + return ptr = (T*)(((unsigned int) ptr + X_STRIDE) & X_MASK); + } else { + return ptr = (T*)(((((unsigned int) ptr + X_STRIDE) & X_MASK) + Y_STRIDE) & Y_MASK); + } + } + +}; + +#endif diff --git a/software/spmd/bsg_cuda_lite_runtime/jacobi/kernel_jacobi.cpp b/software/spmd/bsg_cuda_lite_runtime/jacobi/kernel_jacobi.cpp new file mode 100644 index 000000000..8202158ba --- /dev/null +++ b/software/spmd/bsg_cuda_lite_runtime/jacobi/kernel_jacobi.cpp @@ -0,0 +1,154 @@ +//This kernel adds 2 vectors + +#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) +#define BSG_TILE_GROUP_X_DIM bsg_tiles_X +#define BSG_TILE_GROUP_Y_DIM bsg_tiles_Y +#include +#include "bsg_manycore.h" +#include "bsg_set_tile_x_y.h" +#include "bsg_group_strider.hpp" +#include "bsg_cuda_lite_barrier.h" + +// copy 64 elements along X axis +void copyXAxis64(float* src, float* dst) { + for (int i = 0; i < 4; i++) { + float tmp00 = src[0]; + float tmp01 = src[1]; + float tmp02 = src[2]; + float tmp03 = src[3]; + float tmp04 = src[4]; + float tmp05 = src[5]; + float tmp06 = src[6]; + float tmp07 = src[7]; + float tmp08 = src[8]; + float tmp09 = src[9]; + float tmp10 = src[10]; + float tmp11 = src[11]; + float tmp12 = src[12]; + float tmp13 = src[13]; + float tmp14 = src[14]; + float tmp15 = src[15]; + asm volatile("": : :"memory"); + dst[0] = tmp00; + dst[1] = tmp01; + dst[2] = tmp02; + dst[3] = tmp03; + dst[4] = tmp04; + dst[5] = tmp05; + dst[6] = tmp06; + dst[7] = tmp07; + dst[8] = tmp08; + dst[9] = tmp09; + dst[10] = tmp10; + dst[11] = tmp11; + dst[12] = tmp12; + dst[13] = tmp13; + dst[14] = tmp14; + dst[15] = tmp15; + dst += 16; + src += 16; + } + return; +} + +extern "C" __attribute__ ((noinline)) +int kernel_jacobi(int c0, int c1, float *A0, float * Anext, + const int nx, const int ny, const int nz) { + + bsg_barrier_hw_tile_group_init(); + bsg_fence(); + bsg_barrier_hw_tile_group_sync(); + bsg_cuda_print_stat_kernel_start(); + + // Calculate 2D XY distribution. One output per tile (temp). + const int j = __bsg_x + 1; + const int k = __bsg_y + 1; + // Idea - unroll Z-axis (k). By 64, which is the input size + + // Check if additional load from DRAM is necessary + const bool x_l_bound = (__bsg_x == 0); + const bool x_h_bound = (__bsg_x == (bsg_tiles_X-1)); + const bool y_l_bound = (__bsg_y == 0); + const bool y_h_bound = (__bsg_y == (bsg_tiles_Y-1)); + + // Buffer for A0 + float a_self[64] = {0.0f}; + + // Auxillary buffers + float aux_left[64]; + float aux_right[64]; + float aux_up[64]; + float aux_down[64]; + + // Construct remote pointers + float* a_up, *a_down, *a_left, *a_right; + + if (x_l_bound) { + a_left = aux_left; + } else { + bsg_tile_group_strider r_left(a_self, __bsg_x-1, __bsg_y); + a_left = r_left.ptr; + } + if (x_h_bound) { + a_right = aux_right; + } else { + bsg_tile_group_strider r_right(a_self, __bsg_x+1, __bsg_y); + a_right = r_right.ptr; + } + if (y_l_bound) { + a_up = aux_up; + } else { + bsg_tile_group_strider r_up(a_self, __bsg_x, __bsg_y-1); + a_up = r_up.ptr; + } + if (y_h_bound) { + a_down = aux_down; + } else { + bsg_tile_group_strider r_down(a_self, __bsg_x, __bsg_y+1); + a_down = r_down.ptr; + } + + for (int ii = 1; ii < nx-1; ii += 62) { + + // Inital load -- we load 64 and produce 62 + if (x_l_bound) { + copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j-1, k)]), a_left); + } + if (x_h_bound) { + copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j+1, k)]), a_right); + } + if (y_l_bound) { + copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j, k-1)]), a_up); + } + if (y_h_bound) { + copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j, k+1)]), a_down); + } + + copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j, k)]), a_self); + bsg_barrier_hw_tile_group_sync(); + + bsg_unroll(8) + for (int i = 1; i < 63; i++) { + // Load top + // top = A0[Index3D (nx, ny, i+1, j, k)]; + float top = a_self[i+1]; + float bottom = a_self[i-1]; + + float left = a_left[i]; + float right = a_right[i]; + float up = a_up[i]; + float down = a_down[i]; + + // Jacobi + float next = (top + bottom + left + right + up + down) * c1 - a_self[i] * c0; + Anext[Index3D (nx, ny, ii-1+i, j, k)] = next; + } + bsg_barrier_hw_tile_group_sync(); + } + + bsg_cuda_print_stat_kernel_end(); + bsg_fence(); + bsg_barrier_hw_tile_group_sync(); + + return 0; +} diff --git a/software/spmd/bsg_cuda_lite_runtime/jacobi/main.c b/software/spmd/bsg_cuda_lite_runtime/jacobi/main.c new file mode 120000 index 000000000..24daac669 --- /dev/null +++ b/software/spmd/bsg_cuda_lite_runtime/jacobi/main.c @@ -0,0 +1 @@ +../main/main.c \ No newline at end of file