diff --git a/.clang-format-ignore b/.clang-format-ignore index c78f2e5fc2..ce95be05f0 100644 --- a/.clang-format-ignore +++ b/.clang-format-ignore @@ -4,3 +4,6 @@ # Ignore vendored third-party code ./sw/math/* +./target/snitch_cluster/sw/apps/transformer/src/transformer.c +./target/snitch_cluster/sw/apps/transformer/src/data.h +./sw/apps/transformer/src/transformer.h \ No newline at end of file diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 6c4f91184b..c90fd5b6f4 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -117,6 +117,7 @@ jobs: with: flake8-version: "6.0.0" max-line-length: "100" + exclude: "target/snitch_cluster/sw/apps/dnn/datagen.py" ###################### # Clang-Format Check # diff --git a/sw/blas/axpy/data/datagen.py b/sw/blas/axpy/data/datagen.py index f7ae7a6488..3f48e348d7 100755 --- a/sw/blas/axpy/data/datagen.py +++ b/sw/blas/axpy/data/datagen.py @@ -11,8 +11,8 @@ import os sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) -from data_utils import format_scalar_definition, format_vector_definition, \ - format_vector_declaration, format_ifdef_wrapper # noqa: E402 +from data_utils import format_scalar_definition, format_array_definition, \ + format_array_declaration, format_ifdef_wrapper # noqa: E402 MIN = -1000 MAX = +1000 @@ -47,16 +47,16 @@ def main(): a = np.random.uniform(MIN, MAX, 1) x = np.random.uniform(MIN, MAX, length) y = np.random.uniform(MIN, MAX, length) - z = np.zeros(length) g = golden_model(a, x, y) # Format header file l_str = format_scalar_definition('const uint32_t', 'l', length) a_str = format_scalar_definition('const double', 'a', a[0]) - x_str = format_vector_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section) - y_str = format_vector_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section) - z_str = format_vector_declaration('double', 'z', z, alignment=BURST_ALIGNMENT, section=section) - g_str = format_vector_definition('double', 'g', g) + x_str = format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section) + y_str = format_array_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section) + z_str = format_array_declaration('double', 'z', [length], + alignment=BURST_ALIGNMENT, section=section) + g_str = format_array_definition('double', 'g', g) g_str = format_ifdef_wrapper('BIST', g_str) f_str = '\n\n'.join([l_str, a_str, x_str, y_str, z_str, g_str]) f_str += '\n' diff --git a/sw/blas/axpy/verify.py b/sw/blas/axpy/verify.py index 80b195ff9c..a79f7c18e2 100755 --- a/sw/blas/axpy/verify.py +++ b/sw/blas/axpy/verify.py @@ -13,7 +13,7 @@ sys.path.append(str(Path(__file__).parent / '../../../util/sim/')) import verification # noqa: E402 from elf import Elf # noqa: E402 -from data_utils import bytes_to_doubles # noqa: E402 +from data_utils import bytes_to_float # noqa: E402 ERR_THRESHOLD = 1E-10 @@ -27,16 +27,16 @@ def main(): symbols_bin=args.symbols_bin, log=args.log, output_uids=['z']) - z_actual = np.array(bytes_to_doubles(raw_results['z'])) + z_actual = np.array(bytes_to_float(raw_results['z'], prec='64')) # Extract input operands from ELF file if args.symbols_bin: elf = Elf(args.symbols_bin) else: elf = Elf(args.snitch_bin) - a = np.array(bytes_to_doubles(elf.get_symbol_contents('a'))) - x = np.array(bytes_to_doubles(elf.get_symbol_contents('x'))) - y = np.array(bytes_to_doubles(elf.get_symbol_contents('y'))) + a = np.array(bytes_to_float(elf.get_symbol_contents('a'), prec='64')) + x = np.array(bytes_to_float(elf.get_symbol_contents('x'), prec='64')) + y = np.array(bytes_to_float(elf.get_symbol_contents('y'), prec='64')) # Verify results z_golden = golden_model(a, x, y) diff --git a/sw/blas/gemm/data/datagen.py b/sw/blas/gemm/data/datagen.py index 0ccab83817..45a008eff2 100755 --- a/sw/blas/gemm/data/datagen.py +++ b/sw/blas/gemm/data/datagen.py @@ -15,7 +15,7 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) from data_utils import emit_license, format_scalar_definition, \ - format_vector_definition, format_ifdef_wrapper # noqa: E402 + format_array_definition, format_ifdef_wrapper # noqa: E402 np.random.seed(42) @@ -100,18 +100,18 @@ def emit_header(**kwargs): data_str += [format_scalar_definition('uint32_t', 'BETA', kwargs['beta'])] data_str += [format_scalar_definition('uint32_t', 'dtype_size', kwargs['prec']//8)] data_str += [format_scalar_definition('uint32_t', 'expand', kwargs['expand'])] - data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(), + data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(), alignment=BURST_ALIGNMENT, section=kwargs['section'])] - data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(), + data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(), alignment=BURST_ALIGNMENT, section=kwargs['section'])] - data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(), + data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(), alignment=BURST_ALIGNMENT, section=kwargs['section'])] if kwargs['prec'] == 8: - result_def = format_vector_definition(C_TYPES['64'], 'result', result.flatten()) + result_def = format_array_definition(C_TYPES['64'], 'result', result.flatten()) else: - result_def = format_vector_definition(C_TYPES[str(kwargs['prec'])], - 'result', - result.flatten()) + result_def = format_array_definition(C_TYPES[str(kwargs['prec'])], + 'result', + result.flatten()) data_str += [format_ifdef_wrapper('BIST', result_def)] data_str = '\n\n'.join(data_str) diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h index baab570478..ea2c865636 100644 --- a/sw/blas/gemm/src/gemm.h +++ b/sw/blas/gemm/src/gemm.h @@ -24,7 +24,6 @@ typedef char v8f8 __attribute__((vector_size(8))); dump_float(gemm, 8); dump_uint(index, 9); - void gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double* A, uint32_t ldA, uint32_t ta, double* B, uint32_t ldB, uint32_t tb, double* C, uint32_t ldC, double BETA) { @@ -74,24 +73,23 @@ void gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double* A, } /* params: -* M: number of rows of A and C -* N: number of columns of B and C -* K: number of columns of A and rows of B -* A: pointer to matrix A -* ldA: row stride of A -* ta: transpose A -* B: pointer to matrix B -* ldB: row stride of B -* tb: transpose B -* C: pointer to matrix C -* ldC: row stride of C -* ALPHA: scalar alpha -* A is MxK, B is KxN, C is MxN -*/ + * M: number of rows of A and C + * N: number of columns of B and C + * K: number of columns of A and rows of B + * A: pointer to matrix A + * ldA: row stride of A + * ta: transpose A + * B: pointer to matrix B + * ldB: row stride of B + * tb: transpose B + * C: pointer to matrix C + * ldC: row stride of C + * ALPHA: scalar alpha + * A is MxK, B is KxN, C is MxN + */ void gemm_fp32_baseline(uint32_t M, uint32_t N, uint32_t K, float* A, uint32_t ldA, uint32_t ta, float* B, uint32_t ldB, uint32_t tb, float* C, uint32_t ldC, float ALPHA) { - // float c0, c1, c2, c3 = 0; float c0 = 0.0f; float c1 = 0.0f; @@ -110,7 +108,7 @@ void gemm_fp32_baseline(uint32_t M, uint32_t N, uint32_t K, float* A, c1 = 0.0f; c2 = 0.0f; c3 = 0.0f; - for (uint32_t k = 0; k < K; k+=4) { + for (uint32_t k = 0; k < K; k += 4) { c0 += A[(k + 0) + m * ldA] * B[(k + 0) * ldB + n]; c1 += A[(k + 1) + m * ldA] * B[(k + 1) * ldB + n]; c2 += A[(k + 2) + m * ldA] * B[(k + 2) * ldB + n]; @@ -131,7 +129,7 @@ void gemm_fp32_baseline(uint32_t M, uint32_t N, uint32_t K, float* A, c1 = 0.0f; c2 = 0.0f; c3 = 0.0f; - for (uint32_t k = 0; k < K; k+=4) { + for (uint32_t k = 0; k < K; k += 4) { c0 += A[(k + 0) * M * ldA + m * ldA] * B[(k + 0) * ldB + n]; c1 += A[(k + 1) * M * ldA + m * ldA] * B[(k + 1) * ldB + n]; c2 += A[(k + 2) * M * ldA + m * ldA] * B[(k + 2) * ldB + n]; @@ -152,7 +150,7 @@ void gemm_fp32_baseline(uint32_t M, uint32_t N, uint32_t K, float* A, c1 = 0.0f; c2 = 0.0f; c3 = 0.0f; - for (uint32_t k = 0; k < K; k+=4) { + for (uint32_t k = 0; k < K; k += 4) { // c0 += A[k + m * ldA] * B[k + n * ldB]; c0 += A[(k + 0) + m * ldA] * B[(k + 0) + n * ldB]; c1 += A[(k + 1) + m * ldA] * B[(k + 1) + n * ldB]; diff --git a/sw/blas/gemm/verify.py b/sw/blas/gemm/verify.py index b6f886b7b0..bd13836cec 100755 --- a/sw/blas/gemm/verify.py +++ b/sw/blas/gemm/verify.py @@ -13,7 +13,7 @@ sys.path.append(str(Path(__file__).parent / '../../../util/sim/')) import verification # noqa: E402 from elf import Elf # noqa: E402 -from data_utils import bytes_to_doubles, bytes_to_uint32s # noqa: E402 +from data_utils import bytes_to_float, bytes_to_int # noqa: E402 ERR_THRESHOLD = 0.001 @@ -27,21 +27,21 @@ def main(): symbols_bin=args.symbols_bin, log=args.log, output_uids=['c']) - c_actual = np.array(bytes_to_doubles(raw_results['c'])) + c_actual = np.array(bytes_to_float(raw_results['c'], prec='64')) # Extract input operands from ELF file if args.symbols_bin: elf = Elf(args.symbols_bin) else: elf = Elf(args.snitch_bin) - a = np.array(bytes_to_doubles(elf.get_symbol_contents('a'))) - b = np.array(bytes_to_doubles(elf.get_symbol_contents('b'))) - c = np.array(bytes_to_doubles(elf.get_symbol_contents('c'))) - beta = bytes_to_uint32s(elf.get_symbol_contents('BETA'))[0] - m = bytes_to_uint32s(elf.get_symbol_contents('M'))[0] - n = bytes_to_uint32s(elf.get_symbol_contents('N'))[0] - k = bytes_to_uint32s(elf.get_symbol_contents('K'))[0] - tb = bytes_to_uint32s(elf.get_symbol_contents('TB'))[0] + a = np.array(bytes_to_float(elf.get_symbol_contents('a'), prec='64')) + b = np.array(bytes_to_float(elf.get_symbol_contents('b'), prec='64')) + c = np.array(bytes_to_float(elf.get_symbol_contents('c'), prec='64')) + beta = bytes_to_int(elf.get_symbol_contents('BETA'), prec='32', signedness='unsigned')[0] + m = bytes_to_int(elf.get_symbol_contents('M'), prec='32', signedness='unsigned')[0] + n = bytes_to_int(elf.get_symbol_contents('N'), prec='32', signedness='unsigned')[0] + k = bytes_to_int(elf.get_symbol_contents('K'), prec='32', signedness='unsigned')[0] + tb = bytes_to_int(elf.get_symbol_contents('TB'), prec='32', signedness='unsigned')[0] a = np.reshape(a, (m, k)) if tb: b = np.reshape(b, (n, k)) diff --git a/sw/dnn/.gitignore b/sw/dnn/.gitignore new file mode 100644 index 0000000000..aed262ca8f --- /dev/null +++ b/sw/dnn/.gitignore @@ -0,0 +1 @@ +*/data/data.h diff --git a/sw/dnn/batchnorm/data/datagen.py b/sw/dnn/batchnorm/data/datagen.py new file mode 100755 index 0000000000..8dd0b1de73 --- /dev/null +++ b/sw/dnn/batchnorm/data/datagen.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Tim Fischer +# Viviane Potocnik +# Luca Colagrande + +import argparse +import pathlib +import hjson +import sys +import os +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +import data_utils # noqa: E402 +from data_utils import emit_license, \ + format_struct_definition, format_array_definition, \ + format_array_declaration, format_ifdef_wrapper # noqa: E402 + +torch.manual_seed(42) + +# AXI splits bursts crossing 4KB address boundaries. To minimize +# the occurrence of these splits the data should be aligned to 4KB +BURST_ALIGNMENT = 4096 + +PRECISION_T = { + '64': 'FP64', + '32': 'FP32', + '16': 'FP16', + '8': 'FP8' +} + + +def golden_model(ifmap): + n, ci, ih, iw = ifmap.shape + bn = torch.nn.BatchNorm2d(ci) + bn.weight.requires_grad = False + bn.bias.requires_grad = False + running_mean = torch.randn_like(bn.running_mean, requires_grad=False) + running_var = torch.rand_like(bn.running_var, requires_grad=False) + gamma = bn.weight / torch.sqrt(running_var + bn.eps) + beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) + ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1) + return ofmap, gamma, beta + + +def emit_header(**kwargs): + + in_channels = kwargs['input_dim']['channels'] + in_height = kwargs['input_dim']['height'] + in_width = kwargs['input_dim']['width'] + tile_ci = kwargs['tile_ci'] + prec = str(kwargs['prec']) + + torch_type = data_utils.floating_point_torch_type(prec) + ctype = data_utils.floating_point_ctype(prec) + + ifmap = torch.randn(1, in_channels, in_height, in_width, requires_grad=False, dtype=torch_type) + ofmap, gamma, beta = golden_model(ifmap) + + # convert from CHW to HWC format + ifmap = ifmap.permute(0, 2, 3, 1) + ofmap = ofmap.permute(0, 2, 3, 1) + + n, ih, iw, ci = ifmap.shape + + ifmap_uid = 'ifmap' + ofmap_uid = 'ofmap' + beta_uid = 'beta' + gamma_uid = 'gamma' + + layer_cfg = { + 'CI': ci, + 'IH': ih, + 'IW': iw, + 'TILE_CI': tile_ci, + 'ifmap': ifmap_uid, + 'ofmap': ofmap_uid, + 'beta': beta_uid, + 'gamma': gamma_uid + } + + data_str = [emit_license()] + # Array forward declarations + data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape)] + data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape)] + data_str += [format_array_declaration(ctype, beta_uid, beta.shape)] + data_str += [format_array_declaration(ctype, gamma_uid, gamma.shape)] + # Layer struct + data_str += [format_struct_definition('batchnorm_layer_t', 'layer', layer_cfg)] + # Array definitions + data_str += [format_array_definition(ctype, ifmap_uid, ifmap)] + data_str += [format_array_definition(ctype, beta_uid, beta)] + data_str += [format_array_definition(ctype, gamma_uid, gamma)] + # Golden results for BIST + result_def = format_array_definition(ctype, 'golden', ofmap) + data_str += [format_ifdef_wrapper('BIST', result_def)] + data_str = '\n\n'.join(data_str) + + return data_str + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for layernorm kernel') + parser.add_argument( + "-c", "--cfg", + type=pathlib.Path, + required=True, + help='Select param config file kernel' + ) + parser.add_argument( + '--section', + type=str, + help='Section to store matrices in') + parser.add_argument( + 'output', + type=pathlib.Path, + help='Path of the output header file') + args = parser.parse_args() + + # Load param config file + with args.cfg.open() as f: + param = hjson.loads(f.read()) + param['section'] = args.section + + # Emit header file + with open(args.output, 'w') as f: + f.write(emit_header(**param)) + + +if __name__ == '__main__': + main() diff --git a/target/snitch_cluster/sw/apps/dnn/batchnorm/src/params.hjson b/sw/dnn/batchnorm/data/params.hjson similarity index 66% rename from target/snitch_cluster/sw/apps/dnn/batchnorm/src/params.hjson rename to sw/dnn/batchnorm/data/params.hjson index b8d774d0b8..34645f93e4 100644 --- a/target/snitch_cluster/sw/apps/dnn/batchnorm/src/params.hjson +++ b/sw/dnn/batchnorm/data/params.hjson @@ -2,17 +2,12 @@ // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 -// Parameters for a single BatchNorm layer - { - kernel: "BatchNorm" - channels: { - out: 32, - in: 32 - } input_dim: { + channels: 32 height: 8, width: 8 } + tile_ci: 32 prec: 64 } diff --git a/sw/dnn/src/batchnorm.h b/sw/dnn/batchnorm/src/batchnorm.h similarity index 82% rename from sw/dnn/src/batchnorm.h rename to sw/dnn/batchnorm/src/batchnorm.h index f1e8460646..4c8b5adc10 100644 --- a/sw/dnn/src/batchnorm.h +++ b/sw/dnn/batchnorm/src/batchnorm.h @@ -4,6 +4,18 @@ #include "snrt.h" +typedef struct { + uint32_t CI; + uint32_t IH; + uint32_t IW; + uint32_t TILE_CI; + double *ifmap; + double *ofmap; + double *gamma; + double *beta; + precision_t dtype; +} batchnorm_layer_t; + /** * @brief implementation of a FP64 batchnorm as a linear combination * y = gamma * x + beta @@ -50,12 +62,17 @@ static inline void batchnorm_fp64(double *ifmap, double *gamma, double *beta, snrt_ssr_disable(); } -static inline void batchnorm_layer(const conv_layer *l) { +static inline void batchnorm_layer(const batchnorm_layer_t *l) { const uint32_t cluster_num = snrt_cluster_num(); const uint32_t cluster_id = snrt_cluster_idx(); const uint32_t compute_num = snrt_cluster_compute_core_num(); const uint32_t compute_id = snrt_cluster_core_idx(); + // Calculate output dimensions + uint32_t OH = l->IH; + uint32_t OW = l->IW; + uint32_t CO = l->CI; + // Each cluster loads one tile of a row uint32_t ifmap_size = 2 * l->IW * l->TILE_CI; uint32_t weights_size = l->CI; @@ -78,7 +95,7 @@ static inline void batchnorm_layer(const conv_layer *l) { uint32_t prev_ow; uint32_t prev_ci; - for (uint32_t oh = cluster_id; oh < l->OH; oh += cluster_num) { + for (uint32_t oh = cluster_id; oh < OH; oh += cluster_num) { for (uint32_t ci = 0; ci < l->CI; ci += l->TILE_CI) { if (snrt_is_dm_core()) { // Load weights once in the beginning @@ -112,16 +129,15 @@ static inline void batchnorm_layer(const conv_layer *l) { if (!(oh == cluster_id && ci == 0)) { if (l->TILE_CI == l->CI) { // data is stored consecutively - snrt_dma_start_1d(&l->ofmap[prev_oh * l->OW * l->CI], + snrt_dma_start_1d(&l->ofmap[prev_oh * OW * l->CI], &ofmap[!read_buf * (ofmap_size / 2)], sizeof(double) * l->IW * l->CI); } else { // data is stored in interleaved layout snrt_dma_start_2d( - &l->ofmap[prev_oh * l->OW * l->CI + - prev_ci], /* dst */ - &ofmap[!read_buf * (ofmap_size / 2)], /* src */ - sizeof(double) * l->TILE_CI, /* size */ + &l->ofmap[prev_oh * OW * l->CI + prev_ci], /* dst */ + &ofmap[!read_buf * (ofmap_size / 2)], /* src */ + sizeof(double) * l->TILE_CI, /* size */ sizeof(double) * l->CI, /* dst_stride */ sizeof(double) * l->TILE_CI, /* src_stride */ l->IW); /* repetitions */ @@ -146,7 +162,7 @@ static inline void batchnorm_layer(const conv_layer *l) { batchnorm_fp64(&ifmap[read_buf * ofmap_size / 2 + compute_id], &gamma[ci + compute_id], &beta[ci + compute_id], &ofmap[write_buf * ofmap_size / 2 + compute_id], - l->OW, l->TILE_CI, compute_num, setup_SSR); + OW, l->TILE_CI, compute_num, setup_SSR); write_buf = !write_buf; read_buf = !read_buf; @@ -160,18 +176,18 @@ static inline void batchnorm_layer(const conv_layer *l) { if (snrt_is_dm_core()) { if (l->TILE_CI == l->CI) { // data is stored consecutively - snrt_dma_start_1d(&l->ofmap[prev_oh * l->OW * l->CI], + snrt_dma_start_1d(&l->ofmap[prev_oh * OW * l->CI], &ofmap[!read_buf * (ofmap_size / 2)], sizeof(double) * l->IW * l->CI); } else { // data is stored in interleaved layout snrt_dma_start_2d( - &l->ofmap[prev_oh * l->OW * l->CI + prev_ci], /* dst */ - &ofmap[!read_buf * (ofmap_size / 2)], /* src */ - sizeof(double) * l->TILE_CI, /* size */ - sizeof(double) * l->CI, /* dst_stride */ - sizeof(double) * l->TILE_CI, /* src_stride */ - l->IW); /* repetitions */ + &l->ofmap[prev_oh * OW * l->CI + prev_ci], /* dst */ + &ofmap[!read_buf * (ofmap_size / 2)], /* src */ + sizeof(double) * l->TILE_CI, /* size */ + sizeof(double) * l->CI, /* dst_stride */ + sizeof(double) * l->TILE_CI, /* src_stride */ + l->IW); /* repetitions */ } snrt_dma_wait_all(); diff --git a/sw/dnn/batchnorm/src/main.c b/sw/dnn/batchnorm/src/main.c new file mode 100644 index 0000000000..789d3dd547 --- /dev/null +++ b/sw/dnn/batchnorm/src/main.c @@ -0,0 +1,15 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "dnn.h" + +#include "data.h" + +int main() { + batchnorm_layer(&layer); + + snrt_global_barrier(); + + return 0; +} diff --git a/sw/dnn/common.mk b/sw/dnn/common.mk new file mode 100644 index 0000000000..c6630d7652 --- /dev/null +++ b/sw/dnn/common.mk @@ -0,0 +1,31 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +# Usage of absolute paths is required to externally include this Makefile +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) + +DATA_DIR := $(realpath $(MK_DIR)/$(APP)/data) +SRC_DIR := $(realpath $(MK_DIR)/$(APP)/src) +COMMON_SRC_DIR := $(realpath $(MK_DIR)/src) + +DATA_CFG ?= $(DATA_DIR)/params.hjson +SECTION ?= + +SRCS ?= $(realpath $(SRC_DIR)/main.c) +INCDIRS ?= $(DATA_DIR) $(SRC_DIR) $(COMMON_SRC_DIR) + +DATAGEN_PY := $(DATA_DIR)/datagen.py +DATA_H := $(DATA_DIR)/data.h + +$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) + $< -c $(DATA_CFG) --section="$(SECTION)" $@ + +.PHONY: clean-data clean + +clean-data: + rm -f $(DATA_H) + +clean: clean-data diff --git a/sw/dnn/src/conv2d.h b/sw/dnn/conv2d/conv2d.h similarity index 100% rename from sw/dnn/src/conv2d.h rename to sw/dnn/conv2d/conv2d.h diff --git a/sw/dnn/gelu/data/datagen.py b/sw/dnn/gelu/data/datagen.py new file mode 100755 index 0000000000..25d72b1055 --- /dev/null +++ b/sw/dnn/gelu/data/datagen.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Tim Fischer +# Viviane Potocnik +# Luca Colagrande + +import argparse +import pathlib +import hjson +import sys +import os +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +import data_utils # noqa: E402 +from data_utils import emit_license, \ + format_struct_definition, format_array_definition, \ + format_array_declaration, format_ifdef_wrapper # noqa: E402 + +torch.manual_seed(42) + +# AXI splits bursts crossing 4KB address boundaries. To minimize +# the occurrence of these splits the data should be aligned to 4KB +BURST_ALIGNMENT = 4096 + +PRECISION_T = { + '64': 'FP64', + '32': 'FP32', + '16': 'FP16', + '8': 'FP8' +} + + +def golden_model(ifmap): + gelu = torch.nn.GELU() + return gelu(ifmap) + + +def emit_header(**kwargs): + + batch_size = kwargs['input_dim']['batch_size'] + seq_len = kwargs['input_dim']['seq_len'] + hidden_nodes = kwargs['input_dim']['hidden_nodes'] + prec = str(kwargs['prec']) + + torch_type = data_utils.floating_point_torch_type(prec) + ctype = data_utils.floating_point_ctype(prec) + + ifmap = torch.randn(batch_size, seq_len, hidden_nodes, requires_grad=False, dtype=torch_type) + ofmap = golden_model(ifmap) + + ifmap_uid = 'ifmap' + ofmap_uid = 'ofmap' + + layer_cfg = { + 'batch_size': batch_size, + 'seq_len': seq_len, + 'hidden_nodes': hidden_nodes, + 'ifmap': ifmap_uid, + 'ofmap': ofmap_uid, + 'dtype': PRECISION_T[prec] + } + + data_str = [emit_license()] + # Array forward declarations + data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape)] + data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape)] + # Layer struct + data_str += [format_struct_definition('gelu_layer_t', 'layer', layer_cfg)] + # Array definitions + data_str += [format_array_definition(ctype, ifmap_uid, ifmap)] + # Golden results for BIST + result_def = format_array_definition(ctype, 'golden', ofmap) + data_str += [format_ifdef_wrapper('BIST', result_def)] + data_str = '\n\n'.join(data_str) + + return data_str + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for layernorm kernel') + parser.add_argument( + "-c", "--cfg", + type=pathlib.Path, + required=True, + help='Select param config file kernel' + ) + parser.add_argument( + '--section', + type=str, + help='Section to store matrices in') + parser.add_argument( + 'output', + type=pathlib.Path, + help='Path of the output header file') + args = parser.parse_args() + + # Load param config file + with args.cfg.open() as f: + param = hjson.loads(f.read()) + param['section'] = args.section + + # Emit header file + with open(args.output, 'w') as f: + f.write(emit_header(**param)) + + +if __name__ == '__main__': + main() diff --git a/target/snitch_cluster/sw/apps/dnn/gelu/src/params.hjson b/sw/dnn/gelu/data/params.hjson similarity index 76% rename from target/snitch_cluster/sw/apps/dnn/gelu/src/params.hjson rename to sw/dnn/gelu/data/params.hjson index 6d4c2fe7c8..b290c78e5d 100644 --- a/target/snitch_cluster/sw/apps/dnn/gelu/src/params.hjson +++ b/sw/dnn/gelu/data/params.hjson @@ -2,13 +2,10 @@ // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 -// Parameters for a single GELU layer - { - kernel: "GELU" input_dim: { batch_size: 3, - seq_len:8, + seq_len: 8, hidden_nodes: 4 } prec: 32 diff --git a/sw/dnn/src/gelu.h b/sw/dnn/gelu/src/gelu.h similarity index 77% rename from sw/dnn/src/gelu.h rename to sw/dnn/gelu/src/gelu.h index cb903a64ae..83e4516a34 100644 --- a/sw/dnn/src/gelu.h +++ b/sw/dnn/gelu/src/gelu.h @@ -6,34 +6,28 @@ #include "math.h" #include "snrt.h" -#include "utils.h" /** * @struct gelu_layer_struct * @brief This structure contains all parameters necessary * for computing the GELU activation function - * @var gelu_layer_struct::BATCH_SIZE + * @var gelu_layer_struct::batch_size * Size of each input sample - * @var gelu_layer_struct::SEQ_LEN + * @var gelu_layer_struct::seq_len * Size of each output sample - * @var gelu_layer_struct::HIDDEN_NODES + * @var gelu_layer_struct::hidden_nodes * Number of hidden dimensions * @var gelu_layer_struct::ifmap * Pointer to input feature map * @var gelu_layer_struct::ofmap * Pointer to output feature map - * @var gelu_layer_struct::result - * Pointer to the golden model output */ typedef struct gelu_layer_struct { - uint32_t BATCH_SIZE; - uint32_t SEQ_LEN; - uint32_t HIDDEN_NODES; - + uint32_t batch_size; + uint32_t seq_len; + uint32_t hidden_nodes; float *ifmap; float *ofmap; - float *result; - precision_t dtype; } gelu_layer_t; @@ -70,7 +64,7 @@ static inline void gelu_fp32(float *input, float *output, int32_t ldI, /** * @brief GELU layer * - * @param l gelu_layer struct that holds addresses and parameters + * @param l gelu_layer_t struct that holds addresses and parameters * */ static inline void gelu_layer(const gelu_layer_t *l) { @@ -80,7 +74,7 @@ static inline void gelu_layer(const gelu_layer_t *l) { uint32_t compute_id = snrt_cluster_compute_core_num(); uint32_t ifmap_size = - l->BATCH_SIZE * l->SEQ_LEN * l->HIDDEN_NODES * sizeof(float); + l->batch_size * l->seq_len * l->hidden_nodes * sizeof(float); uint32_t ofmap_size = ifmap_size; void *ptr = (float *)snrt_l1_next(); @@ -92,9 +86,9 @@ static inline void gelu_layer(const gelu_layer_t *l) { // DMA transfer the ifmap into the cluster TCDM if (snrt_is_dm_core()) { snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d( - ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float), - l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float), - l->SEQ_LEN * l->HIDDEN_NODES * sizeof(float)); + ifmap, l->ifmap, l->batch_size * sizeof(float), + l->batch_size * sizeof(float), l->batch_size * sizeof(float), + l->seq_len * l->hidden_nodes * sizeof(float)); snrt_dma_wait_all(); } @@ -103,23 +97,23 @@ static inline void gelu_layer(const gelu_layer_t *l) { if (snrt_is_compute_core()) { // determine the row offset for each core - int32_t row_offset = compute_id * l->HIDDEN_NODES; + int32_t row_offset = compute_id * l->hidden_nodes; // determine the row stride of each matrix - int32_t ldI = compute_num * l->HIDDEN_NODES; + int32_t ldI = compute_num * l->hidden_nodes; // determine the batch offset for each core - int32_t batch_offset = l->SEQ_LEN * l->HIDDEN_NODES; + int32_t batch_offset = l->seq_len * l->hidden_nodes; // printf("row_offset: %d, ldI: %d\n", row_offset, ldI); - for (int b = 0; b < l->BATCH_SIZE; b++) { + for (int b = 0; b < l->batch_size; b++) { // if (compute_id == 1) { // printf("BATCH: %d\n", b); // } gelu_fp32(&ifmap[row_offset + b * batch_offset], - &ofmap[row_offset + b * batch_offset], ldI, l->BATCH_SIZE, - l->SEQ_LEN / 8, l->HIDDEN_NODES); + &ofmap[row_offset + b * batch_offset], ldI, l->batch_size, + l->seq_len / 8, l->hidden_nodes); } snrt_cluster_hw_barrier(); diff --git a/sw/dnn/gelu/src/main.c b/sw/dnn/gelu/src/main.c new file mode 100644 index 0000000000..3e8c742cf6 --- /dev/null +++ b/sw/dnn/gelu/src/main.c @@ -0,0 +1,9 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "dnn.h" + +#include "data.h" + +int main() { gelu_layer(&layer); } \ No newline at end of file diff --git a/sw/dnn/gemm/data/datagen.py b/sw/dnn/gemm/data/datagen.py new file mode 100755 index 0000000000..e4338e72c1 --- /dev/null +++ b/sw/dnn/gemm/data/datagen.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Tim Fischer +# Viviane Potocnik +# Luca Colagrande + +import argparse +import pathlib +import hjson +import sys +import os +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +import data_utils # noqa: E402 +from data_utils import emit_license, \ + format_struct_definition, format_array_definition, \ + format_array_declaration, format_ifdef_wrapper # noqa: E402 + +torch.manual_seed(42) + +# AXI splits bursts crossing 4KB address boundaries. To minimize +# the occurrence of these splits the data should be aligned to 4KB +BURST_ALIGNMENT = 4096 + +PRECISION_T = { + '64': 'FP64', + '32': 'FP32', + '16': 'FP16', + '8': 'FP8' +} + + +def rand_data_generator(shape, prec, alt=False): + if prec == '64': + return torch.randn(shape, requires_grad=False, dtype=torch.float64), {} + elif prec == '32': + return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} + elif prec == '16': + if alt: + return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} + else: + return torch.randn(shape, requires_grad=False, dtype=torch.float16), {} + elif prec == '8': + sign = torch.randint(0, 2, shape, + requires_grad=False, dtype=torch.uint8) # -1 or 1 + exponent = torch.randint(0, 16, shape, + requires_grad=False, dtype=torch.uint8) # < 0b01111 + mantissa = torch.randint(0, 4, shape, + requires_grad=False, dtype=torch.uint8) # can be arbitrary + bits = {'sign': sign, 'exponent': exponent, 'mantissa': mantissa} + # TODO: not actually correct + sign_val = (-1.0)**sign.double() + exp_val = (2.0**(exponent.double()-15.0)) + man_val = (1.0 + mantissa.double() / (2**2)) + val = sign_val*exp_val*man_val + return val, bits + + +def golden_model(alpha, A, B, C): + return alpha * C + torch.matmul(A, B) + + +def emit_header(**kwargs): + + M = kwargs['M'] + N = kwargs['N'] + K = kwargs['K'] + alpha = kwargs['alpha'] + expand = kwargs['expand'] + transpose_A = kwargs['transpose_A'] + transpose_B = kwargs['transpose_B'] + prec = str(kwargs['prec']) + + mat_A, bits_A = rand_data_generator((M, K), prec) + mat_B, bits_B = rand_data_generator((K, N), prec) + mat_C, bits_C = rand_data_generator((M, N), prec) + + result = golden_model(alpha, mat_A, mat_B, mat_C) + + if transpose_A: + mat_A = mat_A.T + if transpose_B: + mat_B = mat_B.T + + ctype = data_utils.floating_point_ctype(prec) + + A_uid = 'A' + B_uid = 'B' + C_uid = 'C' + + layer_cfg = { + 'M': M, + 'N': N, + 'K': K, + 'TA': int(transpose_A), + 'TB': int(transpose_B), + 'ALPHA': alpha, + 'expand': expand, + 'dtype': PRECISION_T[prec], + 'A': A_uid, + 'B': B_uid, + 'C': C_uid + } + + data_str = [emit_license()] + # Array forward declarations + data_str += [format_array_declaration(ctype, A_uid, mat_A.shape)] + data_str += [format_array_declaration(ctype, B_uid, mat_B.shape)] + data_str += [format_array_declaration(ctype, C_uid, mat_C.shape)] + # Layer struct + data_str += [format_struct_definition('gemm_layer_t', 'layer', layer_cfg)] + # Array definitions + if prec == 'FP8': + data_str += [format_array_definition(ctype, A_uid, bits_A)] + data_str += [format_array_definition(ctype, B_uid, bits_B)] + data_str += [format_array_definition(ctype, C_uid, bits_C)] + else: + data_str += [format_array_definition(ctype, A_uid, mat_A)] + data_str += [format_array_definition(ctype, B_uid, mat_B)] + data_str += [format_array_definition(ctype, C_uid, mat_C)] + # Golden results for BIST + result_def = format_array_definition(ctype, 'checksum', torch.sum(result, dim=-1)) + data_str += [format_ifdef_wrapper('BIST', result_def)] + data_str = '\n\n'.join(data_str) + + return data_str + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for layernorm kernel') + parser.add_argument( + "-c", "--cfg", + type=pathlib.Path, + required=True, + help='Select param config file kernel' + ) + parser.add_argument( + '--section', + type=str, + help='Section to store matrices in') + parser.add_argument( + 'output', + type=pathlib.Path, + help='Path of the output header file') + args = parser.parse_args() + + # Load param config file + with args.cfg.open() as f: + param = hjson.loads(f.read()) + param['section'] = args.section + + # Emit header file + with open(args.output, 'w') as f: + f.write(emit_header(**param)) + + +if __name__ == '__main__': + main() diff --git a/target/snitch_cluster/sw/apps/dnn/gemm/src/params.hjson b/sw/dnn/gemm/data/params.hjson similarity index 86% rename from target/snitch_cluster/sw/apps/dnn/gemm/src/params.hjson rename to sw/dnn/gemm/data/params.hjson index e3b54c274a..ce1506ae66 100644 --- a/target/snitch_cluster/sw/apps/dnn/gemm/src/params.hjson +++ b/sw/dnn/gemm/data/params.hjson @@ -2,10 +2,7 @@ // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 -// Parameters for a GEMM - { - kernel: "GEMM" M: 16, N: 16, K: 16, diff --git a/sw/dnn/src/gemm.h b/sw/dnn/gemm/src/gemm.h similarity index 96% rename from sw/dnn/src/gemm.h rename to sw/dnn/gemm/src/gemm.h index dd71c7dafd..cf2b2949e0 100644 --- a/sw/dnn/src/gemm.h +++ b/sw/dnn/gemm/src/gemm.h @@ -54,12 +54,12 @@ typedef struct gemm_layer_struct { uint32_t TILE_N; uint32_t TILE_K; - double *A; - double *B; - double *C; + void *A; + void *B; + void *C; uint32_t ALPHA; precision_t dtype; uint32_t expand; -} gemm_layer; +} gemm_layer_t; diff --git a/target/snitch_cluster/sw/apps/dnn/gemm/src/gemm.c b/sw/dnn/gemm/src/main.c similarity index 93% rename from target/snitch_cluster/sw/apps/dnn/gemm/src/gemm.c rename to sw/dnn/gemm/src/main.c index 5a709613eb..a94b0247ed 100644 --- a/target/snitch_cluster/sw/apps/dnn/gemm/src/gemm.c +++ b/sw/dnn/gemm/src/main.c @@ -5,7 +5,9 @@ // SW testbench for profiling GEMM kernels in different // floating point precisions (fp64, fp32, fp16), as well as // different memory layouts for matrices (transposed/not-transposed) -// Correctness of results are checked automatically + +// TODO(colluca): Add IPC verification and remove +#define BIST #include "dnn.h" #include "snrt.h" @@ -21,16 +23,10 @@ // banking conflicts in the beginning #define MAT_PADDING 0 -#define CHECK_RESULT - void *share_ptr; int main() { - gemm_l.A = (void *)gemm_A_dram; - gemm_l.B = (void *)gemm_B_dram; - gemm_l.C = (void *)gemm_C_dram; - - const gemm_layer l1_gemm_l = gemm_l; + const gemm_layer_t l1_gemm_l = layer; const uint32_t cluster_num = snrt_cluster_num(); const uint32_t cluster_id = snrt_cluster_idx(); @@ -163,39 +159,39 @@ int main() { } snrt_cluster_hw_barrier(); -#ifdef CHECK_RESULT +#ifdef BIST if (compute_id == 0) { if (l1_gemm_l.dtype == FP64) { for (uint32_t m = 0; m < l1_gemm_l.M; m++) { - double checksum = gemm_checksum[m]; + double check = checksum[m]; double sum = 0.0; for (uint32_t n = 0; n < l1_gemm_l.N; n++) { sum += ((double *)mat_C)[m * l1_gemm_l.N + n]; } - if (fabs(sum - checksum) > 0.001) { + if (fabs(sum - check) > 0.001) { errors += l1_gemm_l.N; } } } else if (l1_gemm_l.dtype == FP32) { for (uint32_t m = 0; m < l1_gemm_l.M; m++) { - float checksum = gemm_checksum[m]; + float check = checksum[m]; float sum = 0.0; for (uint32_t n = 0; n < l1_gemm_l.N; n++) { sum += ((float *)mat_C)[m * l1_gemm_l.N + n]; } - if (fabs(sum - checksum) > 0.001) { + if (fabs(sum - check) > 0.001) { errors += l1_gemm_l.N; } } } else if (l1_gemm_l.dtype == FP16) { for (uint32_t m = 0; m < l1_gemm_l.M; m++) { - __fp16 checksum = gemm_checksum[m]; + __fp16 check = checksum[m]; float sum = 0.0; for (uint32_t n = 0; n < l1_gemm_l.N; n++) { sum += ((__fp16 *)mat_C)[m * l1_gemm_l.N + n]; } - if (fabs(sum - checksum) > 0.05) { + if (fabs(sum - check) > 0.05) { errors += l1_gemm_l.N; } } diff --git a/sw/dnn/layernorm/data/datagen.py b/sw/dnn/layernorm/data/datagen.py new file mode 100755 index 0000000000..d1af8ea364 --- /dev/null +++ b/sw/dnn/layernorm/data/datagen.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Tim Fischer +# Viviane Potocnik +# Luca Colagrande + +import argparse +import pathlib +import hjson +import sys +import os +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +import data_utils # noqa: E402 +from data_utils import emit_license, \ + format_struct_definition, format_array_definition, \ + format_array_declaration, format_ifdef_wrapper # noqa: E402 + +torch.manual_seed(42) + +# AXI splits bursts crossing 4KB address boundaries. To minimize +# the occurrence of these splits the data should be aligned to 4KB +BURST_ALIGNMENT = 4096 + +PRECISION_T = { + '64': 'FP64', + '32': 'FP32', + '16': 'FP16', + '8': 'FP8' +} + + +def golden_model(ifmap, eps, shape, prec): + ln = torch.nn.LayerNorm(shape, eps=eps) + return ln(ifmap) + + +def emit_header(**kwargs): + batch_size = kwargs['input_dim']['batch_size'] + seq_len = kwargs['input_dim']['seq_len'] + embeddings = kwargs['input_dim']['embeddings'] + eps = kwargs['eps'] + prec = str(kwargs['prec']) + n_tiles = kwargs['n_tiles'] + + assert (seq_len % n_tiles) == 0, 'Input dimension is not an integer multiple of tile size' + + torch_type = data_utils.floating_point_torch_type(prec) + ifmap = torch.randn(batch_size, seq_len, embeddings, requires_grad=False, dtype=torch_type) + + ofmap = golden_model(ifmap, eps, embeddings, prec) + ofmap = ofmap.detach().numpy() + + ctype = data_utils.floating_point_ctype(prec) + + ifmap_uid = 'ifmap' + ofmap_uid = 'ofmap' + + layer_cfg = { + **kwargs['input_dim'], + 'n_tiles': n_tiles, + 'ifmap': ifmap_uid, + 'ofmap': ofmap_uid, + 'eps': eps, + 'dtype': PRECISION_T[prec] + } + + data_str = [emit_license()] + data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape, + alignment=BURST_ALIGNMENT)] + data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape, + alignment=BURST_ALIGNMENT)] + data_str += [format_struct_definition('layernorm_layer_t', 'layer', layer_cfg)] + data_str += [format_array_definition(ctype, ifmap_uid, ifmap, + alignment=BURST_ALIGNMENT)] + result_def = format_array_definition(ctype, 'golden', ofmap, alignment=BURST_ALIGNMENT) + data_str += [format_ifdef_wrapper('BIST', result_def)] + data_str = '\n\n'.join(data_str) + + return data_str + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for layernorm kernel') + parser.add_argument( + "-c", "--cfg", + type=pathlib.Path, + required=True, + help='Select param config file kernel' + ) + parser.add_argument( + '--section', + type=str, + help='Section to store matrices in') + parser.add_argument( + 'output', + type=pathlib.Path, + help='Path of the output header file') + args = parser.parse_args() + + # Load param config file + with args.cfg.open() as f: + param = hjson.loads(f.read()) + param['section'] = args.section + + # Emit header file + with open(args.output, 'w') as f: + f.write(emit_header(**param)) + + +if __name__ == '__main__': + main() diff --git a/target/snitch_cluster/sw/apps/dnn/layernorm/src/params.hjson b/sw/dnn/layernorm/data/params.hjson similarity index 52% rename from target/snitch_cluster/sw/apps/dnn/layernorm/src/params.hjson rename to sw/dnn/layernorm/data/params.hjson index a9e3fca54a..7b60b21349 100644 --- a/target/snitch_cluster/sw/apps/dnn/layernorm/src/params.hjson +++ b/sw/dnn/layernorm/data/params.hjson @@ -1,16 +1,14 @@ -// Copyright 2020 ETH Zurich and University of Bologna. +// Copyright 2023 ETH Zurich and University of Bologna. // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 -// Parameters for a single LayerNorm layer - { - kernel: "LayerNorm" input_dim: { - batch_size: 1, - seq_len: 32, + batch_size: 2, + seq_len: 64, embeddings: 32 } eps: 1e-5 prec: 32 + n_tiles: 2 } \ No newline at end of file diff --git a/sw/dnn/layernorm/layout.csv b/sw/dnn/layernorm/layout.csv new file mode 100644 index 0000000000..9fd0970bec --- /dev/null +++ b/sw/dnn/layernorm/layout.csv @@ -0,0 +1,3 @@ + , setup, dma in, compute tile, dma out, dma in, compute tile, dma out +"range(0,8)", 1, , 3, , , 5, +8 , 1, 2, , 4, 5, , 7 diff --git a/sw/dnn/layernorm/src/layernorm.h b/sw/dnn/layernorm/src/layernorm.h new file mode 100644 index 0000000000..f5af22f457 --- /dev/null +++ b/sw/dnn/layernorm/src/layernorm.h @@ -0,0 +1,177 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" +#include "snrt.h" +// #include "printf.h" +#include "dnn.h" + +/** + * @struct layernorm_layer_struct + * @brief This structure contains all parameters necessary + * for computing the LayerNorm activation function + * @var layernorm_layer_struct::batch_size + * Size of each input sample + * @var layernorm_layer_struct::seq_len + * Size of each output sample + * @var layernorm_layer_struct::embeddings + * Number of hidden dimensions + * @var layernorm_layer_struct::n_tiles + * Number of tiles to split the data into + * @var layernorm_layer_struct::ifmap + * Pointer to input feature map + * @var layernorm_layer_struct::ofmap + * Pointer to output feature map + */ +typedef struct layernorm_layer_struct { + uint32_t batch_size; + uint32_t seq_len; + uint32_t embeddings; + uint32_t n_tiles; + float eps; + void *ifmap; + void *ofmap; + precision_t dtype; +} layernorm_layer_t; + +/** + * Single-cluster implementation of a layernorm tile (data assumed in TCDM) + */ +static inline void layernorm_fp32(float *input, float *output, + int32_t batch_size, int32_t seq_len, + int32_t embeddings, int32_t eps) { + if (snrt_is_compute_core()) { + // Get parameters for every core's tile + // offset: offset between data accessed by every core (for + // corresponding iterations) + // stride: offset between data accessed by the same core in + // consecutive iterations + // tile_seq_len: fraction of the sequence assigned to each core + uint32_t offset = snrt_cluster_core_idx() * embeddings; + uint32_t stride = snrt_cluster_compute_core_num() * embeddings; + uint32_t tile_seq_len = seq_len / snrt_cluster_compute_core_num(); + float *core_itile = input + offset; + float *core_otile = output + offset; + + // get derived layernorm quantities + uint32_t batch_offset = seq_len * embeddings; + + // compute the mean and variance along the last dimension + float mean = 0.0; // max value of the current core + float var = 0.0; // sum of the exp values of the current core + for (int32_t b = 0; b < batch_size; b++) { + for (int32_t s = 0; s < tile_seq_len; s++) { + mean = 0.0; + var = 0.0; + + for (int32_t i = 0; i < embeddings; i++) { + mean += core_itile[b * batch_offset + s * stride + i]; + } + mean /= embeddings; + + for (int32_t i = 0; i < embeddings; i++) { + var += + (core_itile[b * batch_offset + s * stride + i] - mean) * + (core_itile[b * batch_offset + s * stride + i] - mean); + } + var /= embeddings; + + // compute the shifted value of the current row + for (int32_t i = 0; i < embeddings; i++) { + core_otile[b * batch_offset + s * stride + i] = + (core_itile[b * batch_offset + s * stride + i] - mean) / + sqrtf(var + eps); + } + } + } + + snrt_fpu_fence(); + } +} + +// /** +// * Implementation of the LayerNorm layer for the Transformer model for FP64. +// */ +// static inline void transformer_layernorm_fp64(double *input, int32_t ldI, +// int32_t seq_len, int32_t +// embeddings, int32_t eps) { +// layernorm_fp64(input, input, ldI, 0, 1, seq_len, embeddings, eps); +// } + +// /** +// * Implementation of the LayerNorm layer for the Transformer model for FP32. +// */ +// static inline void transformer_layernorm_fp32(float *input, int32_t ldI, +// int32_t seq_len, int32_t +// embeddings, int32_t eps) { +// layernorm_fp32(input, input, ldI, 0, 1, seq_len, embeddings, eps); +// } + +// Tiles the seq_len axis +static inline void layernorm_layer(layernorm_layer_t l) { + snrt_mcycle(); + + // Compute the tiling parameters + uint32_t n_tiles = l.n_tiles; + uint32_t tile_seq_len = l.seq_len / n_tiles; + uint32_t tile_size = l.batch_size * tile_seq_len * l.embeddings; + uint32_t tile_offset = tile_seq_len * l.embeddings; + + // Allocate space for arrays in TCDM + float *local_itile = (float *)snrt_l1_next(); + float *local_otile = local_itile + tile_size; + + // Get pointers to arrays in DRAM + float *remote_ifmap = (float *)l.ifmap; + float *remote_ofmap = (float *)l.ofmap; + + // Iterate tiles + snrt_mcycle(); + for (int tile_idx = 0; tile_idx < n_tiles; tile_idx++) { + // Copy input tile + if (snrt_is_dm_core()) { + float *remote_itile = remote_ifmap + tile_idx * tile_offset; + snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d( + local_itile, /* dst */ + remote_itile, /* src */ + tile_seq_len * l.embeddings * sizeof(float), /* size */ + tile_seq_len * l.embeddings * sizeof(float), /* dst_stride */ + l.seq_len * l.embeddings * sizeof(float), /* src_stride */ + l.batch_size /* repetitions */ + ); + snrt_dma_wait_all(); + snrt_mcycle(); + } + + snrt_cluster_hw_barrier(); + + // Compute layernorm tile + if (snrt_is_compute_core()) snrt_mcycle(); + layernorm_fp32(local_itile, local_otile, l.batch_size, tile_seq_len, + l.embeddings, l.eps); + if (snrt_is_compute_core()) snrt_mcycle(); + + snrt_cluster_hw_barrier(); + + // DMA transfer the ofmap to DRAM + if (snrt_is_dm_core()) { + snrt_mcycle(); + float *remote_otile = remote_ofmap + tile_idx * tile_offset; + snrt_dma_txid_t txid_ofmap = snrt_dma_start_2d( + remote_otile, /* dst */ + local_otile, /* src */ + tile_seq_len * l.embeddings * sizeof(float), /* size */ + l.seq_len * l.embeddings * sizeof(float), /* dst_stride */ + tile_seq_len * l.embeddings * sizeof(float), /* src_stride */ + l.batch_size /* repetitions */ + ); + snrt_dma_wait_all(); + snrt_mcycle(); + } + } + + snrt_global_barrier(); +} diff --git a/sw/dnn/layernorm/src/main.c b/sw/dnn/layernorm/src/main.c new file mode 100644 index 0000000000..21fea7cf56 --- /dev/null +++ b/sw/dnn/layernorm/src/main.c @@ -0,0 +1,14 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande + +#include "dnn.h" + +#include "data.h" + +int main() { + layernorm_layer(layer); + return 0; +} diff --git a/sw/dnn/layernorm/verify.py b/sw/dnn/layernorm/verify.py new file mode 100755 index 0000000000..f7ac49e30e --- /dev/null +++ b/sw/dnn/layernorm/verify.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import sys +from pathlib import Path +import numpy as np +import torch +from data.datagen import golden_model + +sys.path.append(str(Path(__file__).parent / '../../../util/sim/')) +import verification # noqa: E402 +from elf import Elf # noqa: E402 +from data_utils import bytes_to_float, bytes_to_struct # noqa: E402 + + +ERR_THRESHOLD = 0.001 + +PRECISION_T = { + 8: '64', + 4: '32', + 2: '16', + 1: '8' +} + +NUMPY_T = { + '64': np.float64, + '32': np.float32, + '16': np.float16 +} + + +def main(): + # Run simulation and get outputs + args = verification.parse_args() + raw_results = verification.simulate(sim_bin=args.sim_bin, + snitch_bin=args.snitch_bin, + symbols_bin=args.symbols_bin, + log=args.log, + output_uids=['ofmap']) + + # Extract input operands from ELF file + if args.symbols_bin: + elf = Elf(args.symbols_bin) + else: + elf = Elf(args.snitch_bin) + + layer_struct = { + 'batch_size': 'I', + 'seq_len': 'I', + 'embeddings': 'I', + 'n_tiles': 'I', + 'eps': 'f', + 'ifmap_ptr': 'I', + 'ofmap_ptr': 'I', + 'dtype': 'I' + } + layer = bytes_to_struct(elf.get_symbol_contents('layer'), layer_struct) + batch_size = layer['batch_size'] + seq_len = layer['seq_len'] + embeddings = layer['embeddings'] + eps = layer['eps'] + prec = PRECISION_T[layer['dtype']] + + ifmap = np.array(bytes_to_float(elf.get_symbol_contents('ifmap'), prec), dtype=NUMPY_T[prec]) + ifmap = ifmap.reshape(batch_size, seq_len, embeddings) + ifmap = torch.from_numpy(ifmap) + + # Verify results + ofmap_actual = np.array(bytes_to_float(raw_results['ofmap'], prec), dtype=NUMPY_T[prec]) + ofmap_golden = golden_model(ifmap, eps, embeddings, prec).detach().numpy().flatten() + + absolute_err = np.absolute(ofmap_golden - ofmap_actual) + fail = np.any(absolute_err > ERR_THRESHOLD) + if (fail): + verification.dump_results_to_csv([ofmap_golden, ofmap_actual, absolute_err], + Path.cwd() / 'layernorm_results.csv') + + return int(fail) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/sw/dnn/linear/data/datagen.py b/sw/dnn/linear/data/datagen.py new file mode 100755 index 0000000000..67c7934a7e --- /dev/null +++ b/sw/dnn/linear/data/datagen.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Tim Fischer +# Viviane Potocnik +# Luca Colagrande + +import argparse +import pathlib +import hjson +import sys +import os +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +import data_utils # noqa: E402 +from data_utils import emit_license, \ + format_struct_definition, format_array_definition, \ + format_array_declaration, format_ifdef_wrapper # noqa: E402 + +torch.manual_seed(42) + +# AXI splits bursts crossing 4KB address boundaries. To minimize +# the occurrence of these splits the data should be aligned to 4KB +BURST_ALIGNMENT = 4096 + +PRECISION_T = { + '64': 'FP64', + '32': 'FP32', + '16': 'FP16', + '8': 'FP8' +} + + +def golden_model(ifmap, weights, bias): + ifmap = ifmap.flatten(1) + return torch.matmul(ifmap, weights.T) + bias + + +def emit_header(**kwargs): + + out_channels = kwargs['channels']['out'] + in_height = kwargs['input_dim']['height'] + in_width = kwargs['input_dim']['width'] + prec = str(kwargs['prec']) + + torch_type = data_utils.floating_point_torch_type(prec) + ctype = data_utils.floating_point_ctype(prec) + + ifmap = torch.randn(in_height, in_width, requires_grad=False, dtype=torch_type) + weights = torch.randn(out_channels, in_width, requires_grad=False, dtype=torch_type) + bias = torch.randn(out_channels, requires_grad=False, dtype=torch_type) + ofmap = golden_model(ifmap, weights, bias) + + ch, ci = ifmap.shape + _, co = ofmap.shape + + ifmap_uid = 'ifmap' + weights_uid = 'weights' + bias_uid = 'bias' + ofmap_uid = 'ofmap' + + layer_cfg = { + 'CO': co, + 'CI': ci, + 'CH': ch, + 'CW': ci, + 'ifmap': ifmap_uid, + 'ofmap': ofmap_uid + } + + data_str = [emit_license()] + # Array forward declarations + data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape)] + data_str += [format_array_declaration(ctype, weights_uid, weights.shape)] + data_str += [format_array_declaration(ctype, bias_uid, bias.shape)] + data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape)] + # Layer struct + data_str += [format_struct_definition('linear_layer_t', 'layer', layer_cfg)] + # Array definitions + data_str += [format_array_definition(ctype, ifmap_uid, ifmap)] + data_str += [format_array_definition(ctype, weights_uid, weights)] + data_str += [format_array_definition(ctype, bias_uid, bias)] + # Golden results for BIST + result_def = format_array_definition(ctype, 'golden', ofmap) + data_str += [format_ifdef_wrapper('BIST', result_def)] + data_str = '\n\n'.join(data_str) + + return data_str + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for layernorm kernel') + parser.add_argument( + "-c", "--cfg", + type=pathlib.Path, + required=True, + help='Select param config file kernel' + ) + parser.add_argument( + '--section', + type=str, + help='Section to store matrices in') + parser.add_argument( + 'output', + type=pathlib.Path, + help='Path of the output header file') + args = parser.parse_args() + + # Load param config file + with args.cfg.open() as f: + param = hjson.loads(f.read()) + param['section'] = args.section + + # Emit header file + with open(args.output, 'w') as f: + f.write(emit_header(**param)) + + +if __name__ == '__main__': + main() diff --git a/target/snitch_cluster/sw/apps/dnn/linear/src/params.hjson b/sw/dnn/linear/data/params.hjson similarity index 81% rename from target/snitch_cluster/sw/apps/dnn/linear/src/params.hjson rename to sw/dnn/linear/data/params.hjson index 00b5bda648..8b52bfdbfa 100644 --- a/target/snitch_cluster/sw/apps/dnn/linear/src/params.hjson +++ b/sw/dnn/linear/data/params.hjson @@ -2,10 +2,7 @@ // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 -// Parameters for a single linear layer - { - kernel: "Linear" channels: { out: 16, } diff --git a/sw/dnn/src/linear.h b/sw/dnn/linear/src/linear.h similarity index 80% rename from sw/dnn/src/linear.h rename to sw/dnn/linear/src/linear.h index 3b1b7c3f76..82d174fcc9 100644 --- a/sw/dnn/src/linear.h +++ b/sw/dnn/linear/src/linear.h @@ -36,7 +36,6 @@ typedef struct linear_layer_struct { float *weights; float *bias; float *ofmap; - float *result; precision_t dtype; } linear_layer_t; @@ -127,33 +126,4 @@ static inline void linear_layer(const linear_layer_t *l) { } snrt_cluster_hw_barrier(); - - if (snrt_is_dm_core()) { - snrt_dma_txid_t txid_result = snrt_dma_start_2d( - result, l->result, l->CH * sizeof(float), l->CH * sizeof(float), - l->CH * sizeof(float), l->CO * sizeof(float)); - snrt_dma_wait_all(); - } - - snrt_cluster_hw_barrier(); - - // TODO: fix this, wrong values for ofmap printed - if (compute_id == 0) { - // compare result with ofmap - float tolerance = 1e-6; - int error = 0; - for (int i = 0; i < l->CH; i++) { - for (int j = 0; j < l->CO; j++) { - if (result[i * l->CO + j] - ofmap[i * l->CO + j] > tolerance) { - printf( - "MISMATCH: result[%d][%d] = %f, ofmap[%d][%d] = %f\n", - i, j, result[i * l->CO + j], i, j, - ofmap[i * l->CO + j]); - error += 1; - } - } - } - - printf("[%d/%d] mismatches\n", error, l->CH * l->CO); - } } diff --git a/sw/dnn/linear/src/main.c b/sw/dnn/linear/src/main.c new file mode 100644 index 0000000000..8a10396a1f --- /dev/null +++ b/sw/dnn/linear/src/main.c @@ -0,0 +1,39 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// TODO(colluca): add IPC test and remove this flag +#define BIST + +#include "dnn.h" + +#include "data.h" + +int main() { + linear_layer(&layer); + +#ifdef BIST + // TODO: fix this, wrong values for ofmap printed + if (snrt_global_core_idx() == 0) { + // compare result with ofmap + uint32_t n_results = layer.CH * layer.CO; + uint32_t n_errors = n_results; + float tolerance = 1e-6; + for (int i = 0; i < layer.CH; i++) { + for (int j = 0; j < layer.CO; j++) { + if (golden[i * layer.CO + j] - ofmap[i * layer.CO + j] > + tolerance) { + printf( + "MISMATCH: golden[%d][%d] = %f, ofmap[%d][%d] = %f\n", + i, j, golden[i * layer.CO + j], i, j, + ofmap[i * layer.CO + j]); + } else { + n_errors--; + } + } + } + printf("[%d/%d] mismatches\n", n_errors, n_results); + return n_errors; + } +#endif +} \ No newline at end of file diff --git a/sw/dnn/maxpool/data/datagen.py b/sw/dnn/maxpool/data/datagen.py new file mode 100755 index 0000000000..818930090d --- /dev/null +++ b/sw/dnn/maxpool/data/datagen.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Tim Fischer +# Viviane Potocnik +# Luca Colagrande + +import argparse +import pathlib +import hjson +import sys +import os +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +import data_utils # noqa: E402 +from data_utils import emit_license, \ + format_struct_definition, format_array_definition, \ + format_array_declaration, format_ifdef_wrapper # noqa: E402 + +torch.manual_seed(42) + +# AXI splits bursts crossing 4KB address boundaries. To minimize +# the occurrence of these splits the data should be aligned to 4KB +BURST_ALIGNMENT = 4096 + +PRECISION_T = { + '64': 'FP64', + '32': 'FP32', + '16': 'FP16', + '8': 'FP8' +} + + +def golden_model(ifmap, kernel): + max_pool = torch.nn.MaxPool2d(kernel_size=kernel) + return max_pool(ifmap) + + +def emit_header(**kwargs): + + in_channels = kwargs['channels']['in'] + in_height = kwargs['input_dim']['height'] + in_width = kwargs['input_dim']['width'] + kernel_size = kwargs['kernel_size'] + tile_ci = kwargs['tile_ci'] + prec = str(kwargs['prec']) + + torch_type = data_utils.floating_point_torch_type(prec) + ctype = data_utils.floating_point_ctype(prec) + + ifmap = torch.randn(1, in_channels, in_height, in_width, requires_grad=False, dtype=torch_type) + ofmap = golden_model(ifmap, kernel_size) + + # convert from CHW to HWC format + ifmap = ifmap.permute(0, 2, 3, 1) + ofmap = ofmap.permute(0, 2, 3, 1) + + n, ih, iw, ci = ifmap.shape + _, oh, ow, co = ofmap.shape + + ifmap_uid = 'ifmap' + ofmap_uid = 'ofmap' + + layer_cfg = { + 'CO': co, + 'CI': ci, + 'IH': ih, + 'IW': iw, + 'OH': oh, + 'OW': ow, + 'FH': kernel_size, + 'FW': kernel_size, + 'tile_ci': tile_ci, + 'ifmap': ifmap_uid, + 'ofmap': ofmap_uid + } + + data_str = [emit_license()] + # Array forward declarations + data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape)] + data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape)] + # Layer struct + data_str += [format_struct_definition('maxpool_layer_t', 'layer', layer_cfg)] + # Array definitions + data_str += [format_array_definition(ctype, ifmap_uid, ifmap)] + # Golden results for BIST + result_def = format_array_definition(ctype, 'golden', ofmap) + data_str += [format_ifdef_wrapper('BIST', result_def)] + data_str = '\n\n'.join(data_str) + + return data_str + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for layernorm kernel') + parser.add_argument( + "-c", "--cfg", + type=pathlib.Path, + required=True, + help='Select param config file kernel' + ) + parser.add_argument( + '--section', + type=str, + help='Section to store matrices in') + parser.add_argument( + 'output', + type=pathlib.Path, + help='Path of the output header file') + args = parser.parse_args() + + # Load param config file + with args.cfg.open() as f: + param = hjson.loads(f.read()) + param['section'] = args.section + + # Emit header file + with open(args.output, 'w') as f: + f.write(emit_header(**param)) + + +if __name__ == '__main__': + main() diff --git a/target/snitch_cluster/sw/apps/dnn/maxpool/src/params.hjson b/sw/dnn/maxpool/data/params.hjson similarity index 83% rename from target/snitch_cluster/sw/apps/dnn/maxpool/src/params.hjson rename to sw/dnn/maxpool/data/params.hjson index 1826a9f571..c81bdb0a53 100644 --- a/target/snitch_cluster/sw/apps/dnn/maxpool/src/params.hjson +++ b/sw/dnn/maxpool/data/params.hjson @@ -2,10 +2,7 @@ // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 -// Parameters for a single Conv2d layer - { - kernel: "MaxPool" channels: { out: 32, in: 32 @@ -15,5 +12,6 @@ width: 8 } kernel_size: 2 + tile_ci: 32 prec: 64 } diff --git a/sw/dnn/maxpool/src/main.c b/sw/dnn/maxpool/src/main.c new file mode 100644 index 0000000000..0205c8b13b --- /dev/null +++ b/sw/dnn/maxpool/src/main.c @@ -0,0 +1,17 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// SW testbench for profiling MaxPool Layer +// Automatically checks the correctness of the results + +#include "dnn.h" + +#include "data.h" + +int main() { + maxpool_layer(&layer); + + snrt_global_barrier(); + return 0; +} diff --git a/sw/dnn/src/maxpool.h b/sw/dnn/maxpool/src/maxpool.h similarity index 68% rename from sw/dnn/src/maxpool.h rename to sw/dnn/maxpool/src/maxpool.h index ad86782f5c..afba2a074a 100644 --- a/sw/dnn/src/maxpool.h +++ b/sw/dnn/maxpool/src/maxpool.h @@ -4,6 +4,50 @@ #include "snrt.h" +/** + * @struct conv_layer_struct + * @brief This structure contains all parameters necessary for Convolutional + * layers + * @var conv_layer_struct::CO + * Number of output channels + * @var conv_layer_struct::CI + * Number of input channels + * @var conv_layer_struct::IH + * Height of input feature map + * @var conv_layer_struct::IW + * Width of input feature map + * @var conv_layer_struct::OH + * Height of output feature map + * @var conv_layer_struct::OW + * Width of output feature map + * @var conv_layer_struct::FH + * Height of filter + * @var conv_layer_struct::FW + * Width of filter + * @var conv_layer_struct::ifmap + * Pointer to input feature map + * @var conv_layer_struct::ofmap + * Pointer to output feature map + * @var conv_layer_struct::tile_ci + * Tiling factor of input channel + * @var gemm_layer_struct::dtype + * Precision of Convolution layer + */ +typedef struct maxpool_layer_struct { + uint32_t CO; + uint32_t CI; + uint32_t IH; + uint32_t IW; + uint32_t OH; + uint32_t OW; + uint32_t FH; + uint32_t FW; + uint32_t tile_ci; + double *ifmap; + double *ofmap; + precision_t dtype; +} maxpool_layer_t; + /** * @brief implementation of FP64 maxpooling * @@ -30,15 +74,15 @@ static inline void maxpool_fp64(double *ifmap, double *ofmap, uint32_t CI, } } -static inline void maxpool_layer(const conv_layer *l) { +static inline void maxpool_layer(const maxpool_layer_t *l) { uint32_t cluster_num = snrt_cluster_num(); uint32_t cluster_id = snrt_cluster_idx(); uint32_t compute_num = snrt_cluster_compute_core_num(); uint32_t compute_id = snrt_global_core_idx(); // Each cluster loads one tile of kernel size - uint32_t ifmap_size = 2 * l->FH * l->FW * l->TILE_CI; - uint32_t ofmap_size = 2 * l->TILE_CI; + uint32_t ifmap_size = 2 * l->FH * l->FW * l->tile_ci; + uint32_t ofmap_size = 2 * l->tile_ci; double *ptr = (double *)snrt_l1_next(); double *ifmap = ptr; @@ -56,29 +100,29 @@ static inline void maxpool_layer(const conv_layer *l) { // tiles are distributed across clusters for (uint32_t tile = cluster_id; tile < l->OH * l->OW; tile += cluster_num) { - for (uint32_t ci = 0; ci < l->CI; ci += l->TILE_CI) { + for (uint32_t ci = 0; ci < l->CI; ci += l->tile_ci) { uint32_t oh = tile / l->OW; uint32_t ow = tile % l->OW; if (snrt_is_dm_core()) { for (uint32_t fh = 0; fh < l->FH; fh++) { - if (l->TILE_CI == l->CI) { + if (l->tile_ci == l->CI) { snrt_dma_start_1d( &ifmap[write_buf * (ifmap_size / 2) + - fh * l->FW * l->TILE_CI], /* dst */ + fh * l->FW * l->tile_ci], /* dst */ &l->ifmap[((oh * l->FH + fh) * l->IW + ow * l->FW) * l->CI], /* src */ - sizeof(double) * l->TILE_CI * l->FW /* size */); + sizeof(double) * l->tile_ci * l->FW /* size */); } else { // printf("bubu\n"); snrt_dma_start_2d( &ifmap[write_buf * (ifmap_size / 2) + - fh * l->FW * l->TILE_CI], /* dst */ + fh * l->FW * l->tile_ci], /* dst */ &l->ifmap[((oh * l->FH + fh) * l->IW + ow * l->FW) * l->CI + ci], /* src */ - sizeof(double) * l->TILE_CI, /* size */ - sizeof(double) * l->TILE_CI, /* dst_stride */ + sizeof(double) * l->tile_ci, /* size */ + sizeof(double) * l->tile_ci, /* dst_stride */ sizeof(double) * l->CI, /* src_stride */ l->FW /* repetitions */); } @@ -93,9 +137,9 @@ static inline void maxpool_layer(const conv_layer *l) { &l->ofmap[(prev_oh * l->OW + prev_ow) * l->CI + prev_ci], /* dst */ &ofmap[!read_buf * (ofmap_size / 2)], /* src */ - sizeof(double) * l->TILE_CI, /* size */ + sizeof(double) * l->tile_ci, /* size */ sizeof(double) * l->CI, /* dst_stride */ - sizeof(double) * l->TILE_CI, /* src_stride */ + sizeof(double) * l->tile_ci, /* src_stride */ 1 /* repetitions */); } @@ -113,7 +157,7 @@ static inline void maxpool_layer(const conv_layer *l) { maxpool_fp64(&ifmap[read_buf * ifmap_size / 2 + compute_id], &ofmap[write_buf * ofmap_size / 2 + compute_id], - l->TILE_CI, l->FH, l->FW, compute_num); + l->tile_ci, l->FH, l->FW, compute_num); write_buf = !write_buf; read_buf = !read_buf; @@ -127,9 +171,9 @@ static inline void maxpool_layer(const conv_layer *l) { snrt_dma_start_2d( &l->ofmap[(prev_oh * l->OW + prev_ow) * l->CI + prev_ci], /* dst */ &ofmap[!read_buf * (ofmap_size / 2)], /* src */ - sizeof(double) * l->TILE_CI, /* size */ + sizeof(double) * l->tile_ci, /* size */ sizeof(double) * l->CI, /* dst_stride */ - sizeof(double) * l->TILE_CI, /* src_stride */ + sizeof(double) * l->tile_ci, /* src_stride */ 1 /* repetitions */); snrt_dma_wait_all(); } diff --git a/sw/dnn/src/softmax.h b/sw/dnn/softmax/softmax.h similarity index 100% rename from sw/dnn/src/softmax.h rename to sw/dnn/softmax/softmax.h diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h index 537f488cd9..d1d190a968 100644 --- a/sw/dnn/src/dnn.h +++ b/sw/dnn/src/dnn.h @@ -194,16 +194,13 @@ typedef struct network_single_cluster_t_ { } network_single_cluster_t; -// Must be included before batchnorm since the batchnorm layer -// uses the conv_layer struct. This is bad design. -// TODO Fix this, union types should be preferred -#include "conv2d.h" - -#include "batchnorm.h" -#include "gelu.h" -#include "gemm.h" -#include "layernorm.h" -#include "linear.h" -#include "maxpool.h" -#include "softmax.h" -#include "utils.h" +// #include "conv2d.h" + +#include "../batchnorm/src/batchnorm.h" +#include "../gelu/src/gelu.h" +#include "../gemm/src/gemm.h" +#include "../layernorm/src/layernorm.h" +#include "../linear/src/linear.h" +#include "../maxpool/src/maxpool.h" +// #include "softmax.h" +// #include "utils.h" diff --git a/sw/dnn/src/layernorm.h b/sw/dnn/src/layernorm.h deleted file mode 100644 index b875303146..0000000000 --- a/sw/dnn/src/layernorm.h +++ /dev/null @@ -1,238 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "math.h" -#include "snrt.h" -// #include "printf.h" -#include "utils.h" - -// add dump function for layernorm -dump_float(ln, 5); - -/** - * @struct layernorm_layer_struct - * @brief This structure contains all parameters necessary - * for computing the LayerNorm activation function - * @var layernorm_layer_struct::BATCH_SIZE - * Size of each input sample - * @var layernorm_layer_struct::SEQ_LEN - * Size of each output sample - * @var layernorm_layer_struct::EMBEDDINGS - * Number of hidden dimensions - * @var layernorm_layer_struct::ifmap - * Pointer to input feature map - * @var layernorm_layer_struct::ofmap - * Pointer to output feature map - * @var layernorm_layer_struct::result - * Pointer to the golden model output - */ -typedef struct layernorm_layer_struct { - uint32_t BATCH_SIZE; - uint32_t SEQ_LEN; - uint32_t EMBEDDINGS; - uint32_t EPS; - - float *ifmap; - float *ofmap; - float *result; - - precision_t dtype; -} layernorm_layer_t; - -/** - * Implementation of the LayerNorm layer. - */ -static inline void layernorm_fp32(float *input, float *output, int32_t ldI, - int32_t batch_offset, int32_t batch_size, - int32_t seq_len, int32_t embeddings, - int32_t eps) { - float mean = 0.0; // max value of the current core - float var = 0.0; // sum of the exp values of the current core - - uint32_t compute_id = snrt_global_core_idx(); - uint32_t num_cores = snrt_cluster_compute_core_num(); - - // compute the mean and variance along the last dimension - for (int32_t b = 0; b < batch_size; b++) { - for (int32_t s = 0; s < seq_len; s++) { - mean = 0.0; - var = 0.0; - - for (int32_t i = 0; i < embeddings; i++) { - mean += input[b * batch_offset + s * ldI + i]; - } - mean /= embeddings; - - // printf("mean[%d] = %f\n", b, mean); - - for (int32_t i = 0; i < embeddings; i++) { - var += (input[b * batch_offset + s * ldI + i] - mean) * - (input[b * batch_offset + s * ldI + i] - mean); - } - var /= embeddings; - - // printf("var[%d] = %f\n", b, var); - - // compute the shifted value of the current row - for (int32_t i = 0; i < embeddings; i++) { - output[b * batch_offset + s * ldI + i] = - (input[b * batch_offset + s * ldI + i] - mean) / - sqrtf(var + eps); - // printf("output[%d][%d][%d] = %f\n", b, s + compute_id, i, - // output[b * batch_offset + s * ldI + i]); - } - } - } - - snrt_cluster_hw_barrier(); -} - -/** - * Implementation of the LayerNorm layer for the Transformer model for FP64. - */ -static inline void transformer_layernorm_fp64(double *input, int32_t ldI, - int32_t seq_len, int32_t embeddings, - int32_t eps) { - double mean = 0.0; // max value of the current core - double var = 0.0; // sum of the exp values of the current core - - uint32_t compute_id = snrt_global_core_idx(); - uint32_t num_cores = snrt_cluster_compute_core_num(); - - for (int32_t s = 0; s < seq_len; s++) { - mean = 0.0; - var = 0.0; - - for (int32_t i = 0; i < embeddings; i++) { - mean += input[s * ldI + i]; - } - mean /= embeddings; - - // printf("mean[%d] = %f\n", b, mean); - - for (int32_t i = 0; i < embeddings; i++) { - var += (input[s * ldI + i] - mean) * - (input[s * ldI + i] - mean); - } - var /= embeddings; - - // printf("var[%d] = %f\n", b, var); - - // compute the shifted value of the current row - for (int32_t i = 0; i < embeddings; i++) { - input[s * ldI + i] = - (input[s * ldI + i] - mean) / - sqrtf(var + eps); - // printf("output[%d][%d][%d] = %f\n", b, s + compute_id, i, - // output[s * ldI + i]); - // dump_ln(input[s * ldI + i]); - } - } - - snrt_cluster_hw_barrier(); -} - - -/** - * Implementation of the LayerNorm layer for the Transformer model for FP32. - */ -static inline void transformer_layernorm_fp32(float *input, int32_t ldI, - int32_t seq_len, int32_t embeddings, - int32_t eps) { - float mean = 0.0; // max value of the current core - float var = 0.0; // sum of the exp values of the current core - - uint32_t compute_id = snrt_global_core_idx(); - uint32_t num_cores = snrt_cluster_compute_core_num(); - - for (int32_t s = 0; s < seq_len; s++) { - mean = 0.0; - var = 0.0; - - for (int32_t i = 0; i < embeddings; i++) { - mean += input[s * ldI + i]; - } - mean /= embeddings; - - // printf("mean[%d] = %f\n", b, mean); - - for (int32_t i = 0; i < embeddings; i++) { - var += (input[s * ldI + i] - mean) * - (input[s * ldI + i] - mean); - } - var /= embeddings; - - // printf("var[%d] = %f\n", b, var); - - // compute the shifted value of the current row - for (int32_t i = 0; i < embeddings; i++) { - input[s * ldI + i] = - (input[s * ldI + i] - mean) / - sqrtf(var + eps); - // printf("output[%d][%d][%d] = %f\n", b, s + compute_id, i, - // output[s * ldI + i]); - // dump_ln(input[s * ldI + i]); - } - } - - snrt_cluster_hw_barrier(); -} - -/** - * @brief layernorm layer - * - * @param l layernorm_layer struct that holds addresses and parameters - * - */ -static inline void layernorm_layer(const layernorm_layer_t *l) { - uint32_t cluster_num = snrt_cluster_num(); - uint32_t cluster_id = snrt_cluster_idx(); - uint32_t compute_num = snrt_cluster_compute_core_num(); - uint32_t compute_id = snrt_global_core_idx(); - - uint32_t ifmap_size = - l->BATCH_SIZE * l->SEQ_LEN * l->EMBEDDINGS * sizeof(float); - uint32_t ofmap_size = ifmap_size; - - void *ptr = (float *)snrt_l1_next(); - float *ifmap = ptr; - ptr += ifmap_size; - float *ofmap = ptr; - ptr += ofmap_size; - - // DMA transfer the ifmap into the cluster TCDM - if (snrt_is_dm_core()) { - snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d( - ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float), - l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float), - l->SEQ_LEN * l->EMBEDDINGS * sizeof(float)); - - snrt_dma_wait_all(); - } - - snrt_cluster_hw_barrier(); - - if (snrt_is_compute_core()) { - // determine the row offset for each core - int32_t row_offset = compute_id * l->EMBEDDINGS; - - // determine the row stride of each matrix - int32_t ldI = compute_num * l->EMBEDDINGS; - - // determine the batch offset for each core - int32_t batch_offset = l->SEQ_LEN * l->EMBEDDINGS; - - // printf("row_offset: %d, ldI: %d\n", row_offset, ldI); - layernorm_fp32(&ifmap[row_offset], &ofmap[row_offset], ldI, - batch_offset, l->BATCH_SIZE, l->SEQ_LEN / 8, - l->EMBEDDINGS, l->EPS); - - } else { - snrt_cluster_hw_barrier(); - } - - snrt_global_barrier(); -} \ No newline at end of file diff --git a/target/snitch_cluster/Makefile b/target/snitch_cluster/Makefile index 7b38bbad6a..44bdc2b6dc 100644 --- a/target/snitch_cluster/Makefile +++ b/target/snitch_cluster/Makefile @@ -56,7 +56,7 @@ VLT_AR = ${VLT_BUILDDIR}/Vtestharness__ALL.a # (LRU) config, all targets depending on the configuration file have # to be rebuilt. This file is used to express this condition as a # prerequisite for other rules. -DEFAULT_CFG = cfg/default.hjson +DEFAULT_CFG = cfg/divsqrt.hjson CFG = cfg/lru.hjson ##################### diff --git a/target/snitch_cluster/cfg/divsqrt.hjson b/target/snitch_cluster/cfg/divsqrt.hjson new file mode 100644 index 0000000000..f4b63104bf --- /dev/null +++ b/target/snitch_cluster/cfg/divsqrt.hjson @@ -0,0 +1,127 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Cluster configuration for a simple testbench system. +{ + nr_s1_quadrant: 1, + s1_quadrant: { + nr_clusters: 1, + }, + + cluster: { + boot_addr: 4096, // 0x1000 + cluster_base_addr: 268435456, // 0x1000_0000 + cluster_base_offset: 0, // 0x0 + cluster_base_hartid: 0, + addr_width: 48, + data_width: 64, + tcdm: { + size: 128, + banks: 32, + }, + cluster_periph_size: 64, // kB + zero_mem_size: 64, // kB + dma_data_width: 512, + dma_axi_req_fifo_depth: 3, + dma_req_fifo_depth: 3, + // Timing parameters + timing: { + lat_comp_fp32: 3, + lat_comp_fp64: 3, + lat_comp_fp16: 2, + lat_comp_fp16_alt: 2, + lat_comp_fp8: 1, + lat_comp_fp8_alt: 1, + lat_noncomp: 1, + lat_conv: 1, + lat_sdotp: 2, + fpu_pipe_config: "BEFORE" + narrow_xbar_latency: "CUT_ALL_PORTS", + wide_xbar_latency: "CUT_ALL_PORTS", + // Isolate the core. + register_core_req: true, + register_core_rsp: true, + register_offload_req: true, + register_offload_rsp: true + }, + hives: [ + // Hive 0 + { + icache: { + size: 8, // total instruction cache size in kByte + sets: 2, // number of ways + cacheline: 256 // word size in bits + }, + cores: [ + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/dma_core_template" }, + ] + } + ] + }, + dram: { + // 0x8000_0000 + address: 2147483648, + // 0x8000_0000 + length: 2147483648 + }, + peripherals: { + clint: { + // 0xffff_0000 + address: 4294901760, + // 0x0000_1000 + length: 4096 + }, + }, + // Templates. + compute_core_template: { + isa: "rv32imafd", + xssr: true, + xfrep: true, + xdma: false, + xf16: true, + xf16alt: true, + xf8: true, + xf8alt: true, + xfdotp: true, + xfvec: true, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + // Enable division/square root unit + Xdiv_sqrt: true, + }, + dma_core_template: { + isa: "rv32imafd", + Xdiv_sqrt: true, + # isa: "rv32ema", + xdma: true + xssr: false + xfrep: false + xf16: false, + xf16alt: false, + xf8: false, + xf8alt: false, + xfdotp: false, + xfvec: false, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + } +} diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile index 0410fb1cb4..e5d8c8be5e 100644 --- a/target/snitch_cluster/sw/apps/Makefile +++ b/target/snitch_cluster/sw/apps/Makefile @@ -6,18 +6,18 @@ SUBDIRS = lto SUBDIRS += nop -SUBDIRS += transformer +# SUBDIRS += transformer SUBDIRS += blas/axpy SUBDIRS += blas/gemm SUBDIRS += dnn/batchnorm -SUBDIRS += dnn/conv2d -SUBDIRS += dnn/fusedconv +# SUBDIRS += dnn/conv2d +# SUBDIRS += dnn/fusedconv SUBDIRS += dnn/gelu SUBDIRS += dnn/gemm SUBDIRS += dnn/layernorm SUBDIRS += dnn/linear SUBDIRS += dnn/maxpool -SUBDIRS += dnn/softmax +# SUBDIRS += dnn/softmax SUBDIRS += montecarlo/pi_estimation .PHONY: all clean $(SUBDIRS) diff --git a/target/snitch_cluster/sw/apps/common.mk b/target/snitch_cluster/sw/apps/common.mk index 8e1950860e..94eb35236a 100644 --- a/target/snitch_cluster/sw/apps/common.mk +++ b/target/snitch_cluster/sw/apps/common.mk @@ -37,6 +37,7 @@ INCDIRS += $(SNRT_DIR)/api INCDIRS += $(SNRT_DIR)/api/omp INCDIRS += $(SNRT_DIR)/src INCDIRS += $(SNRT_DIR)/src/omp +INCDIRS += $(ROOT)/sw/blas INCDIRS += $(ROOT)/sw/deps/riscv-opcodes INCDIRS += $(ROOT)/sw/math/include diff --git a/target/snitch_cluster/sw/apps/dnn/batchnorm/Makefile b/target/snitch_cluster/sw/apps/dnn/batchnorm/Makefile index e5521ec799..f84fccea61 100644 --- a/target/snitch_cluster/sw/apps/dnn/batchnorm/Makefile +++ b/target/snitch_cluster/sw/apps/dnn/batchnorm/Makefile @@ -2,11 +2,11 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # -# Gianna Paulin +# Luca Colagrande -APP = batchnorm +APP ?= batchnorm -include ../Makefile +include ../../../../../../sw/dnn/common.mk include ../../common.mk -$(DEP): $(DATA_H) +$(DEP): $(DATA_H) \ No newline at end of file diff --git a/target/snitch_cluster/sw/apps/dnn/batchnorm/src/batchnorm.c b/target/snitch_cluster/sw/apps/dnn/batchnorm/src/batchnorm.c deleted file mode 100644 index 07eb0502b2..0000000000 --- a/target/snitch_cluster/sw/apps/dnn/batchnorm/src/batchnorm.c +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// SW testbench for profiling BatchNorm Layer -// Automatically checks the correctness of the results - -#include "dnn.h" -#include "snrt.h" - -#include "data.h" - -int main() { - batchnorm_l.ifmap = (double *)batchnorm_ifmap_dram; - batchnorm_l.ofmap = (double *)batchnorm_ofmap_dram; - batchnorm_l.gamma = (double *)batchnorm_gamma_dram; - batchnorm_l.beta = (double *)batchnorm_beta_dram; - batchnorm_l.TILE_CI = 32; - - batchnorm_layer(&batchnorm_l); - - snrt_global_barrier(); - - // TODO: fix check layer implementation to avoid DRAM overriding by other - // cores uint32_t errors = check_layer(&batchnorm_l, (double - // *)batchnorm_checksum); - - snrt_global_barrier(); - - return 0; - - // return errors; -} diff --git a/target/snitch_cluster/sw/apps/dnn/gelu/Makefile b/target/snitch_cluster/sw/apps/dnn/gelu/Makefile index fae8b36817..4225a8cd89 100644 --- a/target/snitch_cluster/sw/apps/dnn/gelu/Makefile +++ b/target/snitch_cluster/sw/apps/dnn/gelu/Makefile @@ -2,11 +2,11 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # -# Gianna Paulin +# Luca Colagrande -APP = gelu +APP ?= gelu -include ../Makefile +include ../../../../../../sw/dnn/common.mk include ../../common.mk -$(DEP): $(DATA_H) +$(DEP): $(DATA_H) \ No newline at end of file diff --git a/target/snitch_cluster/sw/apps/dnn/gelu/src/gelu.c b/target/snitch_cluster/sw/apps/dnn/gelu/src/gelu.c deleted file mode 100644 index e2e4471920..0000000000 --- a/target/snitch_cluster/sw/apps/dnn/gelu/src/gelu.c +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// SW testbench for profiling linear kernels in different -// floating point precisions (fp64, fp32, fp16), as well as -// different memory layouts for matrices (transposed/not-transposed) -// Correctness of results are checked automatically - -#include "dnn.h" -#include "snrt.h" - -#include "data.h" - -int main() { - gelu_l.ifmap = (float*)gelu_ifmap_dram; - // gelu_l.result = (float*)gelu_ofmap_dram; - - // checksum = (float*)gelu_checksum; - - gelu_layer(&gelu_l); - - // uint32_t error = check_gelu_layer(&linear_l, (float*)linear_checksum); - - return 0; -} \ No newline at end of file diff --git a/target/snitch_cluster/sw/apps/dnn/gemm/Makefile b/target/snitch_cluster/sw/apps/dnn/gemm/Makefile index 0a821adefc..48a31215e5 100644 --- a/target/snitch_cluster/sw/apps/dnn/gemm/Makefile +++ b/target/snitch_cluster/sw/apps/dnn/gemm/Makefile @@ -2,11 +2,11 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # -# Gianna Paulin +# Luca Colagrande -APP = gemm +APP ?= gemm -include ../Makefile +include ../../../../../../sw/dnn/common.mk include ../../common.mk -$(DEP): $(DATA_H) +$(DEP): $(DATA_H) \ No newline at end of file diff --git a/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile b/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile index 87fa026c70..f8df5a08ac 100644 --- a/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile +++ b/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile @@ -2,11 +2,11 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # -# Gianna Paulin +# Luca Colagrande -APP = layernorm +APP ?= layernorm -include ../Makefile +include ../../../../../../sw/dnn/common.mk include ../../common.mk $(DEP): $(DATA_H) diff --git a/target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c b/target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c deleted file mode 100644 index fa776940f6..0000000000 --- a/target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// SW testbench for profiling linear kernels in different -// floating point precisions (fp64, fp32, fp16), as well as -// different memory layouts for matrices (transposed/not-transposed) -// Correctness of results are checked automatically - -#include "dnn.h" -#include "snrt.h" - -#include "data.h" - -int main() { - layernorm_l.ifmap = (float*)layernorm_ifmap_dram; - layernorm_l.result = (float*)layernorm_ofmap_dram; - - // checksum = (float*)layernorm_checksum; - - // printf("Starting layernorm layer\n"); - - layernorm_layer(&layernorm_l); - - // uint32_t error = check_layernorm_layer(&linear_l, - // (float*)linear_checksum); - - return 0; -} \ No newline at end of file diff --git a/target/snitch_cluster/sw/apps/dnn/linear/Makefile b/target/snitch_cluster/sw/apps/dnn/linear/Makefile index 2e14e33dfe..7b43893846 100644 --- a/target/snitch_cluster/sw/apps/dnn/linear/Makefile +++ b/target/snitch_cluster/sw/apps/dnn/linear/Makefile @@ -2,11 +2,11 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # -# Gianna Paulin +# Luca Colagrande -APP = linear +APP ?= linear -include ../Makefile +include ../../../../../../sw/dnn/common.mk include ../../common.mk -$(DEP): $(DATA_H) +$(DEP): $(DATA_H) \ No newline at end of file diff --git a/target/snitch_cluster/sw/apps/dnn/linear/src/linear.c b/target/snitch_cluster/sw/apps/dnn/linear/src/linear.c deleted file mode 100644 index 7135d30ff3..0000000000 --- a/target/snitch_cluster/sw/apps/dnn/linear/src/linear.c +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// SW testbench for profiling linear kernels in different -// floating point precisions (fp64, fp32, fp16), as well as -// different memory layouts for matrices (transposed/not-transposed) -// Correctness of results are checked automatically - -#include "dnn.h" -#include "snrt.h" - -#include "data.h" - -int main() { - linear_l.ifmap = (float*)linear_ifmap_dram; - linear_l.weights = (float*)linear_weights_dram; - linear_l.bias = (float*)linear_bias_dram; - linear_l.result = (float*)linear_ofmap_dram; - - linear_layer(&linear_l); - - // uint32_t error = check_linear_layer(&linear_l, (float*)linear_checksum); - - return 0; -} \ No newline at end of file diff --git a/target/snitch_cluster/sw/apps/dnn/maxpool/Makefile b/target/snitch_cluster/sw/apps/dnn/maxpool/Makefile index fd01d270b4..e83838ca4e 100644 --- a/target/snitch_cluster/sw/apps/dnn/maxpool/Makefile +++ b/target/snitch_cluster/sw/apps/dnn/maxpool/Makefile @@ -2,11 +2,11 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # -# Gianna Paulin +# Luca Colagrande -APP = maxpool +APP ?= maxpool -include ../Makefile +include ../../../../../../sw/dnn/common.mk include ../../common.mk -$(DEP): $(DATA_H) +$(DEP): $(DATA_H) \ No newline at end of file diff --git a/target/snitch_cluster/sw/apps/dnn/maxpool/src/maxpool.c b/target/snitch_cluster/sw/apps/dnn/maxpool/src/maxpool.c deleted file mode 100644 index c3b91394b7..0000000000 --- a/target/snitch_cluster/sw/apps/dnn/maxpool/src/maxpool.c +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// SW testbench for profiling MaxPool Layer -// Automatically checks the correctness of the results - -#include "dnn.h" -#include "snrt.h" - -#include "data.h" - -int main() { - maxpool_l.ifmap = (double*)maxpool_ifmap_dram; - maxpool_l.ofmap = (double*)maxpool_ofmap_dram; - maxpool_l.TILE_CI = 32; - - maxpool_layer(&maxpool_l); - - snrt_global_barrier(); - - // FIXME: The checksum is overwritten in DRAM by the - // output of the cores. This is a bug. - - // uint32_t error = check_layer(&maxpool_l, (double*)maxpool_checksum); - - // snrt_global_barrier(); - - // return error; - - return 0; -} diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml index f25ea76418..0c712fa552 100644 --- a/target/snitch_cluster/sw/run.yaml +++ b/target/snitch_cluster/sw/run.yaml @@ -77,11 +77,10 @@ runs: - elf: apps/dnn/linear/build/linear.elf - elf: apps/dnn/maxpool/build/maxpool.elf - elf: apps/dnn/gemm/build/gemm.elf + - elf: apps/dnn/layernorm/build/layernorm.elf # Illegal FDIV without FDIV unit # - elf: apps/dnn/gelu/build/gelu.elf # seems like it stalls # - elf: apps/dnn/conv2d/build/conv2d.elf # fails with exit code 32 # - elf: apps/dnn/fusedconv/build/fusedconv.elf # fails newly - # - elf: apps/dnn/layernorm/build/layernorm.elf - # throws illegal instruction on FDIV in simulation # - elf: apps/dnn/softmax/build/softmax.elf # throws illegal instruction on FDIV in simulation - elf: apps/montecarlo/pi_estimation/build/pi_estimation.elf diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py index 2ed260d3f1..ab28fdb1dc 100644 --- a/util/sim/data_utils.py +++ b/util/sim/data_utils.py @@ -6,15 +6,45 @@ import struct from datetime import datetime +import torch +import numpy as np def emit_license(): s = (f"// Copyright {datetime.now().year} ETH Zurich and University of Bologna.\n" f"// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n" - f"// SPDX-License-Identifier: Apache-2.0\n\n") + f"// SPDX-License-Identifier: Apache-2.0\n") return s +def floating_point_torch_type(precision): + prec_to_torch_type_map = { + '64': torch.float64, + '32': torch.float32, + '16': torch.float16, + '8': None + } + return prec_to_torch_type_map[precision] + + +# Returns the C type representing a floating-point value of the specified precision +def floating_point_ctype(precision): + prec_to_fp_type_map = { + '64': 'double', + '32': 'float', + '16': '__fp16', + '8': '__fp8' + } + return prec_to_fp_type_map[precision] + + +def flatten(array): + if isinstance(array, np.ndarray): + return array.flatten() + if isinstance(array, torch.Tensor): + return array.numpy().flatten() + + def variable_attributes(alignment=None, section=None): attributes = '' if alignment: @@ -24,27 +54,60 @@ def variable_attributes(alignment=None, section=None): return attributes -def format_vector_definition(type, uid, vector, alignment=None, section=None): +def alias_dtype(dtype): + if dtype == '__fp8': + return 'char' + else: + return dtype + + +def format_array_declaration(dtype, uid, shape, alignment=None, section=None): attributes = variable_attributes(alignment, section) - s = f'{type} {uid}[{len(vector)}] {attributes} = ' + '{\n' - for el in vector: - if type != 'char': - el_str = f'{el}' - else: + s = f'{alias_dtype(dtype)} {uid}' + for dim in shape: + s += f'[{dim}]' + if attributes: + s += f' {attributes};' + else: + s += ';' + return s + + +# In the case of dtype __fp8, array field expects a dictionary of +# sign, exponent and mantissa arrays +def format_array_definition(dtype, uid, array, alignment=None, section=None): + # Definition starts with the declaration stripped off of the terminating semicolon + s = format_array_declaration(dtype, uid, array.shape, alignment, section)[:-1] + s += ' = {\n' + # Flatten array + if dtype == '__fp8': + array = zip(flatten(array['sign']), + flatten(array['exponent']), + flatten(array['mantissa'])) + else: + array = flatten(array) + # Format array elements + for el in array: + if dtype == '__fp8': + sign, exp, mant = el + el = sign * 2**7 + exp * 2**2 + mant el_str = f'0x{el:02x}' + else: + el_str = f'{el}' s += f'\t{el_str},\n' s += '};' return s -def format_vector_declaration(type, uid, vector, alignment=None, section=None): - attributes = variable_attributes(alignment, section) - s = f'{type} {uid}[{len(vector)}] {attributes};' +def format_scalar_definition(dtype, uid, scalar): + s = f'{alias_dtype(dtype)} {uid} = {scalar};' return s -def format_scalar_definition(type, uid, scalar): - s = f'{type} {uid} = {scalar};' +def format_struct_definition(dtype, uid, map): + s = f'{alias_dtype(dtype)} {uid} = {{\n' + s += ',\n'.join([f'\t.{key} = {value}' for (key, value) in map.items()]) + s += '\n};' return s @@ -56,20 +119,43 @@ def format_ifdef_wrapper(macro, body): # bytearray assumed little-endian -def bytes_to_doubles(byte_array): - double_size = struct.calcsize('d') # Size of a double in bytes - num_doubles = len(byte_array) // double_size +def bytes_to_struct(byte_array, struct_map): + struct_fields = struct_map.keys() + fmt_specifiers = struct_map.values() + fmt_string = ''.join(fmt_specifiers) + field_values = struct.unpack(f'<{fmt_string}', byte_array) + return dict(zip(struct_fields, field_values)) + - # Unpack the byte array into a list of doubles - doubles = [] - for i in range(num_doubles): - double_bytes = byte_array[i * double_size:(i + 1) * double_size] - double = struct.unpack('