From 1707019629dab697a7f36106424708bab750776e Mon Sep 17 00:00:00 2001
From: SharzyL <me@sharzy.in>
Date: Fri, 3 May 2024 20:32:33 +0800
Subject: [PATCH 01/10] [nix] use single-float abi compilerrt

---
 nix/overlay.nix | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/nix/overlay.nix b/nix/overlay.nix
index aab82d1ea..5367484aa 100644
--- a/nix/overlay.nix
+++ b/nix/overlay.nix
@@ -43,11 +43,14 @@ in
         major = final.lib.versions.major rv32_buildPkgs.${llvmForRVV_attrName}.release_version;
 
         # compiler-rt requires the compilation flag -fforce-enable-int128, only clang provides that
-        compilerrt = rv32_pkgs.${llvmForRVV_attrName}.compiler-rt.override {
+        compilerrt = (rv32_pkgs.${llvmForRVV_attrName}.compiler-rt.override {
           stdenv = rv32_pkgs.overrideCC
             rv32_pkgs.stdenv
             rv32_buildPkgs.${llvmForRVV_attrName}.clangNoCompilerRt;
-        };
+        }).overrideAttrs (oldAttrs: {
+          NIX_DEBUG = 1;
+          env.NIX_CFLAGS_COMPILE = "-march=rv32gcv -mabi=ilp32f";
+        });
 
         # newlib is built with double float point abi by default, override it
         newlib = rv32_pkgs.stdenv.cc.libc.overrideAttrs (oldAttrs: {

From 989a24db62ac799a8249c7f1c9193e2c7a2c49e3 Mon Sep 17 00:00:00 2001
From: SharzyL <me@sharzy.in>
Date: Fri, 3 May 2024 20:33:00 +0800
Subject: [PATCH 02/10] [tests] fix t1.ld DDR map

---
 tests/t1.ld | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/t1.ld b/tests/t1.ld
index d94135b6e..1a8248c01 100644
--- a/tests/t1.ld
+++ b/tests/t1.ld
@@ -31,5 +31,5 @@ SECTIONS {
   .vbss (TYPE = SHT_NOBITS) : { *(.vbss .vbss.*) } >SRAM
 
   __stacktop = ORIGIN(SCALAR) + LENGTH(SCALAR);  /* put stack on the top of SCALAR */
-  __heapbegin = ORIGIN(SCALAR);  /* put heap on the begin of DDR */
+  __heapbegin = ORIGIN(DDR);  /* put heap on the begin of DDR */
 }

From fe9a4b2516f3704ca4ed2c6495d4e4c9a529cb11 Mon Sep 17 00:00:00 2001
From: SharzyL <me@sharzy.in>
Date: Fri, 3 May 2024 20:37:15 +0800
Subject: [PATCH 03/10] [emurt] support uart/print, export header

---
 tests/emurt/default.nix    | 23 +++++++--
 tests/emurt/emurt.c        | 95 ++++++++++++++++++++++++++++++--------
 tests/emurt/emurt.h        | 20 ++++++++
 tests/intrinsic/uart.h     |  2 +-
 tests/intrinsic/uarttest.c |  4 +-
 5 files changed, 117 insertions(+), 27 deletions(-)
 create mode 100644 tests/emurt/emurt.h

diff --git a/tests/emurt/default.nix b/tests/emurt/default.nix
index cbbab1d36..54f523271 100644
--- a/tests/emurt/default.nix
+++ b/tests/emurt/default.nix
@@ -1,4 +1,4 @@
-{ stdenv, bintools }:
+{ lib, stdenv, bintools }:
 
 stdenv.mkDerivation {
   name = "emurt";
@@ -6,9 +6,24 @@ stdenv.mkDerivation {
 
   NIX_CFLAGS_COMPILE = "-mabi=ilp32f -march=rv32gcv -fno-PIC";
 
-  buildCommand = ''
-    mkdir -p $out/lib
-    ${stdenv.targetPlatform.config}-cc ${./emurt.c} -c -o emurt.o
+  src = with lib.fileset; toSource {
+    root = ./.;
+    fileset = fileFilter (file: file.name != "default.nix") ./.;
+  };
+
+  buildPhase = ''
+    runHook preBuild
+    ${stdenv.targetPlatform.config}-cc emurt.c -c -o emurt.o
+    runHook postBuild
+  '';
+
+  installPhase = ''
+    runHook preInstall
+    mkdir -p $out/{lib,include}
+
+    cp *.h $out/include/
     ${stdenv.targetPlatform.config}-ar rcs $out/lib/libemurt.a emurt.o
+
+    runHook postInstall
   '';
 }
diff --git a/tests/emurt/emurt.c b/tests/emurt/emurt.c
index 8fd5ac84b..9f4a7a4df 100644
--- a/tests/emurt/emurt.c
+++ b/tests/emurt/emurt.c
@@ -1,31 +1,86 @@
-#include <stdint.h>
-#include <stdio.h>
-
-struct uartlite_regs {
-    volatile unsigned int rx_fifo;
-    volatile unsigned int tx_fifo;
-    volatile unsigned int status;
-    volatile unsigned int control;
-};
+#include "emurt.h"
 
 struct uartlite_regs *const ttyUL0 = (struct uartlite_regs *)0x10000000;
 
-void uart_put_c(const char c) {
-  while (ttyUL0->status & (1<<3) /* transmit FIFO full */);
-  ttyUL0->tx_fifo = c;
+#define SR_TX_FIFO_FULL         (1<<3) /* transmit FIFO full */
+#define SR_TX_FIFO_EMPTY        (1<<2) /* transmit FIFO empty */
+#define SR_RX_FIFO_VALID_DATA   (1<<0) /* data in receive FIFO */
+#define SR_RX_FIFO_FULL         (1<<1) /* receive FIFO full */
+
+#define ULITE_CONTROL_RST_TX    0x01
+#define ULITE_CONTROL_RST_RX    0x02
+
+void uart_put_c(char c) {
+    while (ttyUL0->status & SR_TX_FIFO_FULL);
+    ttyUL0->tx_fifo = c;
 }
 
-void uart_put_s(const char *s) {
-  while (*s) {
-    uart_put_c(*s++);
-  }
+char uart_check_read() { // 1: data ready, 0: no data
+    return (ttyUL0->status & SR_RX_FIFO_VALID_DATA) != 0;
+}
+
+char uart_get_c() {
+    return ttyUL0->rx_fifo;
+}
+
+void get(char *s, int n) {
+    char debug[20] = "Enter get()";
+    print_s(debug);
+    int i = 0;
+    while (uart_check_read() != 1);
+    while (uart_check_read() && i < n - 1) {
+        print_s(debug);
+        char c = uart_get_c();
+        uart_put_c(c);
+        if (c == '\r' || c == '\n') {
+            break; // Break if carriage return or newline is encountered
+        }
+        *(s + i) = c;
+        i++;
+    }
+    s[i] = '\0';
+}
+
+void print_s(const char *c) {
+    while (*c) {
+        uart_put_c(*c);
+        c ++;
+    }
+}
+
+void print_long(long x) {
+    char buffer[30];
+    if (x < 0) {
+        uart_put_c('-');
+        x = -x;
+    }
+    int idx = 0;
+    while (x) {
+        long new_x = x / 10;
+        long rem_x = x % 10;
+        buffer[idx ++] = '0' + rem_x;
+        x = new_x;
+    }
+    if (idx == 0) uart_put_c('0');
+    else while (idx) uart_put_c(buffer[--idx]);
+}
+
+void print_digit(unsigned char x) {
+    uart_put_c('0'+x);
 }
 
-void uart_put_hex_d(uint64_t x) {
-    for (int i=15;i>=0;i--) {
-        uint64_t res = (x >> (i * 4)) & 0xf;
-        uart_put_c(res >= 10 ? 'a' + res - 10 : '0' + res);
+void dump_hex(unsigned long x) {
+    uart_put_c('0');
+    uart_put_c('x');
+    char buffer[16];
+    for (int i=0;i<16;i++) {
+        unsigned long cur = x & 0xf;
+        buffer[i] = cur < 10 ? ('0' + cur) : ('a' + cur - 10);
+        x >>= 4;
     }
+    for (int i=15;i>=0;i--) uart_put_c(buffer[i]);
+    uart_put_c('\r');
+    uart_put_c('\n');
 }
 
 extern char* __heapbegin;
diff --git a/tests/emurt/emurt.h b/tests/emurt/emurt.h
new file mode 100644
index 000000000..53eb22de8
--- /dev/null
+++ b/tests/emurt/emurt.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <stdint.h>
+
+struct uartlite_regs {
+    volatile unsigned int rx_fifo;
+    volatile unsigned int tx_fifo;
+    volatile unsigned int status;
+    volatile unsigned int control;
+};
+
+void uart_put_c(char c);
+char uart_check_read();
+char uart_get_c();
+void get(char *s, int n);
+void print_s(const char *c);
+void print_long(long x);
+void print_digit(unsigned char x);
+void dump_hex(unsigned long x);
+
diff --git a/tests/intrinsic/uart.h b/tests/intrinsic/uart.h
index 60fe5f5df..9ac052cd7 100644
--- a/tests/intrinsic/uart.h
+++ b/tests/intrinsic/uart.h
@@ -27,4 +27,4 @@ void uart_put_hex_d(uint64_t x) {
         uint64_t res = (x >> (i * 4)) & 0xf;
         uart_put_c(res >= 10 ? 'a' + res - 10 : '0' + res);
     }
-}
\ No newline at end of file
+}
diff --git a/tests/intrinsic/uarttest.c b/tests/intrinsic/uarttest.c
index 8146924ed..493dfcb89 100644
--- a/tests/intrinsic/uarttest.c
+++ b/tests/intrinsic/uarttest.c
@@ -1,8 +1,8 @@
-#include "uart.h"
+#include <emurt.h>
 
 void test() {
     uart_put_s("Test Begin from UART!\n");
     uart_put_hex_d(0xdeadbeef);
     uart_put_c('\n');
     uart_put_s("Test End from UART!\n");
-}
\ No newline at end of file
+}

From b7510bc5694fd2a027ac7e83d6c6bc0dda22e2d3 Mon Sep 17 00:00:00 2001
From: SharzyL <me@sharzy.in>
Date: Fri, 3 May 2024 20:38:30 +0800
Subject: [PATCH 04/10] [cases] add perf.llama

---
 nix/t1/testcases/make-intrinsic-case.nix |  11 +-
 tests/default.nix                        |   1 +
 tests/perf/llama/default.nix             |  47 ++
 tests/perf/llama/extern_data.S           |  25 +
 tests/perf/llama/extern_data.h           |   5 +
 tests/perf/llama/run.c                   | 936 +++++++++++++++++++++++
 tests/perf/llama/trap.c                  |  56 ++
 tests/perf/llama/trap.h                  |   5 +
 tests/perf/llama/utils.h                 |  45 ++
 9 files changed, 1128 insertions(+), 3 deletions(-)
 create mode 100644 tests/perf/llama/default.nix
 create mode 100644 tests/perf/llama/extern_data.S
 create mode 100644 tests/perf/llama/extern_data.h
 create mode 100644 tests/perf/llama/run.c
 create mode 100644 tests/perf/llama/trap.c
 create mode 100644 tests/perf/llama/trap.h
 create mode 100644 tests/perf/llama/utils.h

diff --git a/nix/t1/testcases/make-intrinsic-case.nix b/nix/t1/testcases/make-intrinsic-case.nix
index 8ea25d065..099ee955b 100644
--- a/nix/t1/testcases/make-intrinsic-case.nix
+++ b/nix/t1/testcases/make-intrinsic-case.nix
@@ -2,10 +2,12 @@
 
 { caseName, xLen ? 32, vLen ? 1024, fp ? false, ... }@inputs:
 
-stdenv.mkDerivation (rec {
-  name = "intrinsic.${caseName}";
+stdenv.mkDerivation (self: rec {
+  casePrefix = "intrinsic";
+  name = "${self.casePrefix}.${caseName}";
 
   unpackPhase = ''
+    runHook preUnpack
     if [ -z "''${srcs:-}" ]; then
         if [ -z "''${src:-}" ]; then
             echo 'variable $src or $srcs should point to the source'
@@ -13,6 +15,7 @@ stdenv.mkDerivation (rec {
         fi
         srcs="$src"
     fi
+    runHook postUnpack
   '';
 
   NIX_CFLAGS_COMPILE = [
@@ -51,7 +54,7 @@ stdenv.mkDerivation (rec {
 
     jq --null-input \
       --arg name ${caseName} \
-      --arg type intrinsic \
+      --arg type ${self.casePrefix} \
       --argjson xLen ${toString xLen} \
       --argjson vLen ${toString vLen} \
       --argjson fp ${lib.boolToString fp} \
@@ -62,5 +65,7 @@ stdenv.mkDerivation (rec {
     runHook postInstall
   '';
 
+  dontFixup = true;
+
   meta.description = "Test case '${caseName}', written in C intrinsic.";
 } // inputs)
diff --git a/tests/default.nix b/tests/default.nix
index 8433ec938..e7cd355e7 100644
--- a/tests/default.nix
+++ b/tests/default.nix
@@ -54,6 +54,7 @@ let
     mlir = searchAndCallPackage ./mlir;
     intrinsic = searchAndCallPackage ./intrinsic;
     asm = searchAndCallPackage ./asm;
+    perf = searchAndCallPackage ./perf;
 
     # nix build .#t1.cases.codegen.vaadd-vv -L
     # codegen case are using xLen=32,vLen=1024 by default
diff --git a/tests/perf/llama/default.nix b/tests/perf/llama/default.nix
new file mode 100644
index 000000000..f7f8ed533
--- /dev/null
+++ b/tests/perf/llama/default.nix
@@ -0,0 +1,47 @@
+{ lib
+, emurt
+, fetchurl
+, _caseBuilders
+}:
+
+let
+  checkpoint_bin = fetchurl {
+    url = "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin";
+    sha256 = "sha256-zVkGRNljhnorbloRB/UfrWY8QdecFJ++y7sflfqB9Jo=";
+  };
+
+  tokenizer_bin = fetchurl {
+    url = "https://github.com/karpathy/llama2.c/raw/b3c4b6c3c4bbff42e5211293280307019368ccb5/tokenizer.bin";
+    sha256 = "sha256-UKUu+CLunoPeXOnQvgoCWnc9AZQ39Ytf+dyvsGPs42E=";
+  };
+in
+
+_caseBuilders.mkIntrinsicCase {
+  casePrefix = "perf";
+  caseName = "llama";
+
+  buildInputs = [ emurt ];
+
+  src = with lib.fileset; toSource {
+    root = ./.;
+    fileset = fileFilter (file: file.name != "default.nix") ./.;
+  };
+
+  unpackPhase = ''
+    cp $src -rT .
+    chmod -R +w .
+  '';
+
+  postPatch = ''
+    substituteInPlace extern_data.S \
+      --replace-fail '{{checkpoint_bin}}' ${checkpoint_bin} \
+      --replace-fail '{{tokenizer_bin}}' ${tokenizer_bin}
+  '';
+
+  srcs = [
+    "run.c"
+    "trap.c"
+    "extern_data.S"
+    ../../t1_main.S
+  ];
+}
diff --git a/tests/perf/llama/extern_data.S b/tests/perf/llama/extern_data.S
new file mode 100644
index 000000000..72dfa7ef1
--- /dev/null
+++ b/tests/perf/llama/extern_data.S
@@ -0,0 +1,25 @@
+  .section .rodata
+  .global checkpoint_data
+  .type   checkpoint_data, @object
+  .align  4
+checkpoint_data:
+  .incbin "{{checkpoint_bin}}"  # will be replaced on nix build
+checkpoint_end:
+  .global checkpoint_size
+  .type   checkpoint_size, @object
+  .align  4
+checkpoint_size:
+  .int    checkpoint_end - checkpoint_data
+
+  .section .rodata
+  .global tokenizer_data
+  .type   tokenizer_data, @object
+  .align  4
+tokenizer_data:
+  .incbin "{{tokenizer_bin}}"  # will be replaced on nix build
+tokenizer_end:
+  .global tokenizer_size
+  .type   tokenizer_size, @object
+  .align  4
+tokenizer_size:
+  .int    tokenizer_end - tokenizer_data
diff --git a/tests/perf/llama/extern_data.h b/tests/perf/llama/extern_data.h
new file mode 100644
index 000000000..472de5b2b
--- /dev/null
+++ b/tests/perf/llama/extern_data.h
@@ -0,0 +1,5 @@
+extern int checkpoint_size;
+extern float checkpoint_data[];
+
+extern int tokenizer_size;
+extern float tokenizer_data[];
diff --git a/tests/perf/llama/run.c b/tests/perf/llama/run.c
new file mode 100644
index 000000000..d0926a090
--- /dev/null
+++ b/tests/perf/llama/run.c
@@ -0,0 +1,936 @@
+/* Inference for Llama-2 Transformer model in pure C */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <time.h>
+#include <math.h>
+#include <string.h>
+#include <fcntl.h>
+#include <stdint.h>
+
+#include <emurt.h>
+#include "trap.h"
+#include "utils.h"
+
+#include "extern_data.h"
+
+// ----------------------------------------------------------------------------
+// Transformer model
+
+
+typedef struct {
+    int dim; // transformer dimension
+    int hidden_dim; // for ffn layers
+    int n_layers; // number of layers
+    int n_heads; // number of query heads
+    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
+    int vocab_size; // vocabulary size, usually 256 (byte-level)
+    int seq_len; // max sequence length
+} Config;
+
+typedef struct {
+    // token embedding table
+    float* token_embedding_table;    // (vocab_size, dim)
+    // weights for rmsnorms
+    float* rms_att_weight; // (layer, dim) rmsnorm weights
+    float* rms_ffn_weight; // (layer, dim)
+    // weights for matmuls. note dim == n_heads * head_size
+    float* wq; // (layer, dim, n_heads * head_size)
+    float* wk; // (layer, dim, n_kv_heads * head_size)
+    float* wv; // (layer, dim, n_kv_heads * head_size)
+    float* wo; // (layer, n_heads * head_size, dim)
+    // weights for ffn
+    float* w1; // (layer, hidden_dim, dim)
+    float* w2; // (layer, dim, hidden_dim)
+    float* w3; // (layer, hidden_dim, dim)
+    // final rmsnorm
+    float* rms_final_weight; // (dim,)
+    // (optional) classifier weights for the logits, on the last layer
+    float* wcls;
+} TransformerWeights;
+
+typedef struct {
+    // current wave of activations
+    float *x; // activation at current time stamp (dim,)
+    float *xb; // same, but inside a residual branch (dim,)
+    float *xb2; // an additional buffer just for convenience (dim,)
+    float *hb; // buffer for hidden dimension in the ffn (hidden_dim,)
+    float *hb2; // buffer for hidden dimension in the ffn (hidden_dim,)
+    float *q; // query (dim,)
+    float *k; // key (dim,)
+    float *v; // value (dim,)
+    float *att; // buffer for scores/attention values (n_heads, seq_len)
+    float *logits; // output logits
+    // kv cache
+    float* key_cache;   // (layer, seq_len, dim)
+    float* value_cache; // (layer, seq_len, dim)
+} RunState;
+
+typedef struct {
+    Config config; // the hyperparameters of the architecture (the blueprint)
+    TransformerWeights weights; // the weights of the model
+    RunState state; // buffers for the "wave" of activations in the forward pass
+    // some more state needed to properly clean up the memory mapping (sigh)
+    int fd; // file descriptor for memory mapping
+    float* data; // memory mapped data pointer
+    ssize_t file_size; // size of the checkpoint file in bytes
+} Transformer;
+
+void malloc_run_state(RunState* s, Config* p) {
+    // we calloc instead of malloc to keep valgrind happy
+    int kv_dim = (p->dim * p->n_kv_heads) / p->n_heads;
+    s->x = calloc(p->dim, sizeof(float));
+    s->xb = calloc(p->dim, sizeof(float));
+    s->xb2 = calloc(p->dim, sizeof(float));
+    s->hb = calloc(p->hidden_dim, sizeof(float));
+    s->hb2 = calloc(p->hidden_dim, sizeof(float));
+    s->q = calloc(p->dim, sizeof(float));
+    s->key_cache = calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float));
+    s->value_cache = calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float));
+    s->att = calloc(p->n_heads * p->seq_len, sizeof(float));
+    s->logits = calloc(p->vocab_size, sizeof(float));
+    // ensure all mallocs went fine
+    if (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q
+     || !s->key_cache || !s->value_cache || !s->att || !s->logits) {
+        fprintf(stderr, "malloc failed!\n");
+        exit(EXIT_FAILURE);
+    }
+}
+
+void free_run_state(RunState* s) {
+    free(s->x);
+    free(s->xb);
+    free(s->xb2);
+    free(s->hb);
+    free(s->hb2);
+    free(s->q);
+    free(s->att);
+    free(s->logits);
+    free(s->key_cache);
+    free(s->value_cache);
+}
+
+void memory_map_weights(TransformerWeights *w, Config* p, float* ptr, int shared_weights) {
+    int head_size = p->dim / p->n_heads;
+    // make sure the multiplications below are done in 64bit to fit the parameter counts of 13B+ models
+    unsigned long long n_layers = p->n_layers;
+    w->token_embedding_table = ptr;
+    ptr += p->vocab_size * p->dim;
+    w->rms_att_weight = ptr;
+    ptr += n_layers * p->dim;
+    w->wq = ptr;
+    ptr += n_layers * p->dim * (p->n_heads * head_size);
+    w->wk = ptr;
+    ptr += n_layers * p->dim * (p->n_kv_heads * head_size);
+    w->wv = ptr;
+    ptr += n_layers * p->dim * (p->n_kv_heads * head_size);
+    w->wo = ptr;
+    ptr += n_layers * (p->n_heads * head_size) * p->dim;
+    w->rms_ffn_weight = ptr;
+    ptr += n_layers * p->dim;
+    w->w1 = ptr;
+    ptr += n_layers * p->dim * p->hidden_dim;
+    w->w2 = ptr;
+    ptr += n_layers * p->hidden_dim * p->dim;
+    w->w3 = ptr;
+    ptr += n_layers * p->dim * p->hidden_dim;
+    w->rms_final_weight = ptr;
+    ptr += p->dim;
+    ptr += p->seq_len * head_size / 2; // skip what used to be freq_cis_real (for RoPE)
+    ptr += p->seq_len * head_size / 2; // skip what used to be freq_cis_imag (for RoPE)
+    w->wcls = shared_weights ? w->token_embedding_table : ptr;
+}
+
+void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weights,
+                     int* fd, float** data, ssize_t* file_size) {
+    BIN *file = bopen((char *) checkpoint_data, checkpoint_size);
+    // read in the config header
+    bread(config, sizeof(Config), 1, file);
+    // negative vocab size is hacky way of signaling unshared weights. bit yikes.
+    int shared_weights = config->vocab_size > 0 ? 1 : 0;
+    config->vocab_size = abs(config->vocab_size);
+    // figure out the file size
+    bclose(file);
+
+    float* weights_ptr = checkpoint_data + sizeof(Config)/sizeof(float);
+
+    memory_map_weights(weights, config, weights_ptr, shared_weights);
+}
+
+void build_transformer(Transformer *t, char* checkpoint_path) {
+    // read in the Config and the Weights from the checkpoint
+    read_checkpoint(checkpoint_path, &t->config, &t->weights, &t->fd, &t->data, &t->file_size);
+    // allocate the RunState buffers
+    malloc_run_state(&t->state, &t->config);
+}
+
+void free_transformer(Transformer* t) {
+    // free the RunState buffers
+    free_run_state(&t->state);
+}
+
+// ----------------------------------------------------------------------------
+// neural net blocks; the dynamics of the Transformer
+
+void rmsnorm(float* o, float* x, float* weight, int size) {
+    // calculate sum of squares
+    float ss = 0.0f;
+    for (int j = 0; j < size; j++) {
+        ss += x[j] * x[j];
+    }
+    ss /= size;
+    ss += 1e-5f;
+    ss = 1.0f / sqrtf(ss);
+    // normalize and scale
+    for (int j = 0; j < size; j++) {
+        o[j] = weight[j] * (ss * x[j]);
+    }
+}
+
+void softmax(float* x, int size) {
+    // find max value (for numerical stability)
+    float max_val = x[0];
+    for (int i = 1; i < size; i++) {
+        if (x[i] > max_val) {
+            max_val = x[i];
+        }
+    }
+    // exp and sum
+    float sum = 0.0f;
+    for (int i = 0; i < size; i++) {
+        x[i] = expf(x[i] - max_val);
+        sum += x[i];
+    }
+    // normalize
+    for (int i = 0; i < size; i++) {
+        x[i] /= sum;
+    }
+}
+
+void matmul(float* xout, float* x, float* w, int n, int d) {
+    // W (d,n) @ x (n,) -> xout (d,)
+    // by far the most amount of time is spent inside this little function
+    int i;
+    #pragma omp parallel for private(i)
+    for (i = 0; i < d; i++) {
+        float val = 0.0f;
+        for (int j = 0; j < n; j++) {
+            val += w[i * n + j] * x[j];
+        }
+        xout[i] = val;
+    }
+}
+
+float* forward(Transformer* transformer, int token, int pos) {
+
+    // a few convenience variables
+    Config* p = &transformer->config;
+    TransformerWeights* w = &transformer->weights;
+    RunState* s = &transformer->state;
+    float *x = s->x;
+    int dim = p->dim;
+    int kv_dim = (p->dim * p->n_kv_heads) / p->n_heads;
+    int kv_mul = p->n_heads / p->n_kv_heads; // integer multiplier of the kv sharing in multiquery
+    int hidden_dim =  p->hidden_dim;
+    int head_size = dim / p->n_heads;
+
+    // copy the token embedding into x
+    float* content_row = w->token_embedding_table + token * dim;
+    memcpy(x, content_row, dim*sizeof(*x));
+
+    // forward all the layers
+    for(unsigned long long l = 0; l < p->n_layers; l++) {
+
+        // attention rmsnorm
+        rmsnorm(s->xb, x, w->rms_att_weight + l*dim, dim);
+
+        // key and value point to the kv cache
+        int loff = l * p->seq_len * kv_dim; // kv cache layer offset for convenience
+        s->k = s->key_cache + loff + pos * kv_dim;
+        s->v = s->value_cache + loff + pos * kv_dim;
+
+        // qkv matmuls for this position
+        matmul(s->q, s->xb, w->wq + l*dim*dim, dim, dim);
+        matmul(s->k, s->xb, w->wk + l*dim*kv_dim, dim, kv_dim);
+        matmul(s->v, s->xb, w->wv + l*dim*kv_dim, dim, kv_dim);
+
+        // RoPE relative positional encoding: complex-valued rotate q and k in each head
+        for (int i = 0; i < dim; i+=2) {
+            int head_dim = i % head_size;
+            float freq = 1.0f / powf(10000.0f, head_dim / (float)head_size);
+            float val = pos * freq;
+            float fcr = cosf(val);
+            float fci = sinf(val);
+            int rotn = i < kv_dim ? 2 : 1; // how many vectors? 2 = q & k, 1 = q only
+            for (int v = 0; v < rotn; v++) {
+                float* vec = v == 0 ? s->q : s->k; // the vector to rotate (query or key)
+                float v0 = vec[i];
+                float v1 = vec[i+1];
+                vec[i]   = v0 * fcr - v1 * fci;
+                vec[i+1] = v0 * fci + v1 * fcr;
+            }
+        }
+
+        // multihead attention. iterate over all heads
+        int h;
+        #pragma omp parallel for private(h)
+        for (h = 0; h < p->n_heads; h++) {
+            // get the query vector for this head
+            float* q = s->q + h * head_size;
+            // attention scores for this head
+            float* att = s->att + h * p->seq_len;
+            // iterate over all timesteps, including the current one
+            for (int t = 0; t <= pos; t++) {
+                // get the key vector for this head and at this timestep
+                float* k = s->key_cache + loff + t * kv_dim + (h / kv_mul) * head_size;
+                // calculate the attention score as the dot product of q and k
+                float score = 0.0f;
+                for (int i = 0; i < head_size; i++) {
+                    score += q[i] * k[i];
+                }
+                score /= sqrtf(head_size);
+                // save the score to the attention buffer
+                att[t] = score;
+            }
+
+            // softmax the scores to get attention weights, from 0..pos inclusively
+            softmax(att, pos + 1);
+
+            // weighted sum of the values, store back into xb
+            float* xb = s->xb + h * head_size;
+            memset(xb, 0, head_size * sizeof(float));
+            for (int t = 0; t <= pos; t++) {
+                // get the value vector for this head and at this timestep
+                float* v = s->value_cache + loff + t * kv_dim + (h / kv_mul) * head_size;
+                // get the attention weight for this timestep
+                float a = att[t];
+                // accumulate the weighted value into xb
+                for (int i = 0; i < head_size; i++) {
+                    xb[i] += a * v[i];
+                }
+            }
+        }
+
+        // final matmul to get the output of the attention
+        matmul(s->xb2, s->xb, w->wo + l*dim*dim, dim, dim);
+
+        // residual connection back into x
+        for (int i = 0; i < dim; i++) {
+            x[i] += s->xb2[i];
+        }
+
+        // ffn rmsnorm
+        rmsnorm(s->xb, x, w->rms_ffn_weight + l*dim, dim);
+
+        // Now for FFN in PyTorch we have: self.w2(F.silu(self.w1(x)) * self.w3(x))
+        // first calculate self.w1(x) and self.w3(x)
+        matmul(s->hb, s->xb, w->w1 + l*dim*hidden_dim, dim, hidden_dim);
+        matmul(s->hb2, s->xb, w->w3 + l*dim*hidden_dim, dim, hidden_dim);
+
+        // SwiGLU non-linearity
+        for (int i = 0; i < hidden_dim; i++) {
+            float val = s->hb[i];
+            // silu(x)=x*σ(x), where σ(x) is the logistic sigmoid
+            val *= (1.0f / (1.0f + expf(-val)));
+            // elementwise multiply with w3(x)
+            val *= s->hb2[i];
+            s->hb[i] = val;
+        }
+
+        // final matmul to get the output of the ffn
+        matmul(s->xb, s->hb, w->w2 + l*dim*hidden_dim, hidden_dim, dim);
+
+        // residual connection
+        for (int i = 0; i < dim; i++) {
+            x[i] += s->xb[i];
+        }
+    }
+
+    // final rmsnorm
+    rmsnorm(x, x, w->rms_final_weight, dim);
+
+    // classifier into logits
+    matmul(s->logits, x, w->wcls, p->dim, p->vocab_size);
+    return s->logits;
+}
+
+// ----------------------------------------------------------------------------
+// The Byte Pair Encoding (BPE) Tokenizer that translates strings <-> tokens
+
+typedef struct {
+    char *str;
+    int id;
+} TokenIndex;
+
+typedef struct {
+    char** vocab;
+    float* vocab_scores;
+    TokenIndex *sorted_vocab;
+    int vocab_size;
+    unsigned int max_token_length;
+    unsigned char byte_pieces[512]; // stores all single-byte strings
+} Tokenizer;
+
+int compare_tokens(const void *a, const void *b) {
+    return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str);
+}
+
+void build_tokenizer(Tokenizer* t, char* tokenizer_path, int vocab_size) {
+    // i should have written the vocab_size into the tokenizer file... sigh
+    t->vocab_size = vocab_size;
+    // malloc space to hold the scores and the strings
+    t->vocab = (char**)malloc(vocab_size * sizeof(char*));
+    t->vocab_scores = (float*)malloc(vocab_size * sizeof(float));
+    t->sorted_vocab = NULL; // initialized lazily
+    for (int i = 0; i < 256; i++) {
+        t->byte_pieces[i * 2] = (unsigned char)i;
+        t->byte_pieces[i * 2 + 1] = '\0';
+    }
+    // read in the file
+    BIN *file = bopen((char *) tokenizer_data, tokenizer_size);
+    bread(&t->max_token_length, sizeof(int), 1, file);
+    int len;
+    for (int i = 0; i < vocab_size; i++) {
+        bread(t->vocab_scores + i, sizeof(float), 1, file);
+        bread(&len, sizeof(int), 1, file);
+        t->vocab[i] = (char *)malloc(len + 1);
+        bread(t->vocab[i], len, 1, file);
+        t->vocab[i][len] = '\0'; // add the string terminating token
+    }
+    bclose(file);
+}
+
+void free_tokenizer(Tokenizer* t) {
+    for (int i = 0; i < t->vocab_size; i++) { free(t->vocab[i]); }
+    free(t->vocab);
+    free(t->vocab_scores);
+    free(t->sorted_vocab);
+}
+
+char* decode(Tokenizer* t, int prev_token, int token) {
+    char *piece = t->vocab[token];
+    // following BOS (1) token, sentencepiece decoder strips any leading whitespace (see PR #89)
+    if (prev_token == 1 && piece[0] == ' ') { piece++; }
+    // careful, some tokens designate raw bytes, and look like e.g. '<0x01>'
+    // parse this and convert and return the actual byte
+    unsigned char byte_val;
+    if (sscanf(piece, "<0x%02hhX>", &byte_val) == 1) {
+        piece = (char*)t->byte_pieces + byte_val * 2;
+    }
+    return piece;
+}
+
+void safe_printf(char *piece) {
+    // piece might be a raw byte token, and we only want to print printable chars or whitespace
+    // because some of the other bytes can be various control codes, backspace, etc.
+    if (piece == NULL) { return; }
+    if (piece[0] == '\0') { return; }
+    if (piece[1] == '\0') {
+        unsigned char byte_val = piece[0];
+        if (!(isprint(byte_val) || isspace(byte_val))) {
+            return; // bad byte, don't print it
+        }
+    }
+}
+
+int str_lookup(char *str, TokenIndex *sorted_vocab, int vocab_size) {
+    // efficiently find the perfect match for str in vocab, return its index or -1 if not found
+    TokenIndex tok = { .str = str }; // acts as the key to search for
+    TokenIndex *res = bsearch(&tok, sorted_vocab, vocab_size, sizeof(TokenIndex), compare_tokens);
+    return res != NULL ? res->id : -1;
+}
+
+void encode(Tokenizer* t, char *text, int8_t bos, int8_t eos, int *tokens, int *n_tokens) {
+    // encode the string text (input) into an upper-bound preallocated tokens[] array
+    // bos != 0 means prepend the BOS token (=1), eos != 0 means append the EOS token (=2)
+    if (text == NULL) { fprintf(stderr, "cannot encode NULL text\n"); exit(EXIT_FAILURE); }
+
+    if (t->sorted_vocab == NULL) {
+        // lazily malloc and sort the vocabulary
+        t->sorted_vocab = malloc(t->vocab_size * sizeof(TokenIndex));
+        for (int i = 0; i < t->vocab_size; i++) {
+            t->sorted_vocab[i].str = t->vocab[i];
+            t->sorted_vocab[i].id = i;
+        }
+        qsort(t->sorted_vocab, t->vocab_size, sizeof(TokenIndex), compare_tokens);
+    }
+
+    // create a temporary buffer that will store merge candidates of always two consecutive tokens
+    // *2 for concat, +1 for null terminator +2 for UTF8 (in case max_token_length is 1)
+    char* str_buffer = malloc((t->max_token_length*2 +1 +2) * sizeof(char));
+    size_t str_len = 0;
+
+    // start at 0 tokens
+    *n_tokens = 0;
+
+    // add optional BOS (=1) token, if desired
+    if (bos) tokens[(*n_tokens)++] = 1;
+
+    // add_dummy_prefix is true by default
+    // so prepend a dummy prefix token to the input string, but only if text != ""
+    // TODO: pretty sure this isn't correct in the general case but I don't have the
+    // energy to read more of the sentencepiece code to figure out what it's doing
+    if (text[0] != '\0') {
+        int dummy_prefix = str_lookup(" ", t->sorted_vocab, t->vocab_size);
+        tokens[(*n_tokens)++] = dummy_prefix;
+    }
+
+    // Okay UTF-8 time. This will get messy. Here is the reference from Wikipedia:
+    // Code point ↔ UTF-8 conversion
+    // First code point	Last code point	Byte 1	Byte 2	Byte 3	Byte 4
+    // U+0000	U+007F	    0xxxxxxx
+    // U+0080	U+07FF	    110xxxxx	10xxxxxx
+    // U+0800	U+FFFF	    1110xxxx	10xxxxxx	10xxxxxx
+    // U+10000	U+10FFFF    11110xxx	10xxxxxx	10xxxxxx	10xxxxxx
+
+    // process the raw (UTF-8) byte sequence of the input string
+    for (char *c = text; *c != '\0'; c++) {
+
+        // reset buffer if the current byte is ASCII or a leading byte
+        // 0xC0 is 11000000, so (*c & 0xC0) keeps the first 2 bits and zeros the rest
+        // 0x80 is 10000000
+        // in UTF-8, all continuation bytes start with "10" in first two bits
+        // so in English this is: "if this byte is not a continuation byte"
+        if ((*c & 0xC0) != 0x80) {
+            // this byte must be either a leading byte (11...) or an ASCII char (0x...)
+            // => reset our location, as we're starting a new UTF-8 codepoint
+            str_len = 0;
+        }
+
+        // append the current byte to the buffer
+        str_buffer[str_len++] = *c; // ++ is post-increment, incremented after this line
+        str_buffer[str_len] = '\0';
+
+        // while the next character is a continuation byte, continue appending
+        // but if there are too many of them, just stop to avoid overruning str_buffer size.
+        if ((*(c+1) & 0xC0) == 0x80 && str_len < 4) {
+            continue;
+        }
+
+        // ok c+1 is not a continuation byte, so we've read in a full codepoint
+        int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size);
+
+        if (id != -1) {
+            // we found this codepoint in vocab, add it as a token
+            tokens[(*n_tokens)++] = id;
+        } else {
+            // byte_fallback encoding: just encode each byte as a token
+            // +3 is here because the first 3 vocab elements are <unk>, <s>, </s>
+            // so the individual bytes only start at index 3
+            for (int i=0; i < str_len; i++) {
+                tokens[(*n_tokens)++] = (unsigned char)str_buffer[i] + 3;
+            }
+        }
+        str_len = 0; // protect against a sequence of stray UTF8 continuation bytes
+    }
+
+    // merge the best consecutive pair each iteration, according the scores in vocab_scores
+    while (1) {
+        float best_score = -1e10;
+        int best_id = -1;
+        int best_idx = -1;
+
+        for (int i=0; i < (*n_tokens-1); i++) {
+            // check if we can merge the pair (tokens[i], tokens[i+1])
+            sprintf(str_buffer, "%s%s", t->vocab[tokens[i]], t->vocab[tokens[i+1]]);
+            int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size);
+            if (id != -1 && t->vocab_scores[id] > best_score) {
+                // this merge pair exists in vocab! record its score and position
+                best_score = t->vocab_scores[id];
+                best_id = id;
+                best_idx = i;
+            }
+        }
+
+        if (best_idx == -1) {
+            break; // we couldn't find any more pairs to merge, so we're done
+        }
+
+        // merge the consecutive pair (best_idx, best_idx+1) into new token best_id
+        tokens[best_idx] = best_id;
+        // delete token at position best_idx+1, shift the entire sequence back 1
+        for (int i = best_idx+1; i < (*n_tokens-1); i++) {
+            tokens[i] = tokens[i+1];
+        }
+        (*n_tokens)--; // token length decreased
+    }
+
+    // add optional EOS (=2) token, if desired
+    if (eos) tokens[(*n_tokens)++] = 2;
+
+    free(str_buffer);
+}
+
+// ----------------------------------------------------------------------------
+// The Sampler, which takes logits and returns a sampled token
+// sampling can be done in a few ways: greedy argmax, sampling, top-p sampling
+
+typedef struct {
+    float prob;
+    int index;
+} ProbIndex; // struct used when sorting probabilities during top-p sampling
+
+typedef struct {
+    int vocab_size;
+    ProbIndex* probindex; // buffer used in top-p sampling
+    float temperature;
+    float topp;
+    unsigned long long rng_state;
+} Sampler;
+
+int sample_argmax(float* probabilities, int n) {
+    // return the index that has the highest probability
+    int max_i = 0;
+    float max_p = probabilities[0];
+    for (int i = 1; i < n; i++) {
+        if (probabilities[i] > max_p) {
+            max_i = i;
+            max_p = probabilities[i];
+        }
+    }
+    return max_i;
+}
+
+int sample_mult(float* probabilities, int n, float coin) {
+    // sample index from probabilities (they must sum to 1!)
+    // coin is a random number in [0, 1), usually from random_f32()
+    float cdf = 0.0f;
+    for (int i = 0; i < n; i++) {
+        cdf += probabilities[i];
+        if (coin < cdf) {
+            return i;
+        }
+    }
+    return n - 1; // in case of rounding errors
+}
+
+int compare(const void* a, const void* b) {
+    ProbIndex* a_ = (ProbIndex*) a;
+    ProbIndex* b_ = (ProbIndex*) b;
+    if (a_->prob > b_->prob) return -1;
+    if (a_->prob < b_->prob) return 1;
+    return 0;
+}
+
+int sample_topp(float* probabilities, int n, float topp, ProbIndex* probindex, float coin) {
+    // top-p sampling (or "nucleus sampling") samples from the smallest set of
+    // tokens that exceed probability topp. This way we never sample tokens that
+    // have very low probabilities and are less likely to go "off the rails".
+    // coin is a random number in [0, 1), usually from random_f32()
+
+    int n0 = 0;
+    // quicksort indices in descending order of probabilities
+    // values smaller than (1 - topp) / (n - 1) cannot be part of the result
+    // so for efficiency we crop these out as candidates before sorting
+    const float cutoff = (1.0f - topp) / (n - 1);
+    for (int i = 0; i < n; i++) {
+        if (probabilities[i] >= cutoff) {
+            probindex[n0].index = i;
+            probindex[n0].prob = probabilities[i];
+            n0++;
+        }
+    }
+    qsort(probindex, n0, sizeof(ProbIndex), compare);
+
+    // truncate the list where cumulative probability exceeds topp
+    float cumulative_prob = 0.0f;
+    int last_idx = n0 - 1; // in case of rounding errors consider all elements
+    for (int i = 0; i < n0; i++) {
+        cumulative_prob += probindex[i].prob;
+        if (cumulative_prob > topp) {
+            last_idx = i;
+            break; // we've exceeded topp by including last_idx
+        }
+    }
+
+    // sample from the truncated list
+    float r = coin * cumulative_prob;
+    float cdf = 0.0f;
+    for (int i = 0; i <= last_idx; i++) {
+        cdf += probindex[i].prob;
+        if (r < cdf) {
+            return probindex[i].index;
+        }
+    }
+    return probindex[last_idx].index; // in case of rounding errors
+}
+
+void build_sampler(Sampler* sampler, int vocab_size, float temperature, float topp, unsigned long long rng_seed) {
+    sampler->vocab_size = vocab_size;
+    sampler->temperature = temperature;
+    sampler->topp = topp;
+    sampler->rng_state = rng_seed;
+    // buffer only used with nucleus sampling; may not need but it's ~small
+    sampler->probindex = malloc(sampler->vocab_size * sizeof(ProbIndex));
+}
+
+void free_sampler(Sampler* sampler) {
+    free(sampler->probindex);
+}
+
+unsigned int random_u32(unsigned long long *state) {
+    // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
+    *state ^= *state >> 12;
+    *state ^= *state << 25;
+    *state ^= *state >> 27;
+    return (*state * 0x2545F4914F6CDD1Dull) >> 32;
+}
+float random_f32(unsigned long long *state) { // random float32 in [0,1)
+    return (random_u32(state) >> 8) / 16777216.0f;
+}
+
+int sample(Sampler* sampler, float* logits) {
+    // sample the token given the logits and some hyperparameters
+    int next;
+    if (sampler->temperature == 0.0f) {
+        // greedy argmax sampling: take the token with the highest probability
+        next = sample_argmax(logits, sampler->vocab_size);
+    } else {
+        // apply the temperature to the logits
+        for (int q=0; q<sampler->vocab_size; q++) { logits[q] /= sampler->temperature; }
+        // apply softmax to the logits to get the probabilities for next token
+        softmax(logits, sampler->vocab_size);
+        // flip a (float) coin (this is our source of entropy for sampling)
+        float coin = random_f32(&sampler->rng_state);
+        // we sample from this distribution to get the next token
+        if (sampler->topp <= 0 || sampler->topp >= 1) {
+            // simply sample from the predicted probability distribution
+            next = sample_mult(logits, sampler->vocab_size, coin);
+        } else {
+            // top-p (nucleus) sampling, clamping the least likely tokens to zero
+            next = sample_topp(logits, sampler->vocab_size, sampler->topp, sampler->probindex, coin);
+        }
+    }
+    return next;
+}
+
+// ----------------------------------------------------------------------------
+// utilities: time
+
+long time_in_ms() {
+    return 0;
+}
+
+// ----------------------------------------------------------------------------
+// generation loop
+
+void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler, char *prompt, int steps) {
+    char *empty_prompt = "";
+    if (prompt == NULL) { prompt = empty_prompt; }
+
+    // encode the (string) prompt into tokens sequence
+    int num_prompt_tokens = 0;
+    int* prompt_tokens = (int*)malloc((strlen(prompt)+3) * sizeof(int)); // +3 for '\0', ?BOS, ?EOS
+    encode(tokenizer, prompt, 1, 0, prompt_tokens, &num_prompt_tokens);
+    if (num_prompt_tokens < 1) {
+        fprintf(stderr, "something is wrong, expected at least 1 prompt token\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // start the main loop
+    long start = 0;  // used to time our code, only initialized after first iteration
+    int next;        // will store the next token in the sequence
+    int token = prompt_tokens[0]; // kick off with the first token in the prompt
+    int pos = 0;     // position in the sequence
+    while (pos < steps) {
+
+        // forward the transformer to get logits for the next token
+        float* logits = forward(transformer, token, pos);
+
+        // advance the state machine
+        if (pos < num_prompt_tokens - 1) {
+            // if we are still processing the input prompt, force the next prompt token
+            next = prompt_tokens[pos + 1];
+        } else {
+            // otherwise sample the next token from the logits
+            next = sample(sampler, logits);
+        }
+        pos++;
+
+        // data-dependent terminating condition: the BOS (=1) token delimits sequences
+        if (next == 1) { break; }
+
+        // print the token as string, decode it with the Tokenizer object
+        char* piece = decode(tokenizer, token, next);
+        safe_printf(piece); // same as printf("%s", piece), but skips "unsafe" bytes
+        token = next;
+
+        // init the timer here because the first iteration can be slower
+        if (start == 0) { start = time_in_ms(); }
+    }
+
+    // report achieved tok/s (pos-1 because the timer starts after first iteration)
+    if (pos > 1) {
+        long end = time_in_ms();
+        fprintf(stderr, "achieved tok/s: %f\n", (pos-1) / (double)(end-start)*1000);
+    }
+
+    free(prompt_tokens);
+}
+
+void read_stdin(const char* guide, char* buffer, size_t bufsize) {
+    // read a line from stdin, up to but not including \n
+    printf("%s", guide);
+    if (fgets(buffer, bufsize, stdin) != NULL) {
+        size_t len = strlen(buffer);
+        if (len > 0 && buffer[len - 1] == '\n') {
+            buffer[len - 1] = '\0'; // strip newline
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+// chat loop
+// I manually inspected the tokens for a few chat conversations compared to
+// python reference and that seemed ok, but this was not thoroughly tested and
+// is not safely implemented, it's more a proof of concept atm.
+
+void chat(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
+          char *cli_user_prompt, char *cli_system_prompt, int steps) {
+
+    // buffers for reading the system prompt and user prompt from stdin
+    // you'll notice they are soomewhat haphazardly and unsafely set atm
+    char system_prompt[512];
+    char user_prompt[512];
+    char rendered_prompt[1152];
+    int num_prompt_tokens = 0;
+    int* prompt_tokens = (int*)malloc(1152 * sizeof(int));
+    int user_idx;
+
+    // start the main loop
+    int8_t user_turn = 1; // user starts
+    int next;        // will store the next token in the sequence
+    int token;       // stores the current token to feed into the transformer
+    int prev_token;
+    int pos = 0;     // position in the sequence
+    while (pos < steps) {
+
+        // when it is the user's turn to contribute tokens to the dialog...
+        if (user_turn) {
+            // get the (optional) system prompt at position 0
+            if (pos == 0) {
+                // at position 0, the user can also contribute a system prompt
+                if (cli_system_prompt == NULL) {
+                    // system prompt was not passed in, attempt to get it from stdin
+                    read_stdin("Enter system prompt (optional): ", system_prompt, sizeof(system_prompt));
+                } else {
+                    // system prompt was passed in, use it
+                    strcpy(system_prompt, cli_system_prompt);
+                }
+            }
+            // get the user prompt
+            if (pos == 0 && cli_user_prompt != NULL) {
+                // user prompt for position 0 was passed in, use it
+                strcpy(user_prompt, cli_user_prompt);
+            } else {
+                // otherwise get user prompt from stdin
+                read_stdin("User: ", user_prompt, sizeof(user_prompt));
+            }
+            // render user/system prompts into the Llama 2 Chat schema
+            if (pos == 0 && system_prompt[0] != '\0') {
+                char system_template[] = "[INST] <<SYS>>\n%s\n<</SYS>>\n\n%s [/INST]";
+                sprintf(rendered_prompt, system_template, system_prompt, user_prompt);
+            } else {
+                char user_template[] = "[INST] %s [/INST]";
+                sprintf(rendered_prompt, user_template, user_prompt);
+            }
+            // encode the rendered prompt into tokens
+            encode(tokenizer, rendered_prompt, 1, 0, prompt_tokens, &num_prompt_tokens);
+            user_idx = 0; // reset the user index
+            user_turn = 0;
+            printf("Assistant: ");
+        }
+
+        // determine the token to pass into the transformer next
+        if (user_idx < num_prompt_tokens) {
+            // if we are still processing the input prompt, force the next prompt token
+            token = prompt_tokens[user_idx++];
+        } else {
+            // otherwise use the next token sampled from previous turn
+            token = next;
+        }
+        // EOS (=2) token ends the Assistant turn
+        if (token == 2) { user_turn = 1; }
+
+        // forward the transformer to get logits for the next token
+        float* logits = forward(transformer, token, pos);
+        next = sample(sampler, logits);
+        pos++;
+
+        if (user_idx >= num_prompt_tokens && next != 2) {
+            // the Assistant is responding, so print its output
+            char* piece = decode(tokenizer, token, next);
+            safe_printf(piece); // same as printf("%s", piece), but skips "unsafe" bytes
+            fflush(stdout);
+        }
+        if (next == 2) { printf("\n"); }
+    }
+    printf("\n");
+    free(prompt_tokens);
+}
+
+
+// ----------------------------------------------------------------------------
+// CLI, include only if not testing
+#ifndef TESTING
+
+void error_usage() {
+    fprintf(stderr, "Usage:   run <checkpoint> [options]\n");
+    fprintf(stderr, "Example: run model.bin -n 256 -i \"Once upon a time\"\n");
+    fprintf(stderr, "Options:\n");
+    fprintf(stderr, "  -t <float>  temperature in [0,inf], default 1.0\n");
+    fprintf(stderr, "  -p <float>  p value in top-p (nucleus) sampling in [0,1] default 0.9\n");
+    fprintf(stderr, "  -s <int>    random seed, default time(NULL)\n");
+    fprintf(stderr, "  -n <int>    number of steps to run for, default 256. 0 = max_seq_len\n");
+    fprintf(stderr, "  -i <string> input prompt\n");
+    fprintf(stderr, "  -z <string> optional path to custom tokenizer\n");
+    fprintf(stderr, "  -m <string> mode: generate|chat, default: generate\n");
+    fprintf(stderr, "  -y <string> (optional) system prompt in chat mode\n");
+    exit(EXIT_FAILURE);
+}
+
+int test(int argc, char *argv[]) {
+    setup_mtvec();
+    // default parameters
+    char checkpoint_path[64] = "./model/stories15M.bin";  // e.g. out/model.bin
+    char *tokenizer_path = "tokenizer.bin";
+    float temperature = 1.0f;   // 0.0 = greedy deterministic. 1.0 = original. don't set higher
+    float topp = 0.9f;          // top-p in nucleus sampling. 1.0 = off. 0.9 works well, but slower
+    int steps = 256;            // number of steps to run for
+    char *prompt = NULL;        // prompt string
+    unsigned long long rng_seed = 0; // seed rng with time by default
+    char *mode = "generate";    // generate|chat
+    char *system_prompt = NULL; // the (optional) system prompt to use in chat mode
+
+    // build the Transformer via the model .bin file
+    Transformer transformer;
+    build_transformer(&transformer, checkpoint_path);
+    if (steps == 0 || steps > transformer.config.seq_len) steps = transformer.config.seq_len; // override to ~max length
+
+
+    // build the Tokenizer via the tokenizer .bin file
+    Tokenizer tokenizer;
+    build_tokenizer(&tokenizer, tokenizer_path, transformer.config.vocab_size);
+
+
+    // build the Sampler
+    Sampler sampler;
+    build_sampler(&sampler, transformer.config.vocab_size, temperature, topp, rng_seed);
+    // run!
+    if (strcmp(mode, "generate") == 0) {
+        generate(&transformer, &tokenizer, &sampler, prompt, steps);
+    } else if (strcmp(mode, "chat") == 0) {
+        chat(&transformer, &tokenizer, &sampler, prompt, system_prompt, steps);
+    } else {
+        fprintf(stderr, "unknown mode: %s\n", mode);
+        error_usage();
+    }
+
+    // memory and file handles cleanup
+    free_sampler(&sampler);
+    free_tokenizer(&tokenizer);
+    free_transformer(&transformer);
+    return 0;
+}
+#endif
diff --git a/tests/perf/llama/trap.c b/tests/perf/llama/trap.c
new file mode 100644
index 000000000..01bab6740
--- /dev/null
+++ b/tests/perf/llama/trap.c
@@ -0,0 +1,56 @@
+#include "trap.h"
+#include <emurt.h>
+
+void __attribute__((aligned(4))) trap_handler() {
+    unsigned long mcause, mtval, mepc;
+    asm volatile(
+        "csrr %0, mcause"
+        : "=r" (mcause)
+    );
+    asm volatile(
+        "csrr %0, mtval"
+        : "=r" (mtval)
+    );
+    asm volatile(
+        "csrr %0, mepc"
+        : "=r" (mepc)
+    );
+    print_s("Exception: ");
+    print_s("\nmcause: ");
+    dump_hex(mcause);
+    print_s("mtval: ");
+    dump_hex(mtval);
+    print_s("mepc: ");
+    dump_hex(mepc);
+    while(1);
+}
+
+void setup_mtvec() {
+    void* ptr = &trap_handler;
+    print_s("setting mtvec to ");
+    dump_hex((unsigned long)ptr);
+    print_s("\n");
+    if (((unsigned long)ptr & 0b11)) {
+        print_s("Error! mtvec does not aligned to 4B\n");
+    }
+    asm volatile(
+        "csrw mtvec, %0"
+        :
+        : "r" (ptr)
+    );
+}
+
+void enter_smode() {
+    asm volatile("csrc mstatus, %0" : : "r" (0x1800)); // clear mpp to zero
+    asm volatile("csrs mstatus, %0" : : "r" (0x0800)); // set mpp to s-mode
+    asm volatile(
+        ".option arch, -c\n"
+        "auipc a0, 0\n"
+        "addi a0, a0, 16\n"
+        "csrw mepc, a0\n"
+        "mret\n"
+        :
+        :
+        : "a0"
+    );
+}
diff --git a/tests/perf/llama/trap.h b/tests/perf/llama/trap.h
new file mode 100644
index 000000000..7165e52c8
--- /dev/null
+++ b/tests/perf/llama/trap.h
@@ -0,0 +1,5 @@
+#include <emurt.h>
+
+void setup_mtvec();
+void trap_handler();
+void enter_smode();
diff --git a/tests/perf/llama/utils.h b/tests/perf/llama/utils.h
new file mode 100644
index 000000000..3cacdc726
--- /dev/null
+++ b/tests/perf/llama/utils.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <stdlib.h>
+#include <memory.h>
+#include <stdio.h>
+
+// provide FILE-like operations for on-memory data
+
+typedef struct {
+  const char *data_begin;
+  size_t size;
+  const char *cursor;
+} BIN;
+
+inline BIN *bopen(const char *data_begin, size_t size) {
+  BIN *bin = (BIN *) malloc(sizeof(BIN));
+  bin->cursor = bin->data_begin = data_begin;
+  bin->size = size;
+  return bin;
+}
+
+inline void bclose(BIN *bin) {
+  free(bin);
+}
+
+inline size_t bread(void *buffer, size_t size, size_t count, BIN *stream) {
+  memcpy(buffer, (void *) stream->cursor, size * count);
+  stream->cursor += size * count;
+  return count;
+}
+
+inline size_t bseek(BIN *bin, long offset, int origin) {
+  switch (origin) {
+    case SEEK_SET:
+      bin->cursor = bin->data_begin + offset; break;
+    case SEEK_CUR:
+      bin->cursor += offset; break;
+    case SEEK_END:
+      bin->cursor = bin->data_begin + bin->size - offset; break;
+    default:
+      return 1;
+  }
+  return 0;
+}
+

From 4ff9380f684fafb0e141e321b489f2d16e09b402 Mon Sep 17 00:00:00 2001
From: SharzyL <me@sharzy.in>
Date: Sun, 5 May 2024 16:22:09 +0800
Subject: [PATCH 05/10] [emurt] cleanup

---
 nix/overlay.nix         |  5 +--
 script/src/Main.scala   |  2 +-
 tests/emurt/emurt.c     | 70 ++++++++++++++---------------------------
 tests/emurt/emurt.h     |  6 ----
 tests/perf/llama/trap.c | 45 +++++++++-----------------
 5 files changed, 42 insertions(+), 86 deletions(-)

diff --git a/nix/overlay.nix b/nix/overlay.nix
index 5367484aa..bdaf9dff4 100644
--- a/nix/overlay.nix
+++ b/nix/overlay.nix
@@ -42,17 +42,18 @@ in
       let
         major = final.lib.versions.major rv32_buildPkgs.${llvmForRVV_attrName}.release_version;
 
+        # By default, compiler-rt and newlib for rv32 are built with double float point abi by default.
+        # We need to override it with `-mabi=ilp32f`
+
         # compiler-rt requires the compilation flag -fforce-enable-int128, only clang provides that
         compilerrt = (rv32_pkgs.${llvmForRVV_attrName}.compiler-rt.override {
           stdenv = rv32_pkgs.overrideCC
             rv32_pkgs.stdenv
             rv32_buildPkgs.${llvmForRVV_attrName}.clangNoCompilerRt;
         }).overrideAttrs (oldAttrs: {
-          NIX_DEBUG = 1;
           env.NIX_CFLAGS_COMPILE = "-march=rv32gcv -mabi=ilp32f";
         });
 
-        # newlib is built with double float point abi by default, override it
         newlib = rv32_pkgs.stdenv.cc.libc.overrideAttrs (oldAttrs: {
           CFLAGS_FOR_TARGET = "-march=rv32gcv -mabi=ilp32f";
         });
diff --git a/script/src/Main.scala b/script/src/Main.scala
index 2412f7a70..d31ab9b1a 100644
--- a/script/src/Main.scala
+++ b/script/src/Main.scala
@@ -177,7 +177,7 @@ object Main:
       ) noLog: Flag = Flag(false),
       @arg(
         name = "no-file-logging",
-        doc = "prevent emulator print log to console"
+        doc = "prevent emulator print log to file"
       ) noFileLog: Flag = Flag(true),
       @arg(
         name = "no-console-logging",
diff --git a/tests/emurt/emurt.c b/tests/emurt/emurt.c
index 9f4a7a4df..f52a57d36 100644
--- a/tests/emurt/emurt.c
+++ b/tests/emurt/emurt.c
@@ -1,5 +1,9 @@
 #include "emurt.h"
 
+///////////////////////
+// uart
+///////////////////////
+
 struct uartlite_regs *const ttyUL0 = (struct uartlite_regs *)0x10000000;
 
 #define SR_TX_FIFO_FULL         (1<<3) /* transmit FIFO full */
@@ -41,6 +45,19 @@ void get(char *s, int n) {
     s[i] = '\0';
 }
 
+int _write(int file, char* ptr, int len) {
+  int i = 0;
+  for (; i < len; i++) {
+    uart_put_c(ptr[i]);
+  }
+  return i;
+}
+
+void _exit(int code) {
+  __asm__("csrwi 0x7cc, 0");
+  __builtin_unreachable();
+}
+
 void print_s(const char *c) {
     while (*c) {
         uart_put_c(*c);
@@ -48,40 +65,9 @@ void print_s(const char *c) {
     }
 }
 
-void print_long(long x) {
-    char buffer[30];
-    if (x < 0) {
-        uart_put_c('-');
-        x = -x;
-    }
-    int idx = 0;
-    while (x) {
-        long new_x = x / 10;
-        long rem_x = x % 10;
-        buffer[idx ++] = '0' + rem_x;
-        x = new_x;
-    }
-    if (idx == 0) uart_put_c('0');
-    else while (idx) uart_put_c(buffer[--idx]);
-}
-
-void print_digit(unsigned char x) {
-    uart_put_c('0'+x);
-}
-
-void dump_hex(unsigned long x) {
-    uart_put_c('0');
-    uart_put_c('x');
-    char buffer[16];
-    for (int i=0;i<16;i++) {
-        unsigned long cur = x & 0xf;
-        buffer[i] = cur < 10 ? ('0' + cur) : ('a' + cur - 10);
-        x >>= 4;
-    }
-    for (int i=15;i>=0;i--) uart_put_c(buffer[i]);
-    uart_put_c('\r');
-    uart_put_c('\n');
-}
+///////////////////////
+// allocation
+///////////////////////
 
 extern char* __heapbegin;
 char *heap_top;
@@ -97,6 +83,10 @@ char *_sbrk(int nbytes) {
   return base;
 }
 
+///////////////////////
+// unimplemented
+///////////////////////
+
 // We don't support FS
 int _isatty(int file) {
   return -1;
@@ -117,11 +107,6 @@ int _fstat(int file, struct stat* st) {
   return -1;
 }
 
-void _exit(int code) {
-  __asm__("csrwi 0x7cc, 0");
-  __builtin_unreachable();
-}
-
 // We don't support close
 int _close(int file) {
   return -1;
@@ -137,10 +122,3 @@ int _read(int file, char* ptr, int len) {
   return -1;
 }
 
-int _write(int file, char* ptr, int len) {
-  int i = 0;
-  for (; i < len; i++) {
-    uart_put_c(ptr[i]);
-  }
-  return i;
-}
diff --git a/tests/emurt/emurt.h b/tests/emurt/emurt.h
index 53eb22de8..b31c1b8f9 100644
--- a/tests/emurt/emurt.h
+++ b/tests/emurt/emurt.h
@@ -9,12 +9,6 @@ struct uartlite_regs {
     volatile unsigned int control;
 };
 
-void uart_put_c(char c);
-char uart_check_read();
-char uart_get_c();
 void get(char *s, int n);
 void print_s(const char *c);
-void print_long(long x);
-void print_digit(unsigned char x);
-void dump_hex(unsigned long x);
 
diff --git a/tests/perf/llama/trap.c b/tests/perf/llama/trap.c
index 01bab6740..1cede6e87 100644
--- a/tests/perf/llama/trap.c
+++ b/tests/perf/llama/trap.c
@@ -1,38 +1,22 @@
-#include "trap.h"
 #include <emurt.h>
+#include <stdio.h>
+
+#include "trap.h"
 
 void __attribute__((aligned(4))) trap_handler() {
     unsigned long mcause, mtval, mepc;
     asm volatile(
-        "csrr %0, mcause"
-        : "=r" (mcause)
-    );
-    asm volatile(
-        "csrr %0, mtval"
-        : "=r" (mtval)
-    );
-    asm volatile(
-        "csrr %0, mepc"
-        : "=r" (mepc)
+        "csrr %0, mcause\n"
+        "csrr %1, mtval\n"
+        "csrr %2, mepc\n"
+        : "=r" (mcause), "=r" (mtval), "=r" (mepc)
     );
-    print_s("Exception: ");
-    print_s("\nmcause: ");
-    dump_hex(mcause);
-    print_s("mtval: ");
-    dump_hex(mtval);
-    print_s("mepc: ");
-    dump_hex(mepc);
+    printf("Exception: mcause=%08lx, mtval=%08lx, mepc=%08lx", mcause, mtval, mepc);
     while(1);
 }
 
 void setup_mtvec() {
     void* ptr = &trap_handler;
-    print_s("setting mtvec to ");
-    dump_hex((unsigned long)ptr);
-    print_s("\n");
-    if (((unsigned long)ptr & 0b11)) {
-        print_s("Error! mtvec does not aligned to 4B\n");
-    }
     asm volatile(
         "csrw mtvec, %0"
         :
@@ -41,16 +25,15 @@ void setup_mtvec() {
 }
 
 void enter_smode() {
-    asm volatile("csrc mstatus, %0" : : "r" (0x1800)); // clear mpp to zero
-    asm volatile("csrs mstatus, %0" : : "r" (0x0800)); // set mpp to s-mode
     asm volatile(
+        "csrc mstatus, %0\n"
+        "csrs mstatus, %1\n"
         ".option arch, -c\n"
-        "auipc a0, 0\n"
-        "addi a0, a0, 16\n"
-        "csrw mepc, a0\n"
+        "auipc %2, 0\n"
+        "addi %2, %2, 16\n"
+        "csrw mepc, %2\n"
         "mret\n"
         :
-        :
-        : "a0"
+        : "r" (0x1800), "r" (0x0800), "r" (0x10)
     );
 }

From dca81eb707191a6e8c88e487f93081f498f91664 Mon Sep 17 00:00:00 2001
From: SharzyL <me@sharzy.in>
Date: Sun, 5 May 2024 16:23:55 +0800
Subject: [PATCH 06/10] [doc] add doc for perf cases

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 3f3feeea3..a1c268906 100644
--- a/README.md
+++ b/README.md
@@ -176,6 +176,7 @@ The `tests/` contains the testcases. There are four types of testcases:
 - intrinsic
 - mlir
 - codegen
+- perf
 
 To add new testcases for asm/intrinsic/mlir, create a new directory with `default.nix` and source files.
 Refer to the existing code for more information on how to write the nix file.

From ce765fe942f21528d4691ed1908a132e95feff3f Mon Sep 17 00:00:00 2001
From: SharzyL <me@sharzy.in>
Date: Sun, 5 May 2024 16:22:52 +0800
Subject: [PATCH 07/10] [cosim] support print call frame

---
 ipemu/csrc/elf.cc          | 138 ++++++++++++++++++++++++++++++-------
 ipemu/csrc/simple_sim.h    |  18 ++++-
 ipemu/csrc/spdlog_ext.cc   |  53 ++++++++------
 ipemu/csrc/vbridge_impl.cc |  31 ++++++++-
 ipemu/csrc/vbridge_impl.h  |   9 +++
 5 files changed, 199 insertions(+), 50 deletions(-)

diff --git a/ipemu/csrc/elf.cc b/ipemu/csrc/elf.cc
index 223783cf0..c6a1e3f40 100644
--- a/ipemu/csrc/elf.cc
+++ b/ipemu/csrc/elf.cc
@@ -2,40 +2,126 @@
 
 #include <fmt/core.h>
 #include <linux/elf.h>
+#include <endian.h>
 
-simple_sim::load_elf_result_t simple_sim::load_elf(const std::string &fname) {
+// convert little-endian integral type to host-endian
+template<typename T>
+T from_le(T value) {
+  static_assert(std::is_integral<T>::value, "T must be an integral type");
+
+  if constexpr (sizeof(T) == 1) {
+    return value;
+  } else if constexpr (sizeof(T) == 2) {
+    return le16toh(value);
+  } else if constexpr (sizeof(T) == 4) {
+    return le32toh(value);
+  } else if constexpr (sizeof(T) == 8) {
+    return le64toh(value);
+  } else {
+    static_assert(sizeof(T) <= 8, "Unsupported type size");
+  }
+}
+
+void copy_from_fs(std::ifstream &ifs, std::streamoff offset, std::streamoff size, void *dst) {
+  ifs.clear();
+  ifs.seekg(offset);
+  ifs.read(reinterpret_cast<char *>(dst), size);
+}
+
+template<typename T>
+T read_from_fs(std::ifstream &ifs, std::streamoff offset) {
+  T t{};
+  copy_from_fs(ifs, offset, sizeof(T), &t);
+  return t;
+}
+
+simple_sim::load_elf_result_t simple_sim::load_elf32_little_endian(const std::string &fname) {
   try {
     std::ifstream fs(fname, std::ios::binary);
     fs.exceptions(std::ios::failbit);
 
-    Elf32_Ehdr ehdr;
-    fs.read(reinterpret_cast<char *>(&ehdr), sizeof(ehdr));
-    CHECK(ehdr.e_machine == EM_RISCV && ehdr.e_type == ET_EXEC &&
-              ehdr.e_ident[EI_CLASS] == ELFCLASS32,
-          "ehdr check failed when loading elf");
-    CHECK_EQ(ehdr.e_phentsize, sizeof(elf32_phdr),
-             "ehdr.e_phentsize does not equal to elf32_phdr");
-
-    for (size_t i = 0; i < ehdr.e_phnum; i++) {
-      auto phdr_offset = ehdr.e_phoff + i * ehdr.e_phentsize;
-      Elf32_Phdr phdr;
-      fs.seekg((long)phdr_offset)
-          .read(reinterpret_cast<char *>(&phdr), sizeof(phdr));
-      if (phdr.p_type == PT_LOAD) {
-        CHECK(phdr.p_paddr + phdr.p_filesz < mem_size,
+    auto ehdr = read_from_fs<Elf32_Ehdr>(fs, 0);
+    CHECK(std::memcmp(ehdr.e_ident, ELFMAG, SELFMAG) == 0, "elf magic not match");
+    CHECK(ehdr.e_machine == EM_RISCV, "elf not in RISCV");
+    CHECK(ehdr.e_type == ET_EXEC, "elf not executable");
+    CHECK(ehdr.e_ident[EI_DATA] == ELFDATA2LSB, "elf not little endian");
+    CHECK(ehdr.e_ident[EI_CLASS] == ELFCLASS32, "elf not in 32bit");
+
+    for (size_t i = 0; i < from_le(ehdr.e_phnum); i++) {
+      auto phdr_offset = from_le(ehdr.e_phoff) + i * from_le(ehdr.e_phentsize);
+      auto phdr = read_from_fs<Elf32_Phdr>(fs, (std::streamoff) phdr_offset);
+      if (from_le(phdr.p_type) == PT_LOAD) {
+        auto paddr = from_le(phdr.p_paddr);
+        auto filesz = from_le(phdr.p_filesz);
+
+        CHECK(paddr + filesz < mem_size,
               "phdr p_paddr + p_filesz check failed");
-        fs.seekg((long)phdr.p_offset)
-            .read(reinterpret_cast<char *>(&mem[phdr.p_paddr]), phdr.p_filesz);
-        Log("LoadElfResult")
-            .with("segment", i)
-            .with("phdr_offset", fmt::format("{:08X}", phdr.p_offset))
-            .with("paddr_range", fmt::format("{:08X}-{:08X}", phdr.p_paddr,
-                                             phdr.p_paddr + phdr.p_memsz))
-            .trace();
+        fs.clear();
+        fs.seekg(from_le(phdr.p_offset))
+          .read(reinterpret_cast<char *>(&mem[paddr]), filesz);
+        Log("LoadElf")
+          .with("segment", i)
+          .with("phdr_offset", fmt::format("{:08X}", phdr.p_offset))
+          .with("paddr_range", fmt::format("{:08X}-{:08X}", phdr.p_paddr,
+                                           phdr.p_paddr + phdr.p_memsz))
+          .warn();
       }
     }
+
+    // read section string section
+    auto shoff = from_le(ehdr.e_shoff);
+    auto shentsize = from_le(ehdr.e_shentsize);
+    auto shstrndx = from_le(ehdr.e_shstrndx);
+    auto section_string_shdr_offset = shoff + shstrndx * shentsize;
+    auto section_string_shdr = read_from_fs<Elf32_Shdr>(fs, section_string_shdr_offset);
+    std::vector<char> section_string_table(from_le(section_string_shdr.sh_size));
+    copy_from_fs(fs,
+                 from_le(section_string_shdr.sh_offset),
+                 from_le(section_string_shdr.sh_size),
+                 section_string_table.data());
+
+    // iterate over section headers to find the symbol string table
+    std::vector<char> string_table;
+    for (int i = 0; i < from_le(ehdr.e_shnum); ++i) {
+      auto shdr = read_from_fs<Elf32_Shdr>(fs, shoff + i * shentsize);
+      if (from_le(shdr.sh_type) == SHT_STRTAB &&
+          std::string(&section_string_table[from_le(shdr.sh_name)]) == ".strtab") {
+        Log("size").with("size", shdr.sh_size).warn();
+        string_table.resize(from_le(shdr.sh_size));
+        copy_from_fs(fs, from_le(shdr.sh_offset), from_le(shdr.sh_size), string_table.data());
+      }
+    }
+
+    if (string_table.empty()) {
+      Log("LoadElf").warn("failed to find .strtab");
+    } else {
+      // iterate over section headers to find the symbol table
+      for (int i = 0; i < from_le(ehdr.e_shnum); ++i) {
+        auto shdr = read_from_fs<Elf32_Shdr>(fs, shoff + i * shentsize);
+        if (from_le(shdr.sh_type) == SHT_SYMTAB && std::string(&section_string_table[shdr.sh_name]) == ".symtab") {
+          auto entsize = from_le(shdr.sh_entsize);
+          unsigned int num_sym = from_le(shdr.sh_size) / entsize;
+          for (int j = 0; j < num_sym; ++j) {
+            auto offset = from_le(shdr.sh_offset) + j * entsize;
+            auto sym = read_from_fs<Elf32_Sym>(fs, (std::streamoff) offset);
+
+            if (ELF32_ST_TYPE(from_le(sym.st_info)) == STT_FUNC) { // Only considering function symbols
+              // read the name from the string table
+              std::string name(&string_table.at(from_le(sym.st_name)));
+              function_symtab[from_le(sym.st_value)] = {.name = name, .info = from_le(sym.st_info)}; // Add to map
+            }
+          }
+          break;
+        }
+      }
+    }
+
     return {.entry_addr = ehdr.e_entry};
-  } catch (std::ios_base::failure &) {
-    throw std::system_error{errno, std::generic_category(), fname};
+  } catch (std::ios_base::failure &f) {
+    Log("LoadElf")
+      .with("errno", errno)
+      .with("fname", fname)
+      .with("reason", f.what())
+      .fatal();
   }
 }
diff --git a/ipemu/csrc/simple_sim.h b/ipemu/csrc/simple_sim.h
index d9063b8c6..175350bae 100644
--- a/ipemu/csrc/simple_sim.h
+++ b/ipemu/csrc/simple_sim.h
@@ -15,6 +15,12 @@ class simple_sim : public simif_t {
   uartlite uart;
   reg_t uart_addr = 0x10000000;
 
+  struct function_sym {
+    std::string name;
+    uint8_t info;
+  };
+  std::map<uint32_t, function_sym> function_symtab;
+
 public:
   explicit simple_sim(size_t mem_size) : mem_size(mem_size) {
     mem = new char[mem_size];
@@ -25,7 +31,8 @@ class simple_sim : public simif_t {
   struct load_elf_result_t {
     uint32_t entry_addr;
   };
-  load_elf_result_t load_elf(const std::string &fname);
+
+  load_elf_result_t load_elf32_little_endian(const std::string &fname);
 
   // should return NULL for MMIO addresses
   char *addr_to_mem(reg_t addr) override {
@@ -71,5 +78,12 @@ class simple_sim : public simif_t {
     // maybe nothing to do
   }
 
-  const char *get_symbol(uint64_t addr) override { FATAL("Unimplemented"); }
+  const char *get_symbol(uint64_t addr) override {
+    auto find = this->function_symtab.find(addr);
+    if (find != this->function_symtab.end()) {
+      return find->second.name.c_str();
+    } else {
+      return nullptr;
+    }
+  }
 };
diff --git a/ipemu/csrc/spdlog_ext.cc b/ipemu/csrc/spdlog_ext.cc
index 75822d237..21c91ac5c 100644
--- a/ipemu/csrc/spdlog_ext.cc
+++ b/ipemu/csrc/spdlog_ext.cc
@@ -60,6 +60,7 @@ ConsoleSink::ConsoleSink() {
   whitelist = get_set_from_env("EMULATOR_LOG_WHITELIST", ',');
   whitelist.insert("DPIInitCosim");
   whitelist.insert("SpikeStep");
+  whitelist.insert("FunctionCall");
   whitelist.insert("SimulationExit");
   whitelist.insert("DPIPeekIssue");
   whitelist.insert("DPIPokeInst");
@@ -75,6 +76,10 @@ inline bool ConsoleSink::is_module_enabled(const std::string &module) {
 void ConsoleSink::sink_it_(const spdlog::details::log_msg &msg) {
   json payload = json::parse(msg.payload);
 
+  if (msg.level < this->level()) {
+    return;
+  }
+
   // filter message matching the current level
   if (msg.level == this->level()) {
     if (!is_module_enabled(payload["_module"])) return;
@@ -82,37 +87,45 @@ void ConsoleSink::sink_it_(const spdlog::details::log_msg &msg) {
 
   fmt::text_style level_color;
   switch (msg.level) {
-  case spdlog::level::debug:
-  case spdlog::level::trace:
-    level_color = fmt::fg(fmt::color::gray);
-    break;
-  case spdlog::level::info:
-    level_color = fmt::fg(fmt::color::white);
-    break;
-  case spdlog::level::warn:
-    level_color = fmt::fg(fmt::color::yellow);
-    break;
-  case spdlog::level::err:
-    level_color = fmt::fg(fmt::color::red);
+    case spdlog::level::debug:
+    case spdlog::level::trace:
+      level_color = fmt::fg(fmt::color::gray);
+      break;
+    case spdlog::level::info:
+      level_color = fmt::fg(fmt::color::white);
+      break;
+    case spdlog::level::warn:
+      level_color = fmt::fg(fmt::color::yellow);
+      break;
+    case spdlog::level::err:
+      level_color = fmt::fg(fmt::color::red);
+      break;
+    case spdlog::level::critical:
+      level_color = fmt::bg(fmt::color::red) | fmt::fg(fmt::color::white);
+      break;
+    default:
+      level_color = fmt::fg(fmt::color::white);
       break;
-  case spdlog::level::critical:
-    level_color = fmt::bg(fmt::color::red) | fmt::fg(fmt::color::white);
-    break;
-  default:
-    level_color = fmt::fg(fmt::color::white);
-    break;
   }
 
   std::cerr << fmt::format("{} {}",
                            fmt::styled(payload["_cycle"].get<int64_t>(), level_color),
                            fmt::styled(payload["_module"].get<std::string>(), fmt::fg(fmt::color::violet))
-                           );
+  );
   if (payload.contains("_msg")) {
     std::cerr << fmt::format(" {}", fmt::styled(payload["_msg"].get<std::string>(), fmt::fg(fmt::color::green)));
   }
   if (payload.contains("_with")) {
     std::cerr << fmt::format(" {}", fmt::styled(payload["_with"].dump(), fmt::fg(fmt::color::gray)));
   }
+  if (msg.level > spdlog::level::err) {
+    std::cerr << "\n";
+    const auto frames = vbridge_impl_instance.frames;
+    for (auto frame = frames.rbegin(); frame != frames.rend(); frame++) {
+      std::cerr << fmt::format(fmt::fg(fmt::color::gray), "  call by {}(...) at {:08X}\n",
+                               frame->func_name, frame->func_addr);
+    }
+  }
   std::cerr << "\n";
 }
 
@@ -160,7 +173,7 @@ JsonLogger::JsonLogger(bool no_logging, bool no_file_logging, bool no_console_lo
   }
 }
 
-JsonLogger::JsonLogger(): do_logging(false) { }
+JsonLogger::JsonLogger() : do_logging(false) {}
 
 // We can only implement a class method with template inside the class
 // declaration
diff --git a/ipemu/csrc/vbridge_impl.cc b/ipemu/csrc/vbridge_impl.cc
index c4eeb78bd..93636925a 100644
--- a/ipemu/csrc/vbridge_impl.cc
+++ b/ipemu/csrc/vbridge_impl.cc
@@ -40,7 +40,7 @@ void VBridgeImpl::dpiInitCosim() {
   proc.get_state()->sstatus->write(proc.get_state()->sstatus->read() |
                                    SSTATUS_VS | SSTATUS_FS);
 
-  auto load_result = sim.load_elf(bin);
+  auto load_result = sim.load_elf32_little_endian(bin);
 
   proc.get_state()->pc = load_result.entry_addr;
 
@@ -369,6 +369,7 @@ std::optional<SpikeEvent> VBridgeImpl::spike_step() {
 
   clear_state(proc);
 
+  reg_t old_pc = state->pc;
   reg_t new_pc;
   if (event) {
     auto &se = event.value();
@@ -386,13 +387,39 @@ std::optional<SpikeEvent> VBridgeImpl::spike_step() {
     new_pc = fetch.func(&proc, fetch.insn, state->pc);
     se.log_arch_changes();
   } else {
+    auto disasm = proc.get_disassembler()->disassemble(fetch.insn);
     Log("SpikeStep")
         .with("pc", fmt::format("{:08X}", state->pc))
         .with("bits", fmt::format("{:08X}", fetch.insn.bits()))
-        .with("disasm", proc.get_disassembler()->disassemble(fetch.insn))
+        .with("disasm", disasm)
         .with("spike_cycles", spike_cycles)
         .info("spike run scalar insn");
     new_pc = fetch.func(&proc, fetch.insn, state->pc);
+
+    if (disasm == "ret" && new_pc == frames.back().return_addr) {
+      Log("FunctionCall")
+        .with("old_pc", fmt::format("{:08X}", old_pc))
+        .with("new_pc", fmt::format("{:08X}", new_pc))
+        .with("spike_cycles", spike_cycles)
+        .with("depth", frames.size())
+        .info("return");
+      frames.pop_back();
+    }
+  }
+
+  if (new_pc - state->pc != 2 && new_pc - state->pc != 4) {
+    auto sym_find = sim.get_symbol(new_pc);
+    if (sym_find != nullptr) {
+      Log("FunctionCall")
+        .with("func_name", sym_find)
+        .with("old_pc", fmt::format("{:08X}", old_pc))
+        .with("new_pc", fmt::format("{:08X}", new_pc))
+        .with("spike_cycles", spike_cycles)
+        .with("depth", frames.size())
+        .info("call");
+      reg_t return_addr = state->XPR[1];
+      frames.emplace_back(CallFrame{sym_find, new_pc, return_addr, spike_cycles});
+    }
   }
 
   // Bypass CSR insns commitlog stuff.
diff --git a/ipemu/csrc/vbridge_impl.h b/ipemu/csrc/vbridge_impl.h
index d560e8fe0..64e065204 100644
--- a/ipemu/csrc/vbridge_impl.h
+++ b/ipemu/csrc/vbridge_impl.h
@@ -265,6 +265,15 @@ class VBridgeImpl {
 
   int64_t spike_cycles = 0;
 
+  struct CallFrame {
+    std::string func_name;
+    reg_t func_addr;
+    reg_t return_addr;
+    int64_t spike_cycle;
+  };
+  std::vector<CallFrame> frames;
+  friend class ConsoleSink;
+
   std::optional<SpikeEvent> create_spike_event(insn_fetch_t fetch);
 
   std::optional<SpikeEvent> spike_step();

From f5fdef9eb01794a2ea524b90a5b0a71edd22f0f2 Mon Sep 17 00:00:00 2001
From: SharzyL <me@sharzy.in>
Date: Mon, 6 May 2024 19:54:22 +0800
Subject: [PATCH 08/10] [cosim] fix handling of elf symbols

---
 ipemu/csrc/vbridge_impl.cc | 2 +-
 tests/asm/mmm/mmm.asm      | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/ipemu/csrc/vbridge_impl.cc b/ipemu/csrc/vbridge_impl.cc
index 93636925a..6991f3496 100644
--- a/ipemu/csrc/vbridge_impl.cc
+++ b/ipemu/csrc/vbridge_impl.cc
@@ -396,7 +396,7 @@ std::optional<SpikeEvent> VBridgeImpl::spike_step() {
         .info("spike run scalar insn");
     new_pc = fetch.func(&proc, fetch.insn, state->pc);
 
-    if (disasm == "ret" && new_pc == frames.back().return_addr) {
+    if (disasm == "ret" && !frames.empty() && new_pc == frames.back().return_addr) {
       Log("FunctionCall")
         .with("old_pc", fmt::format("{:08X}", old_pc))
         .with("new_pc", fmt::format("{:08X}", new_pc))
diff --git a/tests/asm/mmm/mmm.asm b/tests/asm/mmm/mmm.asm
index 1343fe4ce..9fc80504a 100644
--- a/tests/asm/mmm/mmm.asm
+++ b/tests/asm/mmm/mmm.asm
@@ -7,6 +7,7 @@
 .text
 .balign 16
 .globl test
+.type  test, @function
 # assume VLEN >= 512, BN = 256, SEW = 16 * 2 = 32
 # we only support LMUL = 1 for now
 # P, A, B, AB should have 32 elements

From 22fe716d5d90f763f74c8d5a6f89d1231f62217b Mon Sep 17 00:00:00 2001
From: SharzyL <me@sharzy.in>
Date: Tue, 7 May 2024 00:14:33 +0800
Subject: [PATCH 09/10] [cosim] less verbose logging

---
 ipemu/csrc/elf.cc          |  3 +--
 ipemu/csrc/spdlog_ext.cc   | 15 +++++++++------
 ipemu/csrc/vbridge_impl.cc |  2 +-
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/ipemu/csrc/elf.cc b/ipemu/csrc/elf.cc
index c6a1e3f40..8414adb25 100644
--- a/ipemu/csrc/elf.cc
+++ b/ipemu/csrc/elf.cc
@@ -64,7 +64,7 @@ simple_sim::load_elf_result_t simple_sim::load_elf32_little_endian(const std::st
           .with("phdr_offset", fmt::format("{:08X}", phdr.p_offset))
           .with("paddr_range", fmt::format("{:08X}-{:08X}", phdr.p_paddr,
                                            phdr.p_paddr + phdr.p_memsz))
-          .warn();
+          .info();
       }
     }
 
@@ -86,7 +86,6 @@ simple_sim::load_elf_result_t simple_sim::load_elf32_little_endian(const std::st
       auto shdr = read_from_fs<Elf32_Shdr>(fs, shoff + i * shentsize);
       if (from_le(shdr.sh_type) == SHT_STRTAB &&
           std::string(&section_string_table[from_le(shdr.sh_name)]) == ".strtab") {
-        Log("size").with("size", shdr.sh_size).warn();
         string_table.resize(from_le(shdr.sh_size));
         copy_from_fs(fs, from_le(shdr.sh_offset), from_le(shdr.sh_size), string_table.data());
       }
diff --git a/ipemu/csrc/spdlog_ext.cc b/ipemu/csrc/spdlog_ext.cc
index 21c91ac5c..9259578cb 100644
--- a/ipemu/csrc/spdlog_ext.cc
+++ b/ipemu/csrc/spdlog_ext.cc
@@ -58,12 +58,15 @@ static std::set<std::string> get_set_from_env(const char *env_name, const char d
 
 ConsoleSink::ConsoleSink() {
   whitelist = get_set_from_env("EMULATOR_LOG_WHITELIST", ',');
-  whitelist.insert("DPIInitCosim");
-  whitelist.insert("SpikeStep");
-  whitelist.insert("FunctionCall");
-  whitelist.insert("SimulationExit");
-  whitelist.insert("DPIPeekIssue");
-  whitelist.insert("DPIPokeInst");
+  if (whitelist.empty()) {
+    // default set of whitelist
+    whitelist.insert("DPIInitCosim");
+    whitelist.insert("SpikeStep");
+    whitelist.insert("FunctionCall");
+    whitelist.insert("SimulationExit");
+    whitelist.insert("DPIPeekIssue");
+    whitelist.insert("DPIPokeInst");
+  }
 
   // putting it in JsonLogger::JsonLogger will not work. not knowing why
   this->set_level(get_level_from_env("EMULATOR_CONSOLE_LOG_LEVEL", spdlog::level::info));
diff --git a/ipemu/csrc/vbridge_impl.cc b/ipemu/csrc/vbridge_impl.cc
index 6991f3496..7e7274d39 100644
--- a/ipemu/csrc/vbridge_impl.cc
+++ b/ipemu/csrc/vbridge_impl.cc
@@ -524,7 +524,7 @@ void VBridgeImpl::receive_tl_req(const VTlInterface &tl) {
         Log("ReceiveTLReq")
             .with("addr", fmt::format("{:08X}", addr))
             .with("insn", se->jsonify_insn())
-            .warn("send falsy data 0xDE for accessing unexpected memory");
+            .info("send falsy data 0xDE for accessing unexpected memory");
         actual_data[offset] = 0xDE; // falsy data
       }
     }

From 5222689856a3605aa4492fbf64ce4114a4a15a4f Mon Sep 17 00:00:00 2001
From: SharzyL <me@sharzy.in>
Date: Tue, 7 May 2024 00:15:19 +0800
Subject: [PATCH 10/10] [cosim] handle non-save-ra call for frame tracing

---
 ipemu/csrc/vbridge_impl.cc | 46 ++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 9 deletions(-)

diff --git a/ipemu/csrc/vbridge_impl.cc b/ipemu/csrc/vbridge_impl.cc
index 7e7274d39..b2763468e 100644
--- a/ipemu/csrc/vbridge_impl.cc
+++ b/ipemu/csrc/vbridge_impl.cc
@@ -396,28 +396,56 @@ std::optional<SpikeEvent> VBridgeImpl::spike_step() {
         .info("spike run scalar insn");
     new_pc = fetch.func(&proc, fetch.insn, state->pc);
 
-    if (disasm == "ret" && !frames.empty() && new_pc == frames.back().return_addr) {
-      Log("FunctionCall")
-        .with("old_pc", fmt::format("{:08X}", old_pc))
-        .with("new_pc", fmt::format("{:08X}", new_pc))
-        .with("spike_cycles", spike_cycles)
-        .with("depth", frames.size())
-        .info("return");
-      frames.pop_back();
+    if (disasm == "ret") {
+      // When a function call is at the end of some parent function, the compiler may omit the save-ra process
+      // In this case we need to pop more than one frames when the child function returns
+      // Here we traverse the frames from top to bottom, until find a frame of the corresponding return_address
+      int layers_to_pop = 1;
+      for (; layers_to_pop <= frames.size(); layers_to_pop++) {
+        const auto &frame = frames[frames.size() - layers_to_pop];
+        if (frame.return_addr == new_pc) {
+          Log("FunctionCall")
+            .with("old_pc", fmt::format("{:08X}", old_pc))
+            .with("new_pc", fmt::format("{:08X}", new_pc))
+            .with("spike_cycles", spike_cycles)
+            .with("depth", frames.size())
+            .with("depth after return", frames.size() - layers_to_pop)
+            .info("return");
+          break;
+        }
+      }
+
+      if (layers_to_pop > frames.size()) {
+        // sometimes `ret` is used in inner-function jumping, in this case we cannot find corresponding frame
+        Log("FunctionCall")
+          .with("old_pc", fmt::format("{:08X}", old_pc))
+          .with("new_pc", fmt::format("{:08X}", new_pc))
+          .with("spike_cycles", spike_cycles)
+          .with("depth", frames.size())
+          .warn("cannot find the frame to return");
+      } else for (int j = 0; j < layers_to_pop; j++) {
+        frames.pop_back();
+      }
     }
   }
 
   if (new_pc - state->pc != 2 && new_pc - state->pc != 4) {
     auto sym_find = sim.get_symbol(new_pc);
     if (sym_find != nullptr) {
+      reg_t return_addr = state->XPR[1];
+
+      // handle the case with omitted save-ra, in this case return_addr is set to null since it cannot be returned to
+      if (return_addr - old_pc != 2 && return_addr - old_pc != 4) {
+        return_addr = 0;
+      }
       Log("FunctionCall")
         .with("func_name", sym_find)
         .with("old_pc", fmt::format("{:08X}", old_pc))
         .with("new_pc", fmt::format("{:08X}", new_pc))
+        .with("return_addr", fmt::format("{:08X}", return_addr))
         .with("spike_cycles", spike_cycles)
         .with("depth", frames.size())
         .info("call");
-      reg_t return_addr = state->XPR[1];
       frames.emplace_back(CallFrame{sym_find, new_pc, return_addr, spike_cycles});
     }
   }