diff --git a/README.md b/README.md index 3f3feeea3..a1c268906 100644 --- a/README.md +++ b/README.md @@ -176,6 +176,7 @@ The `tests/` contains the testcases. There are four types of testcases: - intrinsic - mlir - codegen +- perf To add new testcases for asm/intrinsic/mlir, create a new directory with `default.nix` and source files. Refer to the existing code for more information on how to write the nix file. diff --git a/ipemu/csrc/elf.cc b/ipemu/csrc/elf.cc index 223783cf0..8414adb25 100644 --- a/ipemu/csrc/elf.cc +++ b/ipemu/csrc/elf.cc @@ -2,40 +2,125 @@ #include #include +#include -simple_sim::load_elf_result_t simple_sim::load_elf(const std::string &fname) { +// convert little-endian integral type to host-endian +template +T from_le(T value) { + static_assert(std::is_integral::value, "T must be an integral type"); + + if constexpr (sizeof(T) == 1) { + return value; + } else if constexpr (sizeof(T) == 2) { + return le16toh(value); + } else if constexpr (sizeof(T) == 4) { + return le32toh(value); + } else if constexpr (sizeof(T) == 8) { + return le64toh(value); + } else { + static_assert(sizeof(T) <= 8, "Unsupported type size"); + } +} + +void copy_from_fs(std::ifstream &ifs, std::streamoff offset, std::streamoff size, void *dst) { + ifs.clear(); + ifs.seekg(offset); + ifs.read(reinterpret_cast(dst), size); +} + +template +T read_from_fs(std::ifstream &ifs, std::streamoff offset) { + T t{}; + copy_from_fs(ifs, offset, sizeof(T), &t); + return t; +} + +simple_sim::load_elf_result_t simple_sim::load_elf32_little_endian(const std::string &fname) { try { std::ifstream fs(fname, std::ios::binary); fs.exceptions(std::ios::failbit); - Elf32_Ehdr ehdr; - fs.read(reinterpret_cast(&ehdr), sizeof(ehdr)); - CHECK(ehdr.e_machine == EM_RISCV && ehdr.e_type == ET_EXEC && - ehdr.e_ident[EI_CLASS] == ELFCLASS32, - "ehdr check failed when loading elf"); - CHECK_EQ(ehdr.e_phentsize, sizeof(elf32_phdr), - "ehdr.e_phentsize does not equal to elf32_phdr"); - - for (size_t i = 0; i < ehdr.e_phnum; i++) { - auto phdr_offset = ehdr.e_phoff + i * ehdr.e_phentsize; - Elf32_Phdr phdr; - fs.seekg((long)phdr_offset) - .read(reinterpret_cast(&phdr), sizeof(phdr)); - if (phdr.p_type == PT_LOAD) { - CHECK(phdr.p_paddr + phdr.p_filesz < mem_size, + auto ehdr = read_from_fs(fs, 0); + CHECK(std::memcmp(ehdr.e_ident, ELFMAG, SELFMAG) == 0, "elf magic not match"); + CHECK(ehdr.e_machine == EM_RISCV, "elf not in RISCV"); + CHECK(ehdr.e_type == ET_EXEC, "elf not executable"); + CHECK(ehdr.e_ident[EI_DATA] == ELFDATA2LSB, "elf not little endian"); + CHECK(ehdr.e_ident[EI_CLASS] == ELFCLASS32, "elf not in 32bit"); + + for (size_t i = 0; i < from_le(ehdr.e_phnum); i++) { + auto phdr_offset = from_le(ehdr.e_phoff) + i * from_le(ehdr.e_phentsize); + auto phdr = read_from_fs(fs, (std::streamoff) phdr_offset); + if (from_le(phdr.p_type) == PT_LOAD) { + auto paddr = from_le(phdr.p_paddr); + auto filesz = from_le(phdr.p_filesz); + + CHECK(paddr + filesz < mem_size, "phdr p_paddr + p_filesz check failed"); - fs.seekg((long)phdr.p_offset) - .read(reinterpret_cast(&mem[phdr.p_paddr]), phdr.p_filesz); - Log("LoadElfResult") - .with("segment", i) - .with("phdr_offset", fmt::format("{:08X}", phdr.p_offset)) - .with("paddr_range", fmt::format("{:08X}-{:08X}", phdr.p_paddr, - phdr.p_paddr + phdr.p_memsz)) - .trace(); + fs.clear(); + fs.seekg(from_le(phdr.p_offset)) + .read(reinterpret_cast(&mem[paddr]), filesz); + Log("LoadElf") + .with("segment", i) + .with("phdr_offset", fmt::format("{:08X}", phdr.p_offset)) + .with("paddr_range", fmt::format("{:08X}-{:08X}", phdr.p_paddr, + phdr.p_paddr + phdr.p_memsz)) + .info(); } } + + // read section string section + auto shoff = from_le(ehdr.e_shoff); + auto shentsize = from_le(ehdr.e_shentsize); + auto shstrndx = from_le(ehdr.e_shstrndx); + auto section_string_shdr_offset = shoff + shstrndx * shentsize; + auto section_string_shdr = read_from_fs(fs, section_string_shdr_offset); + std::vector section_string_table(from_le(section_string_shdr.sh_size)); + copy_from_fs(fs, + from_le(section_string_shdr.sh_offset), + from_le(section_string_shdr.sh_size), + section_string_table.data()); + + // iterate over section headers to find the symbol string table + std::vector string_table; + for (int i = 0; i < from_le(ehdr.e_shnum); ++i) { + auto shdr = read_from_fs(fs, shoff + i * shentsize); + if (from_le(shdr.sh_type) == SHT_STRTAB && + std::string(§ion_string_table[from_le(shdr.sh_name)]) == ".strtab") { + string_table.resize(from_le(shdr.sh_size)); + copy_from_fs(fs, from_le(shdr.sh_offset), from_le(shdr.sh_size), string_table.data()); + } + } + + if (string_table.empty()) { + Log("LoadElf").warn("failed to find .strtab"); + } else { + // iterate over section headers to find the symbol table + for (int i = 0; i < from_le(ehdr.e_shnum); ++i) { + auto shdr = read_from_fs(fs, shoff + i * shentsize); + if (from_le(shdr.sh_type) == SHT_SYMTAB && std::string(§ion_string_table[shdr.sh_name]) == ".symtab") { + auto entsize = from_le(shdr.sh_entsize); + unsigned int num_sym = from_le(shdr.sh_size) / entsize; + for (int j = 0; j < num_sym; ++j) { + auto offset = from_le(shdr.sh_offset) + j * entsize; + auto sym = read_from_fs(fs, (std::streamoff) offset); + + if (ELF32_ST_TYPE(from_le(sym.st_info)) == STT_FUNC) { // Only considering function symbols + // read the name from the string table + std::string name(&string_table.at(from_le(sym.st_name))); + function_symtab[from_le(sym.st_value)] = {.name = name, .info = from_le(sym.st_info)}; // Add to map + } + } + break; + } + } + } + return {.entry_addr = ehdr.e_entry}; - } catch (std::ios_base::failure &) { - throw std::system_error{errno, std::generic_category(), fname}; + } catch (std::ios_base::failure &f) { + Log("LoadElf") + .with("errno", errno) + .with("fname", fname) + .with("reason", f.what()) + .fatal(); } } diff --git a/ipemu/csrc/simple_sim.h b/ipemu/csrc/simple_sim.h index d9063b8c6..175350bae 100644 --- a/ipemu/csrc/simple_sim.h +++ b/ipemu/csrc/simple_sim.h @@ -15,6 +15,12 @@ class simple_sim : public simif_t { uartlite uart; reg_t uart_addr = 0x10000000; + struct function_sym { + std::string name; + uint8_t info; + }; + std::map function_symtab; + public: explicit simple_sim(size_t mem_size) : mem_size(mem_size) { mem = new char[mem_size]; @@ -25,7 +31,8 @@ class simple_sim : public simif_t { struct load_elf_result_t { uint32_t entry_addr; }; - load_elf_result_t load_elf(const std::string &fname); + + load_elf_result_t load_elf32_little_endian(const std::string &fname); // should return NULL for MMIO addresses char *addr_to_mem(reg_t addr) override { @@ -71,5 +78,12 @@ class simple_sim : public simif_t { // maybe nothing to do } - const char *get_symbol(uint64_t addr) override { FATAL("Unimplemented"); } + const char *get_symbol(uint64_t addr) override { + auto find = this->function_symtab.find(addr); + if (find != this->function_symtab.end()) { + return find->second.name.c_str(); + } else { + return nullptr; + } + } }; diff --git a/ipemu/csrc/spdlog_ext.cc b/ipemu/csrc/spdlog_ext.cc index 75822d237..9259578cb 100644 --- a/ipemu/csrc/spdlog_ext.cc +++ b/ipemu/csrc/spdlog_ext.cc @@ -58,11 +58,15 @@ static std::set get_set_from_env(const char *env_name, const char d ConsoleSink::ConsoleSink() { whitelist = get_set_from_env("EMULATOR_LOG_WHITELIST", ','); - whitelist.insert("DPIInitCosim"); - whitelist.insert("SpikeStep"); - whitelist.insert("SimulationExit"); - whitelist.insert("DPIPeekIssue"); - whitelist.insert("DPIPokeInst"); + if (whitelist.empty()) { + // default set of whitelist + whitelist.insert("DPIInitCosim"); + whitelist.insert("SpikeStep"); + whitelist.insert("FunctionCall"); + whitelist.insert("SimulationExit"); + whitelist.insert("DPIPeekIssue"); + whitelist.insert("DPIPokeInst"); + } // putting it in JsonLogger::JsonLogger will not work. not knowing why this->set_level(get_level_from_env("EMULATOR_CONSOLE_LOG_LEVEL", spdlog::level::info)); @@ -75,6 +79,10 @@ inline bool ConsoleSink::is_module_enabled(const std::string &module) { void ConsoleSink::sink_it_(const spdlog::details::log_msg &msg) { json payload = json::parse(msg.payload); + if (msg.level < this->level()) { + return; + } + // filter message matching the current level if (msg.level == this->level()) { if (!is_module_enabled(payload["_module"])) return; @@ -82,37 +90,45 @@ void ConsoleSink::sink_it_(const spdlog::details::log_msg &msg) { fmt::text_style level_color; switch (msg.level) { - case spdlog::level::debug: - case spdlog::level::trace: - level_color = fmt::fg(fmt::color::gray); - break; - case spdlog::level::info: - level_color = fmt::fg(fmt::color::white); - break; - case spdlog::level::warn: - level_color = fmt::fg(fmt::color::yellow); - break; - case spdlog::level::err: - level_color = fmt::fg(fmt::color::red); + case spdlog::level::debug: + case spdlog::level::trace: + level_color = fmt::fg(fmt::color::gray); + break; + case spdlog::level::info: + level_color = fmt::fg(fmt::color::white); + break; + case spdlog::level::warn: + level_color = fmt::fg(fmt::color::yellow); + break; + case spdlog::level::err: + level_color = fmt::fg(fmt::color::red); + break; + case spdlog::level::critical: + level_color = fmt::bg(fmt::color::red) | fmt::fg(fmt::color::white); + break; + default: + level_color = fmt::fg(fmt::color::white); break; - case spdlog::level::critical: - level_color = fmt::bg(fmt::color::red) | fmt::fg(fmt::color::white); - break; - default: - level_color = fmt::fg(fmt::color::white); - break; } std::cerr << fmt::format("{} {}", fmt::styled(payload["_cycle"].get(), level_color), fmt::styled(payload["_module"].get(), fmt::fg(fmt::color::violet)) - ); + ); if (payload.contains("_msg")) { std::cerr << fmt::format(" {}", fmt::styled(payload["_msg"].get(), fmt::fg(fmt::color::green))); } if (payload.contains("_with")) { std::cerr << fmt::format(" {}", fmt::styled(payload["_with"].dump(), fmt::fg(fmt::color::gray))); } + if (msg.level > spdlog::level::err) { + std::cerr << "\n"; + const auto frames = vbridge_impl_instance.frames; + for (auto frame = frames.rbegin(); frame != frames.rend(); frame++) { + std::cerr << fmt::format(fmt::fg(fmt::color::gray), " call by {}(...) at {:08X}\n", + frame->func_name, frame->func_addr); + } + } std::cerr << "\n"; } @@ -160,7 +176,7 @@ JsonLogger::JsonLogger(bool no_logging, bool no_file_logging, bool no_console_lo } } -JsonLogger::JsonLogger(): do_logging(false) { } +JsonLogger::JsonLogger() : do_logging(false) {} // We can only implement a class method with template inside the class // declaration diff --git a/ipemu/csrc/vbridge_impl.cc b/ipemu/csrc/vbridge_impl.cc index c4eeb78bd..b2763468e 100644 --- a/ipemu/csrc/vbridge_impl.cc +++ b/ipemu/csrc/vbridge_impl.cc @@ -40,7 +40,7 @@ void VBridgeImpl::dpiInitCosim() { proc.get_state()->sstatus->write(proc.get_state()->sstatus->read() | SSTATUS_VS | SSTATUS_FS); - auto load_result = sim.load_elf(bin); + auto load_result = sim.load_elf32_little_endian(bin); proc.get_state()->pc = load_result.entry_addr; @@ -369,6 +369,7 @@ std::optional VBridgeImpl::spike_step() { clear_state(proc); + reg_t old_pc = state->pc; reg_t new_pc; if (event) { auto &se = event.value(); @@ -386,13 +387,67 @@ std::optional VBridgeImpl::spike_step() { new_pc = fetch.func(&proc, fetch.insn, state->pc); se.log_arch_changes(); } else { + auto disasm = proc.get_disassembler()->disassemble(fetch.insn); Log("SpikeStep") .with("pc", fmt::format("{:08X}", state->pc)) .with("bits", fmt::format("{:08X}", fetch.insn.bits())) - .with("disasm", proc.get_disassembler()->disassemble(fetch.insn)) + .with("disasm", disasm) .with("spike_cycles", spike_cycles) .info("spike run scalar insn"); new_pc = fetch.func(&proc, fetch.insn, state->pc); + + if (disasm == "ret") { + // When a function call is at the end of some parent function, the compiler may omit the save-ra process + // In this case we need to pop more than one frames when the child function returns + // Here we traverse the frames from top to bottom, until find a frame of the corresponding return_address + int layers_to_pop = 1; + for (; layers_to_pop <= frames.size(); layers_to_pop++) { + const auto &frame = frames[frames.size() - layers_to_pop]; + if (frame.return_addr == new_pc) { + Log("FunctionCall") + .with("old_pc", fmt::format("{:08X}", old_pc)) + .with("new_pc", fmt::format("{:08X}", new_pc)) + .with("spike_cycles", spike_cycles) + .with("depth", frames.size()) + .with("depth after return", frames.size() - layers_to_pop) + .info("return"); + break; + } + } + + if (layers_to_pop > frames.size()) { + // sometimes `ret` is used in inner-function jumping, in this case we cannot find corresponding frame + Log("FunctionCall") + .with("old_pc", fmt::format("{:08X}", old_pc)) + .with("new_pc", fmt::format("{:08X}", new_pc)) + .with("spike_cycles", spike_cycles) + .with("depth", frames.size()) + .warn("cannot find the frame to return"); + } else for (int j = 0; j < layers_to_pop; j++) { + frames.pop_back(); + } + } + } + + if (new_pc - state->pc != 2 && new_pc - state->pc != 4) { + auto sym_find = sim.get_symbol(new_pc); + if (sym_find != nullptr) { + reg_t return_addr = state->XPR[1]; + + // handle the case with omitted save-ra, in this case return_addr is set to null since it cannot be returned to + if (return_addr - old_pc != 2 && return_addr - old_pc != 4) { + return_addr = 0; + } + Log("FunctionCall") + .with("func_name", sym_find) + .with("old_pc", fmt::format("{:08X}", old_pc)) + .with("new_pc", fmt::format("{:08X}", new_pc)) + .with("return_addr", fmt::format("{:08X}", return_addr)) + .with("spike_cycles", spike_cycles) + .with("depth", frames.size()) + .info("call"); + frames.emplace_back(CallFrame{sym_find, new_pc, return_addr, spike_cycles}); + } } // Bypass CSR insns commitlog stuff. @@ -497,7 +552,7 @@ void VBridgeImpl::receive_tl_req(const VTlInterface &tl) { Log("ReceiveTLReq") .with("addr", fmt::format("{:08X}", addr)) .with("insn", se->jsonify_insn()) - .warn("send falsy data 0xDE for accessing unexpected memory"); + .info("send falsy data 0xDE for accessing unexpected memory"); actual_data[offset] = 0xDE; // falsy data } } diff --git a/ipemu/csrc/vbridge_impl.h b/ipemu/csrc/vbridge_impl.h index d560e8fe0..64e065204 100644 --- a/ipemu/csrc/vbridge_impl.h +++ b/ipemu/csrc/vbridge_impl.h @@ -265,6 +265,15 @@ class VBridgeImpl { int64_t spike_cycles = 0; + struct CallFrame { + std::string func_name; + reg_t func_addr; + reg_t return_addr; + int64_t spike_cycle; + }; + std::vector frames; + friend class ConsoleSink; + std::optional create_spike_event(insn_fetch_t fetch); std::optional spike_step(); diff --git a/nix/overlay.nix b/nix/overlay.nix index aab82d1ea..bdaf9dff4 100644 --- a/nix/overlay.nix +++ b/nix/overlay.nix @@ -42,14 +42,18 @@ in let major = final.lib.versions.major rv32_buildPkgs.${llvmForRVV_attrName}.release_version; + # By default, compiler-rt and newlib for rv32 are built with double float point abi by default. + # We need to override it with `-mabi=ilp32f` + # compiler-rt requires the compilation flag -fforce-enable-int128, only clang provides that - compilerrt = rv32_pkgs.${llvmForRVV_attrName}.compiler-rt.override { + compilerrt = (rv32_pkgs.${llvmForRVV_attrName}.compiler-rt.override { stdenv = rv32_pkgs.overrideCC rv32_pkgs.stdenv rv32_buildPkgs.${llvmForRVV_attrName}.clangNoCompilerRt; - }; + }).overrideAttrs (oldAttrs: { + env.NIX_CFLAGS_COMPILE = "-march=rv32gcv -mabi=ilp32f"; + }); - # newlib is built with double float point abi by default, override it newlib = rv32_pkgs.stdenv.cc.libc.overrideAttrs (oldAttrs: { CFLAGS_FOR_TARGET = "-march=rv32gcv -mabi=ilp32f"; }); diff --git a/nix/t1/testcases/make-intrinsic-case.nix b/nix/t1/testcases/make-intrinsic-case.nix index 8ea25d065..099ee955b 100644 --- a/nix/t1/testcases/make-intrinsic-case.nix +++ b/nix/t1/testcases/make-intrinsic-case.nix @@ -2,10 +2,12 @@ { caseName, xLen ? 32, vLen ? 1024, fp ? false, ... }@inputs: -stdenv.mkDerivation (rec { - name = "intrinsic.${caseName}"; +stdenv.mkDerivation (self: rec { + casePrefix = "intrinsic"; + name = "${self.casePrefix}.${caseName}"; unpackPhase = '' + runHook preUnpack if [ -z "''${srcs:-}" ]; then if [ -z "''${src:-}" ]; then echo 'variable $src or $srcs should point to the source' @@ -13,6 +15,7 @@ stdenv.mkDerivation (rec { fi srcs="$src" fi + runHook postUnpack ''; NIX_CFLAGS_COMPILE = [ @@ -51,7 +54,7 @@ stdenv.mkDerivation (rec { jq --null-input \ --arg name ${caseName} \ - --arg type intrinsic \ + --arg type ${self.casePrefix} \ --argjson xLen ${toString xLen} \ --argjson vLen ${toString vLen} \ --argjson fp ${lib.boolToString fp} \ @@ -62,5 +65,7 @@ stdenv.mkDerivation (rec { runHook postInstall ''; + dontFixup = true; + meta.description = "Test case '${caseName}', written in C intrinsic."; } // inputs) diff --git a/script/src/Main.scala b/script/src/Main.scala index 2412f7a70..d31ab9b1a 100644 --- a/script/src/Main.scala +++ b/script/src/Main.scala @@ -177,7 +177,7 @@ object Main: ) noLog: Flag = Flag(false), @arg( name = "no-file-logging", - doc = "prevent emulator print log to console" + doc = "prevent emulator print log to file" ) noFileLog: Flag = Flag(true), @arg( name = "no-console-logging", diff --git a/tests/asm/mmm/mmm.asm b/tests/asm/mmm/mmm.asm index 1343fe4ce..9fc80504a 100644 --- a/tests/asm/mmm/mmm.asm +++ b/tests/asm/mmm/mmm.asm @@ -7,6 +7,7 @@ .text .balign 16 .globl test +.type test, @function # assume VLEN >= 512, BN = 256, SEW = 16 * 2 = 32 # we only support LMUL = 1 for now # P, A, B, AB should have 32 elements diff --git a/tests/default.nix b/tests/default.nix index 8433ec938..e7cd355e7 100644 --- a/tests/default.nix +++ b/tests/default.nix @@ -54,6 +54,7 @@ let mlir = searchAndCallPackage ./mlir; intrinsic = searchAndCallPackage ./intrinsic; asm = searchAndCallPackage ./asm; + perf = searchAndCallPackage ./perf; # nix build .#t1.cases.codegen.vaadd-vv -L # codegen case are using xLen=32,vLen=1024 by default diff --git a/tests/emurt/default.nix b/tests/emurt/default.nix index cbbab1d36..54f523271 100644 --- a/tests/emurt/default.nix +++ b/tests/emurt/default.nix @@ -1,4 +1,4 @@ -{ stdenv, bintools }: +{ lib, stdenv, bintools }: stdenv.mkDerivation { name = "emurt"; @@ -6,9 +6,24 @@ stdenv.mkDerivation { NIX_CFLAGS_COMPILE = "-mabi=ilp32f -march=rv32gcv -fno-PIC"; - buildCommand = '' - mkdir -p $out/lib - ${stdenv.targetPlatform.config}-cc ${./emurt.c} -c -o emurt.o + src = with lib.fileset; toSource { + root = ./.; + fileset = fileFilter (file: file.name != "default.nix") ./.; + }; + + buildPhase = '' + runHook preBuild + ${stdenv.targetPlatform.config}-cc emurt.c -c -o emurt.o + runHook postBuild + ''; + + installPhase = '' + runHook preInstall + mkdir -p $out/{lib,include} + + cp *.h $out/include/ ${stdenv.targetPlatform.config}-ar rcs $out/lib/libemurt.a emurt.o + + runHook postInstall ''; } diff --git a/tests/emurt/emurt.c b/tests/emurt/emurt.c index 8fd5ac84b..f52a57d36 100644 --- a/tests/emurt/emurt.c +++ b/tests/emurt/emurt.c @@ -1,33 +1,74 @@ -#include -#include +#include "emurt.h" -struct uartlite_regs { - volatile unsigned int rx_fifo; - volatile unsigned int tx_fifo; - volatile unsigned int status; - volatile unsigned int control; -}; +/////////////////////// +// uart +/////////////////////// struct uartlite_regs *const ttyUL0 = (struct uartlite_regs *)0x10000000; -void uart_put_c(const char c) { - while (ttyUL0->status & (1<<3) /* transmit FIFO full */); - ttyUL0->tx_fifo = c; +#define SR_TX_FIFO_FULL (1<<3) /* transmit FIFO full */ +#define SR_TX_FIFO_EMPTY (1<<2) /* transmit FIFO empty */ +#define SR_RX_FIFO_VALID_DATA (1<<0) /* data in receive FIFO */ +#define SR_RX_FIFO_FULL (1<<1) /* receive FIFO full */ + +#define ULITE_CONTROL_RST_TX 0x01 +#define ULITE_CONTROL_RST_RX 0x02 + +void uart_put_c(char c) { + while (ttyUL0->status & SR_TX_FIFO_FULL); + ttyUL0->tx_fifo = c; +} + +char uart_check_read() { // 1: data ready, 0: no data + return (ttyUL0->status & SR_RX_FIFO_VALID_DATA) != 0; +} + +char uart_get_c() { + return ttyUL0->rx_fifo; +} + +void get(char *s, int n) { + char debug[20] = "Enter get()"; + print_s(debug); + int i = 0; + while (uart_check_read() != 1); + while (uart_check_read() && i < n - 1) { + print_s(debug); + char c = uart_get_c(); + uart_put_c(c); + if (c == '\r' || c == '\n') { + break; // Break if carriage return or newline is encountered + } + *(s + i) = c; + i++; + } + s[i] = '\0'; } -void uart_put_s(const char *s) { - while (*s) { - uart_put_c(*s++); +int _write(int file, char* ptr, int len) { + int i = 0; + for (; i < len; i++) { + uart_put_c(ptr[i]); } + return i; } -void uart_put_hex_d(uint64_t x) { - for (int i=15;i>=0;i--) { - uint64_t res = (x >> (i * 4)) & 0xf; - uart_put_c(res >= 10 ? 'a' + res - 10 : '0' + res); +void _exit(int code) { + __asm__("csrwi 0x7cc, 0"); + __builtin_unreachable(); +} + +void print_s(const char *c) { + while (*c) { + uart_put_c(*c); + c ++; } } +/////////////////////// +// allocation +/////////////////////// + extern char* __heapbegin; char *heap_top; @@ -42,6 +83,10 @@ char *_sbrk(int nbytes) { return base; } +/////////////////////// +// unimplemented +/////////////////////// + // We don't support FS int _isatty(int file) { return -1; @@ -62,11 +107,6 @@ int _fstat(int file, struct stat* st) { return -1; } -void _exit(int code) { - __asm__("csrwi 0x7cc, 0"); - __builtin_unreachable(); -} - // We don't support close int _close(int file) { return -1; @@ -82,10 +122,3 @@ int _read(int file, char* ptr, int len) { return -1; } -int _write(int file, char* ptr, int len) { - int i = 0; - for (; i < len; i++) { - uart_put_c(ptr[i]); - } - return i; -} diff --git a/tests/emurt/emurt.h b/tests/emurt/emurt.h new file mode 100644 index 000000000..b31c1b8f9 --- /dev/null +++ b/tests/emurt/emurt.h @@ -0,0 +1,14 @@ +#pragma once + +#include + +struct uartlite_regs { + volatile unsigned int rx_fifo; + volatile unsigned int tx_fifo; + volatile unsigned int status; + volatile unsigned int control; +}; + +void get(char *s, int n); +void print_s(const char *c); + diff --git a/tests/intrinsic/uart.h b/tests/intrinsic/uart.h index 60fe5f5df..9ac052cd7 100644 --- a/tests/intrinsic/uart.h +++ b/tests/intrinsic/uart.h @@ -27,4 +27,4 @@ void uart_put_hex_d(uint64_t x) { uint64_t res = (x >> (i * 4)) & 0xf; uart_put_c(res >= 10 ? 'a' + res - 10 : '0' + res); } -} \ No newline at end of file +} diff --git a/tests/intrinsic/uarttest.c b/tests/intrinsic/uarttest.c index 8146924ed..493dfcb89 100644 --- a/tests/intrinsic/uarttest.c +++ b/tests/intrinsic/uarttest.c @@ -1,8 +1,8 @@ -#include "uart.h" +#include void test() { uart_put_s("Test Begin from UART!\n"); uart_put_hex_d(0xdeadbeef); uart_put_c('\n'); uart_put_s("Test End from UART!\n"); -} \ No newline at end of file +} diff --git a/tests/perf/llama/default.nix b/tests/perf/llama/default.nix new file mode 100644 index 000000000..f7f8ed533 --- /dev/null +++ b/tests/perf/llama/default.nix @@ -0,0 +1,47 @@ +{ lib +, emurt +, fetchurl +, _caseBuilders +}: + +let + checkpoint_bin = fetchurl { + url = "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin"; + sha256 = "sha256-zVkGRNljhnorbloRB/UfrWY8QdecFJ++y7sflfqB9Jo="; + }; + + tokenizer_bin = fetchurl { + url = "https://github.com/karpathy/llama2.c/raw/b3c4b6c3c4bbff42e5211293280307019368ccb5/tokenizer.bin"; + sha256 = "sha256-UKUu+CLunoPeXOnQvgoCWnc9AZQ39Ytf+dyvsGPs42E="; + }; +in + +_caseBuilders.mkIntrinsicCase { + casePrefix = "perf"; + caseName = "llama"; + + buildInputs = [ emurt ]; + + src = with lib.fileset; toSource { + root = ./.; + fileset = fileFilter (file: file.name != "default.nix") ./.; + }; + + unpackPhase = '' + cp $src -rT . + chmod -R +w . + ''; + + postPatch = '' + substituteInPlace extern_data.S \ + --replace-fail '{{checkpoint_bin}}' ${checkpoint_bin} \ + --replace-fail '{{tokenizer_bin}}' ${tokenizer_bin} + ''; + + srcs = [ + "run.c" + "trap.c" + "extern_data.S" + ../../t1_main.S + ]; +} diff --git a/tests/perf/llama/extern_data.S b/tests/perf/llama/extern_data.S new file mode 100644 index 000000000..72dfa7ef1 --- /dev/null +++ b/tests/perf/llama/extern_data.S @@ -0,0 +1,25 @@ + .section .rodata + .global checkpoint_data + .type checkpoint_data, @object + .align 4 +checkpoint_data: + .incbin "{{checkpoint_bin}}" # will be replaced on nix build +checkpoint_end: + .global checkpoint_size + .type checkpoint_size, @object + .align 4 +checkpoint_size: + .int checkpoint_end - checkpoint_data + + .section .rodata + .global tokenizer_data + .type tokenizer_data, @object + .align 4 +tokenizer_data: + .incbin "{{tokenizer_bin}}" # will be replaced on nix build +tokenizer_end: + .global tokenizer_size + .type tokenizer_size, @object + .align 4 +tokenizer_size: + .int tokenizer_end - tokenizer_data diff --git a/tests/perf/llama/extern_data.h b/tests/perf/llama/extern_data.h new file mode 100644 index 000000000..472de5b2b --- /dev/null +++ b/tests/perf/llama/extern_data.h @@ -0,0 +1,5 @@ +extern int checkpoint_size; +extern float checkpoint_data[]; + +extern int tokenizer_size; +extern float tokenizer_data[]; diff --git a/tests/perf/llama/run.c b/tests/perf/llama/run.c new file mode 100644 index 000000000..d0926a090 --- /dev/null +++ b/tests/perf/llama/run.c @@ -0,0 +1,936 @@ +/* Inference for Llama-2 Transformer model in pure C */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "trap.h" +#include "utils.h" + +#include "extern_data.h" + +// ---------------------------------------------------------------------------- +// Transformer model + + +typedef struct { + int dim; // transformer dimension + int hidden_dim; // for ffn layers + int n_layers; // number of layers + int n_heads; // number of query heads + int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery) + int vocab_size; // vocabulary size, usually 256 (byte-level) + int seq_len; // max sequence length +} Config; + +typedef struct { + // token embedding table + float* token_embedding_table; // (vocab_size, dim) + // weights for rmsnorms + float* rms_att_weight; // (layer, dim) rmsnorm weights + float* rms_ffn_weight; // (layer, dim) + // weights for matmuls. note dim == n_heads * head_size + float* wq; // (layer, dim, n_heads * head_size) + float* wk; // (layer, dim, n_kv_heads * head_size) + float* wv; // (layer, dim, n_kv_heads * head_size) + float* wo; // (layer, n_heads * head_size, dim) + // weights for ffn + float* w1; // (layer, hidden_dim, dim) + float* w2; // (layer, dim, hidden_dim) + float* w3; // (layer, hidden_dim, dim) + // final rmsnorm + float* rms_final_weight; // (dim,) + // (optional) classifier weights for the logits, on the last layer + float* wcls; +} TransformerWeights; + +typedef struct { + // current wave of activations + float *x; // activation at current time stamp (dim,) + float *xb; // same, but inside a residual branch (dim,) + float *xb2; // an additional buffer just for convenience (dim,) + float *hb; // buffer for hidden dimension in the ffn (hidden_dim,) + float *hb2; // buffer for hidden dimension in the ffn (hidden_dim,) + float *q; // query (dim,) + float *k; // key (dim,) + float *v; // value (dim,) + float *att; // buffer for scores/attention values (n_heads, seq_len) + float *logits; // output logits + // kv cache + float* key_cache; // (layer, seq_len, dim) + float* value_cache; // (layer, seq_len, dim) +} RunState; + +typedef struct { + Config config; // the hyperparameters of the architecture (the blueprint) + TransformerWeights weights; // the weights of the model + RunState state; // buffers for the "wave" of activations in the forward pass + // some more state needed to properly clean up the memory mapping (sigh) + int fd; // file descriptor for memory mapping + float* data; // memory mapped data pointer + ssize_t file_size; // size of the checkpoint file in bytes +} Transformer; + +void malloc_run_state(RunState* s, Config* p) { + // we calloc instead of malloc to keep valgrind happy + int kv_dim = (p->dim * p->n_kv_heads) / p->n_heads; + s->x = calloc(p->dim, sizeof(float)); + s->xb = calloc(p->dim, sizeof(float)); + s->xb2 = calloc(p->dim, sizeof(float)); + s->hb = calloc(p->hidden_dim, sizeof(float)); + s->hb2 = calloc(p->hidden_dim, sizeof(float)); + s->q = calloc(p->dim, sizeof(float)); + s->key_cache = calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float)); + s->value_cache = calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float)); + s->att = calloc(p->n_heads * p->seq_len, sizeof(float)); + s->logits = calloc(p->vocab_size, sizeof(float)); + // ensure all mallocs went fine + if (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q + || !s->key_cache || !s->value_cache || !s->att || !s->logits) { + fprintf(stderr, "malloc failed!\n"); + exit(EXIT_FAILURE); + } +} + +void free_run_state(RunState* s) { + free(s->x); + free(s->xb); + free(s->xb2); + free(s->hb); + free(s->hb2); + free(s->q); + free(s->att); + free(s->logits); + free(s->key_cache); + free(s->value_cache); +} + +void memory_map_weights(TransformerWeights *w, Config* p, float* ptr, int shared_weights) { + int head_size = p->dim / p->n_heads; + // make sure the multiplications below are done in 64bit to fit the parameter counts of 13B+ models + unsigned long long n_layers = p->n_layers; + w->token_embedding_table = ptr; + ptr += p->vocab_size * p->dim; + w->rms_att_weight = ptr; + ptr += n_layers * p->dim; + w->wq = ptr; + ptr += n_layers * p->dim * (p->n_heads * head_size); + w->wk = ptr; + ptr += n_layers * p->dim * (p->n_kv_heads * head_size); + w->wv = ptr; + ptr += n_layers * p->dim * (p->n_kv_heads * head_size); + w->wo = ptr; + ptr += n_layers * (p->n_heads * head_size) * p->dim; + w->rms_ffn_weight = ptr; + ptr += n_layers * p->dim; + w->w1 = ptr; + ptr += n_layers * p->dim * p->hidden_dim; + w->w2 = ptr; + ptr += n_layers * p->hidden_dim * p->dim; + w->w3 = ptr; + ptr += n_layers * p->dim * p->hidden_dim; + w->rms_final_weight = ptr; + ptr += p->dim; + ptr += p->seq_len * head_size / 2; // skip what used to be freq_cis_real (for RoPE) + ptr += p->seq_len * head_size / 2; // skip what used to be freq_cis_imag (for RoPE) + w->wcls = shared_weights ? w->token_embedding_table : ptr; +} + +void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weights, + int* fd, float** data, ssize_t* file_size) { + BIN *file = bopen((char *) checkpoint_data, checkpoint_size); + // read in the config header + bread(config, sizeof(Config), 1, file); + // negative vocab size is hacky way of signaling unshared weights. bit yikes. + int shared_weights = config->vocab_size > 0 ? 1 : 0; + config->vocab_size = abs(config->vocab_size); + // figure out the file size + bclose(file); + + float* weights_ptr = checkpoint_data + sizeof(Config)/sizeof(float); + + memory_map_weights(weights, config, weights_ptr, shared_weights); +} + +void build_transformer(Transformer *t, char* checkpoint_path) { + // read in the Config and the Weights from the checkpoint + read_checkpoint(checkpoint_path, &t->config, &t->weights, &t->fd, &t->data, &t->file_size); + // allocate the RunState buffers + malloc_run_state(&t->state, &t->config); +} + +void free_transformer(Transformer* t) { + // free the RunState buffers + free_run_state(&t->state); +} + +// ---------------------------------------------------------------------------- +// neural net blocks; the dynamics of the Transformer + +void rmsnorm(float* o, float* x, float* weight, int size) { + // calculate sum of squares + float ss = 0.0f; + for (int j = 0; j < size; j++) { + ss += x[j] * x[j]; + } + ss /= size; + ss += 1e-5f; + ss = 1.0f / sqrtf(ss); + // normalize and scale + for (int j = 0; j < size; j++) { + o[j] = weight[j] * (ss * x[j]); + } +} + +void softmax(float* x, int size) { + // find max value (for numerical stability) + float max_val = x[0]; + for (int i = 1; i < size; i++) { + if (x[i] > max_val) { + max_val = x[i]; + } + } + // exp and sum + float sum = 0.0f; + for (int i = 0; i < size; i++) { + x[i] = expf(x[i] - max_val); + sum += x[i]; + } + // normalize + for (int i = 0; i < size; i++) { + x[i] /= sum; + } +} + +void matmul(float* xout, float* x, float* w, int n, int d) { + // W (d,n) @ x (n,) -> xout (d,) + // by far the most amount of time is spent inside this little function + int i; + #pragma omp parallel for private(i) + for (i = 0; i < d; i++) { + float val = 0.0f; + for (int j = 0; j < n; j++) { + val += w[i * n + j] * x[j]; + } + xout[i] = val; + } +} + +float* forward(Transformer* transformer, int token, int pos) { + + // a few convenience variables + Config* p = &transformer->config; + TransformerWeights* w = &transformer->weights; + RunState* s = &transformer->state; + float *x = s->x; + int dim = p->dim; + int kv_dim = (p->dim * p->n_kv_heads) / p->n_heads; + int kv_mul = p->n_heads / p->n_kv_heads; // integer multiplier of the kv sharing in multiquery + int hidden_dim = p->hidden_dim; + int head_size = dim / p->n_heads; + + // copy the token embedding into x + float* content_row = w->token_embedding_table + token * dim; + memcpy(x, content_row, dim*sizeof(*x)); + + // forward all the layers + for(unsigned long long l = 0; l < p->n_layers; l++) { + + // attention rmsnorm + rmsnorm(s->xb, x, w->rms_att_weight + l*dim, dim); + + // key and value point to the kv cache + int loff = l * p->seq_len * kv_dim; // kv cache layer offset for convenience + s->k = s->key_cache + loff + pos * kv_dim; + s->v = s->value_cache + loff + pos * kv_dim; + + // qkv matmuls for this position + matmul(s->q, s->xb, w->wq + l*dim*dim, dim, dim); + matmul(s->k, s->xb, w->wk + l*dim*kv_dim, dim, kv_dim); + matmul(s->v, s->xb, w->wv + l*dim*kv_dim, dim, kv_dim); + + // RoPE relative positional encoding: complex-valued rotate q and k in each head + for (int i = 0; i < dim; i+=2) { + int head_dim = i % head_size; + float freq = 1.0f / powf(10000.0f, head_dim / (float)head_size); + float val = pos * freq; + float fcr = cosf(val); + float fci = sinf(val); + int rotn = i < kv_dim ? 2 : 1; // how many vectors? 2 = q & k, 1 = q only + for (int v = 0; v < rotn; v++) { + float* vec = v == 0 ? s->q : s->k; // the vector to rotate (query or key) + float v0 = vec[i]; + float v1 = vec[i+1]; + vec[i] = v0 * fcr - v1 * fci; + vec[i+1] = v0 * fci + v1 * fcr; + } + } + + // multihead attention. iterate over all heads + int h; + #pragma omp parallel for private(h) + for (h = 0; h < p->n_heads; h++) { + // get the query vector for this head + float* q = s->q + h * head_size; + // attention scores for this head + float* att = s->att + h * p->seq_len; + // iterate over all timesteps, including the current one + for (int t = 0; t <= pos; t++) { + // get the key vector for this head and at this timestep + float* k = s->key_cache + loff + t * kv_dim + (h / kv_mul) * head_size; + // calculate the attention score as the dot product of q and k + float score = 0.0f; + for (int i = 0; i < head_size; i++) { + score += q[i] * k[i]; + } + score /= sqrtf(head_size); + // save the score to the attention buffer + att[t] = score; + } + + // softmax the scores to get attention weights, from 0..pos inclusively + softmax(att, pos + 1); + + // weighted sum of the values, store back into xb + float* xb = s->xb + h * head_size; + memset(xb, 0, head_size * sizeof(float)); + for (int t = 0; t <= pos; t++) { + // get the value vector for this head and at this timestep + float* v = s->value_cache + loff + t * kv_dim + (h / kv_mul) * head_size; + // get the attention weight for this timestep + float a = att[t]; + // accumulate the weighted value into xb + for (int i = 0; i < head_size; i++) { + xb[i] += a * v[i]; + } + } + } + + // final matmul to get the output of the attention + matmul(s->xb2, s->xb, w->wo + l*dim*dim, dim, dim); + + // residual connection back into x + for (int i = 0; i < dim; i++) { + x[i] += s->xb2[i]; + } + + // ffn rmsnorm + rmsnorm(s->xb, x, w->rms_ffn_weight + l*dim, dim); + + // Now for FFN in PyTorch we have: self.w2(F.silu(self.w1(x)) * self.w3(x)) + // first calculate self.w1(x) and self.w3(x) + matmul(s->hb, s->xb, w->w1 + l*dim*hidden_dim, dim, hidden_dim); + matmul(s->hb2, s->xb, w->w3 + l*dim*hidden_dim, dim, hidden_dim); + + // SwiGLU non-linearity + for (int i = 0; i < hidden_dim; i++) { + float val = s->hb[i]; + // silu(x)=x*σ(x), where σ(x) is the logistic sigmoid + val *= (1.0f / (1.0f + expf(-val))); + // elementwise multiply with w3(x) + val *= s->hb2[i]; + s->hb[i] = val; + } + + // final matmul to get the output of the ffn + matmul(s->xb, s->hb, w->w2 + l*dim*hidden_dim, hidden_dim, dim); + + // residual connection + for (int i = 0; i < dim; i++) { + x[i] += s->xb[i]; + } + } + + // final rmsnorm + rmsnorm(x, x, w->rms_final_weight, dim); + + // classifier into logits + matmul(s->logits, x, w->wcls, p->dim, p->vocab_size); + return s->logits; +} + +// ---------------------------------------------------------------------------- +// The Byte Pair Encoding (BPE) Tokenizer that translates strings <-> tokens + +typedef struct { + char *str; + int id; +} TokenIndex; + +typedef struct { + char** vocab; + float* vocab_scores; + TokenIndex *sorted_vocab; + int vocab_size; + unsigned int max_token_length; + unsigned char byte_pieces[512]; // stores all single-byte strings +} Tokenizer; + +int compare_tokens(const void *a, const void *b) { + return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str); +} + +void build_tokenizer(Tokenizer* t, char* tokenizer_path, int vocab_size) { + // i should have written the vocab_size into the tokenizer file... sigh + t->vocab_size = vocab_size; + // malloc space to hold the scores and the strings + t->vocab = (char**)malloc(vocab_size * sizeof(char*)); + t->vocab_scores = (float*)malloc(vocab_size * sizeof(float)); + t->sorted_vocab = NULL; // initialized lazily + for (int i = 0; i < 256; i++) { + t->byte_pieces[i * 2] = (unsigned char)i; + t->byte_pieces[i * 2 + 1] = '\0'; + } + // read in the file + BIN *file = bopen((char *) tokenizer_data, tokenizer_size); + bread(&t->max_token_length, sizeof(int), 1, file); + int len; + for (int i = 0; i < vocab_size; i++) { + bread(t->vocab_scores + i, sizeof(float), 1, file); + bread(&len, sizeof(int), 1, file); + t->vocab[i] = (char *)malloc(len + 1); + bread(t->vocab[i], len, 1, file); + t->vocab[i][len] = '\0'; // add the string terminating token + } + bclose(file); +} + +void free_tokenizer(Tokenizer* t) { + for (int i = 0; i < t->vocab_size; i++) { free(t->vocab[i]); } + free(t->vocab); + free(t->vocab_scores); + free(t->sorted_vocab); +} + +char* decode(Tokenizer* t, int prev_token, int token) { + char *piece = t->vocab[token]; + // following BOS (1) token, sentencepiece decoder strips any leading whitespace (see PR #89) + if (prev_token == 1 && piece[0] == ' ') { piece++; } + // careful, some tokens designate raw bytes, and look like e.g. '<0x01>' + // parse this and convert and return the actual byte + unsigned char byte_val; + if (sscanf(piece, "<0x%02hhX>", &byte_val) == 1) { + piece = (char*)t->byte_pieces + byte_val * 2; + } + return piece; +} + +void safe_printf(char *piece) { + // piece might be a raw byte token, and we only want to print printable chars or whitespace + // because some of the other bytes can be various control codes, backspace, etc. + if (piece == NULL) { return; } + if (piece[0] == '\0') { return; } + if (piece[1] == '\0') { + unsigned char byte_val = piece[0]; + if (!(isprint(byte_val) || isspace(byte_val))) { + return; // bad byte, don't print it + } + } +} + +int str_lookup(char *str, TokenIndex *sorted_vocab, int vocab_size) { + // efficiently find the perfect match for str in vocab, return its index or -1 if not found + TokenIndex tok = { .str = str }; // acts as the key to search for + TokenIndex *res = bsearch(&tok, sorted_vocab, vocab_size, sizeof(TokenIndex), compare_tokens); + return res != NULL ? res->id : -1; +} + +void encode(Tokenizer* t, char *text, int8_t bos, int8_t eos, int *tokens, int *n_tokens) { + // encode the string text (input) into an upper-bound preallocated tokens[] array + // bos != 0 means prepend the BOS token (=1), eos != 0 means append the EOS token (=2) + if (text == NULL) { fprintf(stderr, "cannot encode NULL text\n"); exit(EXIT_FAILURE); } + + if (t->sorted_vocab == NULL) { + // lazily malloc and sort the vocabulary + t->sorted_vocab = malloc(t->vocab_size * sizeof(TokenIndex)); + for (int i = 0; i < t->vocab_size; i++) { + t->sorted_vocab[i].str = t->vocab[i]; + t->sorted_vocab[i].id = i; + } + qsort(t->sorted_vocab, t->vocab_size, sizeof(TokenIndex), compare_tokens); + } + + // create a temporary buffer that will store merge candidates of always two consecutive tokens + // *2 for concat, +1 for null terminator +2 for UTF8 (in case max_token_length is 1) + char* str_buffer = malloc((t->max_token_length*2 +1 +2) * sizeof(char)); + size_t str_len = 0; + + // start at 0 tokens + *n_tokens = 0; + + // add optional BOS (=1) token, if desired + if (bos) tokens[(*n_tokens)++] = 1; + + // add_dummy_prefix is true by default + // so prepend a dummy prefix token to the input string, but only if text != "" + // TODO: pretty sure this isn't correct in the general case but I don't have the + // energy to read more of the sentencepiece code to figure out what it's doing + if (text[0] != '\0') { + int dummy_prefix = str_lookup(" ", t->sorted_vocab, t->vocab_size); + tokens[(*n_tokens)++] = dummy_prefix; + } + + // Okay UTF-8 time. This will get messy. Here is the reference from Wikipedia: + // Code point ↔ UTF-8 conversion + // First code point Last code point Byte 1 Byte 2 Byte 3 Byte 4 + // U+0000 U+007F 0xxxxxxx + // U+0080 U+07FF 110xxxxx 10xxxxxx + // U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx + // U+10000 U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + + // process the raw (UTF-8) byte sequence of the input string + for (char *c = text; *c != '\0'; c++) { + + // reset buffer if the current byte is ASCII or a leading byte + // 0xC0 is 11000000, so (*c & 0xC0) keeps the first 2 bits and zeros the rest + // 0x80 is 10000000 + // in UTF-8, all continuation bytes start with "10" in first two bits + // so in English this is: "if this byte is not a continuation byte" + if ((*c & 0xC0) != 0x80) { + // this byte must be either a leading byte (11...) or an ASCII char (0x...) + // => reset our location, as we're starting a new UTF-8 codepoint + str_len = 0; + } + + // append the current byte to the buffer + str_buffer[str_len++] = *c; // ++ is post-increment, incremented after this line + str_buffer[str_len] = '\0'; + + // while the next character is a continuation byte, continue appending + // but if there are too many of them, just stop to avoid overruning str_buffer size. + if ((*(c+1) & 0xC0) == 0x80 && str_len < 4) { + continue; + } + + // ok c+1 is not a continuation byte, so we've read in a full codepoint + int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size); + + if (id != -1) { + // we found this codepoint in vocab, add it as a token + tokens[(*n_tokens)++] = id; + } else { + // byte_fallback encoding: just encode each byte as a token + // +3 is here because the first 3 vocab elements are , , + // so the individual bytes only start at index 3 + for (int i=0; i < str_len; i++) { + tokens[(*n_tokens)++] = (unsigned char)str_buffer[i] + 3; + } + } + str_len = 0; // protect against a sequence of stray UTF8 continuation bytes + } + + // merge the best consecutive pair each iteration, according the scores in vocab_scores + while (1) { + float best_score = -1e10; + int best_id = -1; + int best_idx = -1; + + for (int i=0; i < (*n_tokens-1); i++) { + // check if we can merge the pair (tokens[i], tokens[i+1]) + sprintf(str_buffer, "%s%s", t->vocab[tokens[i]], t->vocab[tokens[i+1]]); + int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size); + if (id != -1 && t->vocab_scores[id] > best_score) { + // this merge pair exists in vocab! record its score and position + best_score = t->vocab_scores[id]; + best_id = id; + best_idx = i; + } + } + + if (best_idx == -1) { + break; // we couldn't find any more pairs to merge, so we're done + } + + // merge the consecutive pair (best_idx, best_idx+1) into new token best_id + tokens[best_idx] = best_id; + // delete token at position best_idx+1, shift the entire sequence back 1 + for (int i = best_idx+1; i < (*n_tokens-1); i++) { + tokens[i] = tokens[i+1]; + } + (*n_tokens)--; // token length decreased + } + + // add optional EOS (=2) token, if desired + if (eos) tokens[(*n_tokens)++] = 2; + + free(str_buffer); +} + +// ---------------------------------------------------------------------------- +// The Sampler, which takes logits and returns a sampled token +// sampling can be done in a few ways: greedy argmax, sampling, top-p sampling + +typedef struct { + float prob; + int index; +} ProbIndex; // struct used when sorting probabilities during top-p sampling + +typedef struct { + int vocab_size; + ProbIndex* probindex; // buffer used in top-p sampling + float temperature; + float topp; + unsigned long long rng_state; +} Sampler; + +int sample_argmax(float* probabilities, int n) { + // return the index that has the highest probability + int max_i = 0; + float max_p = probabilities[0]; + for (int i = 1; i < n; i++) { + if (probabilities[i] > max_p) { + max_i = i; + max_p = probabilities[i]; + } + } + return max_i; +} + +int sample_mult(float* probabilities, int n, float coin) { + // sample index from probabilities (they must sum to 1!) + // coin is a random number in [0, 1), usually from random_f32() + float cdf = 0.0f; + for (int i = 0; i < n; i++) { + cdf += probabilities[i]; + if (coin < cdf) { + return i; + } + } + return n - 1; // in case of rounding errors +} + +int compare(const void* a, const void* b) { + ProbIndex* a_ = (ProbIndex*) a; + ProbIndex* b_ = (ProbIndex*) b; + if (a_->prob > b_->prob) return -1; + if (a_->prob < b_->prob) return 1; + return 0; +} + +int sample_topp(float* probabilities, int n, float topp, ProbIndex* probindex, float coin) { + // top-p sampling (or "nucleus sampling") samples from the smallest set of + // tokens that exceed probability topp. This way we never sample tokens that + // have very low probabilities and are less likely to go "off the rails". + // coin is a random number in [0, 1), usually from random_f32() + + int n0 = 0; + // quicksort indices in descending order of probabilities + // values smaller than (1 - topp) / (n - 1) cannot be part of the result + // so for efficiency we crop these out as candidates before sorting + const float cutoff = (1.0f - topp) / (n - 1); + for (int i = 0; i < n; i++) { + if (probabilities[i] >= cutoff) { + probindex[n0].index = i; + probindex[n0].prob = probabilities[i]; + n0++; + } + } + qsort(probindex, n0, sizeof(ProbIndex), compare); + + // truncate the list where cumulative probability exceeds topp + float cumulative_prob = 0.0f; + int last_idx = n0 - 1; // in case of rounding errors consider all elements + for (int i = 0; i < n0; i++) { + cumulative_prob += probindex[i].prob; + if (cumulative_prob > topp) { + last_idx = i; + break; // we've exceeded topp by including last_idx + } + } + + // sample from the truncated list + float r = coin * cumulative_prob; + float cdf = 0.0f; + for (int i = 0; i <= last_idx; i++) { + cdf += probindex[i].prob; + if (r < cdf) { + return probindex[i].index; + } + } + return probindex[last_idx].index; // in case of rounding errors +} + +void build_sampler(Sampler* sampler, int vocab_size, float temperature, float topp, unsigned long long rng_seed) { + sampler->vocab_size = vocab_size; + sampler->temperature = temperature; + sampler->topp = topp; + sampler->rng_state = rng_seed; + // buffer only used with nucleus sampling; may not need but it's ~small + sampler->probindex = malloc(sampler->vocab_size * sizeof(ProbIndex)); +} + +void free_sampler(Sampler* sampler) { + free(sampler->probindex); +} + +unsigned int random_u32(unsigned long long *state) { + // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A + *state ^= *state >> 12; + *state ^= *state << 25; + *state ^= *state >> 27; + return (*state * 0x2545F4914F6CDD1Dull) >> 32; +} +float random_f32(unsigned long long *state) { // random float32 in [0,1) + return (random_u32(state) >> 8) / 16777216.0f; +} + +int sample(Sampler* sampler, float* logits) { + // sample the token given the logits and some hyperparameters + int next; + if (sampler->temperature == 0.0f) { + // greedy argmax sampling: take the token with the highest probability + next = sample_argmax(logits, sampler->vocab_size); + } else { + // apply the temperature to the logits + for (int q=0; qvocab_size; q++) { logits[q] /= sampler->temperature; } + // apply softmax to the logits to get the probabilities for next token + softmax(logits, sampler->vocab_size); + // flip a (float) coin (this is our source of entropy for sampling) + float coin = random_f32(&sampler->rng_state); + // we sample from this distribution to get the next token + if (sampler->topp <= 0 || sampler->topp >= 1) { + // simply sample from the predicted probability distribution + next = sample_mult(logits, sampler->vocab_size, coin); + } else { + // top-p (nucleus) sampling, clamping the least likely tokens to zero + next = sample_topp(logits, sampler->vocab_size, sampler->topp, sampler->probindex, coin); + } + } + return next; +} + +// ---------------------------------------------------------------------------- +// utilities: time + +long time_in_ms() { + return 0; +} + +// ---------------------------------------------------------------------------- +// generation loop + +void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler, char *prompt, int steps) { + char *empty_prompt = ""; + if (prompt == NULL) { prompt = empty_prompt; } + + // encode the (string) prompt into tokens sequence + int num_prompt_tokens = 0; + int* prompt_tokens = (int*)malloc((strlen(prompt)+3) * sizeof(int)); // +3 for '\0', ?BOS, ?EOS + encode(tokenizer, prompt, 1, 0, prompt_tokens, &num_prompt_tokens); + if (num_prompt_tokens < 1) { + fprintf(stderr, "something is wrong, expected at least 1 prompt token\n"); + exit(EXIT_FAILURE); + } + + // start the main loop + long start = 0; // used to time our code, only initialized after first iteration + int next; // will store the next token in the sequence + int token = prompt_tokens[0]; // kick off with the first token in the prompt + int pos = 0; // position in the sequence + while (pos < steps) { + + // forward the transformer to get logits for the next token + float* logits = forward(transformer, token, pos); + + // advance the state machine + if (pos < num_prompt_tokens - 1) { + // if we are still processing the input prompt, force the next prompt token + next = prompt_tokens[pos + 1]; + } else { + // otherwise sample the next token from the logits + next = sample(sampler, logits); + } + pos++; + + // data-dependent terminating condition: the BOS (=1) token delimits sequences + if (next == 1) { break; } + + // print the token as string, decode it with the Tokenizer object + char* piece = decode(tokenizer, token, next); + safe_printf(piece); // same as printf("%s", piece), but skips "unsafe" bytes + token = next; + + // init the timer here because the first iteration can be slower + if (start == 0) { start = time_in_ms(); } + } + + // report achieved tok/s (pos-1 because the timer starts after first iteration) + if (pos > 1) { + long end = time_in_ms(); + fprintf(stderr, "achieved tok/s: %f\n", (pos-1) / (double)(end-start)*1000); + } + + free(prompt_tokens); +} + +void read_stdin(const char* guide, char* buffer, size_t bufsize) { + // read a line from stdin, up to but not including \n + printf("%s", guide); + if (fgets(buffer, bufsize, stdin) != NULL) { + size_t len = strlen(buffer); + if (len > 0 && buffer[len - 1] == '\n') { + buffer[len - 1] = '\0'; // strip newline + } + } +} + +// ---------------------------------------------------------------------------- +// chat loop +// I manually inspected the tokens for a few chat conversations compared to +// python reference and that seemed ok, but this was not thoroughly tested and +// is not safely implemented, it's more a proof of concept atm. + +void chat(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler, + char *cli_user_prompt, char *cli_system_prompt, int steps) { + + // buffers for reading the system prompt and user prompt from stdin + // you'll notice they are soomewhat haphazardly and unsafely set atm + char system_prompt[512]; + char user_prompt[512]; + char rendered_prompt[1152]; + int num_prompt_tokens = 0; + int* prompt_tokens = (int*)malloc(1152 * sizeof(int)); + int user_idx; + + // start the main loop + int8_t user_turn = 1; // user starts + int next; // will store the next token in the sequence + int token; // stores the current token to feed into the transformer + int prev_token; + int pos = 0; // position in the sequence + while (pos < steps) { + + // when it is the user's turn to contribute tokens to the dialog... + if (user_turn) { + // get the (optional) system prompt at position 0 + if (pos == 0) { + // at position 0, the user can also contribute a system prompt + if (cli_system_prompt == NULL) { + // system prompt was not passed in, attempt to get it from stdin + read_stdin("Enter system prompt (optional): ", system_prompt, sizeof(system_prompt)); + } else { + // system prompt was passed in, use it + strcpy(system_prompt, cli_system_prompt); + } + } + // get the user prompt + if (pos == 0 && cli_user_prompt != NULL) { + // user prompt for position 0 was passed in, use it + strcpy(user_prompt, cli_user_prompt); + } else { + // otherwise get user prompt from stdin + read_stdin("User: ", user_prompt, sizeof(user_prompt)); + } + // render user/system prompts into the Llama 2 Chat schema + if (pos == 0 && system_prompt[0] != '\0') { + char system_template[] = "[INST] <>\n%s\n<>\n\n%s [/INST]"; + sprintf(rendered_prompt, system_template, system_prompt, user_prompt); + } else { + char user_template[] = "[INST] %s [/INST]"; + sprintf(rendered_prompt, user_template, user_prompt); + } + // encode the rendered prompt into tokens + encode(tokenizer, rendered_prompt, 1, 0, prompt_tokens, &num_prompt_tokens); + user_idx = 0; // reset the user index + user_turn = 0; + printf("Assistant: "); + } + + // determine the token to pass into the transformer next + if (user_idx < num_prompt_tokens) { + // if we are still processing the input prompt, force the next prompt token + token = prompt_tokens[user_idx++]; + } else { + // otherwise use the next token sampled from previous turn + token = next; + } + // EOS (=2) token ends the Assistant turn + if (token == 2) { user_turn = 1; } + + // forward the transformer to get logits for the next token + float* logits = forward(transformer, token, pos); + next = sample(sampler, logits); + pos++; + + if (user_idx >= num_prompt_tokens && next != 2) { + // the Assistant is responding, so print its output + char* piece = decode(tokenizer, token, next); + safe_printf(piece); // same as printf("%s", piece), but skips "unsafe" bytes + fflush(stdout); + } + if (next == 2) { printf("\n"); } + } + printf("\n"); + free(prompt_tokens); +} + + +// ---------------------------------------------------------------------------- +// CLI, include only if not testing +#ifndef TESTING + +void error_usage() { + fprintf(stderr, "Usage: run [options]\n"); + fprintf(stderr, "Example: run model.bin -n 256 -i \"Once upon a time\"\n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -t temperature in [0,inf], default 1.0\n"); + fprintf(stderr, " -p p value in top-p (nucleus) sampling in [0,1] default 0.9\n"); + fprintf(stderr, " -s random seed, default time(NULL)\n"); + fprintf(stderr, " -n number of steps to run for, default 256. 0 = max_seq_len\n"); + fprintf(stderr, " -i input prompt\n"); + fprintf(stderr, " -z optional path to custom tokenizer\n"); + fprintf(stderr, " -m mode: generate|chat, default: generate\n"); + fprintf(stderr, " -y (optional) system prompt in chat mode\n"); + exit(EXIT_FAILURE); +} + +int test(int argc, char *argv[]) { + setup_mtvec(); + // default parameters + char checkpoint_path[64] = "./model/stories15M.bin"; // e.g. out/model.bin + char *tokenizer_path = "tokenizer.bin"; + float temperature = 1.0f; // 0.0 = greedy deterministic. 1.0 = original. don't set higher + float topp = 0.9f; // top-p in nucleus sampling. 1.0 = off. 0.9 works well, but slower + int steps = 256; // number of steps to run for + char *prompt = NULL; // prompt string + unsigned long long rng_seed = 0; // seed rng with time by default + char *mode = "generate"; // generate|chat + char *system_prompt = NULL; // the (optional) system prompt to use in chat mode + + // build the Transformer via the model .bin file + Transformer transformer; + build_transformer(&transformer, checkpoint_path); + if (steps == 0 || steps > transformer.config.seq_len) steps = transformer.config.seq_len; // override to ~max length + + + // build the Tokenizer via the tokenizer .bin file + Tokenizer tokenizer; + build_tokenizer(&tokenizer, tokenizer_path, transformer.config.vocab_size); + + + // build the Sampler + Sampler sampler; + build_sampler(&sampler, transformer.config.vocab_size, temperature, topp, rng_seed); + // run! + if (strcmp(mode, "generate") == 0) { + generate(&transformer, &tokenizer, &sampler, prompt, steps); + } else if (strcmp(mode, "chat") == 0) { + chat(&transformer, &tokenizer, &sampler, prompt, system_prompt, steps); + } else { + fprintf(stderr, "unknown mode: %s\n", mode); + error_usage(); + } + + // memory and file handles cleanup + free_sampler(&sampler); + free_tokenizer(&tokenizer); + free_transformer(&transformer); + return 0; +} +#endif diff --git a/tests/perf/llama/trap.c b/tests/perf/llama/trap.c new file mode 100644 index 000000000..1cede6e87 --- /dev/null +++ b/tests/perf/llama/trap.c @@ -0,0 +1,39 @@ +#include +#include + +#include "trap.h" + +void __attribute__((aligned(4))) trap_handler() { + unsigned long mcause, mtval, mepc; + asm volatile( + "csrr %0, mcause\n" + "csrr %1, mtval\n" + "csrr %2, mepc\n" + : "=r" (mcause), "=r" (mtval), "=r" (mepc) + ); + printf("Exception: mcause=%08lx, mtval=%08lx, mepc=%08lx", mcause, mtval, mepc); + while(1); +} + +void setup_mtvec() { + void* ptr = &trap_handler; + asm volatile( + "csrw mtvec, %0" + : + : "r" (ptr) + ); +} + +void enter_smode() { + asm volatile( + "csrc mstatus, %0\n" + "csrs mstatus, %1\n" + ".option arch, -c\n" + "auipc %2, 0\n" + "addi %2, %2, 16\n" + "csrw mepc, %2\n" + "mret\n" + : + : "r" (0x1800), "r" (0x0800), "r" (0x10) + ); +} diff --git a/tests/perf/llama/trap.h b/tests/perf/llama/trap.h new file mode 100644 index 000000000..7165e52c8 --- /dev/null +++ b/tests/perf/llama/trap.h @@ -0,0 +1,5 @@ +#include + +void setup_mtvec(); +void trap_handler(); +void enter_smode(); diff --git a/tests/perf/llama/utils.h b/tests/perf/llama/utils.h new file mode 100644 index 000000000..3cacdc726 --- /dev/null +++ b/tests/perf/llama/utils.h @@ -0,0 +1,45 @@ +#pragma once + +#include +#include +#include + +// provide FILE-like operations for on-memory data + +typedef struct { + const char *data_begin; + size_t size; + const char *cursor; +} BIN; + +inline BIN *bopen(const char *data_begin, size_t size) { + BIN *bin = (BIN *) malloc(sizeof(BIN)); + bin->cursor = bin->data_begin = data_begin; + bin->size = size; + return bin; +} + +inline void bclose(BIN *bin) { + free(bin); +} + +inline size_t bread(void *buffer, size_t size, size_t count, BIN *stream) { + memcpy(buffer, (void *) stream->cursor, size * count); + stream->cursor += size * count; + return count; +} + +inline size_t bseek(BIN *bin, long offset, int origin) { + switch (origin) { + case SEEK_SET: + bin->cursor = bin->data_begin + offset; break; + case SEEK_CUR: + bin->cursor += offset; break; + case SEEK_END: + bin->cursor = bin->data_begin + bin->size - offset; break; + default: + return 1; + } + return 0; +} + diff --git a/tests/t1.ld b/tests/t1.ld index d94135b6e..1a8248c01 100644 --- a/tests/t1.ld +++ b/tests/t1.ld @@ -31,5 +31,5 @@ SECTIONS { .vbss (TYPE = SHT_NOBITS) : { *(.vbss .vbss.*) } >SRAM __stacktop = ORIGIN(SCALAR) + LENGTH(SCALAR); /* put stack on the top of SCALAR */ - __heapbegin = ORIGIN(SCALAR); /* put heap on the begin of DDR */ + __heapbegin = ORIGIN(DDR); /* put heap on the begin of DDR */ }