diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 252b475a0..6cff82ebe 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -103,6 +103,7 @@ jobs: uarch-pristine-ram.c uarch-pristine-hash.c machine-c-version.h + interpret-jump-table.h cartesi-machine-v${{ env.MACHINE_EMULATOR_VERSION }}_amd64.deb cartesi-machine-v${{ env.MACHINE_EMULATOR_VERSION }}_arm64.deb @@ -726,6 +727,7 @@ jobs: if: ${{ startsWith(github.ref, 'refs/tags/v') }} run: | mv artifacts/machine-c-version.h src + mv artifacts/interpret-jump-table.h src mv artifacts/uarch-pristine-ram.c uarch mv artifacts/uarch-pristine-hash.c uarch make create-generated-files-patch diff --git a/Makefile b/Makefile index 1d692987a..9640ae3eb 100644 --- a/Makefile +++ b/Makefile @@ -154,7 +154,7 @@ export CXX=g++ endif -GENERATED_FILES= uarch/uarch-pristine-hash.c uarch/uarch-pristine-ram.c src/machine-c-version.h +GENERATED_FILES= uarch/uarch-pristine-hash.c uarch/uarch-pristine-ram.c src/machine-c-version.h src/interpret-jump-table.h ADD_GENERATED_FILES_DIFF= add-generated-files.diff all: source-default @@ -244,12 +244,15 @@ lint-% check-format-% format-% check-format-lua-% check-lua-% format-lua-%: source-default: @eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C $(SRCDIR) -uarch: $(SRCDIR)/machine-c-version.h +uarch: $(SRCDIR)/machine-c-version.h $(SRCDIR)/interpret-jump-table.h @eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C uarch $(SRCDIR)/machine-c-version.h: @eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C $(SRCDIR) machine-c-version.h +$(SRCDIR)/interpret-jump-table.h: + @eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C $(SRCDIR) interpret-jump-table.h + build-emulator-builder-image: docker build $(DOCKER_PLATFORM) --build-arg DEBUG=$(debug) --build-arg COVERAGE=$(coverage) --build-arg SANITIZE=$(sanitize) --target builder -t cartesi/machine-emulator:builder -f Dockerfile . @@ -282,6 +285,7 @@ copy: docker create --name uarch-ram-bin $(DOCKER_PLATFORM) $(DEBIAN_IMG) docker cp uarch-ram-bin:/usr/src/emulator/$(DEB_FILENAME) . docker cp uarch-ram-bin:/usr/src/emulator/src/machine-c-version.h . + docker cp uarch-ram-bin:/usr/src/emulator/src/interpret-jump-table.h . docker cp uarch-ram-bin:/usr/src/emulator/uarch/uarch-ram.bin . docker cp uarch-ram-bin:/usr/src/emulator/uarch/uarch-pristine-ram.c . docker cp uarch-ram-bin:/usr/src/emulator/uarch/uarch-pristine-hash.c . @@ -399,4 +403,3 @@ $(ADD_GENERATED_FILES_DIFF): $(GENERATED_FILES) .PHONY: help all submodules doc clean distclean src luacartesi hash uarch \ create-generated-files-patch $(SUBDIRS) $(SUBCLEAN) - diff --git a/src/.gitignore b/src/.gitignore index ecdc63b96..b59133762 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -8,3 +8,4 @@ compile_flags.txt coverage* jsonrpc-discover.cpp machine-c-version.h +interpret-jump-table.h diff --git a/src/Makefile b/src/Makefile index 5f66b62d7..3450a747b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -214,18 +214,27 @@ SHA3_CFLAGS=-O3 # Optimization flags for the interpreter ifneq (,$(filter yes,$(relwithdebinfo) $(release))) -ifneq (,$(filter gcc,$(CC))) -# The following flag helps GCC to eliminate more redundant computations in the interpret loop, -# saving some host instructions and improving performance. -# This flag is usually enabled by default at -O3, -# but we don't use -O3 because it enables some other flags that are not worth for the interpreter. -INTERPRET_CXXFLAGS+=-fgcse-after-reload -fpredictive-commoning -fsplit-paths -ftree-partial-pre +ifneq (,$(findstring gcc,$(CC))) +# The following improves computed goto dispatch as stated in GCC manual +INTERPRET_CXXFLAGS+=-fno-gcse +# The following remove extra jumps in the computed goto dispatch +INTERPRET_CXXFLAGS+=-fno-crossjumping +# The following remove extra NOPs before jumping back to the interpret hot loop +INTERPRET_CXXFLAGS+=-fno-align-loops +# The interpreter dispatch loop performs better as a big inlined function +INTERPRET_CXXFLAGS+=-finline-limit=1024 +# The interpreter hot loop is big and puts pressure on register allocation, this improves register use +INTERPRET_CXXFLAGS+=-frename-registers -fweb +# The interpreter instruction dispatch is big, the following reduces its size minimizing CPU cache pressure +INTERPRET_CXXFLAGS+=-freorder-blocks-algorithm=simple +# Some distributions enables stack protector by default, make sure it's disabled +INTERPRET_CXXFLAGS+=-fno-stack-protector endif -# Disable jump tables, because it degrades the instruction decoding performance in the interpret loop, -# since it generates a memory indirection that has a high cost in opcode switches. -INTERPRET_CXXFLAGS+=-fno-jump-tables endif +# Make testing new optimization options easier +INTERPRET_CXXFLAGS+=$(MYINTERPRET_CXXFLAGS) + # Link time optimizations ifeq ($(lto),yes) OPTFLAGS+=-flto=auto @@ -262,7 +271,7 @@ PGO_WORKLOAD=\ whetstone 25000 LINTER_IGNORE_SOURCES= -LINTER_IGNORE_HEADERS= +LINTER_IGNORE_HEADERS=interpret-jump-table.h LINTER_SOURCES=$(filter-out $(LINTER_IGNORE_SOURCES),$(strip $(wildcard *.cpp) $(wildcard *.c))) LINTER_HEADERS=$(filter-out $(LINTER_IGNORE_HEADERS),$(strip $(wildcard *.hpp) $(wildcard *.h))) @@ -273,7 +282,7 @@ CLANG_FORMAT=clang-format CLANG_FORMAT_UARCH_FILES:=$(wildcard ../uarch/*.cpp) CLANG_FORMAT_UARCH_FILES:=$(filter-out %uarch-printf%,$(strip $(CLANG_FORMAT_UARCH_FILES))) CLANG_FORMAT_FILES:=$(wildcard *.cpp) $(wildcard *.c) $(wildcard *.h) $(wildcard *.hpp) $(CLANG_FORMAT_UARCH_FILES) -CLANG_FORMAT_IGNORE_FILES:= +CLANG_FORMAT_IGNORE_FILES:=interpret-jump-table.h CLANG_FORMAT_FILES:=$(strip $(CLANG_FORMAT_FILES)) CLANG_FORMAT_FILES:=$(filter-out $(CLANG_FORMAT_IGNORE_FILES),$(strip $(CLANG_FORMAT_FILES))) @@ -542,12 +551,12 @@ jsonrpc-discover.cpp: jsonrpc-discover.json echo '} // namespace cartesi' >> jsonrpc-discover.cpp %.clang-tidy: %.cpp machine-c-version.h - @$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CXXFLAGS) $(LUA_INC) 2>/dev/null + @$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CXXFLAGS) $(LUA_INC) -DCLANG_TIDY_LINT 2>/dev/null @$(CXX) $(CXXFLAGS) $(LUA_INC) $< -MM -MT $@ -MF $@.d > /dev/null 2>&1 @touch $@ %.clang-tidy: %.c - @$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CFLAGS) 2>/dev/null + @$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CFLAGS) -DCLANG_TIDY_LINT 2>/dev/null @$(CC) $(CFLAGS) $< -MM -MT $@ -MF $@.d > /dev/null 2>&1 @touch $@ @@ -560,7 +569,10 @@ uarch-pristine-ram.o: $(UARCH_PRISTINE_RAM_C) uarch-pristine-hash.o: $(UARCH_PRISTINE_HASH_C) $(CC) $(CFLAGS) -c -o $@ $< -interpret.o: interpret.cpp machine-c-version.h +interpret-jump-table.h: ../tools/gen-interpret-jump-table.lua + $< > $@ + +interpret.o: interpret.cpp interpret-jump-table.h machine-c-version.h $(CXX) $(CXXFLAGS) $(INTERPRET_CXXFLAGS) -c -o $@ $< %.o: %.cpp machine-c-version.h @@ -571,7 +583,7 @@ interpret.o: interpret.cpp machine-c-version.h ../uarch/uarch-pristine-ram.c ../uarch/uarch-pristine-hash.c: generate-uarch-pristine -generate-uarch-pristine: +generate-uarch-pristine: machine-c-version.h interpret-jump-table.h ifeq (,$(wildcard ../uarch/uarch-pristine-hash.c)) @if [ "$(DEV_ENV_HAS_TOOLCHAIN)" = "yes" ]; then \ $(MAKE) -C .. uarch; \ @@ -583,7 +595,7 @@ endif clean: clean-auto-generated clean-coverage clean-profile clean-tidy clean-libcartesi clean-executables clean-auto-generated: - @rm -f jsonrpc-discover.cpp machine-c-version.h + @rm -f jsonrpc-discover.cpp machine-c-version.h interpret-jump-table.h clean-tidy: @rm -f *.clang-tidy diff --git a/src/compiler-defines.h b/src/compiler-defines.h index 0fb7eb004..b2e331db9 100644 --- a/src/compiler-defines.h +++ b/src/compiler-defines.h @@ -40,4 +40,10 @@ #define PACKED __attribute__((packed)) +#if defined(__GNUC__) +#define FORCE_OPTIMIZE_O3 __attribute__((optimize("-O3"))) +#else +#define FORCE_OPTIMIZE_O3 +#endif + #endif diff --git a/src/device-state-access.h b/src/device-state-access.h index 51d5633a9..ef471818d 100644 --- a/src/device-state-access.h +++ b/src/device-state-access.h @@ -36,7 +36,7 @@ namespace cartesi { template class device_state_access : public i_device_state_access { public: - explicit device_state_access(STATE_ACCESS &a, uint64_t mcycle) : m_a(a), m_mcycle(mcycle) { + explicit device_state_access(STATE_ACCESS a, uint64_t mcycle) : m_a(a), m_mcycle(mcycle) { static_assert(is_an_i_state_access::value, "not an i_state_access"); } @@ -52,7 +52,7 @@ class device_state_access : public i_device_state_access { ~device_state_access() override = default; private: - STATE_ACCESS &m_a; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members) + STATE_ACCESS m_a; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members) uint64_t m_mcycle; void do_set_mip(uint64_t mask) override { diff --git a/src/i-state-access.h b/src/i-state-access.h index d692b32c2..d4b126ea6 100644 --- a/src/i-state-access.h +++ b/src/i-state-access.h @@ -24,6 +24,7 @@ #include #include +#include "compiler-defines.h" #include "meta.h" #include "shadow-tlb.h" @@ -773,7 +774,7 @@ class i_state_access { // CRTP } /// \brief Invalidates all TLB entries of all types. - void flush_all_tlb() { + NO_INLINE void flush_all_tlb() { derived().template flush_tlb_type(); derived().template flush_tlb_type(); derived().template flush_tlb_type(); @@ -781,7 +782,7 @@ class i_state_access { // CRTP /// \brief Invalidates TLB entries for a specific virtual address. /// \param vaddr Target virtual address. - void flush_tlb_vaddr(uint64_t vaddr) { + NO_INLINE void flush_tlb_vaddr(uint64_t vaddr) { return derived().do_flush_tlb_vaddr(vaddr); } diff --git a/src/interpret.cpp b/src/interpret.cpp index 572d317e3..35769e9fd 100644 --- a/src/interpret.cpp +++ b/src/interpret.cpp @@ -113,6 +113,11 @@ namespace cartesi { +enum class rd_kind { + x0, // rd = 0 + xN, // rd is a positive natural number (1, 2, 3 ... 31) +}; + #ifdef DUMP_REGS static const std::array reg_name{"zero", "ra", "sp", "gp", "tp", "t0", "t1", "t2", "s0", "s1", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", @@ -296,6 +301,12 @@ static void dump_regs(const STATE &s) { } #endif +/// \brief Checks if a instruction is uncompressed. +/// \param insn Instruction. +static FORCE_INLINE bool insn_is_uncompressed(uint32_t insn) { + return (insn & 3) == 3; +} + /// \brief Checks if CSR is read-only. /// \param csraddr Address of CSR in file. /// \returns true if read-only, false otherwise. @@ -318,7 +329,7 @@ static inline uint32_t csr_priv(CSR_address csr) { /// \param new_prv New privilege level. /// \details This function is outlined to minimize host CPU code cache pressure. template -static NO_INLINE void set_priv(STATE_ACCESS &a, int new_prv) { +static FORCE_INLINE void set_priv(STATE_ACCESS a, int new_prv) { INC_COUNTER(a.get_statistics(), priv_level[new_prv]); a.write_iflags_PRV(new_prv); // Invalidate all TLB entries @@ -339,7 +350,13 @@ static NO_INLINE void set_priv(STATE_ACCESS &a, int new_prv) { /// \returns The new program counter, pointing to the raised exception trap handler. /// \details This function is outlined to minimize host CPU code cache pressure. template -static NO_INLINE uint64_t raise_exception(STATE_ACCESS &a, uint64_t pc, uint64_t cause, uint64_t tval) { +static NO_INLINE uint64_t raise_exception(STATE_ACCESS a, uint64_t pc, uint64_t cause, uint64_t tval) { + if (cause == MCAUSE_ILLEGAL_INSN && !insn_is_uncompressed(static_cast(tval))) { + // Discard high bits of compressed instructions, + // this is not performed in the instruction hot loop as an optimization. + tval = static_cast(tval); + } + #if defined(DUMP_EXCEPTIONS) || defined(DUMP_MMU_EXCEPTIONS) || defined(DUMP_INTERRUPTS) || \ defined(DUMP_ILLEGAL_INSN_EXCEPTIONS) { @@ -442,7 +459,7 @@ static NO_INLINE uint64_t raise_exception(STATE_ACCESS &a, uint64_t pc, uint64_t /// \param a Machine state accessor object. /// \returns The mask. template -static inline uint32_t get_pending_irq_mask(STATE_ACCESS &a) { +static inline uint32_t get_pending_irq_mask(STATE_ACCESS a) { const uint64_t mip = a.read_mip(); const uint64_t mie = a.read_mie(); @@ -522,7 +539,7 @@ static inline uint32_t get_highest_priority_irq_num(uint32_t v) { /// \param a Machine state accessor object. /// \param pc Machine current program counter. template -static inline uint64_t raise_interrupt_if_any(STATE_ACCESS &a, uint64_t pc) { +static inline uint64_t raise_interrupt_if_any(STATE_ACCESS a, uint64_t pc) { const uint32_t mask = get_pending_irq_mask(a); if (unlikely(mask != 0)) { const uint64_t irq_num = get_highest_priority_irq_num(mask); @@ -535,7 +552,7 @@ static inline uint64_t raise_interrupt_if_any(STATE_ACCESS &a, uint64_t pc) { /// \param a Machine state accessor object. /// \param mcycle Machine current cycle. template -static inline void set_rtc_interrupt(STATE_ACCESS &a, uint64_t mcycle) { +static inline void set_rtc_interrupt(STATE_ACCESS a, uint64_t mcycle) { const uint64_t timecmp_cycle = rtc_time_to_cycle(a.read_clint_mtimecmp()); if (timecmp_cycle <= mcycle && timecmp_cycle != 0) { const uint64_t mip = a.read_mip(); @@ -543,10 +560,10 @@ static inline void set_rtc_interrupt(STATE_ACCESS &a, uint64_t mcycle) { } } -/// \brief Obtains the funct3 and opcode fields an instruction. +/// \brief Obtains the id fields an instruction. /// \param insn Instruction. -static inline uint32_t insn_get_funct3_00000_opcode(uint32_t insn) { - return insn & 0b111000001111111; +static FORCE_INLINE uint32_t insn_get_id(uint32_t insn) { + return insn & 0b1111'11111'1111111; } /// \brief Obtains the funct3 and trailing 0 bits from an instruction. @@ -668,24 +685,6 @@ static inline uint32_t insn_get_rs3(uint32_t insn) { return (insn >> 27); } -/// \brief Obtains the compressed instruction funct3 and opcode fields an instruction. -/// \param insn Instruction. -static inline uint32_t insn_get_c_funct3(uint32_t insn) { - return insn & 0b1110000000000011; -} - -/// \brief Obtains the compressed instruction funct6, funct2 and opcode fields an instruction. -/// \param insn Instruction. -static inline uint32_t insn_get_CA_funct6_funct2(uint32_t insn) { - return insn & 0b1111110001100011; -} - -/// \brief Obtains the compressed instruction funct2 and opcode fields an instruction. -/// \param insn Instruction. -static inline uint32_t insn_get_CB_funct2(uint32_t insn) { - return insn & 0b1110110000000011; -} - /// \brief Obtains the RD field from a compressed instructions that uses the CIW /// or CL format and RS2 field from CS or CA. /// \param insn Instruction. @@ -709,19 +708,29 @@ static inline uint32_t insn_get_CR_CSS_rs2(uint32_t insn) { /// \param insn Instruction. /// \details This function is forced to be inline because GCC may not always inline it. static FORCE_INLINE int32_t insn_get_C_J_imm(uint32_t insn) { - auto imm = static_cast(((insn >> (12 - 11)) & 0x800) | ((insn >> (11 - 4)) & 0x10) | - ((insn >> (9 - 8)) & 0x300) | ((insn << (10 - 8)) & 0x400) | ((insn >> (7 - 6)) & 0x40) | - ((insn << (7 - 6)) & 0x80) | ((insn >> (3 - 1)) & 0xe) | ((insn << (5 - 2)) & 0x20)); - return (imm << 20) >> 20; + return static_cast( + (static_cast(static_cast(insn << 19) >> 20) & ~0b11111111111) | // imm[11] + ((insn >> (11 - 4)) & 0b10000) | // imm[4] + ((insn >> (9 - 8)) & 0b1100000000) | // imm[9:8] + ((insn << (10 - 8)) & 0b10000000000) | // imm[10] + ((insn >> (7 - 6)) & 0b1000000) | // imm[6] + ((insn << (7 - 6)) & 0b10000000) | // imm[7] + ((insn >> (3 - 1)) & 0b1110) | // imm[3:1] + ((insn << (5 - 2)) & 0b100000) // imm[5] + ); } /// \brief Obtains the immediate value from a C_BEQZ and C_BNEZ instruction. /// \param insn Instruction. /// \details This function is forced to be inline because GCC may not always inline it. static FORCE_INLINE int32_t insn_get_C_BEQZ_BNEZ_imm(uint32_t insn) { - auto imm = static_cast(((insn >> (12 - 8)) & 0x100) | ((insn >> (10 - 3)) & 0x18) | - ((insn << (6 - 5)) & 0xc0) | ((insn >> (3 - 1)) & 0x6) | ((insn << (5 - 2)) & 0x20)); - return (imm << 23) >> 23; + return static_cast( + (static_cast(static_cast(insn << 19) >> 23) & ~0b11111111) | // imm[8] + ((insn >> 7) & 0b11000) | // imm[4:3] + ((insn << 1) & 0b11000000) | // imm[7:6] + ((insn >> 2) & 0b110) | // imm[2:1] + ((insn << 3) & 0b100000) // imm[5] + ); } /// \brief Obtains the immediate value from a CL/CS-type instruction. @@ -742,7 +751,9 @@ static FORCE_INLINE uint32_t insn_get_CI_CB_imm(uint32_t insn) { /// \param insn Instruction. /// \details This function is forced to be inline because GCC may not always inline it. static FORCE_INLINE int32_t insn_get_CI_CB_imm_se(uint32_t insn) { - return static_cast(insn_get_CI_CB_imm(insn) << 26) >> 26; + return static_cast((static_cast(static_cast(insn << 19) >> 26) & ~0b11111) | // imm[5] + ((insn >> 2) & 0b11111) // imm[4:0] + ); } /// \brief Obtains the immediate value from a C.LW and C.SW instructions. @@ -764,17 +775,23 @@ static FORCE_INLINE uint32_t insn_get_CIW_imm(uint32_t insn) { /// \param insn Instruction. /// \details This function is forced to be inline because GCC may not always inline it. static FORCE_INLINE int32_t insn_get_C_ADDI16SP_imm(uint32_t insn) { - auto imm = static_cast(((insn >> (12 - 9)) & 0x200) | ((insn >> (6 - 4)) & 0x10) | - ((insn << (6 - 5)) & 0x40) | ((insn << (7 - 3)) & 0x180) | ((insn << (5 - 2)) & 0x20)); - return (imm << 22) >> 22; + return static_cast( + (static_cast(static_cast(insn << 19) >> 22) & ~0b111111111) | // imm[9] + ((insn >> 2) & 0b10000) | // imm[4] + ((insn << 1) & 0b1000000) | // imm[6] + ((insn << 4) & 0b110000000) | // imm[8:7] + ((insn << 3) & 0b100000) // imm[5] + ); } /// \brief Obtains the immediate value from a C.LUI instruction. /// \param insn Instruction. /// \details This function is forced to be inline because GCC may not always inline it. static FORCE_INLINE int32_t insn_get_C_LUI_imm(uint32_t insn) { - auto imm = static_cast(((insn << (17 - 12)) & 0x20000) | ((insn << (12 - 2)) & 0x1F000)); - return (imm << 14) >> 14; + return static_cast( + (static_cast(static_cast(insn << 19) >> 14) & ~0b11111111111111111) | // imm[17] + ((insn << 10) & 0b11111000000000000) // imm[16:12] + ); } /// \brief Obtains the immediate value from a C.FLDSP and C.LDSP instructions. @@ -821,7 +838,7 @@ static FORCE_INLINE int32_t insn_get_C_SWSP_imm(uint32_t insn) { /// is outlined, and taking PC by reference would cause the compiler to store it in a stack variable /// instead of always storing it in register (this is an optimization). template -static NO_INLINE std::pair read_virtual_memory_slow(STATE_ACCESS &a, uint64_t pc, uint64_t mcycle, +static NO_INLINE std::pair read_virtual_memory_slow(STATE_ACCESS a, uint64_t pc, uint64_t mcycle, uint64_t vaddr, T *pval) { using U = std::make_unsigned_t; // No support for misaligned accesses: They are handled by a trap in BBL @@ -870,13 +887,15 @@ static NO_INLINE std::pair read_virtual_memory_slow(STATE_ACCESS /// \param pval Pointer to word receiving value. /// \returns True if succeeded, false otherwise. template -static FORCE_INLINE bool read_virtual_memory(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint64_t vaddr, T *pval) { +static FORCE_INLINE bool read_virtual_memory(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint64_t vaddr, T *pval) { // Try hitting the TLB if (unlikely(!(a.template read_memory_word_via_tlb(vaddr, pval)))) { // Outline the slow path into a function call to minimize host CPU code cache pressure INC_COUNTER(a.get_statistics(), tlb_rmiss); + T val = 0; // Don't pass pval reference directly so the compiler can store it in a register auto [status, new_pc] = - read_virtual_memory_slow(a, pc, mcycle, vaddr, pval); + read_virtual_memory_slow(a, pc, mcycle, vaddr, &val); + *pval = val; pc = new_pc; return status; } @@ -899,7 +918,7 @@ static FORCE_INLINE bool read_virtual_memory(STATE_ACCESS &a, uint64_t &pc, uint /// is outlined, and taking PC by reference would cause the compiler to store it in a stack variable /// instead of always storing it in register (this is an optimization). template -static NO_INLINE std::pair write_virtual_memory_slow(STATE_ACCESS &a, uint64_t pc, +static NO_INLINE std::pair write_virtual_memory_slow(STATE_ACCESS a, uint64_t pc, uint64_t mcycle, uint64_t vaddr, uint64_t val64) { using U = std::make_unsigned_t; // No support for misaligned accesses: They are handled by a trap in BBL @@ -944,7 +963,7 @@ static NO_INLINE std::pair write_virtual_memory_slow(S /// \param val64 Value to write. /// \returns True if succeeded, false if exception raised. template -static FORCE_INLINE execute_status write_virtual_memory(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint64_t vaddr, +static FORCE_INLINE execute_status write_virtual_memory(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint64_t vaddr, uint64_t val64) { // Try hitting the TLB if (unlikely((!a.template write_memory_word_via_tlb(vaddr, static_cast(val64))))) { @@ -959,7 +978,7 @@ static FORCE_INLINE execute_status write_virtual_memory(STATE_ACCESS &a, uint64_ } template -static void dump_insn([[maybe_unused]] STATE_ACCESS &a, [[maybe_unused]] uint64_t pc, [[maybe_unused]] uint32_t insn, +static void dump_insn([[maybe_unused]] STATE_ACCESS a, [[maybe_unused]] uint64_t pc, [[maybe_unused]] uint32_t insn, [[maybe_unused]] const char *name) { #ifdef DUMP_HIST a.get_naked_state().insn_hist[name]++; @@ -993,7 +1012,7 @@ static void dump_insn([[maybe_unused]] STATE_ACCESS &a, [[maybe_unused]] uint64_ /// \details This function is tail-called whenever the caller decoded enough of the instruction to identify it as /// illegal. template -static FORCE_INLINE execute_status raise_illegal_insn_exception(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status raise_illegal_insn_exception(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { pc = raise_exception(a, pc, MCAUSE_ILLEGAL_INSN, insn); return execute_status::failure; } @@ -1005,7 +1024,7 @@ static FORCE_INLINE execute_status raise_illegal_insn_exception(STATE_ACCESS &a, /// \return execute_status::failure /// \details This function is tail-called whenever the caller identified that the next value of pc is misaligned. template -static FORCE_INLINE execute_status raise_misaligned_fetch_exception(STATE_ACCESS &a, uint64_t &pc, uint64_t new_pc) { +static FORCE_INLINE execute_status raise_misaligned_fetch_exception(STATE_ACCESS a, uint64_t &pc, uint64_t new_pc) { pc = raise_exception(a, pc, MCAUSE_INSN_ADDRESS_MISALIGNED, new_pc); return execute_status::failure; } @@ -1017,7 +1036,7 @@ static FORCE_INLINE execute_status raise_misaligned_fetch_exception(STATE_ACCESS /// \return execute_status::failure /// \details This function is tail-called whenever the caller identified a raised exception. template -static FORCE_INLINE execute_status advance_to_raised_exception(STATE_ACCESS & /*a*/, uint64_t & /*pc*/) { +static FORCE_INLINE execute_status advance_to_raised_exception(STATE_ACCESS /*a*/, uint64_t & /*pc*/) { return execute_status::failure; } @@ -1030,7 +1049,7 @@ static FORCE_INLINE execute_status advance_to_raised_exception(STATE_ACCESS & /* /// \return status /// \details This function is tail-called whenever the caller wants move to the next instruction. template -static FORCE_INLINE execute_status advance_to_next_insn(STATE_ACCESS & /*a*/, uint64_t &pc, +static FORCE_INLINE execute_status advance_to_next_insn(STATE_ACCESS /*a*/, uint64_t &pc, execute_status status = execute_status::success) { pc += static_cast(size); return status; @@ -1043,7 +1062,7 @@ static FORCE_INLINE execute_status advance_to_next_insn(STATE_ACCESS & /*a*/, ui /// \return execute_status::success /// \details This function is tail-called whenever the caller wants to jump. template -static FORCE_INLINE execute_status execute_jump(STATE_ACCESS & /*a*/, uint64_t &pc, uint64_t new_pc) { +static FORCE_INLINE execute_status execute_jump(STATE_ACCESS /*a*/, uint64_t &pc, uint64_t new_pc) { pc = new_pc; return execute_status::success; } @@ -1054,7 +1073,7 @@ static FORCE_INLINE execute_status execute_jump(STATE_ACCESS & /*a*/, uint64_t & /// \param pc Interpreter loop program counter (will be overwritten). /// \param insn Instruction. template -static FORCE_INLINE execute_status execute_LR(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_LR(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { const uint64_t vaddr = a.read_x(insn_get_rs1(insn)); T val = 0; if (unlikely(!read_virtual_memory(a, pc, mcycle, vaddr, &val))) { @@ -1075,7 +1094,7 @@ static FORCE_INLINE execute_status execute_LR(STATE_ACCESS &a, uint64_t &pc, uin /// \param pc Interpreter loop program counter (will be overwritten). /// \param insn Instruction. template -static FORCE_INLINE execute_status execute_SC(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_SC(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { uint64_t val = 0; const uint64_t vaddr = a.read_x(insn_get_rs1(insn)); execute_status status = execute_status::success; @@ -1098,7 +1117,7 @@ static FORCE_INLINE execute_status execute_SC(STATE_ACCESS &a, uint64_t &pc, uin /// \brief Implementation of the LR.W instruction. template -static FORCE_INLINE execute_status execute_LR_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_LR_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { if (unlikely((insn & 0b00000001111100000000000000000000) != 0)) { return raise_illegal_insn_exception(a, pc, insn); } @@ -1108,13 +1127,13 @@ static FORCE_INLINE execute_status execute_LR_W(STATE_ACCESS &a, uint64_t &pc, u /// \brief Implementation of the SC.W instruction. template -static FORCE_INLINE execute_status execute_SC_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_SC_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "sc.w"); return execute_SC(a, pc, mcycle, insn); } template -static FORCE_INLINE execute_status execute_AMO(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn, +static FORCE_INLINE execute_status execute_AMO(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn, const F &f) { const uint64_t vaddr = a.read_x(insn_get_rs1(insn)); T valm = 0; @@ -1139,14 +1158,14 @@ static FORCE_INLINE execute_status execute_AMO(STATE_ACCESS &a, uint64_t &pc, ui /// \brief Implementation of the AMOSWAP.W instruction. template -static FORCE_INLINE execute_status execute_AMOSWAP_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOSWAP_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amoswap.w"); return execute_AMO(a, pc, mcycle, insn, [](int32_t /*valm*/, int32_t valr) -> int32_t { return valr; }); } /// \brief Implementation of the AMOADD.W instruction. template -static FORCE_INLINE execute_status execute_AMOADD_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOADD_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amoadd.w"); return execute_AMO(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t { int32_t val = 0; @@ -1156,28 +1175,28 @@ static FORCE_INLINE execute_status execute_AMOADD_W(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_AMOXOR_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOXOR_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amoxor.w"); return execute_AMO(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t { return valm ^ valr; }); } /// \brief Implementation of the AMOAND.W instruction. template -static FORCE_INLINE execute_status execute_AMOAND_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOAND_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amoand.w"); return execute_AMO(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t { return valm & valr; }); } /// \brief Implementation of the AMOOR.W instruction. template -static FORCE_INLINE execute_status execute_AMOOR_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOOR_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amoor.w"); return execute_AMO(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t { return valm | valr; }); } /// \brief Implementation of the AMOMIN.W instruction. template -static FORCE_INLINE execute_status execute_AMOMIN_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOMIN_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amomin.w"); return execute_AMO(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t { if (valm < valr) { @@ -1189,7 +1208,7 @@ static FORCE_INLINE execute_status execute_AMOMIN_W(STATE_ACCESS &a, uint64_t &p /// \brief Implementation of the AMOMAX.W instruction. template -static FORCE_INLINE execute_status execute_AMOMAX_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOMAX_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amomax.w"); return execute_AMO(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t { if (valm > valr) { @@ -1201,7 +1220,7 @@ static FORCE_INLINE execute_status execute_AMOMAX_W(STATE_ACCESS &a, uint64_t &p /// \brief Implementation of the AMOMINU.W instruction. template -static FORCE_INLINE execute_status execute_AMOMINU_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOMINU_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amominu.w"); return execute_AMO(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t { if (static_cast(valm) < static_cast(valr)) { @@ -1213,7 +1232,7 @@ static FORCE_INLINE execute_status execute_AMOMINU_W(STATE_ACCESS &a, uint64_t & /// \brief Implementation of the AMOMAXU.W instruction. template -static FORCE_INLINE execute_status execute_AMOMAXU_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOMAXU_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amomaxu.w"); return execute_AMO(a, pc, mcycle, insn, [](int32_t valm, int32_t valr) -> int32_t { if (static_cast(valm) > static_cast(valr)) { @@ -1225,7 +1244,7 @@ static FORCE_INLINE execute_status execute_AMOMAXU_W(STATE_ACCESS &a, uint64_t & /// \brief Implementation of the LR.D instruction. template -static FORCE_INLINE execute_status execute_LR_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_LR_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { if (unlikely((insn & 0b00000001111100000000000000000000) != 0)) { return raise_illegal_insn_exception(a, pc, insn); } @@ -1235,21 +1254,21 @@ static FORCE_INLINE execute_status execute_LR_D(STATE_ACCESS &a, uint64_t &pc, u /// \brief Implementation of the SC.D instruction. template -static FORCE_INLINE execute_status execute_SC_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_SC_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "sc.d"); return execute_SC(a, pc, mcycle, insn); } /// \brief Implementation of the AMOSWAP.D instruction. template -static FORCE_INLINE execute_status execute_AMOSWAP_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOSWAP_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amoswap.d"); return execute_AMO(a, pc, mcycle, insn, [](int64_t /*valm*/, int64_t valr) -> int64_t { return valr; }); } /// \brief Implementation of the AMOADD.D instruction. template -static FORCE_INLINE execute_status execute_AMOADD_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOADD_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amoadd.d"); return execute_AMO(a, pc, mcycle, insn, [](int64_t valm, int64_t valr) -> int64_t { int64_t val = 0; @@ -1259,28 +1278,28 @@ static FORCE_INLINE execute_status execute_AMOADD_D(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_AMOXOR_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOXOR_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amoxor.d"); return execute_AMO(a, pc, mcycle, insn, [](int64_t valm, int64_t valr) -> int64_t { return valm ^ valr; }); } /// \brief Implementation of the AMOAND.D instruction. template -static FORCE_INLINE execute_status execute_AMOAND_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOAND_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amoand.d"); return execute_AMO(a, pc, mcycle, insn, [](int64_t valm, int64_t valr) -> int64_t { return valm & valr; }); } /// \brief Implementation of the AMOOR.D instruction. template -static FORCE_INLINE execute_status execute_AMOOR_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOOR_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amoor.d"); return execute_AMO(a, pc, mcycle, insn, [](int64_t valm, int64_t valr) -> int64_t { return valm | valr; }); } /// \brief Implementation of the AMOMIN.D instruction. template -static FORCE_INLINE execute_status execute_AMOMIN_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOMIN_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amomin.d"); return execute_AMO(a, pc, mcycle, insn, [](int64_t valm, int64_t valr) -> int64_t { if (valm < valr) { @@ -1292,7 +1311,7 @@ static FORCE_INLINE execute_status execute_AMOMIN_D(STATE_ACCESS &a, uint64_t &p /// \brief Implementation of the AMOMAX.D instruction. template -static FORCE_INLINE execute_status execute_AMOMAX_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOMAX_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amomax.d"); return execute_AMO(a, pc, mcycle, insn, [](int64_t valm, int64_t valr) -> int64_t { if (valm > valr) { @@ -1304,7 +1323,7 @@ static FORCE_INLINE execute_status execute_AMOMAX_D(STATE_ACCESS &a, uint64_t &p /// \brief Implementation of the AMOMINU.D instruction. template -static FORCE_INLINE execute_status execute_AMOMINU_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOMINU_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amominu.d"); return execute_AMO(a, pc, mcycle, insn, [](uint64_t valm, uint64_t valr) -> uint64_t { if (valm < valr) { @@ -1316,7 +1335,7 @@ static FORCE_INLINE execute_status execute_AMOMINU_D(STATE_ACCESS &a, uint64_t & /// \brief Implementation of the AMOMAXU.D instruction. template -static FORCE_INLINE execute_status execute_AMOMAXU_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMOMAXU_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "amomaxu.d"); return execute_AMO(a, pc, mcycle, insn, [](uint64_t valm, uint64_t valr) -> uint64_t { if (valm > valr) { @@ -1327,9 +1346,12 @@ static FORCE_INLINE execute_status execute_AMOMAXU_D(STATE_ACCESS &a, uint64_t & } /// \brief Implementation of the ADDW instruction. -template -static FORCE_INLINE execute_status execute_ADDW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_ADDW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "addw"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { // Discard upper 32 bits auto rs1w = static_cast(rs1); @@ -1341,9 +1363,12 @@ static FORCE_INLINE execute_status execute_ADDW(STATE_ACCESS &a, uint64_t &pc, u } /// \brief Implementation of the SUBW instruction. -template -static FORCE_INLINE execute_status execute_SUBW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SUBW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "subw"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { // Convert 64-bit to 32-bit auto rs1w = static_cast(rs1); @@ -1355,12 +1380,15 @@ static FORCE_INLINE execute_status execute_SUBW(STATE_ACCESS &a, uint64_t &pc, u } /// \brief Implementation of the SLLW instruction. -template -static FORCE_INLINE execute_status execute_SLLW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SLLW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { if (unlikely((insn & 0b11111110000000000111000001111111) != 0b00000000000000000001000000111011)) { return raise_illegal_insn_exception(a, pc, insn); } dump_insn(a, pc, insn, "sllw"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { const auto rs1w = static_cast(static_cast(rs1) << (rs2 & 31)); return static_cast(rs1w); @@ -1368,9 +1396,12 @@ static FORCE_INLINE execute_status execute_SLLW(STATE_ACCESS &a, uint64_t &pc, u } /// \brief Implementation of the SRLW instruction. -template -static FORCE_INLINE execute_status execute_SRLW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SRLW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "srlw"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { auto rs1w = static_cast(static_cast(rs1) >> (rs2 & 31)); return static_cast(rs1w); @@ -1378,9 +1409,12 @@ static FORCE_INLINE execute_status execute_SRLW(STATE_ACCESS &a, uint64_t &pc, u } /// \brief Implementation of the SRAW instruction. -template -static FORCE_INLINE execute_status execute_SRAW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SRAW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "sraw"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { const int32_t rs1w = static_cast(rs1) >> (rs2 & 31); return static_cast(rs1w); @@ -1388,9 +1422,12 @@ static FORCE_INLINE execute_status execute_SRAW(STATE_ACCESS &a, uint64_t &pc, u } /// \brief Implementation of the MULW instruction. -template -static FORCE_INLINE execute_status execute_MULW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_MULW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "mulw"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { auto rs1w = static_cast(rs1); auto rs2w = static_cast(rs2); @@ -1401,12 +1438,15 @@ static FORCE_INLINE execute_status execute_MULW(STATE_ACCESS &a, uint64_t &pc, u } /// \brief Implementation of the DIVW instruction. -template -static FORCE_INLINE execute_status execute_DIVW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_DIVW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { if (unlikely((insn & 0b11111110000000000111000001111111) != 0b00000010000000000100000000111011)) { return raise_illegal_insn_exception(a, pc, insn); } dump_insn(a, pc, insn, "divw"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { auto rs1w = static_cast(rs1); auto rs2w = static_cast(rs2); @@ -1421,9 +1461,12 @@ static FORCE_INLINE execute_status execute_DIVW(STATE_ACCESS &a, uint64_t &pc, u } /// \brief Implementation of the DIVUW instruction. -template -static FORCE_INLINE execute_status execute_DIVUW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_DIVUW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "divuw"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { auto rs1w = static_cast(rs1); auto rs2w = static_cast(rs2); @@ -1435,12 +1478,15 @@ static FORCE_INLINE execute_status execute_DIVUW(STATE_ACCESS &a, uint64_t &pc, } /// \brief Implementation of the REMW instruction. -template -static FORCE_INLINE execute_status execute_REMW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_REMW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { if (unlikely((insn & 0b11111110000000000111000001111111) != 0b00000010000000000110000000111011)) { return raise_illegal_insn_exception(a, pc, insn); } dump_insn(a, pc, insn, "remw"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { auto rs1w = static_cast(rs1); auto rs2w = static_cast(rs2); @@ -1455,12 +1501,15 @@ static FORCE_INLINE execute_status execute_REMW(STATE_ACCESS &a, uint64_t &pc, u } /// \brief Implementation of the REMUW instruction. -template -static FORCE_INLINE execute_status execute_REMUW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_REMUW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { if (unlikely((insn & 0b11111110000000000111000001111111) != 0b00000010000000000111000000111011)) { return raise_illegal_insn_exception(a, pc, insn); } dump_insn(a, pc, insn, "remuw"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { auto rs1w = static_cast(rs1); auto rs2w = static_cast(rs2); @@ -1482,7 +1531,7 @@ static inline uint64_t read_csr_success(uint64_t val, bool *status) { } template -static inline bool rdcounteren(STATE_ACCESS &a, uint64_t mask) { +static inline bool rdcounteren(STATE_ACCESS a, uint64_t mask) { uint64_t counteren = MCOUNTEREN_R_MASK; auto priv = a.read_iflags_PRV(); if (priv <= PRV_S) { @@ -1495,7 +1544,7 @@ static inline bool rdcounteren(STATE_ACCESS &a, uint64_t mask) { } template -static inline uint64_t read_csr_cycle(STATE_ACCESS &a, uint64_t mcycle, bool *status) { +static inline uint64_t read_csr_cycle(STATE_ACCESS a, uint64_t mcycle, bool *status) { if (rdcounteren(a, MCOUNTEREN_CY_MASK)) { return read_csr_success(mcycle, status); } @@ -1503,7 +1552,7 @@ static inline uint64_t read_csr_cycle(STATE_ACCESS &a, uint64_t mcycle, bool *st } template -static inline uint64_t read_csr_instret(STATE_ACCESS &a, uint64_t mcycle, bool *status) { +static inline uint64_t read_csr_instret(STATE_ACCESS a, uint64_t mcycle, bool *status) { if (unlikely(!rdcounteren(a, MCOUNTEREN_IR_MASK))) { return read_csr_fail(status); } @@ -1513,7 +1562,7 @@ static inline uint64_t read_csr_instret(STATE_ACCESS &a, uint64_t mcycle, bool * } template -static inline uint64_t read_csr_time(STATE_ACCESS &a, uint64_t mcycle, bool *status) { +static inline uint64_t read_csr_time(STATE_ACCESS a, uint64_t mcycle, bool *status) { if (unlikely(!rdcounteren(a, MCOUNTEREN_TM_MASK))) { return read_csr_fail(status); } @@ -1522,54 +1571,54 @@ static inline uint64_t read_csr_time(STATE_ACCESS &a, uint64_t mcycle, bool *sta } template -static inline uint64_t read_csr_sstatus(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_sstatus(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_mstatus() & SSTATUS_R_MASK, status); } template -static inline uint64_t read_csr_senvcfg(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_senvcfg(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_senvcfg() & SENVCFG_R_MASK, status); } template -static inline uint64_t read_csr_sie(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_sie(STATE_ACCESS a, bool *status) { const uint64_t mie = a.read_mie(); const uint64_t mideleg = a.read_mideleg(); return read_csr_success(mie & mideleg, status); } template -static inline uint64_t read_csr_stvec(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_stvec(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_stvec(), status); } template -static inline uint64_t read_csr_scounteren(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_scounteren(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_scounteren(), status); } template -static inline uint64_t read_csr_sscratch(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_sscratch(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_sscratch(), status); } template -static inline uint64_t read_csr_sepc(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_sepc(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_sepc(), status); } template -static inline uint64_t read_csr_scause(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_scause(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_scause(), status); } template -static inline uint64_t read_csr_stval(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_stval(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_stval(), status); } template -static inline uint64_t read_csr_sip(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_sip(STATE_ACCESS a, bool *status) { // Ensure values are are loaded in order: do not nest with operator const uint64_t mip = a.read_mip(); const uint64_t mideleg = a.read_mideleg(); @@ -1577,7 +1626,7 @@ static inline uint64_t read_csr_sip(STATE_ACCESS &a, bool *status) { } template -static inline uint64_t read_csr_satp(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_satp(STATE_ACCESS a, bool *status) { const uint64_t mstatus = a.read_mstatus(); auto priv = a.read_iflags_PRV(); // When TVM=1, attempts to read or write the satp CSR @@ -1589,67 +1638,67 @@ static inline uint64_t read_csr_satp(STATE_ACCESS &a, bool *status) { } template -static inline uint64_t read_csr_mstatus(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_mstatus(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_mstatus() & MSTATUS_R_MASK, status); } template -static inline uint64_t read_csr_menvcfg(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_menvcfg(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_menvcfg() & MENVCFG_R_MASK, status); } template -static inline uint64_t read_csr_misa(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_misa(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_misa(), status); } template -static inline uint64_t read_csr_medeleg(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_medeleg(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_medeleg(), status); } template -static inline uint64_t read_csr_mideleg(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_mideleg(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_mideleg(), status); } template -static inline uint64_t read_csr_mie(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_mie(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_mie(), status); } template -static inline uint64_t read_csr_mtvec(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_mtvec(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_mtvec(), status); } template -static inline uint64_t read_csr_mcounteren(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_mcounteren(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_mcounteren(), status); } template -static inline uint64_t read_csr_mscratch(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_mscratch(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_mscratch(), status); } template -static inline uint64_t read_csr_mepc(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_mepc(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_mepc(), status); } template -static inline uint64_t read_csr_mcause(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_mcause(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_mcause(), status); } template -static inline uint64_t read_csr_mtval(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_mtval(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_mtval(), status); } template -static inline uint64_t read_csr_mip(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_mip(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_mip(), status); } @@ -1658,29 +1707,29 @@ static inline uint64_t read_csr_mcycle(uint64_t mcycle, bool *status) { } template -static inline uint64_t read_csr_minstret(STATE_ACCESS &a, uint64_t mcycle, bool *status) { +static inline uint64_t read_csr_minstret(STATE_ACCESS a, uint64_t mcycle, bool *status) { const uint64_t icycleinstret = a.read_icycleinstret(); const uint64_t minstret = mcycle - icycleinstret; return read_csr_success(minstret, status); } template -static inline uint64_t read_csr_mvendorid(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_mvendorid(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_mvendorid(), status); } template -static inline uint64_t read_csr_marchid(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_marchid(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_marchid(), status); } template -static inline uint64_t read_csr_mimpid(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_mimpid(STATE_ACCESS a, bool *status) { return read_csr_success(a.read_mimpid(), status); } template -static inline uint64_t read_csr_fflags(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_fflags(STATE_ACCESS a, bool *status) { // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { return read_csr_fail(status); @@ -1690,7 +1739,7 @@ static inline uint64_t read_csr_fflags(STATE_ACCESS &a, bool *status) { } template -static inline uint64_t read_csr_frm(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_frm(STATE_ACCESS a, bool *status) { // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { return read_csr_fail(status); @@ -1700,7 +1749,7 @@ static inline uint64_t read_csr_frm(STATE_ACCESS &a, bool *status) { } template -static inline uint64_t read_csr_fcsr(STATE_ACCESS &a, bool *status) { +static inline uint64_t read_csr_fcsr(STATE_ACCESS a, bool *status) { // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { return read_csr_fail(status); @@ -1715,7 +1764,7 @@ static inline uint64_t read_csr_fcsr(STATE_ACCESS &a, bool *status) { /// \returns Register value. /// \details This function is outlined to minimize host CPU code cache pressure. template -static NO_INLINE uint64_t read_csr(STATE_ACCESS &a, uint64_t mcycle, CSR_address csraddr, bool *status) { +static NO_INLINE uint64_t read_csr(STATE_ACCESS a, uint64_t mcycle, CSR_address csraddr, bool *status) { if (unlikely(csr_priv(csraddr) > a.read_iflags_PRV())) { return read_csr_fail(status); } @@ -1876,20 +1925,20 @@ static NO_INLINE uint64_t read_csr(STATE_ACCESS &a, uint64_t mcycle, CSR_address } template -static execute_status write_csr_sstatus(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_sstatus(STATE_ACCESS a, uint64_t val) { const uint64_t mstatus = a.read_mstatus(); return write_csr_mstatus(a, (mstatus & ~SSTATUS_W_MASK) | (val & SSTATUS_W_MASK)); } template -static execute_status write_csr_senvcfg(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_senvcfg(STATE_ACCESS a, uint64_t val) { const uint64_t senvcfg = a.read_senvcfg(); a.write_senvcfg((senvcfg & ~SENVCFG_W_MASK) | (val & SENVCFG_W_MASK)); return execute_status::success; } template -static execute_status write_csr_sie(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_sie(STATE_ACCESS a, uint64_t val) { uint64_t mie = a.read_mie(); const uint64_t mask = a.read_mideleg(); mie = (mie & ~mask) | (val & mask); @@ -1898,43 +1947,43 @@ static execute_status write_csr_sie(STATE_ACCESS &a, uint64_t val) { } template -static execute_status write_csr_stvec(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_stvec(STATE_ACCESS a, uint64_t val) { a.write_stvec(val & ~1); return execute_status::success; } template -static execute_status write_csr_scounteren(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_scounteren(STATE_ACCESS a, uint64_t val) { a.write_scounteren(val & SCOUNTEREN_RW_MASK); return execute_status::success; } template -static execute_status write_csr_sscratch(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_sscratch(STATE_ACCESS a, uint64_t val) { a.write_sscratch(val); return execute_status::success; } template -static execute_status write_csr_sepc(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_sepc(STATE_ACCESS a, uint64_t val) { a.write_sepc(val & ~1); return execute_status::success; } template -static execute_status write_csr_scause(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_scause(STATE_ACCESS a, uint64_t val) { a.write_scause(val); return execute_status::success; } template -static execute_status write_csr_stval(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_stval(STATE_ACCESS a, uint64_t val) { a.write_stval(val); return execute_status::success; } template -static execute_status write_csr_sip(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_sip(STATE_ACCESS a, uint64_t val) { const uint64_t mask = a.read_mideleg(); uint64_t mip = a.read_mip(); mip = (mip & ~mask) | (val & mask); @@ -1943,7 +1992,7 @@ static execute_status write_csr_sip(STATE_ACCESS &a, uint64_t val) { } template -static NO_INLINE execute_status write_csr_satp(STATE_ACCESS &a, uint64_t val) { +static NO_INLINE execute_status write_csr_satp(STATE_ACCESS a, uint64_t val) { const uint64_t mstatus = a.read_mstatus(); auto priv = a.read_iflags_PRV(); @@ -1996,7 +2045,7 @@ static NO_INLINE execute_status write_csr_satp(STATE_ACCESS &a, uint64_t val) { } template -static execute_status write_csr_mstatus(STATE_ACCESS &a, uint64_t val) { +static NO_INLINE execute_status write_csr_mstatus(STATE_ACCESS a, uint64_t val) { const uint64_t old_mstatus = a.read_mstatus() & MSTATUS_R_MASK; // M-mode software can determine whether a privilege mode is implemented @@ -2069,7 +2118,7 @@ static execute_status write_csr_mstatus(STATE_ACCESS &a, uint64_t val) { } template -static execute_status write_csr_menvcfg(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_menvcfg(STATE_ACCESS a, uint64_t val) { uint64_t menvcfg = a.read_menvcfg() & MENVCFG_R_MASK; // Modify only bits that can be written to @@ -2080,7 +2129,7 @@ static execute_status write_csr_menvcfg(STATE_ACCESS &a, uint64_t val) { } template -static execute_status write_csr_medeleg(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_medeleg(STATE_ACCESS a, uint64_t val) { // For exceptions that cannot occur in less privileged modes, // the corresponding medeleg bits should be read-only zero a.write_medeleg((a.read_medeleg() & ~MEDELEG_W_MASK) | (val & MEDELEG_W_MASK)); @@ -2088,7 +2137,7 @@ static execute_status write_csr_medeleg(STATE_ACCESS &a, uint64_t val) { } template -static execute_status write_csr_mideleg(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_mideleg(STATE_ACCESS a, uint64_t val) { const uint64_t mask = MIP_SSIP_MASK | MIP_STIP_MASK | MIP_SEIP_MASK; uint64_t mideleg = a.read_mideleg(); mideleg = (mideleg & ~mask) | (val & mask); @@ -2097,7 +2146,7 @@ static execute_status write_csr_mideleg(STATE_ACCESS &a, uint64_t val) { } template -static execute_status write_csr_mie(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_mie(STATE_ACCESS a, uint64_t val) { const uint64_t mask = MIP_MSIP_MASK | MIP_MTIP_MASK | MIP_MEIP_MASK | MIP_SSIP_MASK | MIP_STIP_MASK | MIP_SEIP_MASK; uint64_t mie = a.read_mie(); mie = (mie & ~mask) | (val & mask); @@ -2106,19 +2155,19 @@ static execute_status write_csr_mie(STATE_ACCESS &a, uint64_t val) { } template -static execute_status write_csr_mtvec(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_mtvec(STATE_ACCESS a, uint64_t val) { a.write_mtvec(val & ~1); return execute_status::success; } template -static execute_status write_csr_mcounteren(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_mcounteren(STATE_ACCESS a, uint64_t val) { a.write_mcounteren(val & MCOUNTEREN_RW_MASK); return execute_status::success; } template -static execute_status write_csr_minstret(STATE_ACCESS &a, uint64_t mcycle, uint64_t val) { +static execute_status write_csr_minstret(STATE_ACCESS a, uint64_t mcycle, uint64_t val) { // Note that mcycle will only be incremented after the instruction is executed, // but we have to compute this in advance const uint64_t icycleinstret = (mcycle + 1) - val; @@ -2127,7 +2176,7 @@ static execute_status write_csr_minstret(STATE_ACCESS &a, uint64_t mcycle, uint6 } template -static execute_status write_csr_mcycle(STATE_ACCESS & /*a*/, uint64_t /*val*/) { +static execute_status write_csr_mcycle(STATE_ACCESS /*a*/, uint64_t /*val*/) { // We can't allow writes to mcycle because we use it to measure the progress in machine execution. // The specs say it is an MRW CSR, read-writeable in M-mode. // BBL enables all counters in both M- and S-modes. @@ -2138,31 +2187,31 @@ static execute_status write_csr_mcycle(STATE_ACCESS & /*a*/, uint64_t /*val*/) { } template -static execute_status write_csr_mscratch(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_mscratch(STATE_ACCESS a, uint64_t val) { a.write_mscratch(val); return execute_status::success; } template -static execute_status write_csr_mepc(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_mepc(STATE_ACCESS a, uint64_t val) { a.write_mepc(val & ~1); return execute_status::success; } template -static execute_status write_csr_mcause(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_mcause(STATE_ACCESS a, uint64_t val) { a.write_mcause(val); return execute_status::success; } template -static execute_status write_csr_mtval(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_mtval(STATE_ACCESS a, uint64_t val) { a.write_mtval(val); return execute_status::success; } template -static execute_status write_csr_mip(STATE_ACCESS &a, uint64_t val) { +static execute_status write_csr_mip(STATE_ACCESS a, uint64_t val) { const uint64_t mask = MIP_SSIP_MASK | MIP_STIP_MASK | MIP_SEIP_MASK; auto mip = a.read_mip(); mip = (mip & ~mask) | (val & mask); @@ -2171,7 +2220,7 @@ static execute_status write_csr_mip(STATE_ACCESS &a, uint64_t val) { } template -static inline execute_status write_csr_fflags(STATE_ACCESS &a, uint64_t val) { +static inline execute_status write_csr_fflags(STATE_ACCESS a, uint64_t val) { const uint64_t mstatus = a.read_mstatus(); // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. if (unlikely((mstatus & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { @@ -2183,7 +2232,7 @@ static inline execute_status write_csr_fflags(STATE_ACCESS &a, uint64_t val) { } template -static inline execute_status write_csr_frm(STATE_ACCESS &a, uint64_t val) { +static inline execute_status write_csr_frm(STATE_ACCESS a, uint64_t val) { const uint64_t mstatus = a.read_mstatus(); // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. if (unlikely((mstatus & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { @@ -2195,7 +2244,7 @@ static inline execute_status write_csr_frm(STATE_ACCESS &a, uint64_t val) { } template -static inline execute_status write_csr_fcsr(STATE_ACCESS &a, uint64_t val) { +static inline execute_status write_csr_fcsr(STATE_ACCESS a, uint64_t val) { const uint64_t mstatus = a.read_mstatus(); // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. if (unlikely((mstatus & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { @@ -2213,7 +2262,7 @@ static inline execute_status write_csr_fcsr(STATE_ACCESS &a, uint64_t val) { /// \returns The status of the operation (true for success, false otherwise). /// \details This function is outlined to minimize host CPU code cache pressure. template -static NO_INLINE execute_status write_csr(STATE_ACCESS &a, uint64_t mcycle, CSR_address csraddr, uint64_t val) { +static NO_INLINE execute_status write_csr(STATE_ACCESS a, uint64_t mcycle, CSR_address csraddr, uint64_t val) { #if defined(DUMP_CSR) fprintf(stderr, "csr_write: csr=0x%03x val=0x", static_cast(csraddr)); print_uint64_t(val); @@ -2367,7 +2416,7 @@ static NO_INLINE execute_status write_csr(STATE_ACCESS &a, uint64_t mcycle, CSR_ } template -static FORCE_INLINE execute_status execute_csr_RW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn, +static FORCE_INLINE execute_status execute_csr_RW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn, const RS1VAL &rs1val) { auto csraddr = static_cast(insn_I_get_uimm(insn)); // Try to read old CSR value @@ -2400,22 +2449,22 @@ static FORCE_INLINE execute_status execute_csr_RW(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the CSRRW instruction. template -static FORCE_INLINE execute_status execute_CSRRW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_CSRRW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "csrrw"); return execute_csr_RW(a, pc, mcycle, insn, - [](STATE_ACCESS &a, uint32_t insn) -> uint64_t { return a.read_x(insn_get_rs1(insn)); }); + [](STATE_ACCESS a, uint32_t insn) -> uint64_t { return a.read_x(insn_get_rs1(insn)); }); } /// \brief Implementation of the CSRRWI instruction. template -static FORCE_INLINE execute_status execute_CSRRWI(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_CSRRWI(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "csrrwi"); return execute_csr_RW(a, pc, mcycle, insn, - [](STATE_ACCESS &, uint32_t insn) -> uint64_t { return static_cast(insn_get_rs1(insn)); }); + [](STATE_ACCESS, uint32_t insn) -> uint64_t { return static_cast(insn_get_rs1(insn)); }); } template -static FORCE_INLINE execute_status execute_csr_SC(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn, +static FORCE_INLINE execute_status execute_csr_SC(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn, const F &f) { auto csraddr = static_cast(insn_I_get_uimm(insn)); // Try to read old CSR value @@ -2450,20 +2499,20 @@ static FORCE_INLINE execute_status execute_csr_SC(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the CSRRS instruction. template -static FORCE_INLINE execute_status execute_CSRRS(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_CSRRS(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "csrrs"); return execute_csr_SC(a, pc, mcycle, insn, [](uint64_t csr, uint64_t rs1) -> uint64_t { return csr | rs1; }); } /// \brief Implementation of the CSRRC instruction. template -static FORCE_INLINE execute_status execute_CSRRC(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_CSRRC(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "csrrc"); return execute_csr_SC(a, pc, mcycle, insn, [](uint64_t csr, uint64_t rs1) -> uint64_t { return csr & ~rs1; }); } template -static FORCE_INLINE execute_status execute_csr_SCI(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn, +static FORCE_INLINE execute_status execute_csr_SCI(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn, const F &f) { auto csraddr = static_cast(insn_I_get_uimm(insn)); // Try to read old CSR value @@ -2495,21 +2544,21 @@ static FORCE_INLINE execute_status execute_csr_SCI(STATE_ACCESS &a, uint64_t &pc /// \brief Implementation of the CSRRSI instruction. template -static FORCE_INLINE execute_status execute_CSRRSI(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_CSRRSI(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "csrrsi"); return execute_csr_SCI(a, pc, mcycle, insn, [](uint64_t csr, uint32_t rs1) -> uint64_t { return csr | rs1; }); } /// \brief Implementation of the CSRRCI instruction. template -static FORCE_INLINE execute_status execute_CSRRCI(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_CSRRCI(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "csrrci"); return execute_csr_SCI(a, pc, mcycle, insn, [](uint64_t csr, uint32_t rs1) -> uint64_t { return csr & ~rs1; }); } /// \brief Implementation of the ECALL instruction. template -static FORCE_INLINE execute_status execute_ECALL(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_ECALL(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "ecall"); auto priv = a.read_iflags_PRV(); pc = raise_exception(a, pc, MCAUSE_ECALL_BASE + priv, 0); @@ -2518,7 +2567,7 @@ static FORCE_INLINE execute_status execute_ECALL(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the EBREAK instruction. template -static FORCE_INLINE execute_status execute_EBREAK(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_EBREAK(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "ebreak"); pc = raise_exception(a, pc, MCAUSE_BREAKPOINT, pc); return execute_status::failure; @@ -2526,7 +2575,7 @@ static FORCE_INLINE execute_status execute_EBREAK(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the SRET instruction. template -static FORCE_INLINE execute_status execute_SRET(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_SRET(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "sret"); auto priv = a.read_iflags_PRV(); uint64_t mstatus = a.read_mstatus(); @@ -2556,7 +2605,7 @@ static FORCE_INLINE execute_status execute_SRET(STATE_ACCESS &a, uint64_t &pc, u /// \brief Implementation of the MRET instruction. template -static FORCE_INLINE execute_status execute_MRET(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_MRET(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "mret"); auto priv = a.read_iflags_PRV(); if (unlikely(priv < PRV_M)) { @@ -2588,7 +2637,7 @@ static FORCE_INLINE execute_status execute_MRET(STATE_ACCESS &a, uint64_t &pc, u /// \brief Implementation of the WFI instruction. /// \details This function is outlined to minimize host CPU code cache pressure. template -static FORCE_INLINE execute_status execute_WFI(STATE_ACCESS &a, uint64_t &pc, uint64_t &mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_WFI(STATE_ACCESS a, uint64_t &pc, uint64_t &mcycle, uint32_t insn) { dump_insn(a, pc, insn, "wfi"); // Check privileges and do nothing else auto priv = a.read_iflags_PRV(); @@ -2614,7 +2663,7 @@ static FORCE_INLINE execute_status execute_WFI(STATE_ACCESS &a, uint64_t &pc, ui /// \brief Implementation of the FENCE instruction. template -static FORCE_INLINE execute_status execute_FENCE(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FENCE(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { INC_COUNTER(a.get_statistics(), fence); dump_insn(a, pc, insn, "fence"); // Really do nothing @@ -2623,7 +2672,7 @@ static FORCE_INLINE execute_status execute_FENCE(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the FENCE.I instruction. template -static FORCE_INLINE execute_status execute_FENCE_I(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FENCE_I(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { INC_COUNTER(a.get_statistics(), fence_i); dump_insn(a, pc, insn, "fence.i"); // Really do nothing @@ -2631,11 +2680,8 @@ static FORCE_INLINE execute_status execute_FENCE_I(STATE_ACCESS &a, uint64_t &pc } template -static FORCE_INLINE execute_status execute_arithmetic(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) { +static FORCE_INLINE execute_status execute_arithmetic(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) { const uint32_t rd = insn_get_rd(insn); - if (unlikely(rd == 0)) { - return advance_to_next_insn(a, pc); - } // Ensure rs1 and rs2 are loaded in order: do not nest with call to f() as // the order of evaluation of arguments in a function call is undefined. const uint64_t rs1 = a.read_x(insn_get_rs1(insn)); @@ -2646,9 +2692,12 @@ static FORCE_INLINE execute_status execute_arithmetic(STATE_ACCESS &a, uint64_t } /// \brief Implementation of the ADD instruction. -template -static FORCE_INLINE execute_status execute_ADD(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_ADD(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "add"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { uint64_t val = 0; __builtin_add_overflow(rs1, rs2, &val); @@ -2657,9 +2706,12 @@ static FORCE_INLINE execute_status execute_ADD(STATE_ACCESS &a, uint64_t &pc, ui } /// \brief Implementation of the SUB instruction. -template -static FORCE_INLINE execute_status execute_SUB(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SUB(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "sub"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { uint64_t val = 0; __builtin_sub_overflow(rs1, rs2, &val); @@ -2668,70 +2720,97 @@ static FORCE_INLINE execute_status execute_SUB(STATE_ACCESS &a, uint64_t &pc, ui } /// \brief Implementation of the SLL instruction. -template -static FORCE_INLINE execute_status execute_SLL(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SLL(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "sll"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { return rs1 << (rs2 & (XLEN - 1)); }); } /// \brief Implementation of the SLT instruction. -template -static FORCE_INLINE execute_status execute_SLT(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SLT(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "slt"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { return static_cast(rs1) < static_cast(rs2); }); } /// \brief Implementation of the SLTU instruction. -template -static FORCE_INLINE execute_status execute_SLTU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SLTU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "sltu"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { return rs1 < rs2; }); } /// \brief Implementation of the XOR instruction. -template -static FORCE_INLINE execute_status execute_XOR(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_XOR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "xor"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { return rs1 ^ rs2; }); } /// \brief Implementation of the SRL instruction. -template -static FORCE_INLINE execute_status execute_SRL(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SRL(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "srl"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { return rs1 >> (rs2 & (XLEN - 1)); }); } /// \brief Implementation of the SRA instruction. -template -static FORCE_INLINE execute_status execute_SRA(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SRA(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "sra"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { return static_cast(static_cast(rs1) >> (rs2 & (XLEN - 1))); }); } /// \brief Implementation of the OR instruction. -template -static FORCE_INLINE execute_status execute_OR(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_OR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "or"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { return rs1 | rs2; }); } /// \brief Implementation of the AND instruction. -template -static FORCE_INLINE execute_status execute_AND(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_AND(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "and"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { return rs1 & rs2; }); } /// \brief Implementation of the MUL instruction. -template -static FORCE_INLINE execute_status execute_MUL(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_MUL(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "mul"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { auto srs1 = static_cast(rs1); auto srs2 = static_cast(rs2); @@ -2742,9 +2821,12 @@ static FORCE_INLINE execute_status execute_MUL(STATE_ACCESS &a, uint64_t &pc, ui } /// \brief Implementation of the MULH instruction. -template -static FORCE_INLINE execute_status execute_MULH(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_MULH(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "mulh"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { auto srs1 = static_cast(rs1); auto srs2 = static_cast(rs2); @@ -2753,9 +2835,12 @@ static FORCE_INLINE execute_status execute_MULH(STATE_ACCESS &a, uint64_t &pc, u } /// \brief Implementation of the MULHSU instruction. -template -static FORCE_INLINE execute_status execute_MULHSU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_MULHSU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "mulhsu"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { auto srs1 = static_cast(rs1); return static_cast( @@ -2764,18 +2849,24 @@ static FORCE_INLINE execute_status execute_MULHSU(STATE_ACCESS &a, uint64_t &pc, } /// \brief Implementation of the MULHU instruction. -template -static FORCE_INLINE execute_status execute_MULHU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_MULHU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "mulhu"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { return static_cast((static_cast(rs1) * static_cast(rs2)) >> 64); }); } /// \brief Implementation of the DIV instruction. -template -static FORCE_INLINE execute_status execute_DIV(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_DIV(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "div"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { auto srs1 = static_cast(rs1); auto srs2 = static_cast(rs2); @@ -2790,9 +2881,12 @@ static FORCE_INLINE execute_status execute_DIV(STATE_ACCESS &a, uint64_t &pc, ui } /// \brief Implementation of the DIVU instruction. -template -static FORCE_INLINE execute_status execute_DIVU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_DIVU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "divu"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { if (unlikely(rs2 == 0)) { return static_cast(-1); @@ -2802,9 +2896,12 @@ static FORCE_INLINE execute_status execute_DIVU(STATE_ACCESS &a, uint64_t &pc, u } /// \brief Implementation of the REM instruction. -template -static FORCE_INLINE execute_status execute_REM(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_REM(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "rem"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { auto srs1 = static_cast(rs1); auto srs2 = static_cast(rs2); @@ -2819,9 +2916,12 @@ static FORCE_INLINE execute_status execute_REM(STATE_ACCESS &a, uint64_t &pc, ui } /// \brief Implementation of the REMU instruction. -template -static FORCE_INLINE execute_status execute_REMU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_REMU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "remu"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> uint64_t { if (unlikely(rs2 == 0)) { return rs1; @@ -2831,12 +2931,9 @@ static FORCE_INLINE execute_status execute_REMU(STATE_ACCESS &a, uint64_t &pc, u } template -static FORCE_INLINE execute_status execute_arithmetic_immediate(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, +static FORCE_INLINE execute_status execute_arithmetic_immediate(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) { const uint32_t rd = insn_get_rd(insn); - if (unlikely(rd == 0)) { - return advance_to_next_insn(a, pc); - } const uint64_t rs1 = a.read_x(insn_get_rs1(insn)); const int32_t imm = insn_I_get_imm(insn); a.write_x(rd, f(rs1, imm)); @@ -2844,26 +2941,35 @@ static FORCE_INLINE execute_status execute_arithmetic_immediate(STATE_ACCESS &a, } /// \brief Implementation of the SRLI instruction. -template -static FORCE_INLINE execute_status execute_SRLI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SRLI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "srli"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { return rs1 >> (imm & (XLEN - 1)); }); } /// \brief Implementation of the SRAI instruction. -template -static FORCE_INLINE execute_status execute_SRAI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SRAI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "srai"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { return static_cast(static_cast(rs1) >> (imm & (XLEN - 1))); }); } /// \brief Implementation of the ADDI instruction. -template -static FORCE_INLINE execute_status execute_ADDI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_ADDI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "addi"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { int64_t val = 0; __builtin_add_overflow(static_cast(rs1), static_cast(imm), &val); @@ -2872,49 +2978,67 @@ static FORCE_INLINE execute_status execute_ADDI(STATE_ACCESS &a, uint64_t &pc, u } /// \brief Implementation of the SLTI instruction. -template -static FORCE_INLINE execute_status execute_SLTI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SLTI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "slti"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { return static_cast(rs1) < static_cast(imm); }); } /// \brief Implementation of the SLTIU instruction. -template -static FORCE_INLINE execute_status execute_SLTIU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SLTIU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "sltiu"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { return rs1 < static_cast(imm); }); } /// \brief Implementation of the XORI instruction. -template -static FORCE_INLINE execute_status execute_XORI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_XORI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "xori"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { return rs1 ^ imm; }); } /// \brief Implementation of the ORI instruction. -template -static FORCE_INLINE execute_status execute_ORI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_ORI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "ori"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { return rs1 | imm; }); } /// \brief Implementation of the ANDI instruction. -template -static FORCE_INLINE execute_status execute_ANDI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_ANDI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "andi"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { return rs1 & imm; }); } /// \brief Implementation of the SLLI instruction. -template -static FORCE_INLINE execute_status execute_SLLI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SLLI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { if (unlikely((insn & (0b111111 << 26)) != 0)) { return raise_illegal_insn_exception(a, pc, insn); } dump_insn(a, pc, insn, "slli"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { // No need to mask lower 6 bits in imm because of the if condition a above // We do it anyway here to prevent problems if this code is moved @@ -2923,9 +3047,12 @@ static FORCE_INLINE execute_status execute_SLLI(STATE_ACCESS &a, uint64_t &pc, u } /// \brief Implementation of the ADDIW instruction. -template -static FORCE_INLINE execute_status execute_ADDIW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_ADDIW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "addiw"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { int32_t val = 0; __builtin_add_overflow(static_cast(rs1), imm, &val); @@ -2934,12 +3061,15 @@ static FORCE_INLINE execute_status execute_ADDIW(STATE_ACCESS &a, uint64_t &pc, } /// \brief Implementation of the SLLIW instruction. -template -static FORCE_INLINE execute_status execute_SLLIW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SLLIW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { if (unlikely(insn_get_funct7(insn) != 0)) { return raise_illegal_insn_exception(a, pc, insn); } dump_insn(a, pc, insn, "slliw"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { // No need to mask lower 5 bits in imm because of the if condition a above // We do it anyway here to prevent problems if this code is moved @@ -2949,9 +3079,12 @@ static FORCE_INLINE execute_status execute_SLLIW(STATE_ACCESS &a, uint64_t &pc, } /// \brief Implementation of the SRLIW instruction. -template -static FORCE_INLINE execute_status execute_SRLIW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SRLIW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "srliw"); + if constexpr (rd_kind == rd_kind::x0) { + return advance_to_next_insn(a, pc); + } return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { // No need to mask lower 5 bits in imm because of funct7 test in caller // We do it anyway here to prevent problems if this code is moved @@ -2961,13 +3094,16 @@ static FORCE_INLINE execute_status execute_SRLIW(STATE_ACCESS &a, uint64_t &pc, } /// \brief Implementation of the SRAIW instruction. -template -static FORCE_INLINE execute_status execute_SRAIW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_SRAIW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "sraiw"); - // When rd=0 the instruction is a HINT, and we consider it as a soft yield when rs1 == 31 - if (unlikely(insn_get_rd(insn) == 0 && insn_get_rs1(insn) == 31 && a.get_soft_yield())) { - // Force the main interpreter loop to break - return advance_to_next_insn(a, pc, execute_status::success_and_yield); + if constexpr (rd_kind == rd_kind::x0) { + // When rd=0 the instruction is a HINT, and we consider it as a soft yield when rs1 == 31 + if (unlikely(insn_get_rs1(insn) == 31 && a.get_soft_yield())) { + // Force the main interpreter loop to break + return advance_to_next_insn(a, pc, execute_status::success_and_yield); + } + return advance_to_next_insn(a, pc); } return execute_arithmetic_immediate(a, pc, insn, [](uint64_t rs1, int32_t imm) -> uint64_t { const int32_t rs1w = static_cast(rs1) >> (imm & 0b11111); @@ -2976,47 +3112,50 @@ static FORCE_INLINE execute_status execute_SRAIW(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_S(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_S(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { const uint64_t vaddr = a.read_x(insn_get_rs1(insn)); const int32_t imm = insn_S_get_imm(insn); const uint64_t val = a.read_x(insn_get_rs2(insn)); const execute_status status = write_virtual_memory(a, pc, mcycle, vaddr + imm, val); - if (unlikely(status == execute_status::failure)) { - return advance_to_raised_exception(a, pc); + if (unlikely(status != execute_status::success)) { + if (status == execute_status::failure) { + return advance_to_raised_exception(a, pc); + } + return advance_to_next_insn(a, pc, status); } - return advance_to_next_insn(a, pc, status); + return advance_to_next_insn(a, pc); } /// \brief Implementation of the SB instruction. template -static FORCE_INLINE execute_status execute_SB(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_SB(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "sb"); return execute_S(a, pc, mcycle, insn); } /// \brief Implementation of the SH instruction. template -static FORCE_INLINE execute_status execute_SH(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_SH(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "sh"); return execute_S(a, pc, mcycle, insn); } /// \brief Implementation of the SW instruction. template -static FORCE_INLINE execute_status execute_SW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_SW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "sw"); return execute_S(a, pc, mcycle, insn); } /// \brief Implementation of the SD instruction. template -static FORCE_INLINE execute_status execute_SD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_SD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "sd"); return execute_S(a, pc, mcycle, insn); } -template -static FORCE_INLINE execute_status execute_L(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_L(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { const uint64_t vaddr = a.read_x(insn_get_rs1(insn)); const int32_t imm = insn_I_get_imm(insn); T val = 0; @@ -3025,7 +3164,7 @@ static FORCE_INLINE execute_status execute_L(STATE_ACCESS &a, uint64_t &pc, uint } const uint32_t rd = insn_get_rd(insn); // don't write x0 - if (unlikely(rd == 0)) { + if constexpr (rd_kind == rd_kind::x0) { return advance_to_next_insn(a, pc); } // This static branch is eliminated by the compiler @@ -3038,56 +3177,56 @@ static FORCE_INLINE execute_status execute_L(STATE_ACCESS &a, uint64_t &pc, uint } /// \brief Implementation of the LB instruction. -template -static FORCE_INLINE execute_status execute_LB(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_LB(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "lb"); - return execute_L(a, pc, mcycle, insn); + return execute_L(a, pc, mcycle, insn); } /// \brief Implementation of the LH instruction. -template -static FORCE_INLINE execute_status execute_LH(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_LH(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "lh"); - return execute_L(a, pc, mcycle, insn); + return execute_L(a, pc, mcycle, insn); } /// \brief Implementation of the LW instruction. -template -static FORCE_INLINE execute_status execute_LW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_LW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "lw"); - return execute_L(a, pc, mcycle, insn); + return execute_L(a, pc, mcycle, insn); } /// \brief Implementation of the LD instruction. -template -static FORCE_INLINE execute_status execute_LD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_LD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "ld"); - return execute_L(a, pc, mcycle, insn); + return execute_L(a, pc, mcycle, insn); } /// \brief Implementation of the LBU instruction. -template -static FORCE_INLINE execute_status execute_LBU(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_LBU(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "lbu"); - return execute_L(a, pc, mcycle, insn); + return execute_L(a, pc, mcycle, insn); } /// \brief Implementation of the LHU instruction. -template -static FORCE_INLINE execute_status execute_LHU(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_LHU(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "lhu"); - return execute_L(a, pc, mcycle, insn); + return execute_L(a, pc, mcycle, insn); } /// \brief Implementation of the LWU instruction. -template -static FORCE_INLINE execute_status execute_LWU(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_LWU(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "lwu"); - return execute_L(a, pc, mcycle, insn); + return execute_L(a, pc, mcycle, insn); } template -static FORCE_INLINE execute_status execute_branch(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) { +static FORCE_INLINE execute_status execute_branch(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) { const uint64_t rs1 = a.read_x(insn_get_rs1(insn)); const uint64_t rs2 = a.read_x(insn_get_rs2(insn)); if (f(rs1, rs2)) { @@ -3099,21 +3238,21 @@ static FORCE_INLINE execute_status execute_branch(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the BEQ instruction. template -static FORCE_INLINE execute_status execute_BEQ(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_BEQ(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "beq"); return execute_branch(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> bool { return rs1 == rs2; }); } /// \brief Implementation of the BNE instruction. template -static FORCE_INLINE execute_status execute_BNE(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_BNE(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "bne"); return execute_branch(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> bool { return rs1 != rs2; }); } /// \brief Implementation of the BLT instruction. template -static FORCE_INLINE execute_status execute_BLT(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_BLT(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "blt"); return execute_branch(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> bool { return static_cast(rs1) < static_cast(rs2); }); @@ -3121,7 +3260,7 @@ static FORCE_INLINE execute_status execute_BLT(STATE_ACCESS &a, uint64_t &pc, ui /// \brief Implementation of the BGE instruction. template -static FORCE_INLINE execute_status execute_BGE(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_BGE(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "bge"); return execute_branch(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> bool { return static_cast(rs1) >= static_cast(rs2); }); @@ -3129,64 +3268,64 @@ static FORCE_INLINE execute_status execute_BGE(STATE_ACCESS &a, uint64_t &pc, ui /// \brief Implementation of the BLTU instruction. template -static FORCE_INLINE execute_status execute_BLTU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_BLTU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "bltu"); return execute_branch(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> bool { return rs1 < rs2; }); } /// \brief Implementation of the BGEU instruction. template -static FORCE_INLINE execute_status execute_BGEU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_BGEU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "bgeu"); return execute_branch(a, pc, insn, [](uint64_t rs1, uint64_t rs2) -> bool { return rs1 >= rs2; }); } /// \brief Implementation of the LUI instruction. -template -static FORCE_INLINE execute_status execute_LUI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_LUI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "lui"); - const uint32_t rd = insn_get_rd(insn); - if (unlikely(rd == 0)) { + if constexpr (rd_kind == rd_kind::x0) { return advance_to_next_insn(a, pc); } + const uint32_t rd = insn_get_rd(insn); a.write_x(rd, insn_U_get_imm(insn)); return advance_to_next_insn(a, pc); } /// \brief Implementation of the AUIPC instruction. -template -static FORCE_INLINE execute_status execute_AUIPC(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_AUIPC(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "auipc"); - const uint32_t rd = insn_get_rd(insn); - if (unlikely(rd == 0)) { + if constexpr (rd_kind == rd_kind::x0) { return advance_to_next_insn(a, pc); } + const uint32_t rd = insn_get_rd(insn); a.write_x(rd, pc + insn_U_get_imm(insn)); return advance_to_next_insn(a, pc); } /// \brief Implementation of the JAL instruction. -template -static FORCE_INLINE execute_status execute_JAL(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_JAL(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "jal"); const uint64_t new_pc = pc + insn_J_get_imm(insn); - const uint32_t rd = insn_get_rd(insn); - if (unlikely(rd == 0)) { + if constexpr (rd_kind == rd_kind::x0) { return execute_jump(a, pc, new_pc); } + const uint32_t rd = insn_get_rd(insn); a.write_x(rd, pc + 4); return execute_jump(a, pc, new_pc); } /// \brief Implementation of the JALR instruction. -template -static FORCE_INLINE execute_status execute_JALR(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +template +static FORCE_INLINE execute_status execute_JALR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "jalr"); const uint64_t val = pc + 4; const uint64_t new_pc = static_cast(a.read_x(insn_get_rs1(insn)) + insn_I_get_imm(insn)) & ~static_cast(1); const uint32_t rd = insn_get_rd(insn); - if (unlikely(rd != 0)) { + if constexpr (rd_kind != rd_kind::x0) { a.write_x(rd, val); return execute_jump(a, pc, new_pc); } @@ -3196,7 +3335,7 @@ static FORCE_INLINE execute_status execute_JALR(STATE_ACCESS &a, uint64_t &pc, u /// \brief Implementation of the SFENCE.VMA instruction. /// \details This function is outlined to minimize host CPU code cache pressure. template -static FORCE_INLINE execute_status execute_SFENCE_VMA(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_SFENCE_VMA(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { // rs1 and rs2 are arbitrary, rest is set if (unlikely((insn & 0b11111110000000000111111111111111) != 0b00010010000000000000000001110011)) { return raise_illegal_insn_exception(a, pc, insn); @@ -3248,32 +3387,34 @@ static FORCE_INLINE execute_status execute_SFENCE_VMA(STATE_ACCESS &a, uint64_t return advance_to_next_insn(a, pc, execute_status::success_and_flush_fetch); } -template -static FORCE_INLINE execute_status execute_SRLI_SRAI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7_sr1(insn))) { - case insn_SRLI_SRAI_funct7_sr1::SRLI: - return execute_SRLI(a, pc, insn); - case insn_SRLI_SRAI_funct7_sr1::SRAI: - return execute_SRAI(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); +template +static FORCE_INLINE execute_status execute_SRLI_SRAI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7_sr1 = static_cast(insn_get_funct7_sr1(insn)); + if (funct7_sr1 == insn_SRLI_SRAI_funct7_sr1::SRLI) { + return execute_SRLI(a, pc, insn); + } + if (funct7_sr1 == insn_SRLI_SRAI_funct7_sr1::SRAI) { + return execute_SRAI(a, pc, insn); } + return raise_illegal_insn_exception(a, pc, insn); } -template -static FORCE_INLINE execute_status execute_SRLIW_SRAIW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_SRLIW_SRAIW_funct7::SRLIW: - return execute_SRLIW(a, pc, insn); - case insn_SRLIW_SRAIW_funct7::SRAIW: - return execute_SRAIW(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); +template +static FORCE_INLINE execute_status execute_SRLIW_SRAIW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_SRLIW_SRAIW_funct7::SRLIW) { + return execute_SRLIW(a, pc, insn); + } + if (funct7 == insn_SRLIW_SRAIW_funct7::SRAIW) { + return execute_SRAIW(a, pc, insn); } + return raise_illegal_insn_exception(a, pc, insn); } template -static FORCE_INLINE execute_status execute_AMO_W(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMO_W(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { switch (static_cast(insn_get_funct7_sr2(insn))) { case insn_AMO_funct7_sr2::AMOADD: return execute_AMOADD_W(a, pc, mcycle, insn); @@ -3303,7 +3444,7 @@ static FORCE_INLINE execute_status execute_AMO_W(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_AMO_D(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_AMO_D(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { switch (static_cast(insn_get_funct7_sr2(insn))) { case insn_AMO_funct7_sr2::AMOADD: return execute_AMOADD_D(a, pc, mcycle, insn); @@ -3332,136 +3473,150 @@ static FORCE_INLINE execute_status execute_AMO_D(STATE_ACCESS &a, uint64_t &pc, } } -template -static FORCE_INLINE execute_status execute_ADD_MUL_SUB(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_ADD_MUL_SUB_funct7::ADD: - return execute_ADD(a, pc, insn); - case insn_ADD_MUL_SUB_funct7::MUL: - return execute_MUL(a, pc, insn); - case insn_ADD_MUL_SUB_funct7::SUB: - return execute_SUB(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); +template +static FORCE_INLINE execute_status execute_ADD_MUL_SUB(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_ADD_MUL_SUB_funct7::ADD) { + return execute_ADD(a, pc, insn); + } + if (funct7 == insn_ADD_MUL_SUB_funct7::MUL) { + return execute_MUL(a, pc, insn); + } + if (funct7 == insn_ADD_MUL_SUB_funct7::SUB) { + return execute_SUB(a, pc, insn); } + return raise_illegal_insn_exception(a, pc, insn); } -template -static FORCE_INLINE execute_status execute_SLL_MULH(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_SLL_MULH_funct7::SLL: - return execute_SLL(a, pc, insn); - case insn_SLL_MULH_funct7::MULH: - return execute_MULH(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); +template +static FORCE_INLINE execute_status execute_SLL_MULH(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_SLL_MULH_funct7::SLL) { + return execute_SLL(a, pc, insn); + } + if (funct7 == insn_SLL_MULH_funct7::MULH) { + return execute_MULH(a, pc, insn); } + return raise_illegal_insn_exception(a, pc, insn); } -template -static FORCE_INLINE execute_status execute_SLT_MULHSU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_SLT_MULHSU_funct7::SLT: - return execute_SLT(a, pc, insn); - case insn_SLT_MULHSU_funct7::MULHSU: - return execute_MULHSU(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); +template +static FORCE_INLINE execute_status execute_SLT_MULHSU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_SLT_MULHSU_funct7::SLT) { + return execute_SLT(a, pc, insn); } + if (funct7 == insn_SLT_MULHSU_funct7::MULHSU) { + return execute_MULHSU(a, pc, insn); + } + return raise_illegal_insn_exception(a, pc, insn); } -template -static FORCE_INLINE execute_status execute_SLTU_MULHU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_SLTU_MULHU_funct7::SLTU: - return execute_SLTU(a, pc, insn); - case insn_SLTU_MULHU_funct7::MULHU: - return execute_MULHU(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); +template +static FORCE_INLINE execute_status execute_SLTU_MULHU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_SLTU_MULHU_funct7::SLTU) { + return execute_SLTU(a, pc, insn); + } + if (funct7 == insn_SLTU_MULHU_funct7::MULHU) { + return execute_MULHU(a, pc, insn); } + return raise_illegal_insn_exception(a, pc, insn); } -template -static FORCE_INLINE execute_status execute_XOR_DIV(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_XOR_DIV_funct7::XOR: - return execute_XOR(a, pc, insn); - case insn_XOR_DIV_funct7::DIV: - return execute_DIV(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); +template +static FORCE_INLINE execute_status execute_XOR_DIV(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_XOR_DIV_funct7::XOR) { + return execute_XOR(a, pc, insn); } + if (funct7 == insn_XOR_DIV_funct7::DIV) { + return execute_DIV(a, pc, insn); + } + return raise_illegal_insn_exception(a, pc, insn); } -template -static FORCE_INLINE execute_status execute_SRL_DIVU_SRA(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_SRL_DIVU_SRA_funct7::SRL: - return execute_SRL(a, pc, insn); - case insn_SRL_DIVU_SRA_funct7::DIVU: - return execute_DIVU(a, pc, insn); - case insn_SRL_DIVU_SRA_funct7::SRA: - return execute_SRA(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); +template +static FORCE_INLINE execute_status execute_SRL_DIVU_SRA(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_SRL_DIVU_SRA_funct7::SRL) { + return execute_SRL(a, pc, insn); + } + if (funct7 == insn_SRL_DIVU_SRA_funct7::SRA) { + return execute_SRA(a, pc, insn); } + if (funct7 == insn_SRL_DIVU_SRA_funct7::DIVU) { + return execute_DIVU(a, pc, insn); + } + return raise_illegal_insn_exception(a, pc, insn); } -template -static FORCE_INLINE execute_status execute_OR_REM(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_OR_REM_funct7::OR: - return execute_OR(a, pc, insn); - case insn_OR_REM_funct7::REM: - return execute_REM(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); +template +static FORCE_INLINE execute_status execute_OR_REM(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_OR_REM_funct7::OR) { + return execute_OR(a, pc, insn); } + if (funct7 == insn_OR_REM_funct7::REM) { + return execute_REM(a, pc, insn); + } + return raise_illegal_insn_exception(a, pc, insn); } -template -static FORCE_INLINE execute_status execute_AND_REMU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_AND_REMU_funct7::AND: - return execute_AND(a, pc, insn); - case insn_AND_REMU_funct7::REMU: - return execute_REMU(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); +template +static FORCE_INLINE execute_status execute_AND_REMU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_AND_REMU_funct7::AND) { + return execute_AND(a, pc, insn); } + if (funct7 == insn_AND_REMU_funct7::REMU) { + return execute_REMU(a, pc, insn); + } + return raise_illegal_insn_exception(a, pc, insn); } -template -static FORCE_INLINE execute_status execute_ADDW_MULW_SUBW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_ADDW_MULW_SUBW_funct7::ADDW: - return execute_ADDW(a, pc, insn); - case insn_ADDW_MULW_SUBW_funct7::MULW: - return execute_MULW(a, pc, insn); - case insn_ADDW_MULW_SUBW_funct7::SUBW: - return execute_SUBW(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); +template +static FORCE_INLINE execute_status execute_ADDW_MULW_SUBW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_ADDW_MULW_SUBW_funct7::ADDW) { + return execute_ADDW(a, pc, insn); + } + if (funct7 == insn_ADDW_MULW_SUBW_funct7::MULW) { + return execute_MULW(a, pc, insn); } + if (funct7 == insn_ADDW_MULW_SUBW_funct7::SUBW) { + return execute_SUBW(a, pc, insn); + } + return raise_illegal_insn_exception(a, pc, insn); } -template -static FORCE_INLINE execute_status execute_SRLW_DIVUW_SRAW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_SRLW_DIVUW_SRAW_funct7::SRLW: - return execute_SRLW(a, pc, insn); - case insn_SRLW_DIVUW_SRAW_funct7::DIVUW: - return execute_DIVUW(a, pc, insn); - case insn_SRLW_DIVUW_SRAW_funct7::SRAW: - return execute_SRAW(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); +template +static FORCE_INLINE execute_status execute_SRLW_DIVUW_SRAW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_SRLW_DIVUW_SRAW_funct7::SRLW) { + return execute_SRLW(a, pc, insn); + } + if (funct7 == insn_SRLW_DIVUW_SRAW_funct7::DIVUW) { + return execute_DIVUW(a, pc, insn); + } + if (funct7 == insn_SRLW_DIVUW_SRAW_funct7::SRAW) { + return execute_SRAW(a, pc, insn); } + return raise_illegal_insn_exception(a, pc, insn); } template -static FORCE_INLINE execute_status execute_privileged(STATE_ACCESS &a, uint64_t &pc, uint64_t &mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_privileged(STATE_ACCESS a, uint64_t &pc, uint64_t &mcycle, uint32_t insn) { switch (static_cast(insn)) { case insn_privileged::ECALL: return execute_ECALL(a, pc, insn); @@ -3521,7 +3676,7 @@ static inline T float_unbox(uint64_t val) { } template -static FORCE_INLINE execute_status execute_float_ternary_op_rm(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, +static FORCE_INLINE execute_status execute_float_ternary_op_rm(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) { const uint64_t fcsr = a.read_fcsr(); // The rounding mode comes from the insn @@ -3543,8 +3698,7 @@ static FORCE_INLINE execute_status execute_float_ternary_op_rm(STATE_ACCESS &a, } template -static FORCE_INLINE execute_status execute_float_binary_op_rm(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, - const F &f) { +static FORCE_INLINE execute_status execute_float_binary_op_rm(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) { const uint64_t fcsr = a.read_fcsr(); // The rounding mode comes from the insn const uint32_t rm = insn_get_rm(insn, fcsr); @@ -3564,7 +3718,7 @@ static FORCE_INLINE execute_status execute_float_binary_op_rm(STATE_ACCESS &a, u } template -static FORCE_INLINE execute_status execute_float_unary_op_rm(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) { +static FORCE_INLINE execute_status execute_float_unary_op_rm(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) { const uint64_t fcsr = a.read_fcsr(); // Unary operation should have rs2 set to 0 if (unlikely(insn_get_rs2(insn) != 0)) { @@ -3587,33 +3741,44 @@ static FORCE_INLINE execute_status execute_float_unary_op_rm(STATE_ACCESS &a, ui } template -static FORCE_INLINE execute_status execute_FS(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_FS(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { const uint64_t vaddr = a.read_x(insn_get_rs1(insn)); const int32_t imm = insn_S_get_imm(insn); // A narrower n-bit transfer out of the floating-point // registers will transfer the lower n bits of the register ignoring the upper FLEN−n bits. T val = static_cast(a.read_f(insn_get_rs2(insn))); const execute_status status = write_virtual_memory(a, pc, mcycle, vaddr + imm, val); - if (unlikely(status == execute_status::failure)) { - return advance_to_raised_exception(a, pc); + if (unlikely(status != execute_status::success)) { + if (status == execute_status::failure) { + return advance_to_raised_exception(a, pc); + } + return advance_to_next_insn(a, pc, status); } - return advance_to_next_insn(a, pc, status); + return advance_to_next_insn(a, pc); } template -static FORCE_INLINE execute_status execute_FSW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_FSW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "fsw"); + // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. + if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { + return raise_illegal_insn_exception(a, pc, insn); + } return execute_FS(a, pc, mcycle, insn); } template -static FORCE_INLINE execute_status execute_FSD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_FSD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "fsd"); + // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. + if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { + return raise_illegal_insn_exception(a, pc, insn); + } return execute_FS(a, pc, mcycle, insn); } template -static FORCE_INLINE execute_status execute_FL(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_FL(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { // Loads the float value from virtual memory const uint64_t vaddr = a.read_x(insn_get_rs1(insn)); const int32_t imm = insn_I_get_imm(insn); @@ -3629,19 +3794,27 @@ static FORCE_INLINE execute_status execute_FL(STATE_ACCESS &a, uint64_t &pc, uin } template -static FORCE_INLINE execute_status execute_FLW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_FLW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "flw"); + // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. + if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { + return raise_illegal_insn_exception(a, pc, insn); + } return execute_FL(a, pc, mcycle, insn); } template -static FORCE_INLINE execute_status execute_FLD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { +static FORCE_INLINE execute_status execute_FLD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { dump_insn(a, pc, insn, "fld"); + // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. + if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { + return raise_illegal_insn_exception(a, pc, insn); + } return execute_FL(a, pc, mcycle, insn); } template -static FORCE_INLINE execute_status execute_FMADD_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMADD_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fmadd.s"); return execute_float_ternary_op_rm(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t s3, uint32_t rm, uint32_t *fflags) -> uint32_t { @@ -3650,7 +3823,7 @@ static FORCE_INLINE execute_status execute_FMADD_S(STATE_ACCESS &a, uint64_t &pc } template -static FORCE_INLINE execute_status execute_FMADD_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMADD_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fmadd.d"); return execute_float_ternary_op_rm(a, pc, insn, [](uint64_t s1, uint64_t s2, uint64_t s3, uint32_t rm, uint32_t *fflags) -> uint64_t { @@ -3659,7 +3832,11 @@ static FORCE_INLINE execute_status execute_FMADD_D(STATE_ACCESS &a, uint64_t &pc } template -static FORCE_INLINE execute_status execute_FMADD(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMADD(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. + if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { + return raise_illegal_insn_exception(a, pc, insn); + } switch (static_cast(insn_get_funct2_0000000000000000000000000(insn))) { case insn_FM_funct2_0000000000000000000000000::S: return execute_FMADD_S(a, pc, insn); @@ -3671,7 +3848,7 @@ static FORCE_INLINE execute_status execute_FMADD(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FMSUB_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMSUB_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fmsub.s"); return execute_float_ternary_op_rm(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t s3, uint32_t rm, uint32_t *fflags) -> uint32_t { @@ -3680,7 +3857,7 @@ static FORCE_INLINE execute_status execute_FMSUB_S(STATE_ACCESS &a, uint64_t &pc } template -static FORCE_INLINE execute_status execute_FMSUB_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMSUB_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fmsub.d"); return execute_float_ternary_op_rm(a, pc, insn, [](uint64_t s1, uint64_t s2, uint64_t s3, uint32_t rm, uint32_t *fflags) -> uint64_t { @@ -3689,7 +3866,11 @@ static FORCE_INLINE execute_status execute_FMSUB_D(STATE_ACCESS &a, uint64_t &pc } template -static FORCE_INLINE execute_status execute_FMSUB(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMSUB(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. + if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { + return raise_illegal_insn_exception(a, pc, insn); + } switch (static_cast(insn_get_funct2_0000000000000000000000000(insn))) { case insn_FM_funct2_0000000000000000000000000::S: return execute_FMSUB_S(a, pc, insn); @@ -3701,7 +3882,7 @@ static FORCE_INLINE execute_status execute_FMSUB(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FNMADD_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FNMADD_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fnmadd.s"); return execute_float_ternary_op_rm(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t s3, uint32_t rm, uint32_t *fflags) -> uint32_t { @@ -3711,7 +3892,7 @@ static FORCE_INLINE execute_status execute_FNMADD_S(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FNMADD_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FNMADD_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fnmadd.d"); return execute_float_ternary_op_rm(a, pc, insn, [](uint64_t s1, uint64_t s2, uint64_t s3, uint32_t rm, uint32_t *fflags) -> uint64_t { @@ -3721,7 +3902,11 @@ static FORCE_INLINE execute_status execute_FNMADD_D(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FNMADD(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FNMADD(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. + if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { + return raise_illegal_insn_exception(a, pc, insn); + } switch (static_cast(insn_get_funct2_0000000000000000000000000(insn))) { case insn_FM_funct2_0000000000000000000000000::S: return execute_FNMADD_S(a, pc, insn); @@ -3733,7 +3918,7 @@ static FORCE_INLINE execute_status execute_FNMADD(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FNMSUB_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FNMSUB_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fnmsub.s"); return execute_float_ternary_op_rm(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t s3, uint32_t rm, uint32_t *fflags) -> uint32_t { @@ -3742,7 +3927,7 @@ static FORCE_INLINE execute_status execute_FNMSUB_S(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FNMSUB_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FNMSUB_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fnmsub.d"); return execute_float_ternary_op_rm(a, pc, insn, [](uint64_t s1, uint64_t s2, uint64_t s3, uint32_t rm, uint32_t *fflags) -> uint64_t { @@ -3751,7 +3936,11 @@ static FORCE_INLINE execute_status execute_FNMSUB_D(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FNMSUB(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FNMSUB(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. + if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { + return raise_illegal_insn_exception(a, pc, insn); + } switch (static_cast(insn_get_funct2_0000000000000000000000000(insn))) { case insn_FM_funct2_0000000000000000000000000::S: return execute_FNMSUB_S(a, pc, insn); @@ -3763,7 +3952,7 @@ static FORCE_INLINE execute_status execute_FNMSUB(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FADD_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FADD_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fadd.s"); return execute_float_binary_op_rm(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t rm, uint32_t *fflags) -> uint32_t { @@ -3772,7 +3961,7 @@ static FORCE_INLINE execute_status execute_FADD_S(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FADD_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FADD_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fadd.d"); return execute_float_binary_op_rm(a, pc, insn, [](uint64_t s1, uint64_t s2, uint32_t rm, uint32_t *fflags) -> uint64_t { @@ -3781,7 +3970,7 @@ static FORCE_INLINE execute_status execute_FADD_D(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FSUB_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FSUB_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fsub.s"); return execute_float_binary_op_rm(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t rm, uint32_t *fflags) -> uint32_t { @@ -3790,7 +3979,7 @@ static FORCE_INLINE execute_status execute_FSUB_S(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FSUB_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FSUB_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fsub.d"); return execute_float_binary_op_rm(a, pc, insn, [](uint64_t s1, uint64_t s2, uint32_t rm, uint32_t *fflags) -> uint64_t { @@ -3799,7 +3988,7 @@ static FORCE_INLINE execute_status execute_FSUB_D(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FMUL_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMUL_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fmul.s"); return execute_float_binary_op_rm(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t rm, uint32_t *fflags) -> uint32_t { @@ -3808,7 +3997,7 @@ static FORCE_INLINE execute_status execute_FMUL_S(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FMUL_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMUL_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fmul.d"); return execute_float_binary_op_rm(a, pc, insn, [](uint64_t s1, uint64_t s2, uint32_t rm, uint32_t *fflags) -> uint64_t { @@ -3817,7 +4006,7 @@ static FORCE_INLINE execute_status execute_FMUL_D(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FDIV_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FDIV_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fdiv.s"); return execute_float_binary_op_rm(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t rm, uint32_t *fflags) -> uint32_t { @@ -3826,7 +4015,7 @@ static FORCE_INLINE execute_status execute_FDIV_S(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FDIV_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FDIV_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fdiv.d"); return execute_float_binary_op_rm(a, pc, insn, [](uint64_t s1, uint64_t s2, uint32_t rm, uint32_t *fflags) -> uint64_t { @@ -3835,7 +4024,7 @@ static FORCE_INLINE execute_status execute_FDIV_D(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FCLASS(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) { +static FORCE_INLINE execute_status execute_FCLASS(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) { const uint32_t rd = insn_get_rd(insn); if (unlikely(rd == 0)) { return advance_to_next_insn(a, pc); @@ -3847,7 +4036,7 @@ static FORCE_INLINE execute_status execute_FCLASS(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_float_binary_op(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) { +static FORCE_INLINE execute_status execute_float_binary_op(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) { const uint64_t fcsr = a.read_fcsr(); // We must always check if input operands are properly NaN-boxed. T s1 = float_unbox(a.read_f(insn_get_rs1(insn))); @@ -3861,7 +4050,7 @@ static FORCE_INLINE execute_status execute_float_binary_op(STATE_ACCESS &a, uint } template -static FORCE_INLINE execute_status execute_float_cmp_op(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) { +static FORCE_INLINE execute_status execute_float_cmp_op(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) { const uint64_t fcsr = a.read_fcsr(); // We must always check if input operands are properly NaN-boxed. T s1 = float_unbox(a.read_f(insn_get_rs1(insn))); @@ -3879,7 +4068,7 @@ static FORCE_INLINE execute_status execute_float_cmp_op(STATE_ACCESS &a, uint64_ } template -static FORCE_INLINE execute_status execute_FSGNJ_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FSGNJ_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fsgnj.s"); return execute_float_binary_op(a, pc, insn, [](uint32_t s1, uint32_t s2, const uint32_t * /*fflags*/) -> uint32_t { @@ -3888,7 +4077,7 @@ static FORCE_INLINE execute_status execute_FSGNJ_S(STATE_ACCESS &a, uint64_t &pc } template -static FORCE_INLINE execute_status execute_FSGNJN_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FSGNJN_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fsgnjn.s"); return execute_float_binary_op(a, pc, insn, [](uint32_t s1, uint32_t s2, const uint32_t * /*fflags*/) -> uint32_t { @@ -3897,7 +4086,7 @@ static FORCE_INLINE execute_status execute_FSGNJN_S(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FSGNJX_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FSGNJX_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fsgnjx.s"); return execute_float_binary_op(a, pc, insn, [](uint32_t s1, uint32_t s2, const uint32_t * /*fflags*/) -> uint32_t { @@ -3906,7 +4095,7 @@ static FORCE_INLINE execute_status execute_FSGNJX_S(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FSGN_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FSGN_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { switch (static_cast(insn_get_funct3_000000000000(insn))) { case insn_FSGN_funct3_000000000000::J: return execute_FSGNJ_S(a, pc, insn); @@ -3920,7 +4109,7 @@ static FORCE_INLINE execute_status execute_FSGN_S(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FSGNJ_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FSGNJ_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fsgnj.d"); return execute_float_binary_op(a, pc, insn, [](uint64_t s1, uint64_t s2, const uint32_t * /*fflags*/) -> uint64_t { @@ -3929,7 +4118,7 @@ static FORCE_INLINE execute_status execute_FSGNJ_D(STATE_ACCESS &a, uint64_t &pc } template -static FORCE_INLINE execute_status execute_FSGNJN_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FSGNJN_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fsgnjn.d"); return execute_float_binary_op(a, pc, insn, [](uint64_t s1, uint64_t s2, const uint32_t * /*fflags*/) -> uint64_t { @@ -3938,7 +4127,7 @@ static FORCE_INLINE execute_status execute_FSGNJN_D(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FSGNJX_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FSGNJX_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fsgnjx.d"); return execute_float_binary_op(a, pc, insn, [](uint64_t s1, uint64_t s2, const uint32_t * /*fflags*/) -> uint64_t { @@ -3947,7 +4136,7 @@ static FORCE_INLINE execute_status execute_FSGNJX_D(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FSGN_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FSGN_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { switch (static_cast(insn_get_funct3_000000000000(insn))) { case insn_FSGN_funct3_000000000000::J: return execute_FSGNJ_D(a, pc, insn); @@ -3961,21 +4150,21 @@ static FORCE_INLINE execute_status execute_FSGN_D(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FMIN_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMIN_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fmin.s"); return execute_float_binary_op(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t *fflags) -> uint32_t { return i_sfloat32::min(s1, s2, fflags); }); } template -static FORCE_INLINE execute_status execute_FMAX_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMAX_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fmax.s"); return execute_float_binary_op(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t *fflags) -> uint32_t { return i_sfloat32::max(s1, s2, fflags); }); } template -static FORCE_INLINE execute_status execute_FMINMAX_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMINMAX_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { switch (static_cast(insn_get_funct3_000000000000(insn))) { case insn_FMIN_FMAX_funct3_000000000000::MIN: return execute_FMIN_S(a, pc, insn); @@ -3987,21 +4176,21 @@ static FORCE_INLINE execute_status execute_FMINMAX_S(STATE_ACCESS &a, uint64_t & } template -static FORCE_INLINE execute_status execute_FMIN_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMIN_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fmin.d"); return execute_float_binary_op(a, pc, insn, [](uint64_t s1, uint64_t s2, uint32_t *fflags) -> uint64_t { return i_sfloat64::min(s1, s2, fflags); }); } template -static FORCE_INLINE execute_status execute_FMAX_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMAX_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fmax.d"); return execute_float_binary_op(a, pc, insn, [](uint64_t s1, uint64_t s2, uint32_t *fflags) -> uint64_t { return i_sfloat64::max(s1, s2, fflags); }); } template -static FORCE_INLINE execute_status execute_FMINMAX_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMINMAX_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { switch (static_cast(insn_get_funct3_000000000000(insn))) { case insn_FMIN_FMAX_funct3_000000000000::MIN: return execute_FMIN_D(a, pc, insn); @@ -4013,7 +4202,7 @@ static FORCE_INLINE execute_status execute_FMINMAX_D(STATE_ACCESS &a, uint64_t & } template -static FORCE_INLINE execute_status execute_FCVT_F_F(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) { +static FORCE_INLINE execute_status execute_FCVT_F_F(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) { const uint64_t fcsr = a.read_fcsr(); // The rounding mode comes from the insn const uint32_t rm = insn_get_rm(insn, fcsr); @@ -4033,7 +4222,7 @@ static FORCE_INLINE execute_status execute_FCVT_F_F(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FCVT_X_F(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) { +static FORCE_INLINE execute_status execute_FCVT_X_F(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) { const uint64_t fcsr = a.read_fcsr(); // The rounding mode comes from the insn const uint32_t rm = insn_get_rm(insn, fcsr); @@ -4055,7 +4244,7 @@ static FORCE_INLINE execute_status execute_FCVT_X_F(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FCVT_F_X(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) { +static FORCE_INLINE execute_status execute_FCVT_F_X(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) { const uint64_t fcsr = a.read_fcsr(); // The rounding mode comes from the insn const uint32_t rm = insn_get_rm(insn, fcsr); @@ -4074,7 +4263,7 @@ static FORCE_INLINE execute_status execute_FCVT_F_X(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FCVT_S_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_S_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.s.d"); return execute_FCVT_F_F(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint32_t { @@ -4083,7 +4272,7 @@ static FORCE_INLINE execute_status execute_FCVT_S_D(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FCVT_D_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_D_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.d.s"); return execute_FCVT_F_F(a, pc, insn, [](uint32_t s1, uint32_t /*rm*/, uint32_t *fflags) -> uint64_t { @@ -4093,7 +4282,7 @@ static FORCE_INLINE execute_status execute_FCVT_D_S(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FSQRT_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FSQRT_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fsqrt.s"); return execute_float_unary_op_rm(a, pc, insn, [](uint32_t s1, uint32_t rm, uint32_t *fflags) -> uint32_t { return i_sfloat32::sqrt(s1, static_cast(rm), fflags); @@ -4101,7 +4290,7 @@ static FORCE_INLINE execute_status execute_FSQRT_S(STATE_ACCESS &a, uint64_t &pc } template -static FORCE_INLINE execute_status execute_FSQRT_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FSQRT_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fsqrt.d"); return execute_float_unary_op_rm(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t { return i_sfloat64::sqrt(s1, static_cast(rm), fflags); @@ -4109,7 +4298,7 @@ static FORCE_INLINE execute_status execute_FSQRT_D(STATE_ACCESS &a, uint64_t &pc } template -static FORCE_INLINE execute_status execute_FLE_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FLE_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fle.s"); return execute_float_cmp_op(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t *fflags) -> uint64_t { return static_cast(i_sfloat32::le(s1, s2, fflags)); @@ -4117,7 +4306,7 @@ static FORCE_INLINE execute_status execute_FLE_S(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FLT_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FLT_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "flt.s"); return execute_float_cmp_op(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t *fflags) -> uint64_t { return static_cast(i_sfloat32::lt(s1, s2, fflags)); @@ -4125,7 +4314,7 @@ static FORCE_INLINE execute_status execute_FLT_S(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FEQ_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FEQ_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "feq.s"); return execute_float_cmp_op(a, pc, insn, [](uint32_t s1, uint32_t s2, uint32_t *fflags) -> uint64_t { return static_cast(i_sfloat32::eq(s1, s2, fflags)); @@ -4133,7 +4322,7 @@ static FORCE_INLINE execute_status execute_FEQ_S(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FCMP_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCMP_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { switch (static_cast(insn_get_funct3_000000000000(insn))) { case insn_FCMP_funct3_000000000000::LT: return execute_FLT_S(a, pc, insn); @@ -4147,7 +4336,7 @@ static FORCE_INLINE execute_status execute_FCMP_S(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FLE_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FLE_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fle.d"); return execute_float_cmp_op(a, pc, insn, [](uint64_t s1, uint64_t s2, uint32_t *fflags) -> uint64_t { return static_cast(i_sfloat64::le(s1, s2, fflags)); @@ -4155,7 +4344,7 @@ static FORCE_INLINE execute_status execute_FLE_D(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FLT_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FLT_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "flt.d"); return execute_float_cmp_op(a, pc, insn, [](uint64_t s1, uint64_t s2, uint32_t *fflags) -> uint64_t { return static_cast(i_sfloat64::lt(s1, s2, fflags)); @@ -4163,7 +4352,7 @@ static FORCE_INLINE execute_status execute_FLT_D(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FEQ_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FEQ_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "feq.d"); return execute_float_cmp_op(a, pc, insn, [](uint64_t s1, uint64_t s2, uint32_t *fflags) -> uint64_t { return static_cast(i_sfloat64::eq(s1, s2, fflags)); @@ -4171,7 +4360,7 @@ static FORCE_INLINE execute_status execute_FEQ_D(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FCMP_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCMP_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { switch (static_cast(insn_get_funct3_000000000000(insn))) { case insn_FCMP_funct3_000000000000::LT: return execute_FLT_D(a, pc, insn); @@ -4185,7 +4374,7 @@ static FORCE_INLINE execute_status execute_FCMP_D(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_FCVT_W_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_W_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.w.s"); return execute_FCVT_X_F(a, pc, insn, [](uint32_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t { const auto val = i_sfloat32::cvt_f_i(s1, static_cast(rm), fflags); @@ -4195,7 +4384,7 @@ static FORCE_INLINE execute_status execute_FCVT_W_S(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FCVT_WU_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_WU_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.wu.s"); return execute_FCVT_X_F(a, pc, insn, [](uint32_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t { const auto val = i_sfloat32::cvt_f_i(s1, static_cast(rm), fflags); @@ -4205,7 +4394,7 @@ static FORCE_INLINE execute_status execute_FCVT_WU_S(STATE_ACCESS &a, uint64_t & } template -static FORCE_INLINE execute_status execute_FCVT_L_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_L_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.l.s"); return execute_FCVT_X_F(a, pc, insn, [](uint32_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t { const auto val = i_sfloat32::cvt_f_i(s1, static_cast(rm), fflags); @@ -4214,7 +4403,7 @@ static FORCE_INLINE execute_status execute_FCVT_L_S(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FCVT_LU_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_LU_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.lu.s"); return execute_FCVT_X_F(a, pc, insn, [](uint32_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t { return i_sfloat32::cvt_f_i(s1, static_cast(rm), fflags); @@ -4222,7 +4411,7 @@ static FORCE_INLINE execute_status execute_FCVT_LU_S(STATE_ACCESS &a, uint64_t & } template -static FORCE_INLINE execute_status execute_FCVT_W_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_W_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.w.d"); return execute_FCVT_X_F(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t { const auto val = i_sfloat64::cvt_f_i(s1, static_cast(rm), fflags); @@ -4232,7 +4421,7 @@ static FORCE_INLINE execute_status execute_FCVT_W_D(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FCVT_WU_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_WU_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.wu.d"); return execute_FCVT_X_F(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t { const auto val = i_sfloat64::cvt_f_i(s1, static_cast(rm), fflags); @@ -4242,7 +4431,7 @@ static FORCE_INLINE execute_status execute_FCVT_WU_D(STATE_ACCESS &a, uint64_t & } template -static FORCE_INLINE execute_status execute_FCVT_L_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_L_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.l.d"); return execute_FCVT_X_F(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t { const auto val = i_sfloat64::cvt_f_i(s1, static_cast(rm), fflags); @@ -4251,7 +4440,7 @@ static FORCE_INLINE execute_status execute_FCVT_L_D(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FCVT_LU_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_LU_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.lu.d"); return execute_FCVT_X_F(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t { return i_sfloat64::cvt_f_i(s1, static_cast(rm), fflags); @@ -4259,7 +4448,7 @@ static FORCE_INLINE execute_status execute_FCVT_LU_D(STATE_ACCESS &a, uint64_t & } template -static FORCE_INLINE execute_status execute_FCVT_S_W(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_S_W(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.s.w"); return execute_FCVT_F_X(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint32_t { return i_sfloat32::cvt_i_f(static_cast(s1), static_cast(rm), fflags); @@ -4267,7 +4456,7 @@ static FORCE_INLINE execute_status execute_FCVT_S_W(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FCVT_S_WU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_S_WU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.s.wu"); return execute_FCVT_F_X(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint32_t { return i_sfloat32::cvt_i_f(static_cast(s1), static_cast(rm), fflags); @@ -4275,7 +4464,7 @@ static FORCE_INLINE execute_status execute_FCVT_S_WU(STATE_ACCESS &a, uint64_t & } template -static FORCE_INLINE execute_status execute_FCVT_S_L(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_S_L(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.s.l"); return execute_FCVT_F_X(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint32_t { return i_sfloat32::cvt_i_f(static_cast(s1), static_cast(rm), fflags); @@ -4283,7 +4472,7 @@ static FORCE_INLINE execute_status execute_FCVT_S_L(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FCVT_S_LU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_S_LU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.s.lu"); return execute_FCVT_F_X(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint32_t { return i_sfloat32::cvt_i_f(s1, static_cast(rm), fflags); @@ -4291,7 +4480,7 @@ static FORCE_INLINE execute_status execute_FCVT_S_LU(STATE_ACCESS &a, uint64_t & } template -static FORCE_INLINE execute_status execute_FCVT_D_W(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_D_W(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.d.w"); return execute_FCVT_F_X(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t { return i_sfloat64::cvt_i_f(static_cast(s1), static_cast(rm), fflags); @@ -4299,7 +4488,7 @@ static FORCE_INLINE execute_status execute_FCVT_D_W(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FCVT_D_WU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_D_WU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.d.wu"); return execute_FCVT_F_X(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t { return i_sfloat64::cvt_i_f(static_cast(s1), static_cast(rm), fflags); @@ -4307,7 +4496,7 @@ static FORCE_INLINE execute_status execute_FCVT_D_WU(STATE_ACCESS &a, uint64_t & } template -static FORCE_INLINE execute_status execute_FCVT_D_L(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_D_L(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.d.l"); return execute_FCVT_F_X(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t { return i_sfloat64::cvt_i_f(static_cast(s1), static_cast(rm), fflags); @@ -4315,7 +4504,7 @@ static FORCE_INLINE execute_status execute_FCVT_D_L(STATE_ACCESS &a, uint64_t &p } template -static FORCE_INLINE execute_status execute_FCVT_D_LU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_D_LU(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fcvt.d.lu"); return execute_FCVT_F_X(a, pc, insn, [](uint64_t s1, uint32_t rm, uint32_t *fflags) -> uint64_t { return i_sfloat64::cvt_i_f(s1, static_cast(rm), fflags); @@ -4323,7 +4512,7 @@ static FORCE_INLINE execute_status execute_FCVT_D_LU(STATE_ACCESS &a, uint64_t & } template -static FORCE_INLINE execute_status execute_FMV_F_X(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMV_F_X(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { // Should have funct3 set to 0 if (unlikely(insn_get_funct3(insn) != 0)) { return raise_illegal_insn_exception(a, pc, insn); @@ -4336,25 +4525,25 @@ static FORCE_INLINE execute_status execute_FMV_F_X(STATE_ACCESS &a, uint64_t &pc } template -static FORCE_INLINE execute_status execute_FMV_W_X(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMV_W_X(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fmv.w.x"); return execute_FMV_F_X(a, pc, insn); } template -static FORCE_INLINE execute_status execute_FMV_D_X(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMV_D_X(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fmv.d.x"); return execute_FMV_F_X(a, pc, insn); } template -static FORCE_INLINE execute_status execute_FCLASS_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCLASS_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fclass.s"); return execute_FCLASS(a, pc, insn, [](uint32_t s1) -> uint64_t { return i_sfloat32::fclass(s1); }); } template -static FORCE_INLINE execute_status execute_FMV_X_W(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMV_X_W(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fmv.x.w"); const uint32_t rd = insn_get_rd(insn); if (unlikely(rd == 0)) { @@ -4369,7 +4558,7 @@ static FORCE_INLINE execute_status execute_FMV_X_W(STATE_ACCESS &a, uint64_t &pc } template -static FORCE_INLINE execute_status execute_FMV_FCLASS_S(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMV_FCLASS_S(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { switch (static_cast(insn_get_funct3_000000000000(insn))) { case insn_FMV_FCLASS_funct3_000000000000::FMV: return execute_FMV_X_W(a, pc, insn); @@ -4381,13 +4570,13 @@ static FORCE_INLINE execute_status execute_FMV_FCLASS_S(STATE_ACCESS &a, uint64_ } template -static FORCE_INLINE execute_status execute_FCLASS_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCLASS_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fclass.d"); return execute_FCLASS(a, pc, insn, [](uint64_t s1) -> uint64_t { return i_sfloat64::fclass(s1); }); } template -static FORCE_INLINE execute_status execute_FMV_X_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMV_X_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "fmv.x.d"); const uint32_t rd = insn_get_rd(insn); if (unlikely(rd == 0)) { @@ -4399,7 +4588,7 @@ static FORCE_INLINE execute_status execute_FMV_X_D(STATE_ACCESS &a, uint64_t &pc } template -static FORCE_INLINE execute_status execute_FMV_FCLASS_D(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FMV_FCLASS_D(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { switch (static_cast(insn_get_funct3_000000000000(insn))) { case insn_FMV_FCLASS_funct3_000000000000::FMV: return execute_FMV_X_D(a, pc, insn); @@ -4411,7 +4600,7 @@ static FORCE_INLINE execute_status execute_FMV_FCLASS_D(STATE_ACCESS &a, uint64_ } template -static FORCE_INLINE execute_status execute_FCVT_FMV_FCLASS(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FCVT_FMV_FCLASS(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { switch (static_cast(insn_get_funct7_rs2(insn))) { case insn_FD_funct7_rs2::FCVT_W_S: return execute_FCVT_W_S(a, pc, insn); @@ -4463,7 +4652,11 @@ static FORCE_INLINE execute_status execute_FCVT_FMV_FCLASS(STATE_ACCESS &a, uint } template -static FORCE_INLINE execute_status execute_FD(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { +static FORCE_INLINE execute_status execute_FD(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. + if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { + return raise_illegal_insn_exception(a, pc, insn); + } switch (static_cast(insn_get_funct7(insn))) { case insn_FD_funct7::FADD_S: return execute_FADD_S(a, pc, insn); @@ -4503,8 +4696,8 @@ static FORCE_INLINE execute_status execute_FD(STATE_ACCESS &a, uint64_t &pc, uin } template -static FORCE_INLINE execute_status execute_C_L(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t rd, - uint32_t rs1, int32_t imm) { +static FORCE_INLINE execute_status execute_C_L(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t rd, uint32_t rs1, + int32_t imm) { const uint64_t vaddr = a.read_x(rs1); T val = 0; if (unlikely(!read_virtual_memory(a, pc, mcycle, vaddr + imm, &val))) { @@ -4520,19 +4713,22 @@ static FORCE_INLINE execute_status execute_C_L(STATE_ACCESS &a, uint64_t &pc, ui } template -static FORCE_INLINE execute_status execute_C_S(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t rs2, +static FORCE_INLINE execute_status execute_C_S(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t rs2, uint32_t rs1, int32_t imm) { const uint64_t vaddr = a.read_x(rs1); const uint64_t val = a.read_x(rs2); const execute_status status = write_virtual_memory(a, pc, mcycle, vaddr + imm, val); - if (unlikely(status == execute_status::failure)) { - return advance_to_raised_exception(a, pc); + if (unlikely(status != execute_status::success)) { + if (status == execute_status::failure) { + return advance_to_raised_exception(a, pc); + } + return advance_to_next_insn<2>(a, pc, status); } - return advance_to_next_insn<2>(a, pc, status); + return advance_to_next_insn<2>(a, pc); } template -static FORCE_INLINE execute_status execute_C_FL(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t rd, +static FORCE_INLINE execute_status execute_C_FL(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t rd, uint32_t rs1, int32_t imm) { // Loads the float value from virtual memory const uint64_t vaddr = a.read_x(rs1); @@ -4547,33 +4743,30 @@ static FORCE_INLINE execute_status execute_C_FL(STATE_ACCESS &a, uint64_t &pc, u } template -static FORCE_INLINE execute_status execute_C_FS(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t rs2, +static FORCE_INLINE execute_status execute_C_FS(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t rs2, uint32_t rs1, int32_t imm) { const uint64_t vaddr = a.read_x(rs1); // A narrower n-bit transfer out of the floating-point // registers will transfer the lower n bits of the register ignoring the upper FLEN−n bits. T val = static_cast(a.read_f(rs2)); const execute_status status = write_virtual_memory(a, pc, mcycle, vaddr + imm, val); - if (unlikely(status == execute_status::failure)) { - return advance_to_raised_exception(a, pc); + if (unlikely(status != execute_status::success)) { + if (status == execute_status::failure) { + return advance_to_raised_exception(a, pc); + } + return advance_to_next_insn<2>(a, pc, status); } - return advance_to_next_insn<2>(a, pc, status); + return advance_to_next_insn<2>(a, pc); } /// \brief Implementation of the C.ADDI4SPN instruction. template -static FORCE_INLINE execute_status execute_C_ADDI4SPN(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - // "A 16-bit instruction with all bits zero is permanently reserved as an illegal instruction." - if (unlikely(insn == 0)) { - return raise_illegal_insn_exception(a, pc, insn); - } - dump_insn(a, pc, insn, "c.addi4spn"); - // rd cannot be zero +static FORCE_INLINE execute_status execute_C_ADDI4SPN(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.addi4spn"); + // rd cannot be zero (guaranteed by RISC-V spec design) const uint32_t rd = insn_get_CIW_CL_rd_CS_CA_rs2(insn); + // imm cannot be zero (guaranteed by the jump table) const uint32_t imm = insn_get_CIW_imm(insn); - if (unlikely(imm == 0)) { - return raise_illegal_insn_exception(a, pc, insn); - } const uint64_t rs1 = a.read_x(2); int64_t val = 0; __builtin_add_overflow(static_cast(rs1), static_cast(imm), &val); @@ -4583,8 +4776,13 @@ static FORCE_INLINE execute_status execute_C_ADDI4SPN(STATE_ACCESS &a, uint64_t /// \brief Implementation of the C.FLD instruction. template -static FORCE_INLINE execute_status execute_C_FLD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { - dump_insn(a, pc, insn, "c.fld"); +static FORCE_INLINE execute_status execute_C_FLD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.fld"); + // If FS is OFF, attempts to read or write the float state will cause an illegal instruction + // exception. + if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { + return raise_illegal_insn_exception(a, pc, insn); + } const uint32_t rd = insn_get_CIW_CL_rd_CS_CA_rs2(insn); const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn); const int32_t imm = insn_get_CL_CS_imm(insn); @@ -4593,8 +4791,8 @@ static FORCE_INLINE execute_status execute_C_FLD(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the C.LW instruction. template -static FORCE_INLINE execute_status execute_C_LW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { - dump_insn(a, pc, insn, "c.lw"); +static FORCE_INLINE execute_status execute_C_LW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.lw"); const uint32_t rd = insn_get_CIW_CL_rd_CS_CA_rs2(insn); const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn); const int32_t imm = insn_get_C_LW_C_SW_imm(insn); @@ -4603,8 +4801,8 @@ static FORCE_INLINE execute_status execute_C_LW(STATE_ACCESS &a, uint64_t &pc, u /// \brief Implementation of the C.LD instruction. template -static FORCE_INLINE execute_status execute_C_LD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { - dump_insn(a, pc, insn, "c.ld"); +static FORCE_INLINE execute_status execute_C_LD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.ld"); const uint32_t rd = insn_get_CIW_CL_rd_CS_CA_rs2(insn); const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn); const int32_t imm = insn_get_CL_CS_imm(insn); @@ -4613,8 +4811,13 @@ static FORCE_INLINE execute_status execute_C_LD(STATE_ACCESS &a, uint64_t &pc, u /// \brief Implementation of the C.FSD instruction. template -static FORCE_INLINE execute_status execute_C_FSD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { - dump_insn(a, pc, insn, "c.fsd"); +static FORCE_INLINE execute_status execute_C_FSD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.fsd"); + // If FS is OFF, attempts to read or write the float state will cause an illegal instruction + // exception. + if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { + return raise_illegal_insn_exception(a, pc, insn); + } const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn); const uint32_t rs2 = insn_get_CIW_CL_rd_CS_CA_rs2(insn); const int32_t imm = insn_get_CL_CS_imm(insn); @@ -4623,8 +4826,8 @@ static FORCE_INLINE execute_status execute_C_FSD(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the C.SW instruction. template -static FORCE_INLINE execute_status execute_C_SW(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { - dump_insn(a, pc, insn, "c.sw"); +static FORCE_INLINE execute_status execute_C_SW(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.sw"); const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn); const uint32_t rs2 = insn_get_CIW_CL_rd_CS_CA_rs2(insn); const int32_t imm = insn_get_C_LW_C_SW_imm(insn); @@ -4633,8 +4836,8 @@ static FORCE_INLINE execute_status execute_C_SW(STATE_ACCESS &a, uint64_t &pc, u /// \brief Implementation of the C.SD instruction. template -static FORCE_INLINE execute_status execute_C_SD(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { - dump_insn(a, pc, insn, "c.sd"); +static FORCE_INLINE execute_status execute_C_SD(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.sd"); const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn); const uint32_t rs2 = insn_get_CIW_CL_rd_CS_CA_rs2(insn); const int32_t imm = insn_get_CL_CS_imm(insn); @@ -4643,22 +4846,20 @@ static FORCE_INLINE execute_status execute_C_SD(STATE_ACCESS &a, uint64_t &pc, u /// \brief Implementation of the C.NOP instruction. template -static FORCE_INLINE execute_status execute_C_NOP(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.nop"); - // C.NOP with imm != 0 is just a HINT that must execute as no-op (see RISC-V spec) +static FORCE_INLINE execute_status execute_C_NOP(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.nop"); // Really do nothing return advance_to_next_insn<2>(a, pc); } /// \brief Implementation of the C.ADDI instruction. template -static FORCE_INLINE execute_status execute_C_ADDI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, uint32_t rd) { - dump_insn(a, pc, insn, "c.addi"); +static FORCE_INLINE execute_status execute_C_ADDI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.addi"); + // rd cannot be zero (guaranteed by jump table) + const uint32_t rd = insn_get_rd(insn); const int32_t imm = insn_get_CI_CB_imm_se(insn); - // C.ADDI with imm == 0 is just a HINT that must execute as no-op (see RISC-V spec) - if (unlikely(imm == 0)) { - return advance_to_next_insn<2>(a, pc); - } + // imm cannot be zero (guaranteed by jump table) const uint64_t rd_value = a.read_x(rd); int64_t val = 0; __builtin_add_overflow(static_cast(rd_value), static_cast(imm), &val); @@ -4666,23 +4867,12 @@ static FORCE_INLINE execute_status execute_C_ADDI(STATE_ACCESS &a, uint64_t &pc, return advance_to_next_insn<2>(a, pc); } -template -static FORCE_INLINE execute_status execute_C_Q1_SET0(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - const uint32_t rd = insn_get_rd(insn); - if (unlikely(rd == 0)) { - return execute_C_NOP(a, pc, insn); - } - return execute_C_ADDI(a, pc, insn, rd); -} - /// \brief Implementation of the C.addiw instruction. template -static FORCE_INLINE execute_status execute_C_ADDIW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.addiw"); +static FORCE_INLINE execute_status execute_C_ADDIW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.addiw"); + // rd cannot be zero (guaranteed by jump table) const uint32_t rd = insn_get_rd(insn); - if (unlikely(rd == 0)) { - return raise_illegal_insn_exception(a, pc, insn); - } const uint64_t rd_value = a.read_x(rd); const int32_t imm = insn_get_CI_CB_imm_se(insn); int32_t val = 0; @@ -4693,13 +4883,10 @@ static FORCE_INLINE execute_status execute_C_ADDIW(STATE_ACCESS &a, uint64_t &pc /// \brief Implementation of the C.LI instruction. template -static FORCE_INLINE execute_status execute_C_LI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.li"); +static FORCE_INLINE execute_status execute_C_LI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.li"); + // rd cannot be zero (guaranteed by jump table) const uint32_t rd = insn_get_rd(insn); - // C.LI with rd == 0 is just a HINT that must execute as no-op (see RISC-V spec) - if (unlikely(rd == 0)) { - return advance_to_next_insn<2>(a, pc); - } const int32_t imm = insn_get_CI_CB_imm_se(insn); a.write_x(rd, static_cast(imm)); return advance_to_next_insn<2>(a, pc); @@ -4707,12 +4894,10 @@ static FORCE_INLINE execute_status execute_C_LI(STATE_ACCESS &a, uint64_t &pc, u /// \brief Implementation of the C.ADDI16SP instruction. template -static FORCE_INLINE execute_status execute_C_ADDI16SP(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.addi16sp"); +static FORCE_INLINE execute_status execute_C_ADDI16SP(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.addi16sp"); + // imm cannot be zero (guaranteed by the jump table) const int32_t imm = insn_get_C_ADDI16SP_imm(insn); - if (unlikely(imm == 0)) { - return raise_illegal_insn_exception(a, pc, insn); - } const uint64_t rs1_value = a.read_x(2); int64_t val = 0; __builtin_add_overflow(static_cast(rs1_value), static_cast(imm), &val); @@ -4722,39 +4907,23 @@ static FORCE_INLINE execute_status execute_C_ADDI16SP(STATE_ACCESS &a, uint64_t /// \brief Implementation of the C.LUI instruction. template -static FORCE_INLINE execute_status execute_C_LUI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, uint32_t rd) { - dump_insn(a, pc, insn, "c.lui"); +static FORCE_INLINE execute_status execute_C_LUI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.lui"); + // imm cannot be zero (guaranteed by the jump table) const int32_t imm = insn_get_C_LUI_imm(insn); - if (unlikely(imm == 0)) { - return raise_illegal_insn_exception(a, pc, insn); - } - // C.LUI with rd == 0 is just a HINT that must execute as no-op (see RISC-V spec) - if (unlikely(rd == 0)) { - return advance_to_next_insn<2>(a, pc); - } + // rd cannot be zero (guaranteed by the jump table) + const uint32_t rd = insn_get_rd(insn); a.write_x(rd, static_cast(imm)); return advance_to_next_insn<2>(a, pc); } -template -static FORCE_INLINE execute_status execute_C_Q1_SET1(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - const uint32_t rd = insn_get_rd(insn); - if (rd == 2) { - return execute_C_ADDI16SP(a, pc, insn); - } - return execute_C_LUI(a, pc, insn, rd); -} - /// \brief Implementation of the C.SRLI instruction. template -static FORCE_INLINE execute_status execute_C_SRLI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.srli"); +static FORCE_INLINE execute_status execute_C_SRLI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.srli"); const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn); + // imm cannot be zero (guaranteed by the jump table) const uint32_t imm = insn_get_CI_CB_imm(insn); - // C.SRLI with imm == 0 is just a HINT that must execute as no-op (see RISC-V spec) - if (unlikely(imm == 0)) { - return advance_to_next_insn<2>(a, pc); - } const uint64_t rs1_value = a.read_x(rs1); a.write_x(rs1, rs1_value >> imm); return advance_to_next_insn<2>(a, pc); @@ -4762,14 +4931,11 @@ static FORCE_INLINE execute_status execute_C_SRLI(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the C.SRAI instruction. template -static FORCE_INLINE execute_status execute_C_SRAI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.srai"); +static FORCE_INLINE execute_status execute_C_SRAI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.srai"); const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn); + // imm cannot be zero (guaranteed by the jump table) const uint32_t imm = insn_get_CI_CB_imm(insn); - // C.SRAI with imm == 0 is just a HINT that must execute as no-op (see RISC-V spec) - if (unlikely(imm == 0)) { - return advance_to_next_insn<2>(a, pc); - } const auto rs1_value = static_cast(a.read_x(rs1)); a.write_x(rs1, static_cast(rs1_value >> imm)); return advance_to_next_insn<2>(a, pc); @@ -4777,8 +4943,8 @@ static FORCE_INLINE execute_status execute_C_SRAI(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the C.ANDI instruction. template -static FORCE_INLINE execute_status execute_C_ANDI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.andi"); +static FORCE_INLINE execute_status execute_C_ANDI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.andi"); const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn); const int32_t imm = insn_get_CI_CB_imm_se(insn); const uint64_t rs1_value = a.read_x(rs1); @@ -4787,7 +4953,7 @@ static FORCE_INLINE execute_status execute_C_ANDI(STATE_ACCESS &a, uint64_t &pc, } template -static FORCE_INLINE execute_status execute_C_arithmetic(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, const F &f) { +static FORCE_INLINE execute_status execute_C_arithmetic(STATE_ACCESS a, uint64_t &pc, uint32_t insn, const F &f) { // Ensure rs1 and rs2 are loaded in order: do not nest with call to f() as // the order of evaluation of arguments in a function call is undefined. const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn); @@ -4800,8 +4966,8 @@ static FORCE_INLINE execute_status execute_C_arithmetic(STATE_ACCESS &a, uint64_ /// \brief Implementation of the C.SUB instruction. template -static FORCE_INLINE execute_status execute_C_SUB(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.sub"); +static FORCE_INLINE execute_status execute_C_SUB(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.sub"); return execute_C_arithmetic(a, pc, insn, [](uint64_t rs1_value, uint64_t rs2_value) -> uint64_t { uint64_t val = 0; __builtin_sub_overflow(rs1_value, rs2_value, &val); @@ -4811,32 +4977,32 @@ static FORCE_INLINE execute_status execute_C_SUB(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the C.XOR instruction. template -static FORCE_INLINE execute_status execute_C_XOR(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.xor"); +static FORCE_INLINE execute_status execute_C_XOR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.xor"); return execute_C_arithmetic(a, pc, insn, [](uint64_t rs1_value, uint64_t rs2_value) -> uint64_t { return rs1_value ^ rs2_value; }); } /// \brief Implementation of the C.OR instruction. template -static FORCE_INLINE execute_status execute_C_OR(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.or"); +static FORCE_INLINE execute_status execute_C_OR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.or"); return execute_C_arithmetic(a, pc, insn, [](uint64_t rs1_value, uint64_t rs2_value) -> uint64_t { return rs1_value | rs2_value; }); } /// \brief Implementation of the C.AND instruction. template -static FORCE_INLINE execute_status execute_C_AND(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.and"); +static FORCE_INLINE execute_status execute_C_AND(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.and"); return execute_C_arithmetic(a, pc, insn, [](uint64_t rs1_value, uint64_t rs2_value) -> uint64_t { return rs1_value & rs2_value; }); } /// \brief Implementation of the C.SUBW instruction. template -static FORCE_INLINE execute_status execute_C_SUBW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.subw"); +static FORCE_INLINE execute_status execute_C_SUBW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.subw"); return execute_C_arithmetic(a, pc, insn, [](uint64_t rs1_value, uint64_t rs2_value) -> uint64_t { // Convert 64-bit to 32-bit auto rs1w = static_cast(rs1_value); @@ -4849,8 +5015,8 @@ static FORCE_INLINE execute_status execute_C_SUBW(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the C.ADDW instruction. template -static FORCE_INLINE execute_status execute_C_ADDW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.addw"); +static FORCE_INLINE execute_status execute_C_ADDW(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.addw"); return execute_C_arithmetic(a, pc, insn, [](uint64_t rs1_value, uint64_t rs2_value) -> uint64_t { // Discard upper 32 bits auto rs1w = static_cast(rs1_value); @@ -4861,54 +5027,18 @@ static FORCE_INLINE execute_status execute_C_ADDW(STATE_ACCESS &a, uint64_t &pc, }); } -template -static FORCE_INLINE execute_status execute_CB_funct2(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - auto cb_funct2 = static_cast(insn_get_CB_funct2(insn)); - switch (cb_funct2) { - case insn_CB_funct2::C_SRLI: - return execute_C_SRLI(a, pc, insn); - case insn_CB_funct2::C_SRAI: - return execute_C_SRAI(a, pc, insn); - case insn_CB_funct2::C_ANDI: - return execute_C_ANDI(a, pc, insn); - } - return raise_illegal_insn_exception(a, pc, insn); -} - -template -static FORCE_INLINE execute_status execute_C_Q1_SET2(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - auto ca_funct6_funct2 = static_cast(insn_get_CA_funct6_funct2(insn)); - switch (ca_funct6_funct2) { - case insn_CA_funct6_funct2::C_SUB: - return execute_C_SUB(a, pc, insn); - case insn_CA_funct6_funct2::C_XOR: - return execute_C_XOR(a, pc, insn); - case insn_CA_funct6_funct2::C_OR: - return execute_C_OR(a, pc, insn); - case insn_CA_funct6_funct2::C_AND: - return execute_C_AND(a, pc, insn); - case insn_CA_funct6_funct2::C_SUBW: - return execute_C_SUBW(a, pc, insn); - case insn_CA_funct6_funct2::C_ADDW: - return execute_C_ADDW(a, pc, insn); - default: - return execute_CB_funct2(a, pc, insn); - } - return raise_illegal_insn_exception(a, pc, insn); -} - /// \brief Implementation of the C_J instruction. template -static FORCE_INLINE execute_status execute_C_J(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.j"); +static FORCE_INLINE execute_status execute_C_J(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.j"); const uint64_t new_pc = pc + static_cast(insn_get_C_J_imm(insn)); return execute_jump(a, pc, new_pc); } /// \brief Implementation of the C.BEQZ instruction. template -static FORCE_INLINE execute_status execute_C_BEQZ(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.beqz"); +static FORCE_INLINE execute_status execute_C_BEQZ(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.beqz"); const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn); if (a.read_x(rs1) == 0) { const int32_t imm = insn_get_C_BEQZ_BNEZ_imm(insn); @@ -4920,8 +5050,8 @@ static FORCE_INLINE execute_status execute_C_BEQZ(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the C.BNEZ instruction. template -static FORCE_INLINE execute_status execute_C_BNEZ(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.bnez"); +static FORCE_INLINE execute_status execute_C_BNEZ(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.bnez"); const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn); if (a.read_x(rs1) != 0) { const int32_t imm = insn_get_C_BEQZ_BNEZ_imm(insn); @@ -4933,18 +5063,12 @@ static FORCE_INLINE execute_status execute_C_BNEZ(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the C.SLLI instruction. template -static FORCE_INLINE execute_status execute_C_SLLI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.slli"); +static FORCE_INLINE execute_status execute_C_SLLI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.slli"); + // rd cannot be zero (guaranteed by jump table) const uint32_t rd = insn_get_rd(insn); - // C.SLLI with rd == 0 is just a HINT that must execute as no-op (see RISC-V spec) - if (unlikely(rd == 0)) { - return advance_to_next_insn<2>(a, pc); - } + // imm cannot be zero (guaranteed by jump table) const uint32_t imm = insn_get_CI_CB_imm(insn); - // C.SLLI with imm == 0 is just a HINT that must execute as no-op (see RISC-V spec) - if (unlikely(imm == 0)) { - return advance_to_next_insn<2>(a, pc); - } const uint64_t rs1_value = a.read_x(rd); a.write_x(rd, rs1_value << imm); return advance_to_next_insn<2>(a, pc); @@ -4952,8 +5076,13 @@ static FORCE_INLINE execute_status execute_C_SLLI(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the C.FLDSP instruction. template -static FORCE_INLINE execute_status execute_C_FLDSP(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { - dump_insn(a, pc, insn, "c.fldsp"); +static FORCE_INLINE execute_status execute_C_FLDSP(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.fldsp"); + // If FS is OFF, attempts to read or write the float state will cause an illegal instruction + // exception. + if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { + return raise_illegal_insn_exception(a, pc, insn); + } const uint32_t rd = insn_get_rd(insn); const int32_t imm = insn_get_C_FLDSP_LDSP_imm(insn); return execute_C_FL(a, pc, mcycle, rd, 0x2, imm); @@ -4961,48 +5090,41 @@ static FORCE_INLINE execute_status execute_C_FLDSP(STATE_ACCESS &a, uint64_t &pc /// \brief Implementation of the C.LWSP instruction. template -static FORCE_INLINE execute_status execute_C_LWSP(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { - dump_insn(a, pc, insn, "c.lwsp"); +static FORCE_INLINE execute_status execute_C_LWSP(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.lwsp"); + // rd cannot be zero (guaranteed by jump table) const uint32_t rd = insn_get_rd(insn); - if (unlikely(rd == 0)) { - return raise_illegal_insn_exception(a, pc, insn); - } const int32_t imm = insn_get_C_LWSP_imm(insn); return execute_C_L(a, pc, mcycle, rd, 0x2, imm); } /// \brief Implementation of the C.LDSP instruction. template -static FORCE_INLINE execute_status execute_C_LDSP(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { - dump_insn(a, pc, insn, "c.ldsp"); +static FORCE_INLINE execute_status execute_C_LDSP(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.ldsp"); + // rd cannot be zero (guaranteed by jump table) const uint32_t rd = insn_get_rd(insn); - if (unlikely(rd == 0)) { - return raise_illegal_insn_exception(a, pc, insn); - } const int32_t imm = insn_get_C_FLDSP_LDSP_imm(insn); return execute_C_L(a, pc, mcycle, rd, 0x2, imm); } /// \brief Implementation of the C.JR instruction. template -static FORCE_INLINE execute_status execute_C_JR(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, uint32_t rs1) { - dump_insn(a, pc, insn, "c.jr"); - if (unlikely(rs1 == 0)) { - return raise_illegal_insn_exception(a, pc, insn); - } +static FORCE_INLINE execute_status execute_C_JR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.jr"); + // rs1 cannot be zero (guaranteed by the jump table) + const uint32_t rs1 = insn_get_rd(insn); const uint64_t new_pc = a.read_x(rs1) & ~static_cast(1); return execute_jump(a, pc, new_pc); } /// \brief Implementation of the C.MV instruction. template -static FORCE_INLINE execute_status execute_C_MV(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, uint32_t rd, - uint32_t rs2) { - dump_insn(a, pc, insn, "c.mv"); - // C.SLLI with rd == 0 is just a HINT that must execute as no-op (see RISC-V spec) - if (unlikely(rd == 0)) { - return advance_to_next_insn<2>(a, pc); - } +static FORCE_INLINE execute_status execute_C_MV(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.mv"); + // rd cannot be zero (guaranteed by the jump table) + const uint32_t rd = insn_get_rd(insn); + const uint32_t rs2 = insn_get_CR_CSS_rs2(insn); const uint64_t val = a.read_x(rs2); a.write_x(rd, val); return advance_to_next_insn<2>(a, pc); @@ -5010,16 +5132,17 @@ static FORCE_INLINE execute_status execute_C_MV(STATE_ACCESS &a, uint64_t &pc, u /// \brief Implementation of the C.EBREAK instruction. template -static FORCE_INLINE execute_status execute_C_EBREAK(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - dump_insn(a, pc, insn, "c.ebreak"); +static FORCE_INLINE execute_status execute_C_EBREAK(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.ebreak"); pc = raise_exception(a, pc, MCAUSE_BREAKPOINT, pc); return advance_to_raised_exception(a, pc); } /// \brief Implementation of the C.JALR instruction. template -static FORCE_INLINE execute_status execute_C_JALR(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, uint32_t rs1) { - dump_insn(a, pc, insn, "c.jalr"); +static FORCE_INLINE execute_status execute_C_JALR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.jalr"); + const uint32_t rs1 = insn_get_rd(insn); const uint64_t new_pc = a.read_x(rs1) & ~static_cast(1); const uint64_t val = pc + 2; a.write_x(0x1, val); @@ -5028,13 +5151,11 @@ static FORCE_INLINE execute_status execute_C_JALR(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the C.ADD instruction. template -static FORCE_INLINE execute_status execute_C_ADD(STATE_ACCESS &a, uint64_t &pc, uint32_t insn, uint32_t rd, - uint32_t rs2) { - dump_insn(a, pc, insn, "c.add"); - // C.ADD with rd == 0 is just a HINT that must execute as no-op (see RISC-V spec) - if (unlikely(rd == 0)) { - return advance_to_next_insn<2>(a, pc); - } +static FORCE_INLINE execute_status execute_C_ADD(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.add"); + // rd cannot be zero (guaranteed by the jump table) + const uint32_t rd = insn_get_rd(insn); + const uint32_t rs2 = insn_get_CR_CSS_rs2(insn); const uint64_t rd_value = a.read_x(rd); const uint64_t rs2_value = a.read_x(rs2); uint64_t val = 0; @@ -5043,29 +5164,15 @@ static FORCE_INLINE execute_status execute_C_ADD(STATE_ACCESS &a, uint64_t &pc, return advance_to_next_insn<2>(a, pc); } -template -static FORCE_INLINE execute_status execute_C_Q2_SET0(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - const uint32_t rs1 = insn_get_rd(insn); - const uint32_t rs2 = insn_get_CR_CSS_rs2(insn); - if (insn & 0b0001000000000000) { - if (rs2 == 0) { - if (rs1 == 0) { - return execute_C_EBREAK(a, pc, insn); - } - return execute_C_JALR(a, pc, insn, rs1); - } - return execute_C_ADD(a, pc, insn, rs1, rs2); - } - if (rs2 == 0) { - return execute_C_JR(a, pc, insn, rs1); - } - return execute_C_MV(a, pc, insn, rs1, rs2); -} - /// \brief Implementation of the C.FSDSP instruction. template -static FORCE_INLINE execute_status execute_C_FSDSP(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { - dump_insn(a, pc, insn, "c.fsdsp"); +static FORCE_INLINE execute_status execute_C_FSDSP(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.fsdsp"); + // If FS is OFF, attempts to read or write the float state will cause an illegal instruction + // exception. + if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { + return raise_illegal_insn_exception(a, pc, insn); + } const uint32_t rs2 = insn_get_CR_CSS_rs2(insn); const int32_t imm = insn_get_C_FSDSP_SDSP_imm(insn); return execute_C_FS(a, pc, mcycle, rs2, 0x2, imm); @@ -5073,8 +5180,8 @@ static FORCE_INLINE execute_status execute_C_FSDSP(STATE_ACCESS &a, uint64_t &pc /// \brief Implementation of the C.SWSP instruction. template -static FORCE_INLINE execute_status execute_C_SWSP(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { - dump_insn(a, pc, insn, "c.swsp"); +static FORCE_INLINE execute_status execute_C_SWSP(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.swsp"); const uint32_t rs2 = insn_get_CR_CSS_rs2(insn); const int32_t imm = insn_get_C_SWSP_imm(insn); return execute_C_S(a, pc, mcycle, rs2, 0x2, imm); @@ -5082,297 +5189,13 @@ static FORCE_INLINE execute_status execute_C_SWSP(STATE_ACCESS &a, uint64_t &pc, /// \brief Implementation of the C.SDSP instruction. template -static FORCE_INLINE execute_status execute_C_SDSP(STATE_ACCESS &a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { - dump_insn(a, pc, insn, "c.sdsp"); +static FORCE_INLINE execute_status execute_C_SDSP(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { + dump_insn(a, pc, static_cast(insn), "c.sdsp"); const uint32_t rs2 = insn_get_CR_CSS_rs2(insn); const int32_t imm = insn_get_C_FSDSP_SDSP_imm(insn); return execute_C_S(a, pc, mcycle, rs2, 0x2, imm); } -/// \brief Decodes and executes an instruction. -/// \tparam STATE_ACCESS Class of machine state accessor object. -/// \param a Machine state accessor object. -/// \param pc Current pc. -/// \param insn Instruction. -/// \return execute_status::failure if an exception was raised, or -/// execute_status::success otherwise. -/// \details The execute_insn function decodes the instruction in multiple levels. When we know for sure that -/// the instruction could only be a <FOO>, a function with the name execute_<FOO> will be called. -/// See [RV32/64G Instruction Set -/// Listings](https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf#chapter.19) and [Instruction -/// listings for RISC-V](https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf#table.19.2). -template -static FORCE_INLINE execute_status execute_insn(STATE_ACCESS &a, uint64_t &pc, uint64_t &mcycle, uint32_t insn) { - // Is compressed instruction - if ((insn & 3) != 3) { - // The fetch may read 4 bytes as an optimization, - // but the compressed instruction uses only the 2 less significant bytes - insn = static_cast(insn); - auto c_funct3 = static_cast(insn_get_c_funct3(insn)); - switch (c_funct3) { - case insn_c_funct3::C_ADDI4SPN: - return execute_C_ADDI4SPN(a, pc, insn); - case insn_c_funct3::C_LW: - return execute_C_LW(a, pc, mcycle, insn); - case insn_c_funct3::C_LD: - return execute_C_LD(a, pc, mcycle, insn); - case insn_c_funct3::C_SW: - return execute_C_SW(a, pc, mcycle, insn); - case insn_c_funct3::C_SD: - return execute_C_SD(a, pc, mcycle, insn); - case insn_c_funct3::C_Q1_SET0: - return execute_C_Q1_SET0(a, pc, insn); - case insn_c_funct3::C_ADDIW: - return execute_C_ADDIW(a, pc, insn); - case insn_c_funct3::C_LI: - return execute_C_LI(a, pc, insn); - case insn_c_funct3::C_Q1_SET1: - return execute_C_Q1_SET1(a, pc, insn); - case insn_c_funct3::C_Q1_SET2: - return execute_C_Q1_SET2(a, pc, insn); - case insn_c_funct3::C_J: - return execute_C_J(a, pc, insn); - case insn_c_funct3::C_BEQZ: - return execute_C_BEQZ(a, pc, insn); - case insn_c_funct3::C_BNEZ: - return execute_C_BNEZ(a, pc, insn); - case insn_c_funct3::C_SLLI: - return execute_C_SLLI(a, pc, insn); - case insn_c_funct3::C_LWSP: - return execute_C_LWSP(a, pc, mcycle, insn); - case insn_c_funct3::C_LDSP: - return execute_C_LDSP(a, pc, mcycle, insn); - case insn_c_funct3::C_Q2_SET0: - return execute_C_Q2_SET0(a, pc, insn); - case insn_c_funct3::C_SWSP: - return execute_C_SWSP(a, pc, mcycle, insn); - case insn_c_funct3::C_SDSP: - return execute_C_SDSP(a, pc, mcycle, insn); - default: { - // Here we are sure that the next instruction, at best, can only be a floating point instruction, - // or, at worst, an illegal instruction. - // Since all float instructions try to read the float state, - // we can put the next check before all of them. - // If FS is OFF, attempts to read or write the float state will cause an illegal instruction - // exception. - if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { - return raise_illegal_insn_exception(a, pc, insn); - } - switch (c_funct3) { - case insn_c_funct3::C_FLD: - return execute_C_FLD(a, pc, mcycle, insn); - case insn_c_funct3::C_FSD: - return execute_C_FSD(a, pc, mcycle, insn); - case insn_c_funct3::C_FLDSP: - return execute_C_FLDSP(a, pc, mcycle, insn); - case insn_c_funct3::C_FSDSP: - return execute_C_FSDSP(a, pc, mcycle, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); - } - } - } - } else { - //??D We should probably try doing the first branch on the combined opcode, funct3, and funct7. - // Maybe it reduces the number of levels needed to decode most instructions. - auto funct3_00000_opcode = static_cast(insn_get_funct3_00000_opcode(insn)); - switch (funct3_00000_opcode) { - case insn_funct3_00000_opcode::LB: - return execute_LB(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::LH: - return execute_LH(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::LW: - return execute_LW(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::LD: - return execute_LD(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::LBU: - return execute_LBU(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::LHU: - return execute_LHU(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::LWU: - return execute_LWU(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::SB: - return execute_SB(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::SH: - return execute_SH(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::SW: - return execute_SW(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::SD: - return execute_SD(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::FENCE: - return execute_FENCE(a, pc, insn); - case insn_funct3_00000_opcode::FENCE_I: - return execute_FENCE_I(a, pc, insn); - case insn_funct3_00000_opcode::ADDI: - return execute_ADDI(a, pc, insn); - case insn_funct3_00000_opcode::SLLI: - return execute_SLLI(a, pc, insn); - case insn_funct3_00000_opcode::SLTI: - return execute_SLTI(a, pc, insn); - case insn_funct3_00000_opcode::SLTIU: - return execute_SLTIU(a, pc, insn); - case insn_funct3_00000_opcode::XORI: - return execute_XORI(a, pc, insn); - case insn_funct3_00000_opcode::ORI: - return execute_ORI(a, pc, insn); - case insn_funct3_00000_opcode::ANDI: - return execute_ANDI(a, pc, insn); - case insn_funct3_00000_opcode::ADDIW: - return execute_ADDIW(a, pc, insn); - case insn_funct3_00000_opcode::SLLIW: - return execute_SLLIW(a, pc, insn); - case insn_funct3_00000_opcode::SLLW: - return execute_SLLW(a, pc, insn); - case insn_funct3_00000_opcode::DIVW: - return execute_DIVW(a, pc, insn); - case insn_funct3_00000_opcode::REMW: - return execute_REMW(a, pc, insn); - case insn_funct3_00000_opcode::REMUW: - return execute_REMUW(a, pc, insn); - case insn_funct3_00000_opcode::BEQ: - return execute_BEQ(a, pc, insn); - case insn_funct3_00000_opcode::BNE: - return execute_BNE(a, pc, insn); - case insn_funct3_00000_opcode::BLT: - return execute_BLT(a, pc, insn); - case insn_funct3_00000_opcode::BGE: - return execute_BGE(a, pc, insn); - case insn_funct3_00000_opcode::BLTU: - return execute_BLTU(a, pc, insn); - case insn_funct3_00000_opcode::BGEU: - return execute_BGEU(a, pc, insn); - case insn_funct3_00000_opcode::JALR: - return execute_JALR(a, pc, insn); - case insn_funct3_00000_opcode::CSRRW: - return execute_CSRRW(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::CSRRS: - return execute_CSRRS(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::CSRRC: - return execute_CSRRC(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::CSRRWI: - return execute_CSRRWI(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::CSRRSI: - return execute_CSRRSI(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::CSRRCI: - return execute_CSRRCI(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::AUIPC_000: - case insn_funct3_00000_opcode::AUIPC_001: - case insn_funct3_00000_opcode::AUIPC_010: - case insn_funct3_00000_opcode::AUIPC_011: - case insn_funct3_00000_opcode::AUIPC_100: - case insn_funct3_00000_opcode::AUIPC_101: - case insn_funct3_00000_opcode::AUIPC_110: - case insn_funct3_00000_opcode::AUIPC_111: - return execute_AUIPC(a, pc, insn); - case insn_funct3_00000_opcode::LUI_000: - case insn_funct3_00000_opcode::LUI_001: - case insn_funct3_00000_opcode::LUI_010: - case insn_funct3_00000_opcode::LUI_011: - case insn_funct3_00000_opcode::LUI_100: - case insn_funct3_00000_opcode::LUI_101: - case insn_funct3_00000_opcode::LUI_110: - case insn_funct3_00000_opcode::LUI_111: - return execute_LUI(a, pc, insn); - case insn_funct3_00000_opcode::JAL_000: - case insn_funct3_00000_opcode::JAL_001: - case insn_funct3_00000_opcode::JAL_010: - case insn_funct3_00000_opcode::JAL_011: - case insn_funct3_00000_opcode::JAL_100: - case insn_funct3_00000_opcode::JAL_101: - case insn_funct3_00000_opcode::JAL_110: - case insn_funct3_00000_opcode::JAL_111: - return execute_JAL(a, pc, insn); - case insn_funct3_00000_opcode::SRLI_SRAI: - return execute_SRLI_SRAI(a, pc, insn); - case insn_funct3_00000_opcode::SRLIW_SRAIW: - return execute_SRLIW_SRAIW(a, pc, insn); - case insn_funct3_00000_opcode::AMO_W: - return execute_AMO_W(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::AMO_D: - return execute_AMO_D(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::ADD_MUL_SUB: - return execute_ADD_MUL_SUB(a, pc, insn); - case insn_funct3_00000_opcode::SLL_MULH: - return execute_SLL_MULH(a, pc, insn); - case insn_funct3_00000_opcode::SLT_MULHSU: - return execute_SLT_MULHSU(a, pc, insn); - case insn_funct3_00000_opcode::SLTU_MULHU: - return execute_SLTU_MULHU(a, pc, insn); - case insn_funct3_00000_opcode::XOR_DIV: - return execute_XOR_DIV(a, pc, insn); - case insn_funct3_00000_opcode::SRL_DIVU_SRA: - return execute_SRL_DIVU_SRA(a, pc, insn); - case insn_funct3_00000_opcode::OR_REM: - return execute_OR_REM(a, pc, insn); - case insn_funct3_00000_opcode::AND_REMU: - return execute_AND_REMU(a, pc, insn); - case insn_funct3_00000_opcode::ADDW_MULW_SUBW: - return execute_ADDW_MULW_SUBW(a, pc, insn); - case insn_funct3_00000_opcode::SRLW_DIVUW_SRAW: - return execute_SRLW_DIVUW_SRAW(a, pc, insn); - case insn_funct3_00000_opcode::PRIVILEGED: - return execute_privileged(a, pc, mcycle, insn); - default: { - // Here we are sure that the next instruction, at best, can only be a floating point instruction, - // or, at worst, an illegal instruction. - // Since all float instructions try to read the float state, - // we can put the next check before all of them. - // If FS is OFF, attempts to read or write the float state will cause an illegal instruction exception. - if (unlikely((a.read_mstatus() & MSTATUS_FS_MASK) == MSTATUS_FS_OFF)) { - return raise_illegal_insn_exception(a, pc, insn); - } - switch (funct3_00000_opcode) { - case insn_funct3_00000_opcode::FSW: - return execute_FSW(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::FSD: - return execute_FSD(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::FLW: - return execute_FLW(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::FLD: - return execute_FLD(a, pc, mcycle, insn); - case insn_funct3_00000_opcode::FMADD_RNE: - case insn_funct3_00000_opcode::FMADD_RTZ: - case insn_funct3_00000_opcode::FMADD_RDN: - case insn_funct3_00000_opcode::FMADD_RUP: - case insn_funct3_00000_opcode::FMADD_RMM: - case insn_funct3_00000_opcode::FMADD_DYN: - return execute_FMADD(a, pc, insn); - case insn_funct3_00000_opcode::FMSUB_RNE: - case insn_funct3_00000_opcode::FMSUB_RTZ: - case insn_funct3_00000_opcode::FMSUB_RDN: - case insn_funct3_00000_opcode::FMSUB_RUP: - case insn_funct3_00000_opcode::FMSUB_RMM: - case insn_funct3_00000_opcode::FMSUB_DYN: - return execute_FMSUB(a, pc, insn); - case insn_funct3_00000_opcode::FNMSUB_RNE: - case insn_funct3_00000_opcode::FNMSUB_RTZ: - case insn_funct3_00000_opcode::FNMSUB_RDN: - case insn_funct3_00000_opcode::FNMSUB_RUP: - case insn_funct3_00000_opcode::FNMSUB_RMM: - case insn_funct3_00000_opcode::FNMSUB_DYN: - return execute_FNMSUB(a, pc, insn); - case insn_funct3_00000_opcode::FNMADD_RNE: - case insn_funct3_00000_opcode::FNMADD_RTZ: - case insn_funct3_00000_opcode::FNMADD_RDN: - case insn_funct3_00000_opcode::FNMADD_RUP: - case insn_funct3_00000_opcode::FNMADD_RMM: - case insn_funct3_00000_opcode::FNMADD_DYN: - return execute_FNMADD(a, pc, insn); - case insn_funct3_00000_opcode::FD_000: - case insn_funct3_00000_opcode::FD_001: - case insn_funct3_00000_opcode::FD_010: - case insn_funct3_00000_opcode::FD_011: - case insn_funct3_00000_opcode::FD_100: - case insn_funct3_00000_opcode::FD_111: - return execute_FD(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); - } - } - } - } -} - /// \brief Instruction fetch status code enum class fetch_status : int { exception, ///< Instruction fetch failed: exception raised @@ -5388,7 +5211,7 @@ enum class fetch_status : int { /// \return Returns fetch_status::success if load succeeded, fetch_status::exception if it caused an exception. // In that case, raise the exception. template -static FORCE_INLINE fetch_status fetch_translate_pc_slow(STATE_ACCESS &a, uint64_t &pc, uint64_t vaddr, +static FORCE_INLINE fetch_status fetch_translate_pc_slow(STATE_ACCESS a, uint64_t &pc, uint64_t vaddr, unsigned char **phptr) { uint64_t paddr{}; // Walk page table and obtain the physical address @@ -5419,7 +5242,7 @@ static FORCE_INLINE fetch_status fetch_translate_pc_slow(STATE_ACCESS &a, uint64 /// \return Returns fetch_status::success if load succeeded, fetch_status::exception if it caused an exception. // In that case, raise the exception. template -static FORCE_INLINE fetch_status fetch_translate_pc(STATE_ACCESS &a, uint64_t &pc, uint64_t vaddr, +static FORCE_INLINE fetch_status fetch_translate_pc(STATE_ACCESS a, uint64_t &pc, uint64_t vaddr, unsigned char **phptr) { // Try to perform the address translation via TLB first if (unlikely(!(a.template translate_vaddr_via_tlb(vaddr, phptr)))) { @@ -5441,43 +5264,55 @@ static FORCE_INLINE fetch_status fetch_translate_pc(STATE_ACCESS &a, uint64_t &p /// \return Returns fetch_status::success if load succeeded, fetch_status::exception if it caused an exception. // In that case, raise the exception. template -static FORCE_INLINE fetch_status fetch_insn(STATE_ACCESS &a, uint64_t &pc, uint32_t &insn, uint64_t &fetch_vaddr_page, +static FORCE_INLINE fetch_status fetch_insn(STATE_ACCESS a, uint64_t &pc, uint32_t &insn, uint64_t &fetch_vaddr_page, uint64_t &fetch_vh_offset) { + // Efficiently checks if current pc is in the same page as last pc fetch + // and it's not crossing a page boundary. + if (likely((pc ^ fetch_vaddr_page) < (PMA_PAGE_SIZE - 2))) { + // Fetch pc is in the same page as the last pc fetch and it's not crossing a page boundary, + // we can just reuse last fetch translation, skipping TLB or slow address translation altogether. + const unsigned char *hptr = cast_addr_to_ptr(pc + fetch_vh_offset); + + // Here we are sure that reading 4 bytes won't cross a page boundary. + // However pc may not be 4 byte aligned, at best it can only be 2-byte aligned, + // therefore we must perform a misaligned 4 byte read on a 2 byte aligned pointer. + // In case pc holds a compressed instruction, insn will store 2 additional bytes, + // but this is fine because later the instruction decoder will discard them. + insn = aliased_unaligned_read(hptr); + return fetch_status::success; + } + // Fetch pc is either not the same as last cache or crossing a page boundary. + + // Perform address translation unsigned char *hptr = nullptr; - const uint64_t vaddr_page = pc & ~PAGE_OFFSET_MASK; - // If pc is in the same page as the last pc fetch, - // we can just reuse last fetch translation, skipping TLB or slow address translation altogether. - if (likely(vaddr_page == fetch_vaddr_page)) { - hptr = cast_addr_to_ptr(pc + fetch_vh_offset); - } else { - // Not in the same page as last the fetch, we need to perform address translation - if (unlikely(fetch_translate_pc(a, pc, pc, &hptr) == fetch_status::exception)) { - return fetch_status::exception; - } - // Update fetch address translation cache - fetch_vaddr_page = vaddr_page; - fetch_vh_offset = cast_ptr_to_addr(hptr) - pc; + if (unlikely(fetch_translate_pc(a, pc, pc, &hptr) == fetch_status::exception)) { + return fetch_status::exception; } + // Update fetch address translation cache + fetch_vaddr_page = pc & ~PAGE_OFFSET_MASK; + fetch_vh_offset = cast_ptr_to_addr(hptr) - pc; + // The following code assumes pc is always 2-byte aligned, this is guaranteed by RISC-V spec. // If pc is pointing to the very last 2 bytes of a page, it's crossing a page boundary. if (unlikely(((~pc & PAGE_OFFSET_MASK) >> 1) == 0)) { // Here we are crossing page boundary, this is unlikely (1 in 2048 possible cases) insn = aliased_aligned_read(hptr); // If not a compressed instruction, we must read 2 additional bytes from the next page. - if (unlikely((insn & 3) == 3)) { + if (unlikely(insn_is_uncompressed(insn))) { // We have to perform a new address translation to read the next 2 bytes since we changed pages. const uint64_t vaddr = pc + 2; if (unlikely(fetch_translate_pc(a, pc, vaddr, &hptr) == fetch_status::exception)) { return fetch_status::exception; } // Update fetch translation cache - fetch_vaddr_page = vaddr & ~PAGE_OFFSET_MASK; + fetch_vaddr_page = vaddr; fetch_vh_offset = cast_ptr_to_addr(hptr) - vaddr; // Produce the final 4-byte instruction insn |= aliased_aligned_read(hptr) << 16; } return fetch_status::success; } + // Here we are sure that reading 4 bytes won't cross a page boundary. // However pc may not be 4 byte aligned, at best it can only be 2-byte aligned, // therefore we must perform a misaligned 4 byte read on a 2 byte aligned pointer. @@ -5489,7 +5324,7 @@ static FORCE_INLINE fetch_status fetch_insn(STATE_ACCESS &a, uint64_t &pc, uint3 /// \brief Checks that false brk is consistent with rest of state template -static void assert_no_brk([[maybe_unused]] STATE_ACCESS &a) { +static void assert_no_brk([[maybe_unused]] STATE_ACCESS a) { assert(get_pending_irq_mask(a) == 0); // LCOV_EXCL_LINE assert(a.read_iflags_X() == 0); // LCOV_EXCL_LINE assert(a.read_iflags_Y() == 0); // LCOV_EXCL_LINE @@ -5498,7 +5333,7 @@ static void assert_no_brk([[maybe_unused]] STATE_ACCESS &a) { /// \brief Interpreter hot loop template -static NO_INLINE execute_status interpret_loop(STATE_ACCESS &a, uint64_t mcycle_end, uint64_t mcycle) { +static NO_INLINE execute_status interpret_loop(STATE_ACCESS a, uint64_t mcycle_end, uint64_t mcycle) { // The interpret loop is constantly reading and modifying the pc and mcycle variables, // because of this care is taken to make them stack variables that are propagated across inline functions, // helping the C++ compiler optimize them into registers instead of stack variables when compiling, @@ -5512,7 +5347,7 @@ static NO_INLINE execute_status interpret_loop(STATE_ACCESS &a, uint64_t mcycle_ uint64_t pc = a.read_pc(); // Initialize fetch address translation cache invalidated - uint64_t fetch_vaddr_page = PAGE_OFFSET_MASK; + uint64_t fetch_vaddr_page = ~pc; uint64_t fetch_vh_offset = 0; // The outer loop continues until there is an interruption that should be handled @@ -5551,8 +5386,462 @@ static NO_INLINE execute_status interpret_loop(STATE_ACCESS &a, uint64_t mcycle_ // Try to fetch the next instruction if (likely(fetch_insn(a, pc, insn, fetch_vaddr_page, fetch_vh_offset) == fetch_status::success)) { - // Try to execute it - const execute_status status = execute_insn(a, pc, mcycle, insn); + // clang-format off + // NOLINTBEGIN + execute_status status; // explicit uninitialized as an optimization + + // This header define the instruction jump table table, which is very large. + // It also defines the jump table related macros used in the next big switch. + #include "interpret-jump-table.h" + + // This will use computed goto on supported compilers, + // otherwise normal switch in unsupported platforms. + INSN_SWITCH(insn_get_id(insn)) { + // The instructions is this switch are ordered so + // infrequent instructions are placed at the end. + + // IM extensions + INSN_CASE(LUI_rdN): + status = execute_LUI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(AUIPC_rdN): + status = execute_AUIPC(a, pc, insn); + INSN_BREAK(); + INSN_CASE(JAL_rd0): + status = execute_JAL(a, pc, insn); + INSN_BREAK(); + INSN_CASE(JAL_rdN): + status = execute_JAL(a, pc, insn); + INSN_BREAK(); + INSN_CASE(JALR_rd0): + status = execute_JALR(a, pc, insn); + INSN_BREAK(); + INSN_CASE(JALR_rdN): + status = execute_JALR(a, pc, insn); + INSN_BREAK(); + INSN_CASE(BEQ): + status = execute_BEQ(a, pc, insn); + INSN_BREAK(); + INSN_CASE(BNE): + status = execute_BNE(a, pc, insn); + INSN_BREAK(); + INSN_CASE(BLT): + status = execute_BLT(a, pc, insn); + INSN_BREAK(); + INSN_CASE(BGE): + status = execute_BGE(a, pc, insn); + INSN_BREAK(); + INSN_CASE(BLTU): + status = execute_BLTU(a, pc, insn); + INSN_BREAK(); + INSN_CASE(BGEU): + status = execute_BGEU(a, pc, insn); + INSN_BREAK(); + INSN_CASE(ADDI_rdN): + status = execute_ADDI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLTI_rdN): + status = execute_SLTI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLTIU_rdN): + status = execute_SLTIU(a, pc, insn); + INSN_BREAK(); + INSN_CASE(XORI_rdN): + status = execute_XORI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(ORI_rdN): + status = execute_ORI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(ANDI_rdN): + status = execute_ANDI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLLI_rdN): + status = execute_SLLI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SRLI_SRAI_rdN): + status = execute_SRLI_SRAI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(ADD_MUL_SUB_rdN): + status = execute_ADD_MUL_SUB(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLL_MULH_rdN): + status = execute_SLL_MULH(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLT_MULHSU_rdN): + status = execute_SLT_MULHSU(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLTU_MULHU_rdN): + status = execute_SLTU_MULHU(a, pc, insn); + INSN_BREAK(); + INSN_CASE(XOR_DIV_rdN): + status = execute_XOR_DIV(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SRL_DIVU_SRA_rdN): + status = execute_SRL_DIVU_SRA(a, pc, insn); + INSN_BREAK(); + INSN_CASE(OR_REM_rdN): + status = execute_OR_REM(a, pc, insn); + INSN_BREAK(); + INSN_CASE(AND_REMU_rdN): + status = execute_AND_REMU(a, pc, insn); + INSN_BREAK(); + INSN_CASE(ADDIW_rdN): + status = execute_ADDIW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLLIW_rdN): + status = execute_SLLIW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SRLIW_SRAIW_rdN): + status = execute_SRLIW_SRAIW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(ADDW_MULW_SUBW_rdN): + status = execute_ADDW_MULW_SUBW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLLW_rdN): + status = execute_SLLW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SRLW_DIVUW_SRAW_rdN): + status = execute_SRLW_DIVUW_SRAW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(DIVW_rdN): + status = execute_DIVW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(REMW_rdN): + status = execute_REMW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(REMUW_rdN): + status = execute_REMUW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(LD_rdN): + status = execute_LD(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(LW_rdN): + status = execute_LW(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(LWU_rdN): + status = execute_LWU(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(LH_rdN): + status = execute_LH(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(LHU_rdN): + status = execute_LHU(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(LB_rdN): + status = execute_LB(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(LBU_rdN): + status = execute_LBU(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(SD): + status = execute_SD(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(SW): + status = execute_SW(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(SH): + status = execute_SH(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(SB): + status = execute_SB(a, pc, mcycle, insn); + INSN_BREAK(); + // C extension + INSN_CASE(C_HINT): + INSN_CASE(C_NOP): + status = execute_C_NOP(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_LUI): + status = execute_C_LUI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_LI): + status = execute_C_LI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_J): + status = execute_C_J(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_JR): + status = execute_C_JR(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_JALR): + status = execute_C_JALR(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_MV): + status = execute_C_MV(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_BEQZ): + status = execute_C_BEQZ(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_BNEZ): + status = execute_C_BNEZ(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_ADDI): + status = execute_C_ADDI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_ADDIW): + status = execute_C_ADDIW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_ADDI4SPN): + status = execute_C_ADDI4SPN(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_ADDI16SP): + status = execute_C_ADDI16SP(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_ANDI): + status = execute_C_ANDI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_SLLI): + status = execute_C_SLLI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_SRAI): + status = execute_C_SRAI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_SRLI): + status = execute_C_SRLI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_ADD): + status = execute_C_ADD(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_SUB): + status = execute_C_SUB(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_XOR): + status = execute_C_XOR(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_OR): + status = execute_C_OR(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_AND): + status = execute_C_AND(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_ADDW): + status = execute_C_ADDW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_SUBW): + status = execute_C_SUBW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(C_LD): + status = execute_C_LD(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(C_LW): + status = execute_C_LW(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(C_LDSP): + status = execute_C_LDSP(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(C_LWSP): + status = execute_C_LWSP(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(C_SD): + status = execute_C_SD(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(C_SW): + status = execute_C_SW(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(C_SDSP): + status = execute_C_SDSP(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(C_SWSP): + status = execute_C_SWSP(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(C_FLD): + status = execute_C_FLD(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(C_FLDSP): + status = execute_C_FLDSP(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(C_FSD): + status = execute_C_FSD(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(C_FSDSP): + status = execute_C_FSDSP(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(C_EBREAK): + status = execute_C_EBREAK(a, pc, insn); + INSN_BREAK(); + // FD extensions + INSN_CASE(FD): + status = execute_FD(a, pc, insn); + INSN_BREAK(); + INSN_CASE(FLD): + status = execute_FLD(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(FLW): + status = execute_FLW(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(FSD): + status = execute_FSD(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(FSW): + status = execute_FSW(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(FMADD): + status = execute_FMADD(a, pc, insn); + INSN_BREAK(); + INSN_CASE(FMSUB): + status = execute_FMSUB(a, pc, insn); + INSN_BREAK(); + INSN_CASE(FNMADD): + status = execute_FNMADD(a, pc, insn); + INSN_BREAK(); + INSN_CASE(FNMSUB): + status = execute_FNMSUB(a, pc, insn); + INSN_BREAK(); + // A extension + INSN_CASE(AMO_D): + status = execute_AMO_D(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(AMO_W): + status = execute_AMO_W(a, pc, mcycle, insn); + INSN_BREAK(); + // Zicsr extension + INSN_CASE(CSRRW): + status = execute_CSRRW(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(CSRRS): + status = execute_CSRRS(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(CSRRC): + status = execute_CSRRC(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(CSRRWI): + status = execute_CSRRWI(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(CSRRSI): + status = execute_CSRRSI(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(CSRRCI): + status = execute_CSRRCI(a, pc, mcycle, insn); + INSN_BREAK(); + // Special instructions that are less frequent + INSN_CASE(FENCE): + status = execute_FENCE(a, pc, insn); + INSN_BREAK(); + INSN_CASE(FENCE_I): + status = execute_FENCE_I(a, pc, insn); + INSN_BREAK(); + INSN_CASE(PRIVILEGED): + status = execute_privileged(a, pc, mcycle, insn); + INSN_BREAK(); + // Instructions with hints where rd=0 + INSN_CASE(LUI_rd0): + status = execute_LUI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(AUIPC_rd0): + status = execute_AUIPC(a, pc, insn); + INSN_BREAK(); + INSN_CASE(ADDI_rd0): + status = execute_ADDI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLTI_rd0): + status = execute_SLTI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLTIU_rd0): + status = execute_SLTIU(a, pc, insn); + INSN_BREAK(); + INSN_CASE(XORI_rd0): + status = execute_XORI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(ORI_rd0): + status = execute_ORI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(ANDI_rd0): + status = execute_ANDI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLLI_rd0): + status = execute_SLLI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SRLI_SRAI_rd0): + status = execute_SRLI_SRAI(a, pc, insn); + INSN_BREAK(); + INSN_CASE(ADD_MUL_SUB_rd0): + status = execute_ADD_MUL_SUB(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLL_MULH_rd0): + status = execute_SLL_MULH(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLT_MULHSU_rd0): + status = execute_SLT_MULHSU(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLTU_MULHU_rd0): + status = execute_SLTU_MULHU(a, pc, insn); + INSN_BREAK(); + INSN_CASE(XOR_DIV_rd0): + status = execute_XOR_DIV(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SRL_DIVU_SRA_rd0): + status = execute_SRL_DIVU_SRA(a, pc, insn); + INSN_BREAK(); + INSN_CASE(OR_REM_rd0): + status = execute_OR_REM(a, pc, insn); + INSN_BREAK(); + INSN_CASE(AND_REMU_rd0): + status = execute_AND_REMU(a, pc, insn); + INSN_BREAK(); + INSN_CASE(ADDIW_rd0): + status = execute_ADDIW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLLIW_rd0): + status = execute_SLLIW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SRLIW_SRAIW_rd0): + status = execute_SRLIW_SRAIW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(ADDW_MULW_SUBW_rd0): + status = execute_ADDW_MULW_SUBW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SLLW_rd0): + status = execute_SLLW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(SRLW_DIVUW_SRAW_rd0): + status = execute_SRLW_DIVUW_SRAW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(DIVW_rd0): + status = execute_DIVW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(REMW_rd0): + status = execute_REMW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(REMUW_rd0): + status = execute_REMUW(a, pc, insn); + INSN_BREAK(); + INSN_CASE(LD_rd0): + status = execute_LD(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(LW_rd0): + status = execute_LW(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(LWU_rd0): + status = execute_LWU(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(LH_rd0): + status = execute_LH(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(LHU_rd0): + status = execute_LHU(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(LB_rd0): + status = execute_LB(a, pc, mcycle, insn); + INSN_BREAK(); + INSN_CASE(LBU_rd0): + status = execute_LBU(a, pc, mcycle, insn); + INSN_BREAK(); + // Illegal instructions + INSN_CASE(ILLEGAL): + status = raise_illegal_insn_exception(a, pc, insn); + INSN_BREAK(); +#ifndef USE_COMPUTED_GOTO + // When using a naive switch statement, other cases are impossible. + // The following will give a hint to the compiler that it can remove range checks + // (relevant for the WebAssembly target, which cannot use computed gotos). + default: + __builtin_unreachable(); + break; +#endif + } + INSN_SWITCH_OUT(); + + // NOLINTEND + // clang-format on // When execute status is above success, we have to deal with special loop conditions, // this is very unlikely to happen most of the time @@ -5562,7 +5851,7 @@ static NO_INLINE execute_status interpret_loop(STATE_ACCESS &a, uint64_t mcycle_ // due to MRET/SRET instructions (execute_status::success_and_serve_interrupts) // As a simplification (and optimization), the next line will also invalidate in more cases, // but this it's fine. - fetch_vaddr_page = PAGE_OFFSET_MASK; + fetch_vaddr_page = ~pc; // All status above execute_status::success_and_serve_interrupts will require breaking the loop if (unlikely(status >= execute_status::success_and_serve_interrupts)) { // Increment the cycle counter mcycle @@ -5598,7 +5887,7 @@ static NO_INLINE execute_status interpret_loop(STATE_ACCESS &a, uint64_t mcycle_ } template -interpreter_break_reason interpret(STATE_ACCESS &a, uint64_t mcycle_end) { +interpreter_break_reason interpret(STATE_ACCESS a, uint64_t mcycle_end) { static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "code assumes little-endian byte ordering"); static_assert(is_an_i_state_access::value, "not an i_state_access"); @@ -5645,14 +5934,14 @@ interpreter_break_reason interpret(STATE_ACCESS &a, uint64_t mcycle_end) { #ifdef MICROARCHITECTURE // Explicit instantiation for uarch_machine_state_access -template interpreter_break_reason interpret(uarch_machine_state_access &a, uint64_t mcycle_end); +template interpreter_break_reason interpret(uarch_machine_state_access a, uint64_t mcycle_end); #else // Explicit instantiation for state_access -template interpreter_break_reason interpret(state_access &a, uint64_t mcycle_end); +template interpreter_break_reason interpret(state_access a, uint64_t mcycle_end); // Explicit instantiation for record_step_state_access -template interpreter_break_reason interpret(record_step_state_access &a, uint64_t mcycle_end); +template interpreter_break_reason interpret(record_step_state_access a, uint64_t mcycle_end); // Explicit instantiation for replay_step_state_access -template interpreter_break_reason interpret(replay_step_state_access &a, uint64_t mcycle_end); +template interpreter_break_reason interpret(replay_step_state_access a, uint64_t mcycle_end); #endif // MICROARCHITECTURE } // namespace cartesi diff --git a/src/interpret.h b/src/interpret.h index 0e02d6dac..8e5297f93 100644 --- a/src/interpret.h +++ b/src/interpret.h @@ -54,7 +54,7 @@ enum class interpreter_break_reason { /// \details The interpret may stop early if the machine halts permanently or becomes temporarily idle (waiting for /// interrupts). template -interpreter_break_reason interpret(STATE_ACCESS &a, uint64_t mcycle_end); +interpreter_break_reason interpret(STATE_ACCESS a, uint64_t mcycle_end); } // namespace cartesi diff --git a/src/is-pristine.h b/src/is-pristine.h new file mode 100644 index 000000000..458d297fc --- /dev/null +++ b/src/is-pristine.h @@ -0,0 +1,43 @@ +// Copyright Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: LGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) any +// later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +// PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License along +// with this program (see COPYING). If not, see . +// + +#ifndef IS_PRISTINE_H +#define IS_PRISTINE_H + +#include "compiler-defines.h" +#include +#include + +namespace cartesi { + +/// \brief This is an optimized function for checking if memory page is pristine. +/// \param data Memory pointer +/// \param length Memory length +/// \details It's instead to be used in situations where length is equal or less than a page size. +// NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) +static inline bool FORCE_OPTIMIZE_O3 is_pristine(const unsigned char *data, size_t length) { + // This tight for loop has no branches, and is optimized to SIMD instructions in x86_64, + // making it very fast to check if a given page is pristine. + unsigned char bits = 0; + for (size_t i = 0; i < length; ++i) { + bits |= data[i]; + } + return bits == 0; +} + +} // namespace cartesi + +#endif \ No newline at end of file diff --git a/src/machine-state.h b/src/machine-state.h index 915b9c8b2..f587c348e 100644 --- a/src/machine-state.h +++ b/src/machine-state.h @@ -59,9 +59,10 @@ struct machine_state { // The following state fields are very hot, // and are carefully ordered to have better data locality in the interpreter loop. + // The X registers are the very first to optimize access of registers in the interpreter. + std::array x{}; ///< Register file uint64_t mcycle{}; ///< CSR mcycle. uint64_t pc{}; ///< Program counter. - std::array x{}; ///< Register file. uint64_t fcsr{}; ///< CSR fcsr. std::array f{}; ///< Floating-point register file. diff --git a/src/machine.cpp b/src/machine.cpp index 715e85775..8bbf0a9d7 100644 --- a/src/machine.cpp +++ b/src/machine.cpp @@ -18,12 +18,14 @@ #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -41,6 +43,7 @@ #include "htif.h" #include "i-device-state-access.h" #include "interpret.h" +#include "is-pristine.h" #include "machine-config.h" #include "machine-memory-range-descr.h" #include "machine-runtime-config.h" @@ -836,7 +839,7 @@ machine::~machine() { #ifdef DUMP_HIST std::ignore = fprintf(stderr, "\nInstruction Histogram:\n"); for (auto v : m_s.insn_hist) { - std::ignore = fprintf(stderr, "%s: %" PRIu64 "\n", v.first.c_str(), v.second); + std::ignore = fprintf(stderr, "%12" PRIu64 " %s\n", v.second, v.first.c_str()); } #endif #if DUMP_COUNTERS @@ -2030,10 +2033,7 @@ bool machine::update_merkle_tree() const { return false; } if (page_data != nullptr) { - const bool is_pristine = std::all_of(page_data, page_data + PMA_PAGE_SIZE, - [](unsigned char pp) -> bool { return pp == '\0'; }); - - if (is_pristine) { + if (is_pristine(page_data, PMA_PAGE_SIZE)) { // The update_page_node_hash function in the machine_merkle_tree is not thread // safe, so we protect it with a mutex const parallel_for_mutex_guard lock(mutex); @@ -2260,7 +2260,7 @@ void machine::fill_memory(uint64_t address, uint8_t data, uint64_t length) { } void machine::read_virtual_memory(uint64_t vaddr_start, unsigned char *data, uint64_t length) { - state_access a(*this); + const state_access a(*this); if (length == 0) { return; } @@ -2291,7 +2291,7 @@ void machine::read_virtual_memory(uint64_t vaddr_start, unsigned char *data, uin } void machine::write_virtual_memory(uint64_t vaddr_start, const unsigned char *data, uint64_t length) { - state_access a(*this); + const state_access a(*this); if (length == 0) { return; } @@ -2324,7 +2324,7 @@ void machine::write_virtual_memory(uint64_t vaddr_start, const unsigned char *da } uint64_t machine::translate_virtual_address(uint64_t vaddr) { - state_access a(*this); + const state_access a(*this); // perform address translation using read access mode uint64_t paddr = 0; if (!cartesi::translate_virtual_address(a, &paddr, vaddr, PTE_XWR_R_SHIFT)) { @@ -2359,7 +2359,7 @@ uint64_t machine::read_word(uint64_t word_address) const { } void machine::send_cmio_response(uint16_t reason, const unsigned char *data, uint64_t length) { - state_access a(*this); + const state_access a(*this); cartesi::send_cmio_response(a, reason, data, length); } @@ -2367,8 +2367,9 @@ access_log machine::log_send_cmio_response(uint16_t reason, const unsigned char const access_log::type &log_type) { hash_type root_hash_before; get_root_hash(root_hash_before); + access_log log(log_type); // Call send_cmio_response with the recording state accessor - record_state_access a(*this, log_type); + record_state_access a(*this, log); a.push_bracket(bracket_type::begin, "send cmio response"); cartesi::send_cmio_response(a, reason, data, length); a.push_bracket(bracket_type::end, "send cmio response"); @@ -2376,8 +2377,8 @@ access_log machine::log_send_cmio_response(uint16_t reason, const unsigned char hash_type root_hash_after; update_merkle_tree(); get_root_hash(root_hash_after); - verify_send_cmio_response(reason, data, length, root_hash_before, *a.get_log(), root_hash_after); - return std::move(*a.get_log()); + verify_send_cmio_response(reason, data, length, root_hash_before, log, root_hash_after); + return log; } void machine::verify_send_cmio_response(uint16_t reason, const unsigned char *data, uint64_t length, @@ -2386,9 +2387,9 @@ void machine::verify_send_cmio_response(uint16_t reason, const unsigned char *da if (log.get_accesses().empty()) { throw std::invalid_argument{"too few accesses in log"}; } - + replay_state_access::context context(log, root_hash_before); // Verify all intermediate state transitions - replay_state_access a(log, root_hash_before); + replay_state_access a(context); cartesi::send_cmio_response(a, reason, data, length); a.finish(); @@ -2509,7 +2510,8 @@ interpreter_break_reason machine::log_step(uint64_t mcycle_count, const std::str } hash_type root_hash_before; get_root_hash(root_hash_before); - record_step_state_access a(*this, filename); + record_step_state_access::context context(filename); + record_step_state_access a(context, *this); uint64_t mcycle_end{}; if (__builtin_add_overflow(a.read_mcycle(), mcycle_count, &mcycle_end)) { mcycle_end = UINT64_MAX; @@ -2526,7 +2528,8 @@ interpreter_break_reason machine::verify_step(const hash_type &root_hash_before, uint64_t mcycle_count, const hash_type &root_hash_after) { auto data_length = os_get_file_length(filename.c_str(), "step log file"); auto *data = os_map_file(filename.c_str(), data_length, false /* not shared */); - replay_step_state_access a(data, data_length, root_hash_before); + replay_step_state_access::context context; + replay_step_state_access a(context, data, data_length, root_hash_before); uint64_t mcycle_end{}; if (__builtin_add_overflow(a.read_mcycle(), mcycle_count, &mcycle_end)) { mcycle_end = UINT64_MAX; @@ -2541,7 +2544,7 @@ interpreter_break_reason machine::run(uint64_t mcycle_end) { if (mcycle_end < read_reg(reg::mcycle)) { throw std::invalid_argument{"mcycle is past"}; } - state_access a(*this); + const state_access a(*this); return interpret(a, mcycle_end); } diff --git a/src/pma.cpp b/src/pma.cpp index cde8d8462..45d29a5e5 100644 --- a/src/pma.cpp +++ b/src/pma.cpp @@ -26,6 +26,7 @@ #include #include +#include "is-pristine.h" #include "os.h" #include "pma-constants.h" #include "pma-driver.h" @@ -162,8 +163,24 @@ void pma_entry::write_memory(uint64_t paddr, const unsigned char *data, uint64_t if (data == nullptr) { throw std::invalid_argument{"invalid data buffer"}; } - memcpy(get_memory().get_host_memory() + (paddr - get_start()), data, size); - mark_dirty_pages(paddr, size); + // The case of writing a large range chunk is special and optimized for uarch reset + if (size > PMA_PAGE_SIZE) { + // Copy in chunks of page size, to avoid marking dirty pages unnecessarily + for (uint64_t offset = 0; offset < size; offset += PMA_PAGE_SIZE) { + const uint64_t paddr_offset = paddr + offset; + const uint64_t chunk_len = std::min(PMA_PAGE_SIZE, size - offset); + const unsigned char *src = data + offset; + unsigned char *dest = get_memory().get_host_memory() + (paddr_offset - get_start()); + if (memcmp(dest, src, chunk_len) != 0) { + // Page is different, we have to copy memory + memcpy(dest, src, chunk_len); + mark_dirty_pages(paddr + offset, chunk_len); + } + } + } else { + memcpy(get_memory().get_host_memory() + (paddr - get_start()), data, size); + mark_dirty_pages(paddr, size); + } } void pma_entry::fill_memory(uint64_t paddr, unsigned char value, uint64_t size) { @@ -173,8 +190,23 @@ void pma_entry::fill_memory(uint64_t paddr, unsigned char value, uint64_t size) if (!contains(paddr, size)) { throw std::invalid_argument{"range not contained in pma"}; } - memset(get_memory().get_host_memory() + (paddr - get_start()), value, size); - mark_dirty_pages(paddr, size); + // The case of filling a large range with zeros is special and optimized for uarch reset + if (value == 0 && size > PMA_PAGE_SIZE) { + // Fill in chunks of page size, to avoid marking dirty pages unnecessarily + for (uint64_t offset = 0; offset < size; offset += PMA_PAGE_SIZE) { + const uint64_t paddr_offset = paddr + offset; + const uint64_t chunk_len = std::min(PMA_PAGE_SIZE, size - offset); + unsigned char *dest = get_memory().get_host_memory() + (paddr_offset - get_start()); + if (!is_pristine(dest, chunk_len)) { + // Page is different, we have to fill memory + memset(dest, 0, chunk_len); + mark_dirty_pages(paddr + offset, chunk_len); + } + } + } else { + memset(get_memory().get_host_memory() + (paddr - get_start()), value, size); + mark_dirty_pages(paddr, size); + } } bool pma_peek_error(const pma_entry & /*pma*/, const machine & /*m*/, uint64_t /*page_address*/, diff --git a/src/record-state-access.h b/src/record-state-access.h index fcc0800dd..2c5b24853 100644 --- a/src/record-state-access.h +++ b/src/record-state-access.h @@ -45,10 +45,9 @@ class record_state_access : public i_state_access m_log; ///< Pointer to access log static void get_hash(const access_data &data, hash_type &hash) { hasher_type hasher; @@ -57,35 +56,12 @@ class record_state_access : public i_state_access(log_type)) { + /// \param m Reference to machine state. + /// \param log Reference to access log. + explicit record_state_access(machine &m, access_log &log) : m_m(m), m_log(log) { ; } - /// \brief No copy constructor - record_state_access(const record_state_access &) = delete; - /// \brief No copy assignment - record_state_access &operator=(const record_state_access &) = delete; - /// \brief No move constructor - record_state_access(record_state_access &&) = delete; - /// \brief No move assignment - record_state_access &operator=(record_state_access &&) = delete; - /// \brief Default destructor - ~record_state_access() = default; - - /// \brief Returns const pointer to access log. - std::shared_ptr get_log() const { - return m_log; - } - - /// \brief Returns pointer to access log. - std::shared_ptr get_log() { - return m_log; - } - private: /// \brief Logs a read access of a uint64_t word from the machine state. /// \param paligned Physical address in the machine state, aligned to a 64-bit word. @@ -119,7 +95,7 @@ class record_state_access : public i_state_accesspush_access(std::move(a), text); + m_log.push_access(std::move(a), text); } /// \brief Logs a write access before it happens. @@ -162,7 +138,7 @@ class record_state_access : public i_state_accesspush_access(std::move(a), text); + m_log.push_access(std::move(a), text); } /// \brief Updates the Merkle tree after the modification of a word in the machine state. @@ -196,7 +172,7 @@ class record_state_access : public i_state_access; void do_push_bracket(bracket_type &type, const char *text) { - m_log->push_bracket(type, text); + m_log.push_bracket(type, text); } void do_reset_iflags_Y() { @@ -248,7 +224,7 @@ class record_state_access : public i_state_accessget_log_type().has_large_data()) { + if (m_log.get_log_type().has_large_data()) { access_data &data = a.get_read().emplace(write_length); memcpy(data.data(), pma.get_memory().get_host_memory(), write_length); } @@ -270,12 +246,12 @@ class record_state_access : public i_state_accessget_log_type().has_large_data()) { + if (m_log.get_log_type().has_large_data()) { access_data &data = a.get_written().emplace(write_length); memcpy(data.data(), pma.get_memory().get_host_memory(), write_length); } // NOLINTEND(bugprone-unchecked-optional-access) - m_log->push_access(a, "cmio rx buffer"); + m_log.push_access(a, "cmio rx buffer"); } }; diff --git a/src/record-step-state-access.h b/src/record-step-state-access.h index 323b0f1ce..0ef701520 100644 --- a/src/record-step-state-access.h +++ b/src/record-step-state-access.h @@ -33,6 +33,7 @@ namespace cartesi { /// \class record_step_state_access /// \brief Records machine state access into a step log file class record_step_state_access : public i_state_access { +public: constexpr static int LOG2_ROOT_SIZE = machine_merkle_tree::get_log2_root_size(); constexpr static int LOG2_PAGE_SIZE = machine_merkle_tree::get_log2_page_size(); constexpr static uint64_t PAGE_SIZE = UINT64_C(1) << LOG2_PAGE_SIZE; @@ -44,32 +45,38 @@ class record_step_state_access : public i_state_access; using page_indices_type = std::vector; - // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) - machine &m_m; ///< reference to machine - std::string m_filename; ///< where to save the log - mutable pages_type m_touched_pages; ///< copy of all pages touched during execution + struct context { + /// \brief Constructor of record step state access context + /// \param filename where to save the log + explicit context(std::string filename) : filename(std::move(filename)) { + ; + } + std::string filename; ///< where to save the log + mutable pages_type touched_pages; ///< copy of all pages touched during execution + }; + +private: + // NOLINTBEGIN(cppcoreguidelines-avoid-const-or-ref-data-members) + context &m_context; ///< context for the recording + machine &m_m; ///< reference to machine + // NOLINTEND(cppcoreguidelines-avoid-const-or-ref-data-members) public: - /// \brief Constructor + /// \brief Constructor of record step state access + /// \param context Context for the recording with the log filename /// \param m reference to machine - /// \param filename where to save the log /// \details The log file is saved when finish() is called - record_step_state_access(machine &m, const std::string &filename) : m_m(m), m_filename(filename) { - if (os_file_exists(filename.c_str())) { + record_step_state_access(context &context, machine &m) : m_context(context), m_m(m) { + if (os_file_exists(m_context.filename.c_str())) { throw std::runtime_error("file already exists"); } } - record_step_state_access(const record_step_state_access &) = delete; - record_step_state_access(record_step_state_access &&) = delete; - record_step_state_access &operator=(const record_step_state_access &) = delete; - record_step_state_access &operator=(record_step_state_access &&) = delete; - ~record_step_state_access() = default; /// \brief Finish recording and save the log file void finish() { // get sibling hashes of all touched pages auto sibling_hashes = get_sibling_hashes(); - uint64_t page_count = m_touched_pages.size(); + uint64_t page_count = m_context.touched_pages.size(); uint64_t sibling_count = sibling_hashes.size(); // Write log file. @@ -78,11 +85,11 @@ class record_step_state_access : public i_state_access> LOG2_PAGE_SIZE; if (fwrite(&page_index, sizeof(page_index), 1, fp.get()) != 1) { throw std::runtime_error("Could not write page index to log file"); @@ -112,10 +119,10 @@ class record_step_state_access : public i_state_accesssecond.data(), it->second.size()); } @@ -125,7 +132,7 @@ class record_step_state_access : public i_state_access> LOG2_PAGE_SIZE); } auto next_page_index = page_indices.cbegin(); diff --git a/src/replay-state-access.h b/src/replay-state-access.h index 8ef97074f..991c2c0c3 100644 --- a/src/replay-state-access.h +++ b/src/replay-state-access.h @@ -43,50 +43,49 @@ namespace cartesi { /// \brief Allows replaying a uarch reset operation from an access log. class replay_state_access : public i_state_access { +public: using tree_type = machine_merkle_tree; using hash_type = tree_type::hash_type; using hasher_type = tree_type::hasher_type; using proof_type = tree_type::proof_type; - ///< Access log generated by log_reset_uarch - const std::vector &m_accesses; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members) - ///< Index of next access to ne consumed - unsigned m_next_access{}; - ///< Root hash before next access - machine_merkle_tree::hash_type m_root_hash; - ///< Hasher needed to verify proofs - machine_merkle_tree::hasher_type m_hasher; + struct context { + /// \brief Constructor replay_state_access context + /// \param log Access log to be replayed + /// \param initial_hash Initial root hash + context(const access_log &log, machine_merkle_tree::hash_type initial_hash) : + accesses(log.get_accesses()), + root_hash(initial_hash) { + ; + } + const std::vector &accesses; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members) + ///< Index of next access to ne consumed + unsigned int next_access{}; + ///< Root hash before next access + machine_merkle_tree::hash_type root_hash; + ///< Hasher needed to verify proofs + machine_merkle_tree::hasher_type hasher; + }; + +private: + context &m_context; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members) public: /// \brief Constructor from access log - /// \param log Access log to be replayed - /// \param initial_hash Initial root hash - explicit replay_state_access(const access_log &log, const hash_type &initial_hash) : - m_accesses(log.get_accesses()), - m_root_hash{initial_hash} { - if (m_accesses.empty()) { + /// \param context Context with access log and initial root hash + explicit replay_state_access(replay_state_access::context &context) : m_context{context} { + if (m_context.accesses.empty()) { throw std::invalid_argument{"the access log has no accesses"}; } } - /// \brief No copy constructor - replay_state_access(const replay_state_access &) = delete; - /// \brief No copy assignment - replay_state_access &operator=(const replay_state_access &) = delete; - /// \brief No move constructor - replay_state_access(replay_state_access &&) = delete; - /// \brief No move assignment - replay_state_access &operator=(replay_state_access &&) = delete; - /// \brief Default destructor - ~replay_state_access() = default; - void get_root_hash(machine_merkle_tree::hash_type &hash) const { - hash = m_root_hash; + hash = m_context.root_hash; } /// \brief Checks if access log was fully consumed after reset operation is finished void finish() { - if (m_next_access != m_accesses.size()) { + if (m_context.next_access != m_context.accesses.size()) { throw std::invalid_argument{"access log was not fully consumed"}; } } @@ -95,7 +94,7 @@ class replay_state_access : public i_state_access; std::string access_to_report() const { - auto index = m_next_access + 1; + auto index = m_context.next_access + 1; switch (index) { case 1: return "1st access"; @@ -125,10 +124,10 @@ class replay_state_access : public i_state_access= m_accesses.size()) { + if (m_context.next_access >= m_context.accesses.size()) { throw std::invalid_argument{"too few accesses in log"}; } - const auto &access = m_accesses[m_next_access]; + const auto &access = m_context.accesses[m_context.next_access]; if (access.get_type() != access_type::read) { throw std::invalid_argument{"expected " + access_to_report() + " to read " + text}; } @@ -153,18 +152,18 @@ class replay_state_access : public i_state_access(paligned - pleaf_aligned); return get_word_access_data(read_data, word_offset); @@ -181,10 +180,10 @@ class replay_state_access : public i_state_access= m_accesses.size()) { + if (m_context.next_access >= m_context.accesses.size()) { throw std::invalid_argument{"too few accesses in log"}; } - const auto &access = m_accesses[m_next_access]; + const auto &access = m_context.accesses[m_context.next_access]; if (access.get_type() != access_type::write) { throw std::invalid_argument{"expected " + access_to_report() + " to write " + text}; } @@ -210,7 +209,7 @@ class replay_state_access : public i_state_access= m_accesses.size()) { + if (m_context.next_access >= m_context.accesses.size()) { throw std::invalid_argument{"too few accesses in log"}; } - const auto &access = m_accesses[m_next_access]; + const auto &access = m_context.accesses[m_context.next_access]; if (access.get_address() != paddr) { throw std::invalid_argument{"expected address of " + access_to_report() + " to match address of " + text}; } @@ -344,13 +343,13 @@ class replay_state_access : public i_state_access { public: - using hash_type = std::array; - static_assert(sizeof(hash_type) == interop_machine_hash_byte_size); - -private: using address_type = uint64_t; using data_type = unsigned char[PMA_PAGE_SIZE]; + using hash_type = std::array; + static_assert(sizeof(hash_type) == interop_machine_hash_byte_size); struct PACKED page_type { address_type index; @@ -178,19 +177,27 @@ class replay_step_state_access : public i_state_access, PMA_MAX> m_pmas{}; ///< Array of PMA entries + struct context { + uint64_t page_count{0}; ///< Number of pages in the step log + page_type *pages{nullptr}; ///< Array of page data + uint64_t sibling_count{0}; ///< Number of sibling hashes in the step log + hash_type *sibling_hashes{nullptr}; ///< Array of sibling hashes + std::array, PMA_MAX> pmas{}; ///< Array of PMA entries + }; + +private: + context &m_context; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members) public: // \brief Construct a replay_step_state_access object from a log image and expected initial root hash + // \param context The context object to be filled with the replay step log data // \param log_image Image of the step log file // \param log_size The size of the log data // \param root_hash_before The expected machine root hash before the replay // \throw runtime_error if the initial root hash does not match or the log data is invalid - replay_step_state_access(unsigned char *log_image, uint64_t log_size, const hash_type &root_hash_before) { + replay_step_state_access(context &context, unsigned char *log_image, uint64_t log_size, + const hash_type &root_hash_before) : + m_context(context) { // relevant offsets in the log data uint64_t first_page_offset{}; uint64_t first_siblng_offset{}; @@ -203,36 +210,36 @@ class replay_step_state_access : public i_state_access(log_image + first_page_offset); + m_context.pages = reinterpret_cast(log_image + first_page_offset); // set sibling count and hashes - if (!validate_and_advance_offset(log_size, sibling_count_offset, sizeof(m_sibling_count), 1, + if (!validate_and_advance_offset(log_size, sibling_count_offset, sizeof(m_context.sibling_count), 1, &first_siblng_offset)) { interop_throw_runtime_error("sibling count past end of step log"); } - memcpy(&m_sibling_count, log_image + sibling_count_offset, sizeof(m_sibling_count)); + memcpy(&m_context.sibling_count, log_image + sibling_count_offset, sizeof(m_context.sibling_count)); // set sibling hashes - if (!validate_and_advance_offset(log_size, first_siblng_offset, sizeof(hash_type), m_sibling_count, + if (!validate_and_advance_offset(log_size, first_siblng_offset, sizeof(hash_type), m_context.sibling_count, &end_offset)) { interop_throw_runtime_error("sibling hashes past end of step log"); } // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) - m_sibling_hashes = reinterpret_cast(log_image + first_siblng_offset); + m_context.sibling_hashes = reinterpret_cast(log_image + first_siblng_offset); // ensure that we read exactly the expected log size if (end_offset != log_size) { @@ -242,11 +249,11 @@ class replay_step_state_access : public i_state_access 0 && m_pages[i - 1].index >= m_pages[i].index) { + for (uint64_t i = 0; i < m_context.page_count; i++) { + if (i > 0 && m_context.pages[i - 1].index >= m_context.pages[i].index) { interop_throw_runtime_error("invalid log format: page index is not in increasing order"); } - if (m_pages[i].hash != all_zeros) { + if (m_context.pages[i].hash != all_zeros) { interop_throw_runtime_error("invalid log format: page scratch hash area is not zero"); } } @@ -262,12 +269,6 @@ class replay_step_state_access : public i_state_access(); } - replay_step_state_access(const replay_step_state_access &) = delete; - replay_step_state_access(replay_step_state_access &&) = delete; - replay_step_state_access &operator=(const replay_step_state_access &) = delete; - replay_step_state_access &operator=(replay_step_state_access &&) = delete; - ~replay_step_state_access() = default; - // \brief Finish the replay and check the final machine root hash // \param final_root_hash The expected final machine root hash // \throw runtime_error if the final root hash does not match @@ -344,13 +345,13 @@ class replay_step_state_access : public i_state_access> PMA_PAGE_SIZE_LOG2; uint64_t min{0}; - uint64_t max{m_page_count}; + uint64_t max{m_context.page_count}; while (min < max) { auto mid = (min + max) >> 1; - if (m_pages[mid].index == page_index) { - return &m_pages[mid]; + if (m_context.pages[mid].index == page_index) { + return &m_context.pages[mid]; } - if (m_pages[mid].index < page_index) { + if (m_context.pages[mid].index < page_index) { min = mid + 1; } else { max = mid; @@ -420,20 +421,20 @@ class replay_step_state_access : public i_state_access(&m_pages[i].hash)); + reinterpret_cast(&m_context.pages[i].hash)); } size_t next_page = 0; size_t next_sibling = 0; auto root_hash = compute_root_hash_impl(0, interop_log2_root_size - PMA_PAGE_SIZE_LOG2, next_page, next_sibling); - if (next_page != m_page_count) { - interop_throw_runtime_error("compute_root_hash: next_page != m_page_count"); + if (next_page != m_context.page_count) { + interop_throw_runtime_error("compute_root_hash: next_page != m_context.page_count"); } - if (next_sibling != m_sibling_count) { + if (next_sibling != m_context.sibling_count) { interop_throw_runtime_error("compute_root_hash: sibling hashes not totally consumed"); } return root_hash; @@ -449,12 +450,12 @@ class replay_step_state_access : public i_state_access= m_page_count || page_index + page_count <= m_pages[next_page].index) { - if (next_sibling >= m_sibling_count) { + if (next_page >= m_context.page_count || page_index + page_count <= m_context.pages[next_page].index) { + if (next_sibling >= m_context.sibling_count) { interop_throw_runtime_error( "compute_root_hash_impl: trying to access beyond sibling count while skipping range"); } - return m_sibling_hashes[next_sibling++]; + return m_context.sibling_hashes[next_sibling++]; } if (page_count_log2_size > 0) { auto left = compute_root_hash_impl(page_index, page_count_log2_size - 1, next_page, next_sibling); @@ -465,13 +466,13 @@ class replay_step_state_access : public i_state_access(&hash)); return hash; } - if (m_pages[next_page].index == page_index) { - return m_pages[next_page++].hash; + if (m_context.pages[next_page].index == page_index) { + return m_context.pages[next_page++].hash; } - if (next_sibling >= m_sibling_count) { + if (next_sibling >= m_context.sibling_count) { interop_throw_runtime_error("compute_root_hash_impl: trying to access beyond sibling count"); } - return m_sibling_hashes[next_sibling++]; + return m_context.sibling_hashes[next_sibling++]; // NOLINTEND(cppcoreguidelines-pro-type-reinterpret-cast)) } @@ -893,7 +894,7 @@ class replay_step_state_access : public i_state_access mock_pma_entry &do_find_pma_entry(uint64_t paddr) { - for (size_t i = 0; i < m_pmas.size(); i++) { + for (size_t i = 0; i < m_context.pmas.size(); i++) { auto &pma = get_pma_entry(static_cast(i)); if (pma.get_istart_E()) { return pma; @@ -908,11 +909,11 @@ class replay_step_state_access : public i_state_access> 26 (6 most significant bits of funct7) can be /// used to identify the SRI instructions enum insn_SRLI_SRAI_funct7_sr1 : uint32_t { SRLI = 0b000000, SRAI = 0b010000 }; diff --git a/src/send-cmio-response.cpp b/src/send-cmio-response.cpp index e94304413..d03279a93 100644 --- a/src/send-cmio-response.cpp +++ b/src/send-cmio-response.cpp @@ -32,7 +32,7 @@ namespace cartesi { template -void send_cmio_response(STATE_ACCESS &a, uint16 reason, bytes data, uint32 dataLength) { +void send_cmio_response(STATE_ACCESS a, uint16 reason, bytes data, uint32 dataLength) { if (!readIflagsY(a)) { throwRuntimeError(a, "iflags.Y is not set"); } @@ -61,13 +61,13 @@ void send_cmio_response(STATE_ACCESS &a, uint16 reason, bytes data, uint32 dataL } // Explicit instantiation for state_access -template void send_cmio_response(state_access &a, uint16_t reason, const unsigned char *data, uint32 length); +template void send_cmio_response(state_access a, uint16_t reason, const unsigned char *data, uint32 length); // Explicit instantiation for record_state_access -template void send_cmio_response(record_state_access &a, uint16_t reason, const unsigned char *data, uint32 length); +template void send_cmio_response(record_state_access a, uint16_t reason, const unsigned char *data, uint32 length); // Explicit instantiation for replay_state_access -template void send_cmio_response(replay_state_access &a, uint16_t reason, const unsigned char *data, uint32 length); +template void send_cmio_response(replay_state_access a, uint16_t reason, const unsigned char *data, uint32 length); } // namespace cartesi // NOLINTEND(google-readability-casting,misc-const-correctness,modernize-use-auto,hicpp-use-auto) diff --git a/src/send-cmio-response.h b/src/send-cmio-response.h index 11fb4c923..1f8d4e4e4 100644 --- a/src/send-cmio-response.h +++ b/src/send-cmio-response.h @@ -28,22 +28,22 @@ namespace cartesi { /// \param data Response data /// \param length Response data length template -void send_cmio_response(STATE_ACCESS &a, uint16_t reason, const unsigned char *data, uint32_t dataLength); +void send_cmio_response(STATE_ACCESS a, uint16_t reason, const unsigned char *data, uint32_t dataLength); class state_access; class record_state_access; class replay_state_access; // Declaration of explicit instantiation in module send_cmio_response.cpp -extern template void send_cmio_response(state_access &a, uint16_t reason, const unsigned char *data, +extern template void send_cmio_response(state_access a, uint16_t reason, const unsigned char *data, uint32_t dataLength); // Declaration of explicit instantiation in module uarch-reset-state.cpp -extern template void send_cmio_response(record_state_access &a, uint16_t reason, const unsigned char *data, +extern template void send_cmio_response(record_state_access a, uint16_t reason, const unsigned char *data, uint32_t dataLength); // Declaration of explicit instantiation in module uarch-reset-state.cpp -extern template void send_cmio_response(replay_state_access &a, uint16_t reason, const unsigned char *data, +extern template void send_cmio_response(replay_state_access a, uint16_t reason, const unsigned char *data, uint32_t dataLength); } // namespace cartesi diff --git a/src/soft-float.h b/src/soft-float.h index 0354c36ae..d87040be8 100644 --- a/src/soft-float.h +++ b/src/soft-float.h @@ -194,7 +194,7 @@ struct i_sfloat { /// \brief Right shift that takes rounding in account, used for adjust mantissa. static F_UINT mant_rshift_rnd(F_UINT a, int d) { - if (d != 0) { + if (d > 0) { if (d >= F_SIZE) { return (a != 0); } @@ -326,7 +326,7 @@ struct i_sfloat { } /// \brief Addition operation. - static F_UINT add(F_UINT a, F_UINT b, FRM_modes rm, uint32_t *pfflags) { + static NO_INLINE F_UINT add(F_UINT a, F_UINT b, FRM_modes rm, uint32_t *pfflags) { // swap so that abs(a) >= abs(b) if ((a & ~SIGN_MASK) < (b & ~SIGN_MASK)) { const F_UINT tmp = a; @@ -379,7 +379,7 @@ struct i_sfloat { } /// \brief Multiply operation. - static F_UINT mul(F_UINT a, F_UINT b, FRM_modes rm, uint32_t *pfflags) { + static NO_INLINE F_UINT mul(F_UINT a, F_UINT b, FRM_modes rm, uint32_t *pfflags) { const uint32_t a_sign = a >> (F_SIZE - 1); const uint32_t b_sign = b >> (F_SIZE - 1); const uint32_t r_sign = a_sign ^ b_sign; @@ -425,7 +425,7 @@ struct i_sfloat { } /// \brief Fused multiply and add operation. - static F_UINT fma(F_UINT a, F_UINT b, F_UINT c, FRM_modes rm, uint32_t *pfflags) { + static NO_INLINE F_UINT fma(F_UINT a, F_UINT b, F_UINT c, FRM_modes rm, uint32_t *pfflags) { const uint32_t a_sign = a >> (F_SIZE - 1); const uint32_t b_sign = b >> (F_SIZE - 1); uint32_t c_sign = c >> (F_SIZE - 1); @@ -551,7 +551,7 @@ struct i_sfloat { } /// \brief Division operation. - static F_UINT div(F_UINT a, F_UINT b, FRM_modes rm, uint32_t *pfflags) { + static NO_INLINE F_UINT div(F_UINT a, F_UINT b, FRM_modes rm, uint32_t *pfflags) { const uint32_t a_sign = a >> (F_SIZE - 1); const uint32_t b_sign = b >> (F_SIZE - 1); const uint32_t r_sign = a_sign ^ b_sign; @@ -612,7 +612,7 @@ struct i_sfloat { } /// \brief Square root operation. - static F_UINT sqrt(F_UINT a, FRM_modes rm, uint32_t *pfflags) { + static NO_INLINE F_UINT sqrt(F_UINT a, FRM_modes rm, uint32_t *pfflags) { const uint32_t a_sign = a >> (F_SIZE - 1); int32_t a_exp = (a >> MANT_SIZE) & EXP_MASK; F_UINT a_mant = a & MANT_MASK; @@ -673,7 +673,7 @@ struct i_sfloat { } /// \brief Min operation. - static F_UINT min(F_UINT a, F_UINT b, uint32_t *pfflags) { + static NO_INLINE F_UINT min(F_UINT a, F_UINT b, uint32_t *pfflags) { if (isnan(a) || isnan(b)) { return min_max_nan(a, b, pfflags); } @@ -686,7 +686,7 @@ struct i_sfloat { } /// \brief Max operation. - static F_UINT max(F_UINT a, F_UINT b, uint32_t *pfflags) { + static NO_INLINE F_UINT max(F_UINT a, F_UINT b, uint32_t *pfflags) { if (isnan(a) || isnan(b)) { return min_max_nan(a, b, pfflags); } @@ -699,7 +699,7 @@ struct i_sfloat { } /// \brief Equal operation. - static bool eq(F_UINT a, F_UINT b, uint32_t *pfflags) { + static NO_INLINE bool eq(F_UINT a, F_UINT b, uint32_t *pfflags) { if (unlikely(isnan(a) || isnan(b))) { if (issignan(a) || issignan(b)) { *pfflags |= FFLAGS_NV_MASK; @@ -713,7 +713,7 @@ struct i_sfloat { } /// \brief Less or equal than operation. - static bool le(F_UINT a, F_UINT b, uint32_t *pfflags) { + static NO_INLINE bool le(F_UINT a, F_UINT b, uint32_t *pfflags) { if (unlikely(isnan(a) || isnan(b))) { *pfflags |= FFLAGS_NV_MASK; return false; @@ -727,7 +727,7 @@ struct i_sfloat { } /// \brief Less than operation. - static bool lt(F_UINT a, F_UINT b, uint32_t *pfflags) { // NOLINT(misc-confusable-identifiers) + static NO_INLINE bool lt(F_UINT a, F_UINT b, uint32_t *pfflags) { // NOLINT(misc-confusable-identifiers) if (unlikely(isnan(a) || isnan(b))) { *pfflags |= FFLAGS_NV_MASK; return false; @@ -741,7 +741,7 @@ struct i_sfloat { } /// \brief Retrieves float class. - static uint32_t fclass(F_UINT a) { + static NO_INLINE uint32_t fclass(F_UINT a) { const uint32_t a_sign = a >> (F_SIZE - 1); const int32_t a_exp = (a >> MANT_SIZE) & EXP_MASK; const F_UINT a_mant = a & MANT_MASK; @@ -762,7 +762,7 @@ struct i_sfloat { /// \brief Conversion from float to integer. template - static ICVT_INT cvt_f_i(F_UINT a, FRM_modes rm, uint32_t *pfflags) { + static NO_INLINE ICVT_INT cvt_f_i(F_UINT a, FRM_modes rm, uint32_t *pfflags) { using ICVT_UINT = std::make_unsigned_t; constexpr bool IS_UNSIGNED = std::is_unsigned_v; constexpr int ICVT_SIZE = sizeof(ICVT_UINT) * 8; @@ -841,7 +841,7 @@ struct i_sfloat { /// \brief Conversion from integer to float. template - static F_UINT cvt_i_f(ICVT_INT a, FRM_modes rm, uint32_t *pfflags) { + static NO_INLINE F_UINT cvt_i_f(ICVT_INT a, FRM_modes rm, uint32_t *pfflags) { using ICVT_UINT = std::make_unsigned_t; constexpr bool IS_UNSIGNED = std::is_unsigned_v; constexpr int ICVT_SIZE = sizeof(ICVT_UINT) * 8; @@ -870,7 +870,7 @@ using i_sfloat32 = i_sfloat; // Interface for single-precision using i_sfloat64 = i_sfloat; // Interface for double-precision floating-point /// \brief Conversion from float32 to float64. -static uint64_t sfloat_cvt_f32_f64(uint32_t a, uint32_t *pfflags) { +static NO_INLINE uint64_t sfloat_cvt_f32_f64(uint32_t a, uint32_t *pfflags) { uint32_t a_sign = 0; int32_t a_exp = 0; i_sfloat64::F_UINT a_mant = i_sfloat32::unpack(&a_sign, &a_exp, a); @@ -899,7 +899,7 @@ static uint64_t sfloat_cvt_f32_f64(uint32_t a, uint32_t *pfflags) { } /// \brief Conversion from float64 to float32. -static uint32_t sfloat_cvt_f64_f32(uint64_t a, FRM_modes rm, uint32_t *pfflags) { +static NO_INLINE uint32_t sfloat_cvt_f64_f32(uint64_t a, FRM_modes rm, uint32_t *pfflags) { uint32_t a_sign = 0; int32_t a_exp = 0; i_sfloat64::F_UINT a_mant = i_sfloat64::unpack(&a_sign, &a_exp, a); diff --git a/src/state-access.h b/src/state-access.h index 71a48a15e..0ebabe3d2 100644 --- a/src/state-access.h +++ b/src/state-access.h @@ -56,17 +56,6 @@ class state_access : public i_state_access { ; } - /// \brief No copy constructor - state_access(const state_access &) = delete; - /// \brief No copy assignment - state_access &operator=(const state_access &) = delete; - /// \brief No move constructor - state_access(state_access &&) = delete; - /// \brief No move assignment - state_access &operator=(state_access &&) = delete; - /// \brief Default destructor - ~state_access() = default; - const machine &get_naked_machine() const { return m_m; } diff --git a/src/translate-virtual-address.h b/src/translate-virtual-address.h index 5044fdfd4..b5e7f8fce 100644 --- a/src/translate-virtual-address.h +++ b/src/translate-virtual-address.h @@ -57,7 +57,7 @@ namespace cartesi { /// \param val Value to write. /// \returns True if succeeded, false otherwise. template -static inline bool write_ram_uint64(STATE_ACCESS &a, uint64_t paddr, uint64_t val) { +static inline bool write_ram_uint64(STATE_ACCESS a, uint64_t paddr, uint64_t val) { auto &pma = a.template find_pma_entry(paddr); if (unlikely(!pma.get_istart_M() || !pma.get_istart_W())) { return false; @@ -79,7 +79,7 @@ static inline bool write_ram_uint64(STATE_ACCESS &a, uint64_t paddr, uint64_t va /// \param pval Pointer to word. /// \returns True if succeeded, false otherwise. template -static inline bool read_ram_uint64(STATE_ACCESS &a, uint64_t paddr, uint64_t *pval) { +static inline bool read_ram_uint64(STATE_ACCESS a, uint64_t paddr, uint64_t *pval) { auto &pma = a.template find_pma_entry(paddr); if (unlikely(!pma.get_istart_M() || !pma.get_istart_R())) { return false; @@ -102,7 +102,7 @@ static inline bool read_ram_uint64(STATE_ACCESS &a, uint64_t paddr, uint64_t *pv /// \details This function is outlined to minimize host CPU code cache pressure. /// \returns True if succeeded, false otherwise. template -static NO_INLINE bool translate_virtual_address(STATE_ACCESS &a, uint64_t *ppaddr, uint64_t vaddr, int xwr_shift) { +static NO_INLINE bool translate_virtual_address(STATE_ACCESS a, uint64_t *ppaddr, uint64_t vaddr, int xwr_shift) { auto priv = a.read_iflags_PRV(); const uint64_t mstatus = a.read_mstatus(); diff --git a/src/uarch-record-state-access.h b/src/uarch-record-state-access.h index 4b664c5f2..18122fd9a 100644 --- a/src/uarch-record-state-access.h +++ b/src/uarch-record-state-access.h @@ -449,8 +449,9 @@ class uarch_record_state_access : public i_uarch_state_accessget_log_type().has_large_data()) { // log written data, if debug info is enabled a.get_written().emplace(get_uarch_state_image()); diff --git a/src/uarch-state-access.h b/src/uarch-state-access.h index 02d8e999e..c0578f2e6 100644 --- a/src/uarch-state-access.h +++ b/src/uarch-state-access.h @@ -195,8 +195,9 @@ class uarch_state_access : public i_uarch_state_access { if (uarch_pristine_ram_len > m_us.ram.get_length()) { throw std::runtime_error("embedded uarch ram image does not fit in uarch ram pma"); } - m_us.ram.fill_memory(m_us.ram.get_start(), 0, m_us.ram.get_length()); m_us.ram.write_memory(m_us.ram.get_start(), uarch_pristine_ram, uarch_pristine_ram_len); + m_us.ram.fill_memory(m_us.ram.get_start() + uarch_pristine_ram_len, 0, + m_us.ram.get_length() - uarch_pristine_ram_len); } }; diff --git a/tools/gen-interpret-jump-table.lua b/tools/gen-interpret-jump-table.lua new file mode 100755 index 000000000..173173d3e --- /dev/null +++ b/tools/gen-interpret-jump-table.lua @@ -0,0 +1,602 @@ +#!/usr/bin/env lua5.4 + +--[[ +This file is used to generate interpret-jump-table.h header used in the interpreter. +It's purpose is to generate a big jump table covering most RISC-V instructions, +so we can decode most instructions with a single jump. + +At the moment this file is a bit hacky and slow, it could be optimized in the future. +]] + +--[[ +List of RISC-V instructions taken from RISC-V specification. +Bits marked as 0 and 1 are fixed, bits marked as _ is a placeholder accepting both 0 or 1. +When `rd0_special` is set means the instruction has specialization when rd == 0. +When `rm = true` is set means the instruction has specialization for floating-point rounding modes. +]] +local insns = { + -- RV32I + { bits = "_________________________0110111", name = "LUI", rd0_special = true }, + { bits = "_________________________0010111", name = "AUIPC", rd0_special = true }, + { bits = "_________________________1101111", name = "JAL", rd0_special = true }, + { bits = "_________________000_____1100111", name = "JALR", rd0_special = true }, + { bits = "_________________000_____1100011", name = "BEQ" }, + { bits = "_________________001_____1100011", name = "BNE" }, + { bits = "_________________100_____1100011", name = "BLT" }, + { bits = "_________________101_____1100011", name = "BGE" }, + { bits = "_________________110_____1100011", name = "BLTU" }, + { bits = "_________________111_____1100011", name = "BGEU" }, + { bits = "_________________000_____0000011", name = "LB", rd0_special = true }, + { bits = "_________________001_____0000011", name = "LH", rd0_special = true }, + { bits = "_________________010_____0000011", name = "LW", rd0_special = true }, + { bits = "_________________100_____0000011", name = "LBU", rd0_special = true }, + { bits = "_________________101_____0000011", name = "LHU", rd0_special = true }, + { bits = "_________________000_____0100011", name = "SB" }, + { bits = "_________________001_____0100011", name = "SH" }, + { bits = "_________________010_____0100011", name = "SW" }, + { bits = "_________________000_____0010011", name = "ADDI", rd0_special = true }, + { bits = "_________________010_____0010011", name = "SLTI", rd0_special = true }, + { bits = "_________________011_____0010011", name = "SLTIU", rd0_special = true }, + { bits = "_________________100_____0010011", name = "XORI", rd0_special = true }, + { bits = "_________________110_____0010011", name = "ORI", rd0_special = true }, + { bits = "_________________111_____0010011", name = "ANDI", rd0_special = true }, + { bits = "000000___________001_____0010011", name = "SLLI", rd0_special = true }, + { bits = "000000___________101_____0010011", name = "SRLI", rd0_special = true }, + { bits = "010000___________101_____0010011", name = "SRAI", rd0_special = true }, + { bits = "0000000__________000_____0110011", name = "ADD", rd0_special = true }, + { bits = "0100000__________000_____0110011", name = "SUB", rd0_special = true }, + { bits = "0000000__________001_____0110011", name = "SLL", rd0_special = true }, + { bits = "0000000__________010_____0110011", name = "SLT", rd0_special = true }, + { bits = "0000000__________011_____0110011", name = "SLTU", rd0_special = true }, + { bits = "0000000__________100_____0110011", name = "XOR", rd0_special = true }, + { bits = "0000000__________101_____0110011", name = "SRL", rd0_special = true }, + { bits = "0100000__________101_____0110011", name = "SRA", rd0_special = true }, + { bits = "0000000__________110_____0110011", name = "OR", rd0_special = true }, + { bits = "0000000__________111_____0110011", name = "AND", rd0_special = true }, + { bits = "_________________000_____0001111", name = "FENCE" }, + { bits = "00000000000000000000000001110011", name = "ECALL" }, + { bits = "00000000000100000000000001110011", name = "EBREAK" }, + + -- RV64I + { bits = "_________________110_____0000011", name = "LWU", rd0_special = true }, + { bits = "_________________011_____0000011", name = "LD", rd0_special = true }, + { bits = "_________________011_____0100011", name = "SD" }, + { bits = "_________________000_____0011011", name = "ADDIW", rd0_special = true }, + { bits = "0000000__________001_____0011011", name = "SLLIW", rd0_special = true }, + { bits = "0000000__________101_____0011011", name = "SRLIW", rd0_special = true }, + { bits = "0100000__________101_____0011011", name = "SRAIW", rd0_special = true }, + { bits = "0000000__________000_____0111011", name = "ADDW", rd0_special = true }, + { bits = "0100000__________000_____0111011", name = "SUBW", rd0_special = true }, + { bits = "0000000__________001_____0111011", name = "SLLW", rd0_special = true }, + { bits = "0000000__________101_____0111011", name = "SRLW", rd0_special = true }, + { bits = "0100000__________101_____0111011", name = "SRAW", rd0_special = true }, + + -- RV32M extension + { bits = "0000001__________000_____0110011", name = "MUL", rd0_special = true }, + { bits = "0000001__________001_____0110011", name = "MULH", rd0_special = true }, + { bits = "0000001__________010_____0110011", name = "MULHSU", rd0_special = true }, + { bits = "0000001__________011_____0110011", name = "MULHU", rd0_special = true }, + { bits = "0000001__________100_____0110011", name = "DIV", rd0_special = true }, + { bits = "0000001__________101_____0110011", name = "DIVU", rd0_special = true }, + { bits = "0000001__________110_____0110011", name = "REM", rd0_special = true }, + { bits = "0000001__________111_____0110011", name = "REMU", rd0_special = true }, + + -- RV64M + { bits = "0000001__________000_____0111011", name = "MULW", rd0_special = true }, + { bits = "0000001__________100_____0111011", name = "DIVW", rd0_special = true }, + { bits = "0000001__________101_____0111011", name = "DIVUW", rd0_special = true }, + { bits = "0000001__________110_____0111011", name = "REMW", rd0_special = true }, + { bits = "0000001__________111_____0111011", name = "REMUW", rd0_special = true }, + + -- RV32A + { bits = "00010__00000_____010_____0101111", name = "LR.W" }, + { bits = "00011____________010_____0101111", name = "SC.W" }, + { bits = "00001____________010_____0101111", name = "AMOSWAP.W" }, + { bits = "00000____________010_____0101111", name = "AMOADD.W" }, + { bits = "00100____________010_____0101111", name = "AMOXOR.W" }, + { bits = "01100____________010_____0101111", name = "AMOAND.W" }, + { bits = "01000____________010_____0101111", name = "AMOOR.W" }, + { bits = "10000____________010_____0101111", name = "AMOMIN.W" }, + { bits = "10100____________010_____0101111", name = "AMOMAX.W" }, + { bits = "11000____________010_____0101111", name = "AMOMINU.W" }, + { bits = "11100____________010_____0101111", name = "AMOMAXU.W" }, + + -- RV64A + { bits = "00010__00000_____011_____0101111", name = "LR.D" }, + { bits = "00011____________011_____0101111", name = "SC.D" }, + { bits = "00001____________011_____0101111", name = "AMOSWAP.D" }, + { bits = "00000____________011_____0101111", name = "AMOADD.D" }, + { bits = "00100____________011_____0101111", name = "AMOXOR.D" }, + { bits = "01100____________011_____0101111", name = "AMOAND.D" }, + { bits = "01000____________011_____0101111", name = "AMOOR.D" }, + { bits = "10000____________011_____0101111", name = "AMOMIN.D" }, + { bits = "10100____________011_____0101111", name = "AMOMAX.D" }, + { bits = "11000____________011_____0101111", name = "AMOMINU.D" }, + { bits = "11100____________011_____0101111", name = "AMOMAXU.D" }, + + -- RV32F extension + { bits = "_________________010_____0000111", name = "FLW" }, + { bits = "_________________010_____0100111", name = "FSW" }, + { bits = "_____00__________________1000011", name = "FMADD.S", rm = true }, + { bits = "_____00__________________1000111", name = "FMSUB.S", rm = true }, + { bits = "_____00__________________1001011", name = "FNMSUB.S", rm = true }, + { bits = "_____00__________________1001111", name = "FNMADD.S", rm = true }, + { bits = "0000000__________________1010011", name = "FADD.S", rm = true }, + { bits = "0000100__________________1010011", name = "FSUB.S", rm = true }, + { bits = "0001000__________________1010011", name = "FMUL.S", rm = true }, + { bits = "0001100__________________1010011", name = "FDIV.S", rm = true }, + { bits = "010110000000_____________1010011", name = "FSQRT.S", rm = true }, + { bits = "0010000__________000_____1010011", name = "FSGNJ.S" }, + { bits = "0010000__________001_____1010011", name = "FSGNJN.S" }, + { bits = "0010000__________010_____1010011", name = "FSGNJX.S" }, + { bits = "0010100__________000_____1010011", name = "FMIN.S" }, + { bits = "0010100__________001_____1010011", name = "FMAX.S" }, + { bits = "110000000000_____________1010011", name = "FCVT.W.S", rm = true }, + { bits = "110000000001_____________1010011", name = "FCVT.WU.S", rm = true }, + { bits = "111000000000_____000_____1010011", name = "FMV.X.W" }, + { bits = "1010000__________010_____1010011", name = "FEQ.S" }, + { bits = "1010000__________001_____1010011", name = "FLT.S" }, + { bits = "1010000__________000_____1010011", name = "FLE.S" }, + { bits = "111000000000_____001_____1010011", name = "FCLASS.S" }, + { bits = "110100000000_____________1010011", name = "FCVT.S.W", rm = true }, + { bits = "110100000001_____________1010011", name = "FCVT.S.WU", rm = true }, + { bits = "111100000000_____000_____1010011", name = "FMV.W.X" }, + + -- RV64F + { bits = "110000000010_____________1010011", name = "FCVT.L.S", rm = true }, + { bits = "110000000011_____________1010011", name = "FCVT.LU.S", rm = true }, + { bits = "110100000010_____________1010011", name = "FCVT.S.L", rm = true }, + { bits = "110100000011_____________1010011", name = "FCVT.S.LU", rm = true }, + + -- RV32D + { bits = "_________________011_____0000111", name = "FLD" }, + { bits = "_________________011_____0100111", name = "FSD" }, + { bits = "_____01__________________1000011", name = "FMADD.D", rm = true }, + { bits = "_____01__________________1000111", name = "FMSUB.D", rm = true }, + { bits = "_____01__________________1001011", name = "FNMSUB.D", rm = true }, + { bits = "_____01__________________1001111", name = "FNMADD.D", rm = true }, + { bits = "0000001__________________1010011", name = "FADD.D", rm = true }, + { bits = "0000101__________________1010011", name = "FSUB.D", rm = true }, + { bits = "0001001__________________1010011", name = "FMUL.D", rm = true }, + { bits = "0001101__________________1010011", name = "FDIV.D", rm = true }, + { bits = "010110100000_____________1010011", name = "FSQRT.D", rm = true }, + { bits = "0010001__________000_____1010011", name = "FSGNJ.D" }, + { bits = "0010001__________001_____1010011", name = "FSGNJN.D" }, + { bits = "0010001__________010_____1010011", name = "FSGNJX.D" }, + { bits = "0010101__________000_____1010011", name = "FMIN.D" }, + { bits = "0010101__________001_____1010011", name = "FMAX.D" }, + { bits = "010000000001_____________1010011", name = "FCVT.S.D", rm = true }, + { bits = "010000100000_____________1010011", name = "FCVT.D.S", rm = true }, + { bits = "1010001__________010_____1010011", name = "FEQ.D" }, + { bits = "1010001__________001_____1010011", name = "FLT.D" }, + { bits = "1010001__________000_____1010011", name = "FLE.D" }, + { bits = "111000100000_____001_____1010011", name = "FCLASS.D", rm = true }, + { bits = "110000100000_____________1010011", name = "FCVT.W.D", rm = true }, + { bits = "110000100001_____________1010011", name = "FCVT.WU.D", rm = true }, + { bits = "110100100000_____________1010011", name = "FCVT.D.W", rm = true }, + { bits = "110100100001_____________1010011", name = "FCVT.D.WU", rm = true }, + -- RV64D + { bits = "110000100010_____________1010011", name = "FCVT.L.D", rm = true }, + { bits = "110000100011_____________1010011", name = "FCVT.LU.D", rm = true }, + { bits = "111000100000_____000_____1010011", name = "FMV.X.D" }, + { bits = "110100100010_____________1010011", name = "FCVT.D.L", rm = true }, + { bits = "110100100011_____________1010011", name = "FCVT.D.LU", rm = true }, + { bits = "111100100000_____000_____1010011", name = "FMV.D.X" }, + + -- Zifencei extension + { bits = "_________________001_____0001111", name = "FENCE.I" }, + + -- Zicsr extension + { bits = "_________________001_____1110011", name = "CSRRW" }, + { bits = "_________________010_____1110011", name = "CSRRS" }, + { bits = "_________________011_____1110011", name = "CSRRC" }, + { bits = "_________________101_____1110011", name = "CSRRWI" }, + { bits = "_________________110_____1110011", name = "CSRRSI" }, + { bits = "_________________111_____1110011", name = "CSRRCI" }, + + -- Privileged + { bits = "00010000001000000000000001110011", name = "SRET" }, + { bits = "00110000001000000000000001110011", name = "MRET" }, + { bits = "01110000001000000000000001110011", name = "MNRET" }, + { bits = "00010000010100000000000001110011", name = "WFI" }, + { bits = "0001001__________000000001110011", name = "SFENCE.VMA" }, +} + +-- Converts an integer `num` to a base 2 string of length `nbits` +local function tobase2(num, nbits) + local t = {} + local bit = 1 << (nbits - 1) + for _ = 1, nbits do + table.insert(t, ((num & bit) ~= 0) and "1" or "0") + bit = bit >> 1 + end + return table.concat(t) +end + +-- Converts a string in base 2 to an integer. +local function frombase2(s) return tonumber(s, 2) end + +local c_insns = {} +local c_insn_by_idx = {} + +-- Fill compressed instructions table according to the RISC-V spec +do + local function add_c_insn(c_insn) + assert(#c_insn.bits == 16) + c_insn.name = c_insn.name:gsub("%.", "_") + table.insert(c_insns, c_insn) + local num_bits = frombase2(c_insn.bits) + assert(c_insn_by_idx[num_bits] == nil, "duplicated compressed instruction") + c_insn_by_idx[num_bits] = c_insn.name + end + do -- quadrant 0 + for rd = 0, (1 << 3) - 1 do + for imm = 1, (1 << 8) - 1 do + add_c_insn({ bits = "000" .. tobase2(imm, 8) .. tobase2(rd, 3) .. "00", name = "C.ADDI4SPN" }) + end + end + + for mid = 0, (1 << 11) - 1 do + add_c_insn({ bits = "001" .. tobase2(mid, 11) .. "00", name = "C.FLD" }) + add_c_insn({ bits = "010" .. tobase2(mid, 11) .. "00", name = "C.LW" }) + add_c_insn({ bits = "011" .. tobase2(mid, 11) .. "00", name = "C.LD" }) + add_c_insn({ bits = "101" .. tobase2(mid, 11) .. "00", name = "C.FSD" }) + add_c_insn({ bits = "110" .. tobase2(mid, 11) .. "00", name = "C.SW" }) + add_c_insn({ bits = "111" .. tobase2(mid, 11) .. "00", name = "C.SD" }) + end + end + + do -- quadrant 1 + for rd = 0, (1 << 5) - 1 do + for imm = 0, (1 << 6) - 1 do + if rd == 0 then + add_c_insn({ + bits = "000" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "01", + name = imm ~= 0 and "C.NOP" or "C.HINT", + }) + else + add_c_insn({ + bits = "000" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "01", + name = imm ~= 0 and "C.ADDI" or "C.HINT", + }) + add_c_insn({ + bits = "001" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "01", + name = "C.ADDIW", + }) + end + add_c_insn({ + bits = "010" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "01", + name = rd ~= 0 and "C.LI" or "C.HINT", + }) + + if imm ~= 0 then + if rd == 2 then + add_c_insn({ + bits = "011" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "01", + name = rd ~= 0 and "C.ADDI16SP" or "C.HINT", + }) + else + add_c_insn({ + bits = "011" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "01", + name = rd ~= 0 and "C.LUI" or "C.HINT", + }) + end + end + end + end + + for rd = 0, (1 << 3) - 1 do + for imm = 0, (1 << 6) - 1 do + add_c_insn({ + bits = "100" .. tobase2(imm & 1, 1) .. "00" .. tobase2(rd, 3) .. tobase2(imm >> 1, 5) .. "01", + name = imm ~= 0 and "C.SRLI" or "C.HINT", + }) + add_c_insn({ + bits = "100" .. tobase2(imm & 1, 1) .. "01" .. tobase2(rd, 3) .. tobase2(imm >> 1, 5) .. "01", + name = imm ~= 0 and "C.SRAI" or "C.HINT", + }) + add_c_insn({ + bits = "100" .. tobase2(imm & 1, 1) .. "10" .. tobase2(rd, 3) .. tobase2(imm >> 1, 5) .. "01", + name = "C.ANDI", + }) + end + end + + for rs1 = 0, (1 << 3) - 1 do + for rs2 = 0, (1 << 3) - 1 do + add_c_insn({ bits = "100011" .. tobase2(rs1, 3) .. "00" .. tobase2(rs2, 3) .. "01", name = "C.SUB" }) + add_c_insn({ bits = "100011" .. tobase2(rs1, 3) .. "01" .. tobase2(rs2, 3) .. "01", name = "C.XOR" }) + add_c_insn({ bits = "100011" .. tobase2(rs1, 3) .. "10" .. tobase2(rs2, 3) .. "01", name = "C.OR" }) + add_c_insn({ bits = "100011" .. tobase2(rs1, 3) .. "11" .. tobase2(rs2, 3) .. "01", name = "C.AND" }) + add_c_insn({ bits = "100111" .. tobase2(rs1, 3) .. "00" .. tobase2(rs2, 3) .. "01", name = "C.SUBW" }) + add_c_insn({ bits = "100111" .. tobase2(rs1, 3) .. "01" .. tobase2(rs2, 3) .. "01", name = "C.ADDW" }) + end + end + + for mid = 0, (1 << 11) - 1 do + add_c_insn({ bits = "101" .. tobase2(mid, 11) .. "01", name = "C.J" }) + add_c_insn({ bits = "110" .. tobase2(mid, 11) .. "01", name = "C.BEQZ" }) + add_c_insn({ bits = "111" .. tobase2(mid, 11) .. "01", name = "C.BNEZ" }) + end + end + + do -- quadrant 2 + for rd = 0, (1 << 5) - 1 do + for imm = 0, (1 << 6) - 1 do + add_c_insn({ + bits = "000" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "10", + name = (imm ~= 0 and rd ~= 0) and "C.SLLI" or "C.HINT", + }) + + add_c_insn({ + bits = "001" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "10", + name = "C.FLDSP", + }) + + if rd ~= 0 then + add_c_insn({ + bits = "010" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "10", + name = "C.LWSP", + }) + add_c_insn({ + bits = "011" .. tobase2(imm & 1, 1) .. tobase2(rd, 5) .. tobase2(imm >> 1, 5) .. "10", + name = "C.LDSP", + }) + end + + do + local rs2 = rd + add_c_insn({ bits = "101" .. tobase2(imm, 6) .. tobase2(rs2, 5) .. "10", name = "C.FSDSP" }) + add_c_insn({ bits = "110" .. tobase2(imm, 6) .. tobase2(rs2, 5) .. "10", name = "C.SWSP" }) + add_c_insn({ bits = "111" .. tobase2(imm, 6) .. tobase2(rs2, 5) .. "10", name = "C.SDSP" }) + end + end + end + + for rs1 = 0, (1 << 5) - 1 do + for rs2 = 0, (1 << 5) - 1 do + if rs2 == 0 then + if rs1 == 0 then + add_c_insn({ bits = "1001" .. tobase2(rs1, 5) .. tobase2(rs2, 5) .. "10", name = "C.EBREAK" }) + else + add_c_insn({ bits = "1000" .. tobase2(rs1, 5) .. tobase2(rs2, 5) .. "10", name = "C.JR" }) + add_c_insn({ bits = "1001" .. tobase2(rs1, 5) .. tobase2(rs2, 5) .. "10", name = "C.JALR" }) + end + elseif rs2 ~= 0 then + add_c_insn({ + bits = "1000" .. tobase2(rs1, 5) .. tobase2(rs2, 5) .. "10", + name = rs1 ~= 0 and "C.MV" or "C.HINT", + }) + add_c_insn({ + bits = "1001" .. tobase2(rs1, 5) .. tobase2(rs2, 5) .. "10", + name = rs1 ~= 0 and "C.ADD" or "C.HINT", + }) + end + end + end + end +end + +-- Replace FD instructions that needs rounding discarding invalid round modes +local valid_rms = { + "000", -- rne + "001", -- rtz + "010", -- rdn + "011", -- rup + "100", -- rmm + "111", -- dyn +} +for _, insn in ipairs(insns) do + if insn.rm then + local lbits, rbits = insn.bits:sub(1, 17), insn.bits:sub(21, 32) + insn.bits = lbits .. valid_rms[1] .. rbits + insn.rm = nil + for i = 2, #valid_rms do + table.insert(insns, { bits = lbits .. valid_rms[i] .. rbits, name = insn.name }) + end + end +end + +-- Table use to rename a group of instructions to a single name. +local group_names = { + -- I + ["ADD|SUB|MUL"] = "ADD_MUL_SUB", + ["ADDW|SUBW|MULW"] = "ADDW_MULW_SUBW", + ["SRL|SRA|DIVU"] = "SRL_DIVU_SRA", + ["SRLW|SRAW|DIVUW"] = "SRLW_DIVUW_SRAW", + -- A + ["LR.W|SC.W|AMOSWAP.W|AMOADD.W|AMOXOR.W|AMOAND.W|AMOOR.W|AMOMIN.W|AMOMAX.W|AMOMINU.W|AMOMAXU.W"] = "AMO_W", + ["LR.D|SC.D|AMOSWAP.D|AMOADD.D|AMOXOR.D|AMOAND.D|AMOOR.D|AMOMIN.D|AMOMAX.D|AMOMINU.D|AMOMAXU.D"] = "AMO_D", + -- FD + ["FMADD.S|FMADD.D"] = "FMADD", + ["FMSUB.S|FMSUB.D"] = "FMSUB", + ["FNMADD.S|FNMADD.D"] = "FNMADD", + ["FNMSUB.S|FNMSUB.D"] = "FNMSUB", + ["FADD.S|FSUB.S|FMUL.S|FDIV.S|FSQRT.S|FSGNJ.S|FMIN.S|FCVT.W.S|FCVT.WU.S|FMV.X.W|FLE.S|FCVT.S.W|FCVT.S.WU|FMV.W.X|FCVT.L.S|FCVT.LU.S|FCVT.S.L|FCVT.S.LU|FADD.D|FSUB.D|FMUL.D|FDIV.D|FSQRT.D|FSGNJ.D|FMIN.D|FCVT.S.D|FCVT.D.S|FLE.D|FCLASS.D|FCVT.W.D|FCVT.WU.D|FCVT.D.W|FCVT.D.WU|FCVT.L.D|FCVT.LU.D|FMV.X.D|FCVT.D.L|FCVT.D.LU|FMV.D.X"] = "FD", + ["FSGNJN.S|FMAX.S|FLT.S|FCLASS.S|FSGNJN.D|FMAX.D|FLT.D|FADD.S|FSUB.S|FMUL.S|FDIV.S|FSQRT.S|FCVT.W.S|FCVT.WU.S|FCVT.S.W|FCVT.S.WU|FCVT.L.S|FCVT.LU.S|FCVT.S.L|FCVT.S.LU|FADD.D|FSUB.D|FMUL.D|FDIV.D|FSQRT.D|FCVT.S.D|FCVT.D.S|FCLASS.D|FCVT.W.D|FCVT.WU.D|FCVT.D.W|FCVT.D.WU|FCVT.L.D|FCVT.LU.D|FCVT.D.L|FCVT.D.LU"] = "FD", + ["FSGNJX.S|FEQ.S|FSGNJX.D|FEQ.D|FADD.S|FSUB.S|FMUL.S|FDIV.S|FSQRT.S|FCVT.W.S|FCVT.WU.S|FCVT.S.W|FCVT.S.WU|FCVT.L.S|FCVT.LU.S|FCVT.S.L|FCVT.S.LU|FADD.D|FSUB.D|FMUL.D|FDIV.D|FSQRT.D|FCVT.S.D|FCVT.D.S|FCLASS.D|FCVT.W.D|FCVT.WU.D|FCVT.D.W|FCVT.D.WU|FCVT.L.D|FCVT.LU.D|FCVT.D.L|FCVT.D.LU"] = "FD", + ["FADD.S|FSUB.S|FMUL.S|FDIV.S|FSQRT.S|FCVT.W.S|FCVT.WU.S|FCVT.S.W|FCVT.S.WU|FCVT.L.S|FCVT.LU.S|FCVT.S.L|FCVT.S.LU|FADD.D|FSUB.D|FMUL.D|FDIV.D|FSQRT.D|FCVT.S.D|FCVT.D.S|FCLASS.D|FCVT.W.D|FCVT.WU.D|FCVT.D.W|FCVT.D.WU|FCVT.L.D|FCVT.LU.D|FCVT.D.L|FCVT.D.LU"] = "FD", + -- privileged + ["ECALL|EBREAK|SRET|MRET|MNRET|WFI|SFENCE.VMA"] = "PRIVILEGED", + ["SFENCE.VMA"] = "PRIVILEGED", +} + +--[[ +Instruction mask bits +- 4 bits on the left (1 bit + funct3) +- 7 bits on the right (funtc7) +- Checking these bits is enough to make a big switch covering most uncompressed/compressed instructions. +]] +local lmask_bits = 4 +local rmask_bits = 7 +local mask_bits = lmask_bits + rmask_bits +local lmask = (1 << lmask_bits) - 1 +local rmask = (1 << rmask_bits) - 1 + +-- Labels +local labels = { ["ILLEGAL"] = true, [1] = { name = "ILLEGAL", i = 1 << (mask_bits + 1) } } + +-- Checks if the a string of bits match the mask of string bits, "_" is accepted as bit placeholder. +local function matchmask(bits, mask) + assert(#bits == 32 and #mask == 32) + for i = 1, 32 do + local b, m = bits:sub(i, i), mask:sub(i, i) + if b ~= "_" and m ~= "_" and b ~= m then return false end + end + return true +end + +-- Generate the jump table +local jumptable = {} +for i = 0, ((1 << mask_bits) - 1) do + local mask = "________________" + .. tobase2((i >> rmask_bits) & lmask, lmask_bits) + .. "_____" + .. tobase2(i & rmask, rmask_bits) + local matches = {} + local firstindex + local rd0_special + for j, insn in ipairs(insns) do + if matchmask(insn.bits, mask) and not matches[insn.name] then + if #matches == 0 then + rd0_special = insn.rd0_special + elseif rd0_special ~= insn.rd0_special then + rd0_special = nil + end + matches[insn.name] = true + table.insert(matches, insn.name) + firstindex = math.min(firstindex or j, j) + end + end + local namekey = table.concat(matches, "|") + local name = group_names[namekey] or namekey:gsub("%.", "_"):gsub("|", "_") + if #name == 0 then name = "ILLEGAL" end + if not labels[name] then + labels[name] = true + if rd0_special then + table.insert(labels, { name = name .. "_rd0", i = firstindex * 10 + 1 }) + table.insert(labels, { name = name .. "_rdN", i = firstindex * 10 + 2 }) + else + table.insert(labels, { name = name, i = firstindex * 10 }) + end + end + assert(#name < 18, namekey) + for rd = 0, 31 do + local ename = name + if rd0_special then + if rd == 0 then + ename = ename .. "_rd0" + else + ename = ename .. "_rdN" + end + end + local emask = mask:sub(1, 20) .. tobase2(rd, 5) .. mask:sub(26, 32) + local idx = frombase2(emask:match("[0-1]+")) + if ename == "ILLEGAL" then -- check for compressed instruction + ename = c_insn_by_idx[idx] or ename + end + jumptable[idx + 1] = ename + end +end +-- Make sure the jump table has exactly 64KB +assert(#jumptable == 65536) + +-- Sort labels by its definition order +table.sort(labels, function(a, b) return a.i < b.i end) + +-- Add compressed instructions to the labels +for _, c_insn in ipairs(c_insns) do + if not labels[c_insn.name] then + labels[c_insn.name] = true + table.insert(labels, #labels, { name = c_insn.name }) + end +end + +-- Make sure labels can fit a byte +assert(#labels <= 256) + +-- Emit the jump table header +io.write([[ +// Copyright Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: LGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) any +// later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +// PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License along +// with this program (see COPYING). If not, see . +// + +// THIS FILE WAS GENERATED BY "lua tools/gen-interpret-jump-table.lua", +// DO NOT EDIT IT DIRECTLY, EDIT THE GENERATOR SCRIPT INSTEAD. + +#ifndef INTERPRET_JUMP_TABLE_H +#define INTERPRET_JUMP_TABLE_H + +// NOLINTBEGIN(cppcoreguidelines-macro-usage) +#pragma GCC diagnostic ignored "-Wpedantic" +#pragma GCC diagnostic push + +#if !defined(NO_COMPUTED_GOTO) && defined(__GNUC__) && !defined(__wasm__) +#define USE_COMPUTED_GOTO +#endif + +#ifdef USE_COMPUTED_GOTO + +#define INSN_LABEL(x) &&x +#define INSN_CASE(x) x +#define INSN_BREAK() goto NEXT_INSN +#define INSN_SWITCH(x) goto *insn_jumptable[x]; +#define INSN_SWITCH_OUT() \ + NEXT_INSN: +#define INSN_JUMPTABLE_TYPE void * + +#else + +#define INSN_LABEL(x) insn_label_id::x +#define INSN_CASE(x) case insn_label_id::x +#define INSN_BREAK() break +#define INSN_SWITCH(x) switch (insn_jumptable[x]) +#define INSN_SWITCH_OUT() +#define INSN_JUMPTABLE_TYPE insn_label_id + +]]) + +-- Emit labels +io.write("enum class insn_label_id : unsigned char {\n") +for _, label in ipairs(labels) do + io.write(" " .. label.name .. ",\n") +end +io.write([[}; + +#endif // USE_COMPUTED_GOTO + +]]) + +-- Emit the jump table +io.write("static const INSN_JUMPTABLE_TYPE insn_jumptable[", #jumptable, "] = {\n") +io.write("#ifndef CLANG_TIDY_LINT // Disable clang-tidy via an ifdef because it's too slow\n") +for i, name in ipairs(jumptable) do + io.write(string.format("%-40s", " INSN_LABEL(" .. name .. "),"), " // " .. string.format("%4d", (i - 1)) .. "\n") +end +io.write("#else\n") +io.write(" INSN_LABEL(ILLEGAL)\n") +io.write("#endif\n") +io.write("};\n") + +-- Emit the jump table footer +io.write([[ + +#pragma GCC diagnostic pop +// NOLINTEND(cppcoreguidelines-macro-usage) + +#endif // INTERPRET_JUMP_TABLE_H +]]) + +io.flush() diff --git a/uarch/uarch-machine-state-access.h b/uarch/uarch-machine-state-access.h index 01254115a..4e29a7c5a 100644 --- a/uarch/uarch-machine-state-access.h +++ b/uarch/uarch-machine-state-access.h @@ -148,12 +148,12 @@ class uarch_pma_entry final { // Provides access to the state of the big emulator from microcode class uarch_machine_state_access : public i_state_access { - std::array, PMA_MAX> m_pmas; + std::array, PMA_MAX> &m_pmas; //NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members) public: - uarch_machine_state_access() = default; - uarch_machine_state_access(const uarch_machine_state_access &other) = delete; - uarch_machine_state_access(uarch_machine_state_access &&other) = delete; + explicit uarch_machine_state_access(std::array, PMA_MAX>& pmas) : m_pmas(pmas) {} + uarch_machine_state_access(const uarch_machine_state_access &other) = default; + uarch_machine_state_access(uarch_machine_state_access &&other) = default; uarch_machine_state_access &operator=(const uarch_machine_state_access &other) = delete; uarch_machine_state_access &operator=(uarch_machine_state_access &&other) = delete; ~uarch_machine_state_access() = default; @@ -534,7 +534,7 @@ class uarch_machine_state_access : public i_state_access do_poll_external_interrupts(uint64_t mcycle, uint64_t /*mcycle_max*/) { return {mcycle, false}; } - + uint64_t do_read_pma_istart(int i) { return raw_read_memory(shadow_pmas_get_pma_abs_addr(i)); } diff --git a/uarch/uarch-run.cpp b/uarch/uarch-run.cpp index f077771f0..39222d25e 100644 --- a/uarch/uarch-run.cpp +++ b/uarch/uarch-run.cpp @@ -36,18 +36,20 @@ static void set_uarch_halt_flag() { } // Let the state accessor be on static memory storage to speed up uarch initialization -static uarch_machine_state_access a; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) +static std::array, PMA_MAX> + pmas; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) namespace cartesi { // Declaration of explicit instantiation in module interpret.cpp when compiled with microarchitecture -extern template interpreter_break_reason interpret(uarch_machine_state_access &a, uint64_t mcycle_end); +extern template interpreter_break_reason interpret(uarch_machine_state_access a, uint64_t mcycle_end); } // namespace cartesi /// \brief Advances one mcycle by executing the "big machine interpreter" compiled to the microarchitecture /// \return This function never returns extern "C" NO_RETURN void interpret_next_mcycle_with_uarch() { + uarch_machine_state_access a(pmas); const uint64_t mcycle_end = a.read_mcycle() + 1; interpret(a, mcycle_end); // Finished executing a whole mcycle: halt the microarchitecture