From 5372c85d7f8dd70025c204b8834517fcf4ce7dff Mon Sep 17 00:00:00 2001 From: Eduardo Bart Date: Tue, 5 Nov 2024 19:02:33 -0300 Subject: [PATCH] feat: optimize immediate sign extension for some instructions --- src/Makefile | 2 -- src/interpret.cpp | 59 +++++++++++++++++++++++++++++------------------ 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/src/Makefile b/src/Makefile index 477f52e3..125d7f13 100644 --- a/src/Makefile +++ b/src/Makefile @@ -218,8 +218,6 @@ ifneq (,$(findstring gcc,$(CC))) INTERPRET_CXXFLAGS+=-fno-gcse # The following remove extra jumps in the computed goto dispatch INTERPRET_CXXFLAGS+=-fno-crossjumping -# The following remove extra NOPs before jumping back to the interpret hot loop -INTERPRET_CXXFLAGS+=-fno-align-loops # The interpreter dispatch loop performs better as a big inlined function INTERPRET_CXXFLAGS+=-finline-limit=1024 # The interpreter hot loop is big and puts pressure on register allocation, this improves register use diff --git a/src/interpret.cpp b/src/interpret.cpp index 73207d19..36174528 100644 --- a/src/interpret.cpp +++ b/src/interpret.cpp @@ -608,31 +608,36 @@ static inline uint32_t insn_I_get_uimm(uint32_t insn) { /// \brief Obtains the immediate value from a U-type instruction. /// \param insn Instruction. -static inline int32_t insn_U_get_imm(uint32_t insn) { - return static_cast(insn & 0xfffff000); +static inline int64_t insn_U_get_imm(uint32_t insn) { + return static_cast(static_cast(insn)) & ~INT64_C(0b111111111111); } /// \brief Obtains the immediate value from a B-type instruction. /// \param insn Instruction. /// \details This function is forced to be inline because GCC may not always inline it. -static FORCE_INLINE int32_t insn_B_get_imm(uint32_t insn) { - return static_cast(static_cast(static_cast(insn) >> 31) << 12 | - ((insn << 1) >> 26) << 5 | ((insn << 20) >> 28) << 1 | ((insn << 24) >> 31) << 11); +static FORCE_INLINE int64_t insn_B_get_imm(uint32_t insn) { + return ((static_cast(static_cast(insn)) >> 51) & ~INT64_C(0b111111111111)) | // imm[12] + ((insn >> 20) & 0b11111100000) | // imm[10:5] + ((insn >> 7) & 0b11110) | // imm[4:1] + ((insn << 4) & 0b100000000000); // imm[11] } /// \brief Obtains the immediate value from a J-type instruction. /// \param insn Instruction. /// \details This function is forced to be inline because GCC may not always inline it. -static FORCE_INLINE int32_t insn_J_get_imm(uint32_t insn) { - return static_cast(static_cast(static_cast(insn) >> 31) << 20 | - ((insn << 1) >> 22) << 1 | ((insn << 11) >> 31) << 11 | ((insn << 12) >> 24) << 12); +static FORCE_INLINE int64_t insn_J_get_imm(uint32_t insn) { + return ((static_cast(static_cast(insn)) >> 43) & ~INT64_C(0b11111111111111111111)) | // imm[20] + ((insn >> 20) & 0b11111111110) | // imm[10:1] + ((insn >> 9) & 0b100000000000) | // imm[11] + (insn & 0b11111111000000000000); // imm[19:12] } /// \brief Obtains the immediate value from a S-type instruction. /// \param insn Instruction. /// \details This function is forced to be inline because GCC may not always inline it. -static FORCE_INLINE int32_t insn_S_get_imm(uint32_t insn) { - return (static_cast(insn & 0xfe000000) >> (25 - 5)) | static_cast((insn >> 7) & 0b11111); +static FORCE_INLINE int64_t insn_S_get_imm(uint32_t insn) { + return ((static_cast(static_cast(insn)) >> 20) & ~INT64_C(0b11111)) | // imm[11:5] + ((insn >> 7) & 0b11111); // imm[4:0] } /// \brief Obtains the 5 most significant bits of the funct7 field from an instruction. @@ -748,10 +753,17 @@ static FORCE_INLINE uint32_t insn_get_CI_CB_imm(uint32_t insn) { /// \brief Obtains the immediate (sign-extended) value from a CI/CB-type instruction. /// \param insn Instruction. /// \details This function is forced to be inline because GCC may not always inline it. -static FORCE_INLINE int32_t insn_get_CI_CB_imm_se(uint32_t insn) { - return static_cast((static_cast(static_cast(insn << 19) >> 26) & ~0b11111) | // imm[5] - ((insn >> 2) & 0b11111) // imm[4:0] - ); +static FORCE_INLINE int32_t insn_get_CI_CB_imm_se32(uint32_t insn) { + return ((static_cast(insn << 19) >> 26) & ~0b11111) | // imm[5] + ((insn >> 2) & 0b11111); // imm[4:0] +} + +/// \brief Obtains the immediate (sign-extended) value from a CI/CB-type instruction. +/// \param insn Instruction. +/// \details This function is forced to be inline because GCC may not always inline it. +static FORCE_INLINE int64_t insn_get_CI_CB_imm_se64(uint32_t insn) { + return (((static_cast(insn) << 51) >> 58) & ~0b11111) | // imm[5] + ((insn >> 2) & 0b11111); // imm[4:0] } /// \brief Obtains the immediate value from a C.LW and C.SW instructions. @@ -3112,7 +3124,7 @@ static FORCE_INLINE execute_status execute_SRAIW(STATE_ACCESS a, uint64_t &pc, u template static FORCE_INLINE execute_status execute_S(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { const uint64_t vaddr = a.read_x(insn_get_rs1(insn)); - const int32_t imm = insn_S_get_imm(insn); + const int64_t imm = insn_S_get_imm(insn); const uint64_t val = a.read_x(insn_get_rs2(insn)); const execute_status status = write_virtual_memory(a, pc, mcycle, vaddr + imm, val); if (unlikely(status != execute_status::success)) { @@ -3320,8 +3332,9 @@ template static FORCE_INLINE execute_status execute_JALR(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, insn, "jalr"); const uint64_t val = pc + 4; - const uint64_t new_pc = - static_cast(a.read_x(insn_get_rs1(insn)) + insn_I_get_imm(insn)) & ~static_cast(1); + const int32_t imm = insn_I_get_imm(insn); + const uint64_t rs1_value = a.read_x(insn_get_rs1(insn)); + const uint64_t new_pc = static_cast(rs1_value + imm) & ~static_cast(1); const uint32_t rd = insn_get_rd(insn); if constexpr (rd_kind != rd_kind::x0) { a.write_x(rd, val); @@ -3741,7 +3754,7 @@ static FORCE_INLINE execute_status execute_float_unary_op_rm(STATE_ACCESS a, uin template static FORCE_INLINE execute_status execute_FS(STATE_ACCESS a, uint64_t &pc, uint64_t mcycle, uint32_t insn) { const uint64_t vaddr = a.read_x(insn_get_rs1(insn)); - const int32_t imm = insn_S_get_imm(insn); + const int64_t imm = insn_S_get_imm(insn); // A narrower n-bit transfer out of the floating-point // registers will transfer the lower n bits of the register ignoring the upper FLEN−n bits. T val = static_cast(a.read_f(insn_get_rs2(insn))); @@ -4856,11 +4869,11 @@ static FORCE_INLINE execute_status execute_C_ADDI(STATE_ACCESS a, uint64_t &pc, dump_insn(a, pc, static_cast(insn), "c.addi"); // rd cannot be zero (guaranteed by jump table) const uint32_t rd = insn_get_rd(insn); - const int32_t imm = insn_get_CI_CB_imm_se(insn); + const int64_t imm = insn_get_CI_CB_imm_se64(insn); // imm cannot be zero (guaranteed by jump table) const uint64_t rd_value = a.read_x(rd); int64_t val = 0; - __builtin_add_overflow(static_cast(rd_value), static_cast(imm), &val); + __builtin_add_overflow(static_cast(rd_value), imm, &val); a.write_x(rd, static_cast(val)); return advance_to_next_insn<2>(a, pc); } @@ -4872,7 +4885,7 @@ static FORCE_INLINE execute_status execute_C_ADDIW(STATE_ACCESS a, uint64_t &pc, // rd cannot be zero (guaranteed by jump table) const uint32_t rd = insn_get_rd(insn); const uint64_t rd_value = a.read_x(rd); - const int32_t imm = insn_get_CI_CB_imm_se(insn); + const int32_t imm = insn_get_CI_CB_imm_se32(insn); int32_t val = 0; __builtin_add_overflow(static_cast(rd_value), imm, &val); a.write_x(rd, static_cast(val)); @@ -4885,7 +4898,7 @@ static FORCE_INLINE execute_status execute_C_LI(STATE_ACCESS a, uint64_t &pc, ui dump_insn(a, pc, static_cast(insn), "c.li"); // rd cannot be zero (guaranteed by jump table) const uint32_t rd = insn_get_rd(insn); - const int32_t imm = insn_get_CI_CB_imm_se(insn); + const int64_t imm = insn_get_CI_CB_imm_se64(insn); a.write_x(rd, static_cast(imm)); return advance_to_next_insn<2>(a, pc); } @@ -4944,7 +4957,7 @@ template static FORCE_INLINE execute_status execute_C_ANDI(STATE_ACCESS a, uint64_t &pc, uint32_t insn) { dump_insn(a, pc, static_cast(insn), "c.andi"); const uint32_t rs1 = insn_get_CL_CS_CA_CB_rs1(insn); - const int32_t imm = insn_get_CI_CB_imm_se(insn); + const int64_t imm = insn_get_CI_CB_imm_se64(insn); const uint64_t rs1_value = a.read_x(rs1); a.write_x(rs1, rs1_value & static_cast(imm)); return advance_to_next_insn<2>(a, pc);