From 04a4be3f9d202f7d2d0b597a91bb2ca98983aa69 Mon Sep 17 00:00:00 2001 From: Eduardo Bart Date: Mon, 28 Oct 2024 09:46:24 -0300 Subject: [PATCH] feat: optimize branching of uncompressed arithmetic instructions --- src/Makefile | 3 + src/interpret.cpp | 200 ++++++++++++++++++++++++-------------------- src/machine-state.h | 3 +- 3 files changed, 113 insertions(+), 93 deletions(-) diff --git a/src/Makefile b/src/Makefile index d22e9f2c4..b817543e9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -223,6 +223,9 @@ INTERPRET_CXXFLAGS+=-fgcse-after-reload -fpredictive-commoning -fsplit-paths -ft INTERPRET_CXXFLAGS+=-fno-gcse # The interpreter dispatch loop performs better as a big inlined function INTERPRET_CXXFLAGS+=-finline-limit=1024 +# The following optimization improves register allocation in the interpret hot loop +INTERPRET_CXXFLAGS+=-funroll-loops +INTERPRET_CXXFLAGS+=$(MYINTERPRET_CXXFLAGS) endif endif diff --git a/src/interpret.cpp b/src/interpret.cpp index fda7603fb..ffdab366c 100644 --- a/src/interpret.cpp +++ b/src/interpret.cpp @@ -3352,26 +3352,28 @@ static FORCE_INLINE execute_status execute_SFENCE_VMA(STATE_ACCESS &a, uint64_t template static FORCE_INLINE execute_status execute_SRLI_SRAI(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7_sr1(insn))) { - case insn_SRLI_SRAI_funct7_sr1::SRLI: - return execute_SRLI(a, pc, insn); - case insn_SRLI_SRAI_funct7_sr1::SRAI: - return execute_SRAI(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7_sr1 = static_cast(insn_get_funct7_sr1(insn)); + if (funct7_sr1 == insn_SRLI_SRAI_funct7_sr1::SRLI) { + return execute_SRLI(a, pc, insn); + } + if (funct7_sr1 == insn_SRLI_SRAI_funct7_sr1::SRAI) { + return execute_SRAI(a, pc, insn); } + return raise_illegal_insn_exception(a, pc, insn); } template static FORCE_INLINE execute_status execute_SRLIW_SRAIW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_SRLIW_SRAIW_funct7::SRLIW: - return execute_SRLIW(a, pc, insn); - case insn_SRLIW_SRAIW_funct7::SRAIW: - return execute_SRAIW(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_SRLIW_SRAIW_funct7::SRLIW) { + return execute_SRLIW(a, pc, insn); } + if (funct7 == insn_SRLIW_SRAIW_funct7::SRAIW) { + return execute_SRAIW(a, pc, insn); + } + return raise_illegal_insn_exception(a, pc, insn); } template @@ -3436,130 +3438,144 @@ static FORCE_INLINE execute_status execute_AMO_D(STATE_ACCESS &a, uint64_t &pc, template static FORCE_INLINE execute_status execute_ADD_MUL_SUB(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_ADD_MUL_SUB_funct7::ADD: - return execute_ADD(a, pc, insn); - case insn_ADD_MUL_SUB_funct7::MUL: - return execute_MUL(a, pc, insn); - case insn_ADD_MUL_SUB_funct7::SUB: - return execute_SUB(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_ADD_MUL_SUB_funct7::ADD) { + return execute_ADD(a, pc, insn); + } + if (funct7 == insn_ADD_MUL_SUB_funct7::MUL) { + return execute_MUL(a, pc, insn); + } + if (funct7 == insn_ADD_MUL_SUB_funct7::SUB) { + return execute_SUB(a, pc, insn); } + return raise_illegal_insn_exception(a, pc, insn); } template static FORCE_INLINE execute_status execute_SLL_MULH(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_SLL_MULH_funct7::SLL: - return execute_SLL(a, pc, insn); - case insn_SLL_MULH_funct7::MULH: - return execute_MULH(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_SLL_MULH_funct7::SLL) { + return execute_SLL(a, pc, insn); + } + if (funct7 == insn_SLL_MULH_funct7::MULH) { + return execute_MULH(a, pc, insn); } + return raise_illegal_insn_exception(a, pc, insn); } template static FORCE_INLINE execute_status execute_SLT_MULHSU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_SLT_MULHSU_funct7::SLT: - return execute_SLT(a, pc, insn); - case insn_SLT_MULHSU_funct7::MULHSU: - return execute_MULHSU(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_SLT_MULHSU_funct7::SLT) { + return execute_SLT(a, pc, insn); } + if (funct7 == insn_SLT_MULHSU_funct7::MULHSU) { + return execute_MULHSU(a, pc, insn); + } + return raise_illegal_insn_exception(a, pc, insn); } template static FORCE_INLINE execute_status execute_SLTU_MULHU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_SLTU_MULHU_funct7::SLTU: - return execute_SLTU(a, pc, insn); - case insn_SLTU_MULHU_funct7::MULHU: - return execute_MULHU(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_SLTU_MULHU_funct7::SLTU) { + return execute_SLTU(a, pc, insn); } + if (funct7 == insn_SLTU_MULHU_funct7::MULHU) { + return execute_MULHU(a, pc, insn); + } + return raise_illegal_insn_exception(a, pc, insn); } template static FORCE_INLINE execute_status execute_XOR_DIV(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_XOR_DIV_funct7::XOR: - return execute_XOR(a, pc, insn); - case insn_XOR_DIV_funct7::DIV: - return execute_DIV(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_XOR_DIV_funct7::XOR) { + return execute_XOR(a, pc, insn); + } + if (funct7 == insn_XOR_DIV_funct7::DIV) { + return execute_DIV(a, pc, insn); } + return raise_illegal_insn_exception(a, pc, insn); } template static FORCE_INLINE execute_status execute_SRL_DIVU_SRA(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_SRL_DIVU_SRA_funct7::SRL: - return execute_SRL(a, pc, insn); - case insn_SRL_DIVU_SRA_funct7::DIVU: - return execute_DIVU(a, pc, insn); - case insn_SRL_DIVU_SRA_funct7::SRA: - return execute_SRA(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_SRL_DIVU_SRA_funct7::SRL) { + return execute_SRL(a, pc, insn); + } + if (funct7 == insn_SRL_DIVU_SRA_funct7::SRA) { + return execute_SRA(a, pc, insn); } + if (funct7 == insn_SRL_DIVU_SRA_funct7::DIVU) { + return execute_DIVU(a, pc, insn); + } + return raise_illegal_insn_exception(a, pc, insn); } template static FORCE_INLINE execute_status execute_OR_REM(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_OR_REM_funct7::OR: - return execute_OR(a, pc, insn); - case insn_OR_REM_funct7::REM: - return execute_REM(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_OR_REM_funct7::OR) { + return execute_OR(a, pc, insn); } + if (funct7 == insn_OR_REM_funct7::REM) { + return execute_REM(a, pc, insn); + } + return raise_illegal_insn_exception(a, pc, insn); } template static FORCE_INLINE execute_status execute_AND_REMU(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_AND_REMU_funct7::AND: - return execute_AND(a, pc, insn); - case insn_AND_REMU_funct7::REMU: - return execute_REMU(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_AND_REMU_funct7::AND) { + return execute_AND(a, pc, insn); + } + if (funct7 == insn_AND_REMU_funct7::REMU) { + return execute_REMU(a, pc, insn); } + return raise_illegal_insn_exception(a, pc, insn); } template static FORCE_INLINE execute_status execute_ADDW_MULW_SUBW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_ADDW_MULW_SUBW_funct7::ADDW: - return execute_ADDW(a, pc, insn); - case insn_ADDW_MULW_SUBW_funct7::MULW: - return execute_MULW(a, pc, insn); - case insn_ADDW_MULW_SUBW_funct7::SUBW: - return execute_SUBW(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_ADDW_MULW_SUBW_funct7::ADDW) { + return execute_ADDW(a, pc, insn); + } + if (funct7 == insn_ADDW_MULW_SUBW_funct7::MULW) { + return execute_MULW(a, pc, insn); } + if (funct7 == insn_ADDW_MULW_SUBW_funct7::SUBW) { + return execute_SUBW(a, pc, insn); + } + return raise_illegal_insn_exception(a, pc, insn); } template static FORCE_INLINE execute_status execute_SRLW_DIVUW_SRAW(STATE_ACCESS &a, uint64_t &pc, uint32_t insn) { - switch (static_cast(insn_get_funct7(insn))) { - case insn_SRLW_DIVUW_SRAW_funct7::SRLW: - return execute_SRLW(a, pc, insn); - case insn_SRLW_DIVUW_SRAW_funct7::DIVUW: - return execute_DIVUW(a, pc, insn); - case insn_SRLW_DIVUW_SRAW_funct7::SRAW: - return execute_SRAW(a, pc, insn); - default: - return raise_illegal_insn_exception(a, pc, insn); + // Use ifs instead of a switch to produce fewer branches for the most frequent instructions + const auto funct7 = static_cast(insn_get_funct7(insn)); + if (funct7 == insn_SRLW_DIVUW_SRAW_funct7::SRLW) { + return execute_SRLW(a, pc, insn); + } + if (funct7 == insn_SRLW_DIVUW_SRAW_funct7::DIVUW) { + return execute_DIVUW(a, pc, insn); + } + if (funct7 == insn_SRLW_DIVUW_SRAW_funct7::SRAW) { + return execute_SRAW(a, pc, insn); } + return raise_illegal_insn_exception(a, pc, insn); } template diff --git a/src/machine-state.h b/src/machine-state.h index 915b9c8b2..f587c348e 100644 --- a/src/machine-state.h +++ b/src/machine-state.h @@ -59,9 +59,10 @@ struct machine_state { // The following state fields are very hot, // and are carefully ordered to have better data locality in the interpreter loop. + // The X registers are the very first to optimize access of registers in the interpreter. + std::array x{}; ///< Register file uint64_t mcycle{}; ///< CSR mcycle. uint64_t pc{}; ///< Program counter. - std::array x{}; ///< Register file. uint64_t fcsr{}; ///< CSR fcsr. std::array f{}; ///< Floating-point register file.