Skip to content

Commit

Permalink
feat: optimize few host instructions from trace of SD and LD
Browse files Browse the repository at this point in the history
  • Loading branch information
edubart committed Nov 2, 2024
1 parent 37845ab commit ee4b0e4
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 4 deletions.
10 changes: 7 additions & 3 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -214,14 +214,18 @@ SHA3_CFLAGS=-O3
# Optimization flags for the interpreter
ifneq (,$(filter yes,$(relwithdebinfo) $(release)))
ifneq (,$(findstring gcc,$(CC)))
# The following is known to improve the interpret loop efficiency
INTERPRET_CXXFLAGS+=-fgcse-after-reload -fgcse-sm
# The following improves computed goto dispatch as stated in GCC manual
INTERPRET_CXXFLAGS+=-fno-gcse
# The following remove extra jumps in the computed goto dispatch
INTERPRET_CXXFLAGS+=-fno-crossjumping
# The following remove extra NOPs before jumping back to the interpret hot loop
INTERPRET_CXXFLAGS+=-fno-align-loops
# The interpreter dispatch loop performs better as a big inlined function
INTERPRET_CXXFLAGS+=-finline-limit=1024
# The interpreter hot loop is big and puts pressure on register allocation, this improves register use
INTERPRET_CXXFLAGS+=-frename-registers -fweb
# The interpreter instruction dispatch is big, the following reduces its size minimizing CPU cache pressure
INTERPRET_CXXFLAGS+=-freorder-blocks-algorithm=simple -fno-align-loops
INTERPRET_CXXFLAGS+=-freorder-blocks-algorithm=simple
# Some distributions enables stack protector by default, make sure it's disabled
INTERPRET_CXXFLAGS+=-fno-stack-protector
endif
Expand Down
4 changes: 3 additions & 1 deletion src/interpret.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -890,8 +890,10 @@ static FORCE_INLINE bool read_virtual_memory(STATE_ACCESS a, uint64_t &pc, uint6
if (unlikely(!(a.template read_memory_word_via_tlb<TLB_READ>(vaddr, pval)))) {
// Outline the slow path into a function call to minimize host CPU code cache pressure
INC_COUNTER(a.get_statistics(), tlb_rmiss);
T val = 0; // Don't pass pval reference directly so the compiler can store it in a register
auto [status, new_pc] =
read_virtual_memory_slow<T, STATE_ACCESS, RAISE_STORE_EXCEPTIONS>(a, pc, mcycle, vaddr, pval);
read_virtual_memory_slow<T, STATE_ACCESS, RAISE_STORE_EXCEPTIONS>(a, pc, mcycle, vaddr, &val);
*pval = val;
pc = new_pc;
return status;
}
Expand Down

0 comments on commit ee4b0e4

Please sign in to comment.