From 8fa2f858540b93707e40bf82bf4fe7561895bad5 Mon Sep 17 00:00:00 2001 From: Eduardo Bart Date: Mon, 28 Oct 2024 17:05:36 -0300 Subject: [PATCH] feat: optimize GCC interpret flags --- src/Makefile | 16 ++++++++++------ src/interpret.cpp | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/Makefile b/src/Makefile index b817543e..fd5dde9d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -218,17 +218,21 @@ ifneq (,$(filter gcc,$(CC))) # saving some host instructions and improving performance. # This flag is usually enabled by default at -O3, # but we don't use -O3 because it enables some other flags that are not worth for the interpreter. -INTERPRET_CXXFLAGS+=-fgcse-after-reload -fpredictive-commoning -fsplit-paths -ftree-partial-pre -fpeel-loops -# GCC manual says that we should disable gcse when using computed gotos -INTERPRET_CXXFLAGS+=-fno-gcse +INTERPRET_CXXFLAGS+=-fgcse-after-reload -fpeel-loops # The interpreter dispatch loop performs better as a big inlined function INTERPRET_CXXFLAGS+=-finline-limit=1024 -# The following optimization improves register allocation in the interpret hot loop -INTERPRET_CXXFLAGS+=-funroll-loops -INTERPRET_CXXFLAGS+=$(MYINTERPRET_CXXFLAGS) +# The interpreter hot loop is big and puts pressure on register allocation, this improves register use +INTERPRET_CXXFLAGS+=-frename-registers -fweb +# The interpreter instruction dispatch is ordered by hand, we don't want the compiler to shuffle it +INTERPRET_CXXFLAGS+=-freorder-blocks-algorithm=simple +# The following is known to save instructions in the hop loop +INTERPRET_CXXFLAGS+=-fgcse-sm endif endif +# Make testing new optimization options easier +INTERPRET_CXXFLAGS+=$(MYINTERPRET_CXXFLAGS) + # Link time optimizations ifeq ($(lto),yes) OPTFLAGS+=-flto=auto diff --git a/src/interpret.cpp b/src/interpret.cpp index ffdab366..8bdddc62 100644 --- a/src/interpret.cpp +++ b/src/interpret.cpp @@ -2011,7 +2011,7 @@ static NO_INLINE execute_status write_csr_satp(STATE_ACCESS &a, uint64_t val) { } template -static execute_status write_csr_mstatus(STATE_ACCESS &a, uint64_t val) { +static NO_INLINE execute_status write_csr_mstatus(STATE_ACCESS &a, uint64_t val) { const uint64_t old_mstatus = a.read_mstatus() & MSTATUS_R_MASK; // M-mode software can determine whether a privilege mode is implemented