Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize instruction fetch and decoding #226

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ jobs:
uarch-pristine-ram.c
uarch-pristine-hash.c
machine-c-version.h
interpret-jump-table.h
cartesi-machine-v${{ env.MACHINE_EMULATOR_VERSION }}_amd64.deb
cartesi-machine-v${{ env.MACHINE_EMULATOR_VERSION }}_arm64.deb

Expand Down Expand Up @@ -726,6 +727,7 @@ jobs:
if: ${{ startsWith(github.ref, 'refs/tags/v') }}
run: |
mv artifacts/machine-c-version.h src
mv artifacts/interpret-jump-table.h src
mv artifacts/uarch-pristine-ram.c uarch
mv artifacts/uarch-pristine-hash.c uarch
make create-generated-files-patch
Expand Down
9 changes: 6 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ export CXX=g++

endif

GENERATED_FILES= uarch/uarch-pristine-hash.c uarch/uarch-pristine-ram.c src/machine-c-version.h
GENERATED_FILES= uarch/uarch-pristine-hash.c uarch/uarch-pristine-ram.c src/machine-c-version.h src/interpret-jump-table.h
ADD_GENERATED_FILES_DIFF= add-generated-files.diff

all: source-default
Expand Down Expand Up @@ -244,12 +244,15 @@ lint-% check-format-% format-% check-format-lua-% check-lua-% format-lua-%:
source-default:
@eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C $(SRCDIR)

uarch: $(SRCDIR)/machine-c-version.h
uarch: $(SRCDIR)/machine-c-version.h $(SRCDIR)/interpret-jump-table.h
@eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C uarch

$(SRCDIR)/machine-c-version.h:
@eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C $(SRCDIR) machine-c-version.h

$(SRCDIR)/interpret-jump-table.h:
@eval $$($(MAKE) -s --no-print-directory env); $(MAKE) -C $(SRCDIR) interpret-jump-table.h

build-emulator-builder-image:
docker build $(DOCKER_PLATFORM) --build-arg DEBUG=$(debug) --build-arg COVERAGE=$(coverage) --build-arg SANITIZE=$(sanitize) --target builder -t cartesi/machine-emulator:builder -f Dockerfile .

Expand Down Expand Up @@ -282,6 +285,7 @@ copy:
docker create --name uarch-ram-bin $(DOCKER_PLATFORM) $(DEBIAN_IMG)
docker cp uarch-ram-bin:/usr/src/emulator/$(DEB_FILENAME) .
docker cp uarch-ram-bin:/usr/src/emulator/src/machine-c-version.h .
docker cp uarch-ram-bin:/usr/src/emulator/src/interpret-jump-table.h .
docker cp uarch-ram-bin:/usr/src/emulator/uarch/uarch-ram.bin .
docker cp uarch-ram-bin:/usr/src/emulator/uarch/uarch-pristine-ram.c .
docker cp uarch-ram-bin:/usr/src/emulator/uarch/uarch-pristine-hash.c .
Expand Down Expand Up @@ -399,4 +403,3 @@ $(ADD_GENERATED_FILES_DIFF): $(GENERATED_FILES)

.PHONY: help all submodules doc clean distclean src luacartesi hash uarch \
create-generated-files-patch $(SUBDIRS) $(SUBCLEAN)

1 change: 1 addition & 0 deletions src/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ compile_flags.txt
coverage*
jsonrpc-discover.cpp
machine-c-version.h
interpret-jump-table.h
44 changes: 28 additions & 16 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -214,18 +214,27 @@ SHA3_CFLAGS=-O3

# Optimization flags for the interpreter
ifneq (,$(filter yes,$(relwithdebinfo) $(release)))
ifneq (,$(filter gcc,$(CC)))
# The following flag helps GCC to eliminate more redundant computations in the interpret loop,
# saving some host instructions and improving performance.
# This flag is usually enabled by default at -O3,
# but we don't use -O3 because it enables some other flags that are not worth for the interpreter.
INTERPRET_CXXFLAGS+=-fgcse-after-reload -fpredictive-commoning -fsplit-paths -ftree-partial-pre
ifneq (,$(findstring gcc,$(CC)))
# The following improves computed goto dispatch as stated in GCC manual
INTERPRET_CXXFLAGS+=-fno-gcse
# The following remove extra jumps in the computed goto dispatch
INTERPRET_CXXFLAGS+=-fno-crossjumping
# The following remove extra NOPs before jumping back to the interpret hot loop
INTERPRET_CXXFLAGS+=-fno-align-loops
# The interpreter dispatch loop performs better as a big inlined function
INTERPRET_CXXFLAGS+=-finline-limit=1024
# The interpreter hot loop is big and puts pressure on register allocation, this improves register use
INTERPRET_CXXFLAGS+=-frename-registers -fweb
# The interpreter instruction dispatch is big, the following reduces its size minimizing CPU cache pressure
INTERPRET_CXXFLAGS+=-freorder-blocks-algorithm=simple
# Some distributions enables stack protector by default, make sure it's disabled
INTERPRET_CXXFLAGS+=-fno-stack-protector
endif
# Disable jump tables, because it degrades the instruction decoding performance in the interpret loop,
# since it generates a memory indirection that has a high cost in opcode switches.
INTERPRET_CXXFLAGS+=-fno-jump-tables
endif

# Make testing new optimization options easier
INTERPRET_CXXFLAGS+=$(MYINTERPRET_CXXFLAGS)

# Link time optimizations
ifeq ($(lto),yes)
OPTFLAGS+=-flto=auto
Expand Down Expand Up @@ -262,7 +271,7 @@ PGO_WORKLOAD=\
whetstone 25000

LINTER_IGNORE_SOURCES=
LINTER_IGNORE_HEADERS=
LINTER_IGNORE_HEADERS=interpret-jump-table.h
LINTER_SOURCES=$(filter-out $(LINTER_IGNORE_SOURCES),$(strip $(wildcard *.cpp) $(wildcard *.c)))
LINTER_HEADERS=$(filter-out $(LINTER_IGNORE_HEADERS),$(strip $(wildcard *.hpp) $(wildcard *.h)))

Expand All @@ -273,7 +282,7 @@ CLANG_FORMAT=clang-format
CLANG_FORMAT_UARCH_FILES:=$(wildcard ../uarch/*.cpp)
CLANG_FORMAT_UARCH_FILES:=$(filter-out %uarch-printf%,$(strip $(CLANG_FORMAT_UARCH_FILES)))
CLANG_FORMAT_FILES:=$(wildcard *.cpp) $(wildcard *.c) $(wildcard *.h) $(wildcard *.hpp) $(CLANG_FORMAT_UARCH_FILES)
CLANG_FORMAT_IGNORE_FILES:=
CLANG_FORMAT_IGNORE_FILES:=interpret-jump-table.h
CLANG_FORMAT_FILES:=$(strip $(CLANG_FORMAT_FILES))
CLANG_FORMAT_FILES:=$(filter-out $(CLANG_FORMAT_IGNORE_FILES),$(strip $(CLANG_FORMAT_FILES)))

Expand Down Expand Up @@ -542,12 +551,12 @@ jsonrpc-discover.cpp: jsonrpc-discover.json
echo '} // namespace cartesi' >> jsonrpc-discover.cpp

%.clang-tidy: %.cpp machine-c-version.h
@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CXXFLAGS) $(LUA_INC) 2>/dev/null
@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CXXFLAGS) $(LUA_INC) -DCLANG_TIDY_LINT 2>/dev/null
@$(CXX) $(CXXFLAGS) $(LUA_INC) $< -MM -MT $@ -MF [email protected] > /dev/null 2>&1
@touch $@

%.clang-tidy: %.c
@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CFLAGS) 2>/dev/null
@$(CLANG_TIDY) --header-filter='$(CLANG_TIDY_HEADER_FILTER)' $(CLANG_TIDY_FLAGS) $< -- $(CFLAGS) -DCLANG_TIDY_LINT 2>/dev/null
@$(CC) $(CFLAGS) $< -MM -MT $@ -MF [email protected] > /dev/null 2>&1
@touch $@

Expand All @@ -560,7 +569,10 @@ uarch-pristine-ram.o: $(UARCH_PRISTINE_RAM_C)
uarch-pristine-hash.o: $(UARCH_PRISTINE_HASH_C)
$(CC) $(CFLAGS) -c -o $@ $<

interpret.o: interpret.cpp machine-c-version.h
interpret-jump-table.h: ../tools/gen-interpret-jump-table.lua
$< > $@

interpret.o: interpret.cpp interpret-jump-table.h machine-c-version.h
$(CXX) $(CXXFLAGS) $(INTERPRET_CXXFLAGS) -c -o $@ $<

%.o: %.cpp machine-c-version.h
Expand All @@ -571,7 +583,7 @@ interpret.o: interpret.cpp machine-c-version.h

../uarch/uarch-pristine-ram.c ../uarch/uarch-pristine-hash.c: generate-uarch-pristine

generate-uarch-pristine:
generate-uarch-pristine: machine-c-version.h interpret-jump-table.h
ifeq (,$(wildcard ../uarch/uarch-pristine-hash.c))
@if [ "$(DEV_ENV_HAS_TOOLCHAIN)" = "yes" ]; then \
$(MAKE) -C .. uarch; \
Expand All @@ -583,7 +595,7 @@ endif
clean: clean-auto-generated clean-coverage clean-profile clean-tidy clean-libcartesi clean-executables

clean-auto-generated:
@rm -f jsonrpc-discover.cpp machine-c-version.h
@rm -f jsonrpc-discover.cpp machine-c-version.h interpret-jump-table.h

clean-tidy:
@rm -f *.clang-tidy
Expand Down
6 changes: 6 additions & 0 deletions src/compiler-defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,10 @@

#define PACKED __attribute__((packed))

#if defined(__GNUC__)
#define FORCE_OPTIMIZE_O3 __attribute__((optimize("-O3")))
#else
#define FORCE_OPTIMIZE_O3
#endif

#endif
4 changes: 2 additions & 2 deletions src/device-state-access.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ namespace cartesi {
template <typename STATE_ACCESS>
class device_state_access : public i_device_state_access {
public:
explicit device_state_access(STATE_ACCESS &a, uint64_t mcycle) : m_a(a), m_mcycle(mcycle) {
explicit device_state_access(STATE_ACCESS a, uint64_t mcycle) : m_a(a), m_mcycle(mcycle) {
static_assert(is_an_i_state_access<STATE_ACCESS>::value, "not an i_state_access");
}

Expand All @@ -52,7 +52,7 @@ class device_state_access : public i_device_state_access {
~device_state_access() override = default;

private:
STATE_ACCESS &m_a; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members)
STATE_ACCESS m_a; // NOLINT(cppcoreguidelines-avoid-const-or-ref-data-members)
uint64_t m_mcycle;

void do_set_mip(uint64_t mask) override {
Expand Down
5 changes: 3 additions & 2 deletions src/i-state-access.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <type_traits>
#include <utility>

#include "compiler-defines.h"
#include "meta.h"
#include "shadow-tlb.h"

Expand Down Expand Up @@ -773,15 +774,15 @@ class i_state_access { // CRTP
}

/// \brief Invalidates all TLB entries of all types.
void flush_all_tlb() {
NO_INLINE void flush_all_tlb() {
derived().template flush_tlb_type<TLB_CODE>();
derived().template flush_tlb_type<TLB_READ>();
derived().template flush_tlb_type<TLB_WRITE>();
}

/// \brief Invalidates TLB entries for a specific virtual address.
/// \param vaddr Target virtual address.
void flush_tlb_vaddr(uint64_t vaddr) {
NO_INLINE void flush_tlb_vaddr(uint64_t vaddr) {
return derived().do_flush_tlb_vaddr(vaddr);
}

Expand Down
Loading
Loading