From 6b462c03dc8c64863ae146a94fe7abffe4da19bb Mon Sep 17 00:00:00 2001 From: Yen-Fu Chen Date: Tue, 18 Jun 2024 12:40:27 +0800 Subject: [PATCH] Introduce background compilation Given the significant runtime compilation overhead associated with performing aggressive optimizations, we have implemented a background compilation mechanism to mitigate this issue. When the runtime profiler identifies a strong hotspot, it adds a T2C compilation request to the wait queue. A background thread, which continuously monitors this queue, triggers T2C to process the requests and notifies the main thread upon completion by updating a flag. This mechanism defers the execution of T2C-generated machine code, leading to more frequent use of T1C-generated code. Despite this, the approach effectively minimizes runtime compilation delays by eliminating the need for the main thread to wait for T2C compilation to complete, thereby improving overall performance. Close: #239 --- .github/workflows/main.yml | 5 ++++- src/emulate.c | 18 ++++++++++------- src/feature.h | 3 +-- src/riscv.c | 41 ++++++++++++++++++++++++++++++++++++++ src/riscv_private.h | 19 +++++++++++++++++- 5 files changed, 75 insertions(+), 11 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3da6de63a..9fbb26761 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -75,7 +75,7 @@ jobs: - name: undefined behavior test run: | make clean && make ENABLE_UBSAN=1 check -j$(nproc) - make ENABLE_JIT=1 clean clean && make ENABLE_JIT=1 ENABLE_UBSAN=1 check -j$(nproc) + make ENABLE_JIT=1 clean && make ENABLE_JIT=1 ENABLE_UBSAN=1 check -j$(nproc) host-arm64: needs: [detect-code-related-file-changes] @@ -134,6 +134,9 @@ jobs: run: | sudo apt-get update -q -y sudo apt-get install -q -y clang clang-tools libsdl2-dev libsdl2-mixer-dev + wget https://apt.llvm.org/llvm.sh + chmod +x ./llvm.sh + sudo ./llvm.sh 17 shell: bash - name: run scan-build without JIT run: make distclean && scan-build -v -o ~/scan-build --status-bugs --use-cc=clang --force-analyze-debug-code --show-description -analyzer-config stable-report-filename=true -enable-checker valist,nullability make ENABLE_EXT_F=0 ENABLE_SDL=0 ENABLE_JIT=0 diff --git a/src/emulate.c b/src/emulate.c index 3e6bf1e36..e156aaba2 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -308,6 +308,9 @@ static block_t *block_alloc(riscv_t *rv) block->has_loops = false; block->n_invoke = 0; INIT_LIST_HEAD(&block->list); +#if RV32_HAS(T2C) + block->compiled = false; +#endif #endif return block; } @@ -993,13 +996,14 @@ void rv_step(void *arg) ((exec_t2c_func_t) block->func)(rv); prev = NULL; continue; - } /* check if the execution path is strong hotspot */ - if (block->n_invoke >= THRESHOLD) { - t2c_compile(block, - (uint64_t) ((memory_t *) PRIV(rv)->mem)->mem_base); - ((exec_t2c_func_t) block->func)(rv); - prev = NULL; - continue; + } /* check if invoking times of t1 generated code exceed threshold */ + else if (!block->compiled && block->n_invoke >= THRESHOLD) { + block->compiled = true; + queue_entry_t *entry = malloc(sizeof(queue_entry_t)); + entry->block = block; + pthread_mutex_lock(&rv->wait_queue_lock); + list_add(&entry->list, &rv->wait_queue); + pthread_mutex_unlock(&rv->wait_queue_lock); } #endif /* executed through the tier-1 JIT compiler */ diff --git a/src/feature.h b/src/feature.h index 706c4603a..7535e3ff6 100644 --- a/src/feature.h +++ b/src/feature.h @@ -50,12 +50,11 @@ /* Experimental just-in-time compiler */ #ifndef RV32_FEATURE_JIT #define RV32_FEATURE_JIT 0 -#endif - /* Experimental tier-2 just-in-time compiler */ #ifndef RV32_FEATURE_T2C #define RV32_FEATURE_T2C 0 #endif +#endif /* Feature test macro */ #define RV32_HAS(x) RV32_FEATURE_##x diff --git a/src/riscv.c b/src/riscv.c index 0657f692b..abe0ebd0a 100644 --- a/src/riscv.c +++ b/src/riscv.c @@ -28,6 +28,9 @@ #include "riscv_private.h" #include "utils.h" #if RV32_HAS(JIT) +#if RV32_HAS(T2C) +#include +#endif #include "cache.h" #include "jit.h" #define CODE_CACHE_SIZE (4 * 1024 * 1024) @@ -184,6 +187,31 @@ IO_HANDLER_IMPL(byte, write_b, W) #undef R #undef W +#if RV32_HAS(T2C) +static pthread_t t2c_thread; +static void *t2c_runloop(void *arg) +{ + riscv_t *rv = (riscv_t *) arg; + while (1) { + if (!list_empty(&rv->wait_queue)) { + queue_entry_t *entry = + list_last_entry(&rv->wait_queue, queue_entry_t, list); + pthread_mutex_lock(&rv->wait_queue_lock); + list_del_init(&entry->list); + pthread_mutex_unlock(&rv->wait_queue_lock); + t2c_compile(entry->block, + (uint64_t) ((memory_t *) PRIV(rv)->mem)->mem_base); + free(entry); + } + /* Instead of writing while(rv->quit), placing the code here prevents + * rv->quit from being optimized by the compiler.*/ + if (rv->quit) + break; + } + return NULL; +} +#endif + riscv_t *rv_create(riscv_user_t rv_attr) { assert(rv_attr); @@ -269,6 +297,14 @@ riscv_t *rv_create(riscv_user_t rv_attr) rv->jit_state = jit_state_init(CODE_CACHE_SIZE); rv->block_cache = cache_create(BLOCK_MAP_CAPACITY_BITS); assert(rv->block_cache); +#if RV32_HAS(T2C) + rv->quit = false; + /* prepare wait queue. */ + pthread_mutex_init(&rv->wait_queue_lock, NULL); + INIT_LIST_HEAD(&rv->wait_queue); + /* activate the background compilation thread. */ + pthread_create(&t2c_thread, NULL, t2c_runloop, rv); +#endif #endif return rv; @@ -353,6 +389,11 @@ void rv_delete(riscv_t *rv) memory_delete(attr->mem); block_map_destroy(rv); #else +#if RV32_HAS(T2C) + rv->quit = true; + pthread_join(t2c_thread, NULL); + pthread_mutex_destroy(&rv->wait_queue_lock); +#endif mpool_destroy(rv->chain_entry_mp); jit_state_exit(rv->jit_state); cache_free(rv->block_cache); diff --git a/src/riscv_private.h b/src/riscv_private.h index 95efe76da..a0bc660e8 100644 --- a/src/riscv_private.h +++ b/src/riscv_private.h @@ -14,6 +14,9 @@ #include "riscv.h" #include "utils.h" #if RV32_HAS(JIT) +#if RV32_HAS(T2C) +#include +#endif #include "cache.h" #endif @@ -70,7 +73,10 @@ typedef struct block { bool translatable; /**< Determine the block has RV32AF insturctions or not */ bool has_loops; /**< Determine the block has loop or not */ - uint32_t offset; /**< The machine code offset in T1 code cache */ +#if RV32_HAS(T2C) + bool compiled; /**< The T2C request is enqueued or not */ +#endif + uint32_t offset; /**< The machine code offset in T1 code cache */ uint32_t n_invoke; /**< The invoking times of T1 machine code */ void *func; /**< The function pointer of T2 machine code */ struct list_head list; @@ -82,6 +88,12 @@ typedef struct { block_t *block; struct list_head list; } chain_entry_t; +#if RV32_HAS(T2C) +typedef struct { + block_t *block; + struct list_head list; +} queue_entry_t; +#endif #endif typedef struct { @@ -134,6 +146,11 @@ struct riscv_internal { #else struct cache *block_cache; struct mpool *chain_entry_mp; +#if RV32_HAS(T2C) + struct list_head wait_queue; + pthread_mutex_t wait_queue_lock; + bool quit; /**< Determine the main thread is terminated or not */ +#endif #endif struct mpool *block_mp, *block_ir_mp;