Skip to content

Commit

Permalink
Introduce preliminary macro operation fusion
Browse files Browse the repository at this point in the history
Through our observations, we have identified certain patterns in instruction
sequences. By converting these specific RISC-V instruction patterns into
faster and equivalent code, we can significantly improve execution efficiency.

In our current analysis, we focus on a commonly used benchmark and have
found the following frequently occurring instruction patterns: auipc + addi,
auipc + add, multiple sw, and multiple lw.

|  Metric  |     commit fba5802       |    macro fuse operation   |Speedup|
|----------+--------------------------+---------------------------+-------|
| CoreMark | 1351.065 (Iterations/Sec)|  1352.843 (Iterations/Sec)|+0.13% |
| dhrystone|       1073 DMIPS         |        1146 DMIPS         | +6.8% |
| nqueens  |       8295 msec          |        7824 msec          | +6.0% |
  • Loading branch information
qwe661234 committed May 29, 2023
1 parent 7bde175 commit 56b14b8
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 3 deletions.
17 changes: 16 additions & 1 deletion src/decode.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,14 @@
_(cjalr, 1) \
_(cadd, 0) \
_(cswsp, 0) \
)
) \
/* macro operation fusion: convert specific RISC-V instruction patterns
* into faster and equivalent code
*/ \
_(fuse1, 0) \
_(fuse2, 0) \
_(fuse3, 0) \
_(fuse4, 0)
/* clang-format on */

/* IR list */
Expand Down Expand Up @@ -228,6 +235,11 @@ enum {
INSN_32 = 4,
};

typedef struct {
int32_t imm;
uint8_t rd, rs1, rs2;
} opcode_fuse_t;

typedef struct rv_insn {
union {
int32_t imm;
Expand All @@ -240,6 +252,9 @@ typedef struct rv_insn {
#if RV32_HAS(EXT_C)
uint8_t shamt;
#endif
/* fuse operation */
int16_t imm2;
opcode_fuse_t *fuse;

/* instruction length */
uint8_t insn_len;
Expand Down
131 changes: 129 additions & 2 deletions src/emulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ extern struct target_ops gdbstub_ops;
#include "decode.h"
#include "riscv.h"
#include "riscv_private.h"
#include "state.h"
#include "utils.h"

/* RISC-V exception code list */
Expand Down Expand Up @@ -310,7 +311,15 @@ static uint32_t last_pc = 0;
/* RV32I Base Instruction Set */

/* Internal */
RVOP(nop, {/* no operation */});
static bool do_nop(riscv_t *rv, const rv_insn_t *ir)
{
rv->X[rv_reg_zero] = 0;
rv->csr_cycle++;
rv->PC += ir->insn_len;
const rv_insn_t *next = ir + 1;
MUST_TAIL return next->impl(rv, next);
}


/* LUI is used to build 32-bit constants and uses the U-type format. LUI
* places the U-immediate value in the top 20 bits of the destination
Expand Down Expand Up @@ -1219,6 +1228,46 @@ RVOP(cswsp, {
})
#endif

/* auipc + addi */
RVOP(fuse1, { rv->X[ir->rd] = (int32_t) (rv->PC + ir->imm + ir->imm2); })

/* auipc + add */
RVOP(fuse2, {
rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->PC + ir->imm);
})

/* multiple sw */
RVOP(fuse3, {
opcode_fuse_t *fuse = ir->fuse;
uint32_t addr = rv->X[fuse[0].rs1] + fuse[0].imm;
/* the memory addresses of the sw instructions are contiguous, so we only
* need to check the first sw instruction to determine if its memory address
* is misaligned or if the memory chunk does not exist.
*/
RV_EXC_MISALIGN_HANDLER(3, store, false, 1);
rv->io.mem_write_w(rv, addr, rv->X[fuse[0].rs2]);
for (int i = 1; i < ir->imm2; i++) {
addr = rv->X[fuse[i].rs1] + fuse[i].imm;
rv->io.mem_write_w(rv, addr, rv->X[fuse[i].rs2]);
}
})

/* multiple lw */
RVOP(fuse4, {
opcode_fuse_t *fuse = ir->fuse;
uint32_t addr = rv->X[fuse[0].rs1] + fuse[0].imm;
/* the memory addresses of the lw instructions are contiguous, so we only
* need to check the first lw instruction to determine if its memory address
* is misaligned or if the memory chunk does not exist.
*/
RV_EXC_MISALIGN_HANDLER(3, load, false, 1);
rv->X[fuse[0].rd] = rv->io.mem_read_w(rv, addr);
for (int i = 1; i < ir->imm2; i++) {
addr = rv->X[fuse[i].rs1] + fuse[i].imm;
rv->X[fuse[i].rd] = rv->io.mem_read_w(rv, addr);
}
})

static const void *dispatch_table[] = {
#define _(inst, can_branch) [rv_insn_##inst] = do_##inst,
RISCV_INSN_LIST
Expand Down Expand Up @@ -1337,7 +1386,6 @@ static void block_translate(riscv_t *rv, block_t *block)
/* compute the end of pc */
block->pc_end += ir->insn_len;
block->n_insn++;

/* stop on branch */
if (insn_is_branch(ir->opcode)) {
/* recursive jump translation */
Expand All @@ -1356,6 +1404,82 @@ static void block_translate(riscv_t *rv, block_t *block)
block->ir[block->n_insn - 1].tailcall = true;
}

#define COMBINE_MEM_OPS(RW) \
count = 1; \
next_ir = ir + 1; \
if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw)) \
break; \
sign = (ir->imm - next_ir->imm) >> 31 ? -1 : 1; \
for (uint32_t j = 1; j < block->n_insn - 1 - i; j++) { \
next_ir = ir + j; \
if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw) || \
ir->rs1 != next_ir->rs1 || ir->imm - next_ir->imm != 4 * sign) \
break; \
count++; \
} \
if (count > 1) { \
ir->opcode = IIF(RW)(rv_insn_fuse4, rv_insn_fuse3); \
ir->fuse = malloc(count * sizeof(opcode_fuse_t)); \
ir->imm2 = count; \
memcpy(ir->fuse, ir, sizeof(opcode_fuse_t)); \
ir->impl = dispatch_table[ir->opcode]; \
for (int j = 1; j < count; j++) { \
next_ir = ir + j; \
memcpy(ir->fuse + j, next_ir, sizeof(opcode_fuse_t)); \
next_ir->opcode = rv_insn_nop; \
next_ir->impl = dispatch_table[next_ir->opcode]; \
} \
} \
break;


/* examine whether instructions in a block match a specific pattern. If so,
* rewrite them into fused instructions.
*
* We plan to devise strategies to increase the number of instructions that
* match the pattern, such as reordering the instructions.
*/
static void match_pattern(block_t *block)
{
for (uint32_t i = 0; i < block->n_insn - 1; i++) {
rv_insn_t *ir = block->ir + i, *next_ir = NULL;
int32_t count = 0, sign = 1;
switch (ir->opcode) {
case rv_insn_auipc:
next_ir = ir + 1;
if (next_ir->opcode == rv_insn_addi && ir->rd == next_ir->rs1) {
/* the destination register of instruction auipc is equal to the
* source register 1 of next instruction addi */
ir->opcode = rv_insn_fuse1;
ir->rd = next_ir->rd;
ir->imm2 = next_ir->imm;
ir->impl = dispatch_table[ir->opcode];
next_ir->opcode = rv_insn_nop;
next_ir->impl = dispatch_table[next_ir->opcode];
} else if (next_ir->opcode == rv_insn_add &&
ir->rd == next_ir->rs2) {
/* the destination register of instruction auipc is equal to the
* source register 2 of next instruction add */
ir->opcode = rv_insn_fuse2;
ir->rd = next_ir->rd;
ir->rs1 = next_ir->rs1;
ir->impl = dispatch_table[ir->opcode];
next_ir->opcode = rv_insn_nop;
next_ir->impl = dispatch_table[next_ir->opcode];
}
break;
/* If the memory addresses of a sequence of store or load instructions
* are contiguous, combine these instructions.
*/
case rv_insn_sw:
COMBINE_MEM_OPS(0);
case rv_insn_lw:
COMBINE_MEM_OPS(1);
/* FIXME: lui + addi*/
}
}
}

static block_t *prev = NULL;
static block_t *block_find_or_translate(riscv_t *rv)
{
Expand All @@ -1375,6 +1499,9 @@ static block_t *block_find_or_translate(riscv_t *rv)
/* translate the basic block */
block_translate(rv, next);

/* macro operation fusion */
match_pattern(next);

/* insert the block into block map */
block_insert(&rv->block_map, next);

Expand Down
2 changes: 2 additions & 0 deletions src/riscv.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ void block_map_clear(block_map_t *map)
block_t *block = map->map[i];
if (!block)
continue;
for (uint32_t i = 0; i < block->n_insn; i++)
free(block->ir[i].fuse);
free(block->ir);
free(block);
map->map[i] = NULL;
Expand Down

0 comments on commit 56b14b8

Please sign in to comment.