Skip to content

Commit

Permalink
Substitute memset/memcpy calls with faster alternatives
Browse files Browse the repository at this point in the history
1. Because RISC-V use jal instruction as function call, so we remove recursive
jump tranlation to detect function call.
2. To identify memset/memcpy, we analyze the pattern of the first basic
block of memset/memcpy for comparison. If the basic block aligns with the
pattern, we extract the instruction sequence starting from the initial PC
of the basic block and then compare it with the pre-recorded memset/memcpy
instruction sequence.Upon detecting a memset/memcpy, we substitute the basic
block with the standard library function memset/memcpy.

Based on performance results below, we gain 3% performance improvement when
running coreMark and lost 1% performance when running dhrysone. The performance
impact is resulted in the number of invoking memset/memcpy times.

* Intel Core i7-11700

|   Metric   | origin  | proposed |Speedup|
|------------+---------+----------+-------|
| Dhrystone  | 1413.11 | 1447.11  |  +2%  |
| CoreMark   | 1497.35 | 1525.31  |  +2%  |

See: sysprog21#153
  • Loading branch information
qwe661234 committed Aug 24, 2023
1 parent 98a149b commit e67f3be
Showing 1 changed file with 271 additions and 3 deletions.
274 changes: 271 additions & 3 deletions src/emulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,9 @@ static uint32_t last_pc = 0;
_(fuse2) \
_(fuse3) \
_(fuse4) \
_(fuse5)
_(fuse5) \
_(fuse6) \
_(fuse7)

enum {
rv_insn_fuse0 = N_RV_INSNS,
Expand Down Expand Up @@ -424,6 +426,35 @@ static bool do_fuse5(riscv_t *rv, const rv_insn_t *ir)
MUST_TAIL return next->impl(rv, next);
}

/* memset */
static bool do_fuse6(riscv_t *rv, const rv_insn_t *ir)
{
rv->X[rv_reg_zero] = 0;
rv->csr_cycle += 2;
memory_t *m = ((state_t *) rv->userdata)->mem;
memset((char *) m->mem_base + rv->X[10], rv->X[11], rv->X[12]);
rv->PC = rv->X[1] & ~1U;
if (unlikely(RVOP_NO_NEXT(ir)))
return true;
const rv_insn_t *next = ir + 1;
MUST_TAIL return next->impl(rv, next);
}

/* memcpy */
static bool do_fuse7(riscv_t *rv, const rv_insn_t *ir)
{
rv->X[rv_reg_zero] = 0;
rv->csr_cycle += 2;
memory_t *m = ((state_t *) rv->userdata)->mem;
memcpy((char *) m->mem_base + rv->X[10], (char *) m->mem_base + rv->X[11],
rv->X[12]);
rv->PC = rv->X[1] & ~1U;
if (unlikely(RVOP_NO_NEXT(ir)))
return true;
const rv_insn_t *next = ir + 1;
MUST_TAIL return next->impl(rv, next);
}

/* clang-format off */
static const void *dispatch_table[] = {
/* RV32 instructions */
Expand Down Expand Up @@ -582,18 +613,255 @@ static void block_translate(riscv_t *rv, block_t *block)
ir->tailcall = next_ir->tailcall; \
}

static bool parse_memset(riscv_t *rv)
{
static const uint32_t memset_insn[] = {
0x00f00313, /* li t1,15 */
0x00050713, /* mv a4,a0 */
0x02c37e63, /* bgeu t1,a2,0x11828 */
0x00f77793, /* and a5,a4,15 */
0x0a079063, /* bnez a5,0x11894 */
0x08059263, /* bnez a1,0x1187c */
0xff067693, /* and a3,a2,-16 */
0x00f67613, /* and a2,a2,15 */
0x00e686b3, /* add a3,a3,a4 */
0x00b72023, /* sw a1,0(a4) */
0x00b72223, /* sw a1,4(a4) */
0x00b72423, /* sw a1,8(a4) */
0x00b72623, /* sw a1,12(a4) */
0x01070713, /* add a4,a4,16 */
0xfed766e3, /* bltu a4,a3,0x11808 */
0x00061463, /* bnez a2,0x11828 */
0x00008067, /* ret */
0x40c306b3, /* sub a3,t1,a2 */
0x00269693, /* sll a3,a3,0x2 */
0x00000297, /* auipc t0,0x0 */
0x005686b3, /* add a3,a3,t0 */
0x00c68067, /* jr 12(a3) */
0x00b70723, /* sb a1,14(a4) */
0x00b706a3, /* sb a1,13(a4) */
0x00b70623, /* sb a1,12(a4) */
0x00b705a3, /* sb a1,11(a4) */
0x00b70523, /* sb a1,10(a4) */
0x00b704a3, /* sb a1,9(a4) */
0x00b70423, /* sb a1,8(a4) */
0x00b703a3, /* sb a1,7(a4) */
0x00b70323, /* sb a1,6(a4) */
0x00b702a3, /* sb a1,5(a4) */
0x00b70223, /* sb a1,4(a4) */
0x00b701a3, /* sb a1,3(a4) */
0x00b70123, /* sb a1,2(a4) */
0x00b700a3, /* sb a1,1(a4) */
0x00b70023, /* sb a1,0(a4) */
0x00008067, /* ret */
0x0ff5f593, /* zext.b a1,a1 */
0x00859693, /* sll a3,a1,0x8 */
0x00d5e5b3, /* or a1,a1,a3 */
0x01059693, /* sll a3,a1,0x10 */
0x00d5e5b3, /* or a1,a1,a3 */
0xf6dff06f, /* j 0x117fc */
0x00279693, /* sll a3,a5,0x2 */
0x00000297, /* auipc t0,0x0 */
0x005686b3, /* add a3,a3,t0 */
0x00008293, /* mv t0,ra */
0xfa0680e7, /* jalr -96(a3) */
0x00028093, /* mv ra,t0 */
0xff078793, /* add a5,a5,-16 */
0x40f70733, /* sub a4,a4,a5 */
0x00f60633, /* add a2,a2,a5 */
0xf6c378e3, /* bgeu t1,a2,0x11828 */
0xf3dff06f, /* j 0x117f8 */
};
uint32_t tmp_pc = rv->PC;
for (int i = 0; i < 55; i++) {
const uint32_t insn = rv->io.mem_ifetch(tmp_pc);
if (insn != memset_insn[i])
return false;
tmp_pc += 4;
}
return true;
}

static bool parse_memcpy(riscv_t *rv)
{
static const uint32_t memset_insn[] = {
0x00b547b3, /* xor a5,a0,a1 */
0x0037f793, /* and a5,a5,3 */
0x00c508b3, /* add a7,a0,a2 */
0x06079463, /* bnez a5,0x21428 */
0x00300793, /* li a5,3 */
0x06c7f063, /* bgeu a5,a2,0x21428 */
0x00357793, /* and a5,a0,3 */
0x00050713, /* mv a4,a0 */
0x06079a63, /* bnez a5,0x21448 */
0xffc8f613, /* and a2,a7,-4 */
0x40e606b3, /* sub a3,a2,a4 */
0x02000793, /* li a5,32 */
0x08d7ce63, /* blt a5,a3,0x21480 */
0x00058693, /* mv a3,a1 */
0x00070793, /* mv a5,a4 */
0x02c77863, /* bgeu a4,a2,0x21420 */
0x0006a803, /* lw a6,0(a3) */
0x00478793, /* add a5,a5,4 */
0x00468693, /* add a3,a3,4 */
0xff07ae23, /* sw a6,-4(a5) */
0xfec7e8e3, /* bltu a5,a2,0x213f4 */
0xfff60793, /* add a5,a2,-1 */
0x40e787b3, /* sub a5,a5,a4 */
0xffc7f793, /* and a5,a5,-4 */
0x00478793, /* add a5,a5,4 */
0x00f70733, /* add a4,a4,a5 */
0x00f585b3, /* add a1,a1,a5 */
0x01176863, /* bltu a4,a7,0x21430 */
0x00008067, /* ret */
0x00050713, /* mv a4,a0 */
0x05157863, /* bgeu a0,a7,0x2147c */
0x0005c783, /* lbu a5,0(a1) */
0x00170713, /* add a4,a4,1 */
0x00158593, /* add a1,a1,1 */
0xfef70fa3, /* sb a5,-1(a4) */
0xfee898e3, /* bne a7,a4,0x21430 */
0x00008067, /* ret */
0x0005c683, /* lbu a3,0(a1) */
0x00170713, /* add a4,a4,1 */
0x00377793, /* and a5,a4,3 */
0xfed70fa3, /* sb a3,-1(a4) */
0x00158593, /* add a1,a1,1 */
0xf6078ee3, /* beqz a5,0x213d8 */
0x0005c683, /* lbu a3,0(a1) */
0x00170713, /* add a4,a4,1 */
0x00377793, /* and a5,a4,3 */
0xfed70fa3, /* sb a3,-1(a4) */
0x00158593, /* add a1,a1,1 */
0xfc079ae3, /* bnez a5,0x21448 */
0xf61ff06f, /* j 0x213d8 */
0x00008067, /* ret */
0xff010113, /* add sp,sp,-16 */
0x00812623, /* sw s0,12(sp) */
0x02000413, /* li s0,32 */
0x0005a383, /* lw t2,0(a1) */
0x0045a283, /* lw t0,4(a1) */
0x0085af83, /* lw t6,8(a1) */
0x00c5af03, /* lw t5,12(a1) */
0x0105ae83, /* lw t4,16(a1) */
0x0145ae03, /* lw t3,20(a1) */
0x0185a303, /* lw t1,24(a1) */
0x01c5a803, /* lw a6,28(a1) */
0x0205a683, /* lw a3,32(a1) */
0x02470713, /* add a4,a4,36 */
0x40e607b3, /* sub a5,a2,a4 */
0xfc772e23, /* sw t2,-36(a4) */
0xfe572023, /* sw t0,-32(a4) */
0xfff72223, /* sw t6,-28(a4) */
0xffe72423, /* sw t5,-24(a4) */
0xffd72623, /* sw t4,-20(a4) */
0xffc72823, /* sw t3,-16(a4) */
0xfe672a23, /* sw t1,-12(a4) */
0xff072c23, /* sw a6,-8(a4) */
0xfed72e23, /* sw a3,-4(a4) */
0x02458593, /* add a1,a1,36 */
0xfaf446e3, /* blt s0,a5,0x2148c */
0x00058693, /* mv a3,a1 */
0x00070793, /* mv a5,a4 */
0x02c77863, /* bgeu a4,a2,0x2151c */
0x0006a803, /* lw a6,0(a3) */
0x00478793, /* add a5,a5,4 */
0x00468693, /* add a3,a3,4 */
0xff07ae23, /* sw a6,-4(a5) */
0xfec7e8e3, /* bltu a5,a2,0x214f0 */
0xfff60793, /* add a5,a2,-1 */
0x40e787b3, /* sub a5,a5,a4 */
0xffc7f793, /* and a5,a5,-4 */
0x00478793, /* add a5,a5,4 */
0x00f70733, /* add a4,a4,a5 */
0x00f585b3, /* add a1,a1,a5 */
0x01176863, /* bltu a4,a7,0x2152c */
0x00c12403, /* lw s0,12(sp) */
0x01010113, /* add sp,sp,16 */
0x00008067, /* ret */
0x0005c783, /* lbu a5,0(a1) */
0x00170713, /* add a4,a4,1 */
0x00158593, /* add a1,a1,1 */
0xfef70fa3, /* sb a5,-1(a4) */
0xfee882e3, /* beq a7,a4,0x21520 */
0x0005c783, /* lbu a5,0(a1) */
0x00170713, /* add a4,a4,1 */
0x00158593, /* add a1,a1,1 */
0xfef70fa3, /* sb a5,-1(a4) */
0xfce89ee3, /* bne a7,a4,0x2152c */
0xfcdff06f, /* j 0x21520 */
};
uint32_t tmp_pc = rv->PC;
for (int i = 0; i < 105; i++) {
const uint32_t insn = rv->io.mem_ifetch(tmp_pc);
if (insn != memset_insn[i])
return false;
tmp_pc += 4;
}
return true;
}

/* Check if instructions in a block match a specific pattern. If they do,
* rewrite them as fused instructions.
*
* Strategies are being devised to increase the number of instructions that
* match the pattern, including possible instruction reordering.
*/
static void match_pattern(block_t *block)
static void match_pattern(riscv_t *rv, block_t *block)
{
for (uint32_t i = 0; i < block->n_insn - 1; i++) {
rv_insn_t *ir = block->ir + i, *next_ir = NULL;
int32_t count = 0, sign = 1;
switch (ir->opcode) {
case rv_insn_addi:
/* Compare the target block with the first basic block of memset, if
* two block is match, we would extract the instruction sequence
* starting from the pc_start of the basic block and then compare
* it with the pre-recorded memset instruction sequence.
*/
if (ir->imm == 15 && ir->rd == 6 && ir->rs1 == 0) {
next_ir = ir + 1;
if (next_ir->opcode == rv_insn_addi && next_ir->rd == 14 &&
next_ir->rs1 == 10 && next_ir->rs2 == 0) {
next_ir = next_ir + 1;
if (next_ir->opcode == rv_insn_bgeu && next_ir->imm == 60 &&
next_ir->rs1 == 6 && next_ir->rs2 == 12) {
if (parse_memset(rv)) {
ir->opcode = rv_insn_fuse6;
ir->impl = dispatch_table[ir->opcode];
ir->tailcall = true;
};
}
}
}
break;
case rv_insn_xor:
/* Compare the target block with the first basic block of memcpy, if
* two block is match, we would extract the instruction sequence
* starting from the pc_start of the basic block and then compare
* it with the pre-recorded memcpy instruction sequence.
*/
if (ir->rd == 15 && ir->rs1 == 10 && ir->rs2 == 11) {
next_ir = ir + 1;
if (next_ir->opcode == rv_insn_andi && next_ir->imm == 3 &&
next_ir->rd == 15 && next_ir->rs1 == 15) {
next_ir = next_ir + 1;
if (next_ir->opcode == rv_insn_add && next_ir->rd == 17 &&
next_ir->rs1 == 10 && next_ir->rs2 == 12) {
next_ir = next_ir + 1;
if (next_ir->opcode == rv_insn_bne &&
next_ir->imm == 104 && next_ir->rs1 == 15 &&
next_ir->rs2 == 0) {
if (parse_memcpy(rv)) {
ir->opcode = rv_insn_fuse7;
ir->impl = dispatch_table[ir->opcode];
ir->tailcall = true;
};
}
}
}
}
break;
case rv_insn_auipc:
next_ir = ir + 1;
if (next_ir->opcode == rv_insn_addi && ir->rd == next_ir->rs1) {
Expand Down Expand Up @@ -676,7 +944,7 @@ static block_t *block_find_or_translate(riscv_t *rv)
if (likely(!rv->debug_mode))
#endif
/* macro operation fusion */
match_pattern(next);
match_pattern(rv, next);

/* insert the block into block map */
block_insert(&rv->block_map, next);
Expand Down

0 comments on commit e67f3be

Please sign in to comment.