From bc80e5e689ada01074c5b7a646e4e099028c44f7 Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Fri, 22 Mar 2024 13:17:13 +0300 Subject: [PATCH] Extend GCM with an ability to split partially-dead nodes The code is based on the Benedikt Meurer's idea first implemented in V8. See: https://codereview.chromium.org/899433005 --- ir.h | 5 +- ir_gcm.c | 299 ++++++++++++++++++++++++++++++++++++++ ir_main.c | 3 + tests/c/switch_002.irt | 11 +- tests/llvm/switch_002.irt | 11 +- 5 files changed, 319 insertions(+), 10 deletions(-) diff --git a/ir.h b/ir.h index 16551b3b..3fbf7fcb 100644 --- a/ir.h +++ b/ir.h @@ -531,8 +531,9 @@ void ir_strtab_free(ir_strtab *strtab); /* debug related */ #ifdef IR_DEBUG -# define IR_DEBUG_SCCP (1<<27) -# define IR_DEBUG_GCM (1<<28) +# define IR_DEBUG_SCCP (1<<26) +# define IR_DEBUG_GCM (1<<27) +# define IR_DEBUG_GCM_SPLIT (1<<28) # define IR_DEBUG_SCHEDULE (1<<29) # define IR_DEBUG_RA (1<<30) #endif diff --git a/ir_gcm.c b/ir_gcm.c index b98b568d..deea7160 100644 --- a/ir_gcm.c +++ b/ir_gcm.c @@ -14,6 +14,8 @@ #define IR_GCM_IS_SCHEDULED_EARLY(b) (((int32_t)(b)) < 0) #define IR_GCM_EARLY_BLOCK(b) ((uint32_t)-((int32_t)(b))) +#define IR_GCM_SPLIT 1 + static uint32_t ir_gcm_schedule_early(ir_ctx *ctx, ir_ref ref, ir_list *queue_rest) { ir_ref n, *p, input; @@ -80,6 +82,296 @@ static uint32_t ir_gcm_find_lca(ir_ctx *ctx, uint32_t b1, uint32_t b2) return b2; } +#if IR_GCM_SPLIT +/* Partially Dead Code Elimination throug splitting the node and sunking the clones + * + * This code is based on the idea of Benedikt Meurer first implemented in V8. + * See: https://codereview.chromium.org/899433005 + */ + +static void _push_predecessors(ir_ctx *ctx, int32_t b, ir_bitset totally_useful, ir_bitqueue *worklist) +{ + ir_block *bb = &ctx->cfg_blocks[b]; + uint32_t *p, i, n = bb->predecessors_count; + + for (p = ctx->cfg_edges + bb->predecessors; n > 0; p++, n--) { + i = *p; + if (!ir_bitset_in(totally_useful, i)) { + ir_bitqueue_add(worklist, i); + } + } +} + +static bool _check_successors(ir_ctx *ctx, int32_t b, ir_bitset totally_useful) +{ + ir_block *bb = &ctx->cfg_blocks[b]; + uint32_t *p, i, n = bb->successors_count; + + for (p = ctx->cfg_edges + bb->successors; n > 0; p++, n--) { + i = *p; + if (!ir_bitset_in(totally_useful, i)) { + return 0; + } + } + return 1; +} + +static bool ir_split_partially_dead_node(ir_ctx *ctx, ir_ref ref, uint32_t b) +{ + ir_use_list *use_list; + ir_insn *insn; + ir_ref n, *p, use; + uint32_t i; + ir_bitqueue worklist; + ir_bitset totally_useful = ir_bitset_malloc(ctx->cfg_blocks_count + 1); + + /* 1. Find a set of blocks where the node is TOTALLY_USEFUL (not PARTIALLY_DEAD) + * 1.1. Collect the blocks where the node is really USED. + */ + ir_bitqueue_init(&worklist, ctx->cfg_blocks_count + 1); + IR_ASSERT(b > 0 && b <= ctx->cfg_blocks_count); + + use_list = &ctx->use_lists[ref]; + n = use_list->count; + for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) { + use = *p; + insn = &ctx->ir_base[use]; + if (insn->op == IR_PHI) { + ir_ref *p = insn->ops + 2; /* PHI data inputs */ + ir_ref *q = ctx->ir_base[insn->op1].ops + 1; /* MERGE inputs */ + ir_ref n = insn->inputs_count - 1; + + for (;n > 0; p++, q++, n--) { + if (*p == ref) { + i = ctx->cfg_map[*q]; + IR_ASSERT(i > 0 && i <= ctx->cfg_blocks_count); + if (!ir_bitset_in(totally_useful, i)) { + if (i == b) goto exit; /* node is totally-useful in the scheduled block */ + ir_bitset_incl(totally_useful, i); + _push_predecessors(ctx, i, totally_useful, &worklist); + } + } + } + } else { + i = ctx->cfg_map[use]; + if (!i) { + continue; + } + IR_ASSERT(i > 0 && i <= ctx->cfg_blocks_count); + if (!ir_bitset_in(totally_useful, i)) { + ir_bitset_incl(totally_useful, i); + if (i == b) goto exit; /* node is totally-useful in the scheduled block */ + _push_predecessors(ctx, i, totally_useful, &worklist); + } + } + } + +#ifdef IR_DEBUG + if (ctx->flags & IR_DEBUG_GCM_SPLIT) { + bool first = 1; + fprintf(stderr, "*** Split partially dead node d_%d scheduled to BB%d\n", ref, b); + IR_BITSET_FOREACH(totally_useful, ir_bitset_len(ctx->cfg_blocks_count + 1), i) { + if (first) { + fprintf(stderr, "\td_%d is USED in [BB%d", ref, i); + first = 0; + } else { + fprintf(stderr, ", BB%d", i); + } + } IR_BITSET_FOREACH_END(); + fprintf(stderr, "]\n"); + } +#endif + + /* 1.2. Iteratively check the predecessors of already found TOTALLY_USEFUL blocks and + * add them into TOTALLY_USEFUL set if all of their sucessors are already there. + */ + while ((i = ir_bitqueue_pop(&worklist)) != (uint32_t)-1) { + if (!ir_bitset_in(totally_useful, i) + && _check_successors(ctx, i, totally_useful)) { + if (i == b) goto exit; /* node is TOTALLY_USEFUL in the scheduled block */ + ir_bitset_incl(totally_useful, i); + _push_predecessors(ctx, i, totally_useful, &worklist); + } + } + + IR_ASSERT(!ir_bitset_in(totally_useful, b)); + +#ifdef IR_DEBUG + if (ctx->flags & IR_DEBUG_GCM_SPLIT) { + bool first = 1; + IR_BITSET_FOREACH(totally_useful, ir_bitset_len(ctx->cfg_blocks_count + 1), i) { + if (first) { + fprintf(stderr, "\td_%d is TOTALLY_USEFUL in [BB%d", ref, i); + first = 0; + } else { + fprintf(stderr, ", BB%d", i); + } + } IR_BITSET_FOREACH_END(); + fprintf(stderr, "]\n"); + } +#endif + + /* 2. Split the USEs into partitions */ + use_list = &ctx->use_lists[ref]; + ir_hashtab hash; + uint32_t j, clone, clones_count = 0, uses_count = 0; + struct { + ir_ref ref; + uint32_t block; + uint32_t use_count; + uint32_t use; + } *clones = ir_mem_malloc(sizeof(*clones) * use_list->count); + struct { + ir_ref ref; + uint32_t block; + uint32_t next; + } *uses = ir_mem_malloc(sizeof(*uses) * use_list->count); + + ir_hashtab_init(&hash, use_list->count); + n = use_list->count; + for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) { + use = *p; + insn = &ctx->ir_base[use]; + if (insn->op == IR_PHI) { + ir_ref *p = insn->ops + 2; /* PHI data inputs */ + ir_ref *q = ctx->ir_base[insn->op1].ops + 1; /* MERGE inputs */ + ir_ref n = insn->inputs_count - 1; + + /* PHIs must be processed once */ + if (ir_hashtab_find(&hash, -use) != (ir_ref)IR_INVALID_VAL) { + continue; + } + ir_hashtab_add(&hash, -use, IR_NULL); + for (;n > 0; p++, q++, n--) { + if (*p == ref) { + j = i = ctx->cfg_map[*q]; + while (ir_bitset_in(totally_useful, ctx->cfg_blocks[j].idom)) { + j = ctx->cfg_blocks[j].idom; + } + clone = ir_hashtab_find(&hash, j); + if (clone == IR_INVALID_VAL) { + clone = clones_count++; + ir_hashtab_add(&hash, j, clone); + clones[clone].block = j; + clones[clone].use_count = 0; + clones[clone].use = (uint32_t)-1; + } + uses[uses_count].ref = use; + uses[uses_count].block = i; + uses[uses_count].next = clones[clone].use; + clones[clone].use_count++; + clones[clone].use = uses_count++; + } + } + } else { + j = i = ctx->cfg_map[use]; + IR_ASSERT(i > 0); + while (ir_bitset_in(totally_useful, ctx->cfg_blocks[j].idom)) { + j = ctx->cfg_blocks[j].idom; + } + clone = ir_hashtab_find(&hash, j); + if (clone == IR_INVALID_VAL) { + clone = clones_count++; + ir_hashtab_add(&hash, j, clone); + clones[clone].block = j; + clones[clone].use_count = 0; + clones[clone].use = -1; + } + uses[uses_count].ref = use; + uses[uses_count].block = i; + uses[uses_count].next = clones[clone].use; + clones[clone].use_count++; + clones[clone].use = uses_count++; + } + } + +#ifdef IR_DEBUG + if (ctx->flags & IR_DEBUG_GCM_SPLIT) { + for (i = 0; i < clones_count; i++) { + uint32_t u = clones[i].use; + + fprintf(stderr, "\tCLONE #%d in BB%d USES(%d)=[d_%d/BB%d", + i, clones[i].block, clones[i].use_count, uses[u].ref, uses[u].block); + u = uses[u].next; + while (u != (uint32_t)-1) { + fprintf(stderr, ", d_%d/BB%d", uses[u].ref, uses[u].block); + u = uses[u].next; + } + fprintf(stderr, "]\n"); + } + } +#endif + + /* Create Clones */ + insn = &ctx->ir_base[ref]; + clones[0].ref = ref; + for (i = 1; i < clones_count; i++) { + clones[i].ref = clone = ir_emit(ctx, insn->optx, insn->op1, insn->op2, insn->op3); + if (insn->op1 > 0) ir_use_list_add(ctx, insn->op1, clone); + if (insn->op2 > 0) ir_use_list_add(ctx, insn->op2, clone); + if (insn->op3 > 0) ir_use_list_add(ctx, insn->op3, clone); + } + + /* Reconstruct IR: Update DEF->USE lists, CFG mapping and etc */ + ctx->use_lists = ir_mem_realloc(ctx->use_lists, ctx->insns_count * sizeof(ir_use_list)); + ctx->cfg_map = ir_mem_realloc(ctx->cfg_map, ctx->insns_count * sizeof(uint32_t)); + n = ctx->use_lists[ref].refs; + for (i = 0; i < clones_count; i++) { + clone = clones[i].ref; + ctx->cfg_map[clone] = clones[i].block; + ctx->use_lists[clone].count = clones[i].use_count; + ctx->use_lists[clone].refs = n; + + uint32_t u = clones[i].use; + while (u != (uint32_t)-1) { + use = uses[u].ref; + ctx->use_edges[n++] = use; + u = uses[u].next; + if (i > 0) { + /* replace inputs */ + ir_insn *insn = &ctx->ir_base[use]; + ir_ref k, l = insn->inputs_count; + + for (k = 1; k <= l; k++) { + if (ir_insn_op(insn, k) == ref) { + if (insn->op == IR_PHI) { + j = ctx->cfg_map[ir_insn_op(&ctx->ir_base[insn->op1], k - 1)]; + while (ir_bitset_in(totally_useful, ctx->cfg_blocks[j].idom)) { + j = ctx->cfg_blocks[j].idom; + } + if (j != clones[i].block) { + continue; + } + } + ir_insn_set_op(insn, k, clone); + break; + } + } + } + } + } + + ir_mem_free(uses); + ir_mem_free(clones); + ir_hashtab_free(&hash); + ir_mem_free(totally_useful); + ir_bitqueue_free(&worklist); + +#ifdef IR_DEBUG + if (ctx->flags & IR_DEBUG_GCM_SPLIT) { + ir_check(ctx); + } +#endif + + return 1; + +exit: + ir_mem_free(totally_useful); + ir_bitqueue_free(&worklist); + return 0; +} +#endif + static void ir_gcm_schedule_late(ir_ctx *ctx, ir_ref ref, uint32_t b) { ir_ref n, use; @@ -122,6 +414,13 @@ static void ir_gcm_schedule_late(ir_ctx *ctx, ir_ref ref, uint32_t b) IR_ASSERT(lca != 0 && "No Common Ancestor"); b = lca; +#if IR_GCM_SPLIT + if (ctx->use_lists[ref].count > 1 + && ir_split_partially_dead_node(ctx, ref, b)) { + return; + } +#endif + if (b != ctx->cfg_map[ref]) { ir_block *bb = &ctx->cfg_blocks[b]; uint32_t loop_depth = bb->loop_depth; diff --git a/ir_main.c b/ir_main.c index 900aa5a2..1a8f2905 100644 --- a/ir_main.c +++ b/ir_main.c @@ -86,6 +86,7 @@ static void help(const char *cmd) #ifdef IR_DEBUG " --debug-sccp - debug SCCP optimization pass\n" " --debug-gcm - debug GCM optimization pass\n" + " --debug-gcm-split - debug floating node splitting\n" " --debug-schedule - debug SCHEDULE optimization pass\n" " --debug-ra - debug register allocator\n" " --debug-regset - restrict available register set\n" @@ -1123,6 +1124,8 @@ int main(int argc, char **argv) flags |= IR_DEBUG_SCCP; } else if (strcmp(argv[i], "--debug-gcm") == 0) { flags |= IR_DEBUG_GCM; + } else if (strcmp(argv[i], "--debug-gcm-split") == 0) { + flags |= IR_DEBUG_GCM_SPLIT; } else if (strcmp(argv[i], "--debug-schedule") == 0) { flags |= IR_DEBUG_SCHEDULE; } else if (strcmp(argv[i], "--debug-ra") == 0) { diff --git a/tests/c/switch_002.irt b/tests/c/switch_002.irt index 524089c1..220cddbf 100644 --- a/tests/c/switch_002.irt +++ b/tests/c/switch_002.irt @@ -35,18 +35,23 @@ int32_t test(int32_t z) { int32_t d_1 = z; int32_t d_2; - d_2 = d_1 - 2; switch (d_1) { case 0: goto bb2; case 1: goto bb3; - case 2: goto bb6; - default: goto bb6; + case 2: goto bb4; + default: goto bb5; } bb2: d_2 = d_1 * 3; goto bb6; bb3: d_2 = d_1 + 1; + goto bb6; +bb4: + d_2 = d_1 - 2; + goto bb6; +bb5: + d_2 = d_1 - 2; bb6: return d_2; } diff --git a/tests/llvm/switch_002.irt b/tests/llvm/switch_002.irt index 10ccca2e..8fff1a5a 100644 --- a/tests/llvm/switch_002.irt +++ b/tests/llvm/switch_002.irt @@ -33,23 +33,24 @@ SWITCH 001 --EXPECT-- define i32 @test(i32 %d2) { - %d3 = sub i32 %d2, 2 switch i32 %d2, label %l5 [ i32 0, label %l2 i32 1, label %l3 i32 2, label %l4 ] l2: - %d6 = mul i32 %d2, 3 + %d5 = mul i32 %d2, 3 br label %l6 l3: - %d9 = add i32 %d2, 1 + %d8 = add i32 %d2, 1 br label %l6 l4: + %d11 = sub i32 %d2, 2 br label %l6 l5: + %d14 = sub i32 %d2, 2 br label %l6 l6: - %d17 = phi i32 [%d6, %l2], [%d9, %l3], [%d3, %l4], [%d3, %l5] - ret i32 %d17 + %d18 = phi i32 [%d5, %l2], [%d8, %l3], [%d11, %l4], [%d14, %l5] + ret i32 %d18 }