Skip to content

Commit

Permalink
Extend GCM with an ability to split partially-dead nodes
Browse files Browse the repository at this point in the history
The code is based on the Benedikt Meurer's idea first implemented in V8.
See: https://codereview.chromium.org/899433005
  • Loading branch information
dstogov committed Mar 22, 2024
1 parent de8b054 commit 1bf0780
Show file tree
Hide file tree
Showing 5 changed files with 319 additions and 10 deletions.
5 changes: 3 additions & 2 deletions ir.h
Original file line number Diff line number Diff line change
Expand Up @@ -531,8 +531,9 @@ void ir_strtab_free(ir_strtab *strtab);

/* debug related */
#ifdef IR_DEBUG
# define IR_DEBUG_SCCP (1<<27)
# define IR_DEBUG_GCM (1<<28)
# define IR_DEBUG_SCCP (1<<26)
# define IR_DEBUG_GCM (1<<27)
# define IR_DEBUG_GCM_SPLIT (1<<28)
# define IR_DEBUG_SCHEDULE (1<<29)
# define IR_DEBUG_RA (1<<30)
#endif
Expand Down
299 changes: 299 additions & 0 deletions ir_gcm.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#define IR_GCM_IS_SCHEDULED_EARLY(b) (((int32_t)(b)) < 0)
#define IR_GCM_EARLY_BLOCK(b) ((uint32_t)-((int32_t)(b)))

#define IR_GCM_SPLIT 1

static uint32_t ir_gcm_schedule_early(ir_ctx *ctx, ir_ref ref, ir_list *queue_rest)
{
ir_ref n, *p, input;
Expand Down Expand Up @@ -80,6 +82,296 @@ static uint32_t ir_gcm_find_lca(ir_ctx *ctx, uint32_t b1, uint32_t b2)
return b2;
}

#if IR_GCM_SPLIT
/* Partially Dead Code Elimination through splitting the node and sunking the clones
*
* This code is based on the Benedikt Meurer's idea first implemented in V8.
* See: https://codereview.chromium.org/899433005
*/

static void _push_predecessors(ir_ctx *ctx, int32_t b, ir_bitset totally_useful, ir_bitqueue *worklist)
{
ir_block *bb = &ctx->cfg_blocks[b];
uint32_t *p, i, n = bb->predecessors_count;

for (p = ctx->cfg_edges + bb->predecessors; n > 0; p++, n--) {
i = *p;
if (!ir_bitset_in(totally_useful, i)) {
ir_bitqueue_add(worklist, i);
}
}
}

static bool _check_successors(ir_ctx *ctx, int32_t b, ir_bitset totally_useful)
{
ir_block *bb = &ctx->cfg_blocks[b];
uint32_t *p, i, n = bb->successors_count;

for (p = ctx->cfg_edges + bb->successors; n > 0; p++, n--) {
i = *p;
if (!ir_bitset_in(totally_useful, i)) {
return 0;
}
}
return 1;
}

static bool ir_split_partially_dead_node(ir_ctx *ctx, ir_ref ref, uint32_t b)
{
ir_use_list *use_list;
ir_insn *insn;
ir_ref n, *p, use;
uint32_t i;
ir_bitqueue worklist;
ir_bitset totally_useful = ir_bitset_malloc(ctx->cfg_blocks_count + 1);

/* 1. Find a set of blocks where the node is TOTALLY_USEFUL (not PARTIALLY_DEAD)
* 1.1. Collect the blocks where the node is really USED.
*/
ir_bitqueue_init(&worklist, ctx->cfg_blocks_count + 1);
IR_ASSERT(b > 0 && b <= ctx->cfg_blocks_count);

use_list = &ctx->use_lists[ref];
n = use_list->count;
for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
use = *p;
insn = &ctx->ir_base[use];
if (insn->op == IR_PHI) {
ir_ref *p = insn->ops + 2; /* PHI data inputs */
ir_ref *q = ctx->ir_base[insn->op1].ops + 1; /* MERGE inputs */
ir_ref n = insn->inputs_count - 1;

for (;n > 0; p++, q++, n--) {
if (*p == ref) {
i = ctx->cfg_map[*q];
IR_ASSERT(i > 0 && i <= ctx->cfg_blocks_count);
if (!ir_bitset_in(totally_useful, i)) {
if (i == b) goto exit; /* node is totally-useful in the scheduled block */
ir_bitset_incl(totally_useful, i);
_push_predecessors(ctx, i, totally_useful, &worklist);
}
}
}
} else {
i = ctx->cfg_map[use];
if (!i) {
continue;
}
IR_ASSERT(i > 0 && i <= ctx->cfg_blocks_count);
if (!ir_bitset_in(totally_useful, i)) {
ir_bitset_incl(totally_useful, i);
if (i == b) goto exit; /* node is totally-useful in the scheduled block */
_push_predecessors(ctx, i, totally_useful, &worklist);
}
}
}

#ifdef IR_DEBUG
if (ctx->flags & IR_DEBUG_GCM_SPLIT) {
bool first = 1;
fprintf(stderr, "*** Split partially dead node d_%d scheduled to BB%d\n", ref, b);
IR_BITSET_FOREACH(totally_useful, ir_bitset_len(ctx->cfg_blocks_count + 1), i) {
if (first) {
fprintf(stderr, "\td_%d is USED in [BB%d", ref, i);
first = 0;
} else {
fprintf(stderr, ", BB%d", i);
}
} IR_BITSET_FOREACH_END();
fprintf(stderr, "]\n");
}
#endif

/* 1.2. Iteratively check the predecessors of already found TOTALLY_USEFUL blocks and
* add them into TOTALLY_USEFUL set if all of their sucessors are already there.
*/
while ((i = ir_bitqueue_pop(&worklist)) != (uint32_t)-1) {
if (!ir_bitset_in(totally_useful, i)
&& _check_successors(ctx, i, totally_useful)) {
if (i == b) goto exit; /* node is TOTALLY_USEFUL in the scheduled block */
ir_bitset_incl(totally_useful, i);
_push_predecessors(ctx, i, totally_useful, &worklist);
}
}

IR_ASSERT(!ir_bitset_in(totally_useful, b));

#ifdef IR_DEBUG
if (ctx->flags & IR_DEBUG_GCM_SPLIT) {
bool first = 1;
IR_BITSET_FOREACH(totally_useful, ir_bitset_len(ctx->cfg_blocks_count + 1), i) {
if (first) {
fprintf(stderr, "\td_%d is TOTALLY_USEFUL in [BB%d", ref, i);
first = 0;
} else {
fprintf(stderr, ", BB%d", i);
}
} IR_BITSET_FOREACH_END();
fprintf(stderr, "]\n");
}
#endif

/* 2. Split the USEs into partitions */
use_list = &ctx->use_lists[ref];
ir_hashtab hash;
uint32_t j, clone, clones_count = 0, uses_count = 0;
struct {
ir_ref ref;
uint32_t block;
uint32_t use_count;
uint32_t use;
} *clones = ir_mem_malloc(sizeof(*clones) * use_list->count);
struct {
ir_ref ref;
uint32_t block;
uint32_t next;
} *uses = ir_mem_malloc(sizeof(*uses) * use_list->count);

ir_hashtab_init(&hash, use_list->count);
n = use_list->count;
for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
use = *p;
insn = &ctx->ir_base[use];
if (insn->op == IR_PHI) {
ir_ref *p = insn->ops + 2; /* PHI data inputs */
ir_ref *q = ctx->ir_base[insn->op1].ops + 1; /* MERGE inputs */
ir_ref n = insn->inputs_count - 1;

/* PHIs must be processed once */
if (ir_hashtab_find(&hash, -use) != (ir_ref)IR_INVALID_VAL) {
continue;
}
ir_hashtab_add(&hash, -use, IR_NULL);
for (;n > 0; p++, q++, n--) {
if (*p == ref) {
j = i = ctx->cfg_map[*q];
while (ir_bitset_in(totally_useful, ctx->cfg_blocks[j].idom)) {
j = ctx->cfg_blocks[j].idom;
}
clone = ir_hashtab_find(&hash, j);
if (clone == IR_INVALID_VAL) {
clone = clones_count++;
ir_hashtab_add(&hash, j, clone);
clones[clone].block = j;
clones[clone].use_count = 0;
clones[clone].use = (uint32_t)-1;
}
uses[uses_count].ref = use;
uses[uses_count].block = i;
uses[uses_count].next = clones[clone].use;
clones[clone].use_count++;
clones[clone].use = uses_count++;
}
}
} else {
j = i = ctx->cfg_map[use];
IR_ASSERT(i > 0);
while (ir_bitset_in(totally_useful, ctx->cfg_blocks[j].idom)) {
j = ctx->cfg_blocks[j].idom;
}
clone = ir_hashtab_find(&hash, j);
if (clone == IR_INVALID_VAL) {
clone = clones_count++;
ir_hashtab_add(&hash, j, clone);
clones[clone].block = j;
clones[clone].use_count = 0;
clones[clone].use = -1;
}
uses[uses_count].ref = use;
uses[uses_count].block = i;
uses[uses_count].next = clones[clone].use;
clones[clone].use_count++;
clones[clone].use = uses_count++;
}
}

#ifdef IR_DEBUG
if (ctx->flags & IR_DEBUG_GCM_SPLIT) {
for (i = 0; i < clones_count; i++) {
uint32_t u = clones[i].use;

fprintf(stderr, "\tCLONE #%d in BB%d USES(%d)=[d_%d/BB%d",
i, clones[i].block, clones[i].use_count, uses[u].ref, uses[u].block);
u = uses[u].next;
while (u != (uint32_t)-1) {
fprintf(stderr, ", d_%d/BB%d", uses[u].ref, uses[u].block);
u = uses[u].next;
}
fprintf(stderr, "]\n");
}
}
#endif

/* Create Clones */
insn = &ctx->ir_base[ref];
clones[0].ref = ref;
for (i = 1; i < clones_count; i++) {
clones[i].ref = clone = ir_emit(ctx, insn->optx, insn->op1, insn->op2, insn->op3);
if (insn->op1 > 0) ir_use_list_add(ctx, insn->op1, clone);
if (insn->op2 > 0) ir_use_list_add(ctx, insn->op2, clone);
if (insn->op3 > 0) ir_use_list_add(ctx, insn->op3, clone);
}

/* Reconstruct IR: Update DEF->USE lists, CFG mapping and etc */
ctx->use_lists = ir_mem_realloc(ctx->use_lists, ctx->insns_count * sizeof(ir_use_list));
ctx->cfg_map = ir_mem_realloc(ctx->cfg_map, ctx->insns_count * sizeof(uint32_t));
n = ctx->use_lists[ref].refs;
for (i = 0; i < clones_count; i++) {
clone = clones[i].ref;
ctx->cfg_map[clone] = clones[i].block;
ctx->use_lists[clone].count = clones[i].use_count;
ctx->use_lists[clone].refs = n;

uint32_t u = clones[i].use;
while (u != (uint32_t)-1) {
use = uses[u].ref;
ctx->use_edges[n++] = use;
u = uses[u].next;
if (i > 0) {
/* replace inputs */
ir_insn *insn = &ctx->ir_base[use];
ir_ref k, l = insn->inputs_count;

for (k = 1; k <= l; k++) {
if (ir_insn_op(insn, k) == ref) {
if (insn->op == IR_PHI) {
j = ctx->cfg_map[ir_insn_op(&ctx->ir_base[insn->op1], k - 1)];
while (ir_bitset_in(totally_useful, ctx->cfg_blocks[j].idom)) {
j = ctx->cfg_blocks[j].idom;
}
if (j != clones[i].block) {
continue;
}
}
ir_insn_set_op(insn, k, clone);
break;
}
}
}
}
}

ir_mem_free(uses);
ir_mem_free(clones);
ir_hashtab_free(&hash);
ir_mem_free(totally_useful);
ir_bitqueue_free(&worklist);

#ifdef IR_DEBUG
if (ctx->flags & IR_DEBUG_GCM_SPLIT) {
ir_check(ctx);
}
#endif

return 1;

exit:
ir_mem_free(totally_useful);
ir_bitqueue_free(&worklist);
return 0;
}
#endif

static void ir_gcm_schedule_late(ir_ctx *ctx, ir_ref ref, uint32_t b)
{
ir_ref n, use;
Expand Down Expand Up @@ -122,6 +414,13 @@ static void ir_gcm_schedule_late(ir_ctx *ctx, ir_ref ref, uint32_t b)
IR_ASSERT(lca != 0 && "No Common Ancestor");
b = lca;

#if IR_GCM_SPLIT
if (ctx->use_lists[ref].count > 1
&& ir_split_partially_dead_node(ctx, ref, b)) {
return;
}
#endif

if (b != ctx->cfg_map[ref]) {
ir_block *bb = &ctx->cfg_blocks[b];
uint32_t loop_depth = bb->loop_depth;
Expand Down
3 changes: 3 additions & 0 deletions ir_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ static void help(const char *cmd)
#ifdef IR_DEBUG
" --debug-sccp - debug SCCP optimization pass\n"
" --debug-gcm - debug GCM optimization pass\n"
" --debug-gcm-split - debug floating node splitting\n"
" --debug-schedule - debug SCHEDULE optimization pass\n"
" --debug-ra - debug register allocator\n"
" --debug-regset <bit-mask> - restrict available register set\n"
Expand Down Expand Up @@ -1123,6 +1124,8 @@ int main(int argc, char **argv)
flags |= IR_DEBUG_SCCP;
} else if (strcmp(argv[i], "--debug-gcm") == 0) {
flags |= IR_DEBUG_GCM;
} else if (strcmp(argv[i], "--debug-gcm-split") == 0) {
flags |= IR_DEBUG_GCM_SPLIT;
} else if (strcmp(argv[i], "--debug-schedule") == 0) {
flags |= IR_DEBUG_SCHEDULE;
} else if (strcmp(argv[i], "--debug-ra") == 0) {
Expand Down
11 changes: 8 additions & 3 deletions tests/c/switch_002.irt
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,23 @@ int32_t test(int32_t z)
{
int32_t d_1 = z;
int32_t d_2;
d_2 = d_1 - 2;
switch (d_1) {
case 0: goto bb2;
case 1: goto bb3;
case 2: goto bb6;
default: goto bb6;
case 2: goto bb4;
default: goto bb5;
}
bb2:
d_2 = d_1 * 3;
goto bb6;
bb3:
d_2 = d_1 + 1;
goto bb6;
bb4:
d_2 = d_1 - 2;
goto bb6;
bb5:
d_2 = d_1 - 2;
bb6:
return d_2;
}
Loading

0 comments on commit 1bf0780

Please sign in to comment.