diff --git a/src/x32-transposec/gen/x32-transposec-16x8-rvv.c b/src/x32-transposec/gen/x32-transposec-16x8-rvv.c index 97c4f2ae32c..d6357f59b4b 100644 --- a/src/x32-transposec/gen/x32-transposec-16x8-rvv.c +++ b/src/x32-transposec/gen/x32-transposec-16x8-rvv.c @@ -16,13 +16,13 @@ #include void xnn_x32_transposec_ukernel__16x8_rvv( - const uint32_t* input, - uint32_t* output, - size_t input_stride, - size_t output_stride, - size_t block_width, - size_t block_height, - const union xnn_x32_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height, + const union xnn_x32_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_stride >= block_height * sizeof(uint32_t)); assert(input_stride >= block_width * sizeof(uint32_t)); @@ -50,293 +50,293 @@ void xnn_x32_transposec_ukernel__16x8_rvv( size_t bh = block_height; size_t vl = __riscv_vsetvl_e32m1(tile_height); for (; bh >= 16; bh -= 16) { - if (block_width >= tile_width) { - vuint32m1x8_t tuple = __riscv_vlsseg8e32_v_u32m1x8(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x8_u32m1(tuple, 0); + if (block_width >= tile_width) { + vuint32m1x8_t tuple = __riscv_vlsseg8e32_v_u32m1x8(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x8_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x8_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x8_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x8_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x8_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x8_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + vuint32m1_t v_d6 = __riscv_vget_v_u32m1x8_u32m1(tuple, 6); + __riscv_vse32_v_u32m1(o6, v_d6, vl); + vuint32m1_t v_d7 = __riscv_vget_v_u32m1x8_u32m1(tuple, 7); + __riscv_vse32_v_u32m1(o7, v_d7, vl); + + } else { + switch (block_width) { + case 7: { + vuint32m1x7_t tuple = __riscv_vlsseg7e32_v_u32m1x7(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x7_u32m1(tuple, 0); __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x8_u32m1(tuple, 1); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x7_u32m1(tuple, 1); __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x8_u32m1(tuple, 2); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x7_u32m1(tuple, 2); __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x8_u32m1(tuple, 3); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x7_u32m1(tuple, 3); __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x8_u32m1(tuple, 4); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x7_u32m1(tuple, 4); __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x8_u32m1(tuple, 5); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x7_u32m1(tuple, 5); __riscv_vse32_v_u32m1(o5, v_d5, vl); - vuint32m1_t v_d6 = __riscv_vget_v_u32m1x8_u32m1(tuple, 6); + vuint32m1_t v_d6 = __riscv_vget_v_u32m1x7_u32m1(tuple, 6); __riscv_vse32_v_u32m1(o6, v_d6, vl); - vuint32m1_t v_d7 = __riscv_vget_v_u32m1x8_u32m1(tuple, 7); - __riscv_vse32_v_u32m1(o7, v_d7, vl); - - } else { - switch (block_width) { - case 7: { - vuint32m1x7_t tuple = __riscv_vlsseg7e32_v_u32m1x7(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x7_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x7_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x7_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x7_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x7_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x7_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - vuint32m1_t v_d6 = __riscv_vget_v_u32m1x7_u32m1(tuple, 6); - __riscv_vse32_v_u32m1(o6, v_d6, vl); - break; - } - - case 6: { - vuint32m1x6_t tuple = __riscv_vlsseg6e32_v_u32m1x6(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x6_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x6_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x6_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x6_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x6_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x6_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - break; - } - - case 5: { - vuint32m1x5_t tuple = __riscv_vlsseg5e32_v_u32m1x5(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x5_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x5_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x5_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x5_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x5_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - break; - } - - case 4: { - vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - break; - } - - case 3: { - vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - break; - } - - case 2: { - vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - break; - } - - case 1: { - vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i0, input_stride, vl); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - break; - } - - default: - XNN_UNREACHABLE; - } - } + break; + } + + case 6: { + vuint32m1x6_t tuple = __riscv_vlsseg6e32_v_u32m1x6(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x6_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x6_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x6_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x6_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x6_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x6_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + break; + } + + case 5: { + vuint32m1x5_t tuple = __riscv_vlsseg5e32_v_u32m1x5(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x5_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x5_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x5_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x5_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x5_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + break; + } + + case 4: { + vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i0, input_stride, vl); - i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); - o7 = (uint32_t*) ((uintptr_t) o7 + tile_hbytes); - o6 = (uint32_t*) ((uintptr_t) o6 + tile_hbytes); - o5 = (uint32_t*) ((uintptr_t) o5 + tile_hbytes); - o4 = (uint32_t*) ((uintptr_t) o4 + tile_hbytes); - o3 = (uint32_t*) ((uintptr_t) o3 + tile_hbytes); - o2 = (uint32_t*) ((uintptr_t) o2 + tile_hbytes); - o1 = (uint32_t*) ((uintptr_t) o1 + tile_hbytes); - o0 = (uint32_t*) ((uintptr_t) o0 + tile_hbytes); + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + break; + } + + case 3: { + vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + break; + } + + case 2: { + vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + break; + } + + case 1: { + vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i0, input_stride, vl); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + break; + } + + default: + XNN_UNREACHABLE; + } + } + + i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); + o7 = (uint32_t*) ((uintptr_t) o7 + tile_hbytes); + o6 = (uint32_t*) ((uintptr_t) o6 + tile_hbytes); + o5 = (uint32_t*) ((uintptr_t) o5 + tile_hbytes); + o4 = (uint32_t*) ((uintptr_t) o4 + tile_hbytes); + o3 = (uint32_t*) ((uintptr_t) o3 + tile_hbytes); + o2 = (uint32_t*) ((uintptr_t) o2 + tile_hbytes); + o1 = (uint32_t*) ((uintptr_t) o1 + tile_hbytes); + o0 = (uint32_t*) ((uintptr_t) o0 + tile_hbytes); } if (bh != 0) { - const uint32_t* i = i0; - vl = __riscv_vsetvl_e32m1(bh); - if (block_width >= tile_width) { - vuint32m1x8_t tuple = __riscv_vlsseg8e32_v_u32m1x8(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x8_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x8_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x8_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x8_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x8_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x8_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - vuint32m1_t v_d6 = __riscv_vget_v_u32m1x8_u32m1(tuple, 6); - __riscv_vse32_v_u32m1(o6, v_d6, vl); - vuint32m1_t v_d7 = __riscv_vget_v_u32m1x8_u32m1(tuple, 7); - __riscv_vse32_v_u32m1(o7, v_d7, vl); - } else { - switch(block_width) { - case 7: { - vuint32m1x7_t tuple = __riscv_vlsseg7e32_v_u32m1x7(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x7_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x7_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x7_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x7_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x7_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x7_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - vuint32m1_t v_d6 = __riscv_vget_v_u32m1x7_u32m1(tuple, 6); - __riscv_vse32_v_u32m1(o6, v_d6, vl); - break; - } - case 6: { - vuint32m1x6_t tuple = __riscv_vlsseg6e32_v_u32m1x6(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x6_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x6_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x6_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x6_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x6_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x6_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - break; - } - case 5: { - vuint32m1x5_t tuple = __riscv_vlsseg5e32_v_u32m1x5(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x5_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x5_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x5_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x5_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x5_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - break; - } - case 4: { - vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - break; - } - case 3: { - vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - break; - } - case 2: { - vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - break; - } - - case 1: { - vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i, input_stride, vl); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - break; - } - - default: - XNN_UNREACHABLE; - } - } + const uint32_t* i = i0; + vl = __riscv_vsetvl_e32m1(bh); + if (block_width >= tile_width) { + vuint32m1x8_t tuple = __riscv_vlsseg8e32_v_u32m1x8(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x8_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x8_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x8_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x8_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x8_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x8_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + vuint32m1_t v_d6 = __riscv_vget_v_u32m1x8_u32m1(tuple, 6); + __riscv_vse32_v_u32m1(o6, v_d6, vl); + vuint32m1_t v_d7 = __riscv_vget_v_u32m1x8_u32m1(tuple, 7); + __riscv_vse32_v_u32m1(o7, v_d7, vl); + } else { + switch(block_width) { + case 7: { + vuint32m1x7_t tuple = __riscv_vlsseg7e32_v_u32m1x7(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x7_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x7_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x7_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x7_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x7_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x7_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + vuint32m1_t v_d6 = __riscv_vget_v_u32m1x7_u32m1(tuple, 6); + __riscv_vse32_v_u32m1(o6, v_d6, vl); + break; + } + case 6: { + vuint32m1x6_t tuple = __riscv_vlsseg6e32_v_u32m1x6(i, input_stride, vl); - if (bh & 8) { - o7 += 8; - o6 += 8; - o5 += 8; - o4 += 8; - o3 += 8; - o2 += 8; - o1 += 8; - o0 += 8; - i = (uint32_t*) ((uintptr_t) i + input_stride * 8); - } - if (bh & 4) { - o7 += 4; - o6 += 4; - o5 += 4; - o4 += 4; - o3 += 4; - o2 += 4; - o1 += 4; - o0 += 4; - i = (uint32_t*) ((uintptr_t) i + input_stride * 4); - } - if (bh & 2) { - o7 += 2; - o6 += 2; - o5 += 2; - o4 += 2; - o3 += 2; - o2 += 2; - o1 += 2; - o0 += 2; - i = (uint32_t*) ((uintptr_t) i + input_stride * 2); + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x6_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x6_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x6_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x6_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x6_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x6_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + break; + } + case 5: { + vuint32m1x5_t tuple = __riscv_vlsseg5e32_v_u32m1x5(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x5_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x5_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x5_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x5_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x5_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + break; + } + case 4: { + vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + break; + } + case 3: { + vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + break; + } + case 2: { + vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + break; + } + + case 1: { + vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i, input_stride, vl); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + break; + } + + default: + XNN_UNREACHABLE; } + } + + if (bh & 8) { + o7 += 8; + o6 += 8; + o5 += 8; + o4 += 8; + o3 += 8; + o2 += 8; + o1 += 8; + o0 += 8; + i = (uint32_t*) ((uintptr_t) i + input_stride * 8); + } + if (bh & 4) { + o7 += 4; + o6 += 4; + o5 += 4; + o4 += 4; + o3 += 4; + o2 += 4; + o1 += 4; + o0 += 4; + i = (uint32_t*) ((uintptr_t) i + input_stride * 4); + } + if (bh & 2) { + o7 += 2; + o6 += 2; + o5 += 2; + o4 += 2; + o3 += 2; + o2 += 2; + o1 += 2; + o0 += 2; + i = (uint32_t*) ((uintptr_t) i + input_stride * 2); + } } i0 = (const uint32_t*) ((uintptr_t) i0 + input_reset); diff --git a/src/x32-transposec/gen/x32-transposec-32x8-rvv.c b/src/x32-transposec/gen/x32-transposec-32x8-rvv.c index b44dae5e4c8..c18cab2802c 100644 --- a/src/x32-transposec/gen/x32-transposec-32x8-rvv.c +++ b/src/x32-transposec/gen/x32-transposec-32x8-rvv.c @@ -16,13 +16,13 @@ #include void xnn_x32_transposec_ukernel__32x8_rvv( - const uint32_t* input, - uint32_t* output, - size_t input_stride, - size_t output_stride, - size_t block_width, - size_t block_height, - const union xnn_x32_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height, + const union xnn_x32_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_stride >= block_height * sizeof(uint32_t)); assert(input_stride >= block_width * sizeof(uint32_t)); @@ -50,304 +50,304 @@ void xnn_x32_transposec_ukernel__32x8_rvv( size_t bh = block_height; size_t vl = __riscv_vsetvl_e32m1(tile_height); for (; bh >= 32; bh -= 32) { - if (block_width >= tile_width) { - vuint32m1x8_t tuple = __riscv_vlsseg8e32_v_u32m1x8(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x8_u32m1(tuple, 0); + if (block_width >= tile_width) { + vuint32m1x8_t tuple = __riscv_vlsseg8e32_v_u32m1x8(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x8_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x8_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x8_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x8_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x8_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x8_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + vuint32m1_t v_d6 = __riscv_vget_v_u32m1x8_u32m1(tuple, 6); + __riscv_vse32_v_u32m1(o6, v_d6, vl); + vuint32m1_t v_d7 = __riscv_vget_v_u32m1x8_u32m1(tuple, 7); + __riscv_vse32_v_u32m1(o7, v_d7, vl); + + } else { + switch (block_width) { + case 7: { + vuint32m1x7_t tuple = __riscv_vlsseg7e32_v_u32m1x7(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x7_u32m1(tuple, 0); __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x8_u32m1(tuple, 1); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x7_u32m1(tuple, 1); __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x8_u32m1(tuple, 2); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x7_u32m1(tuple, 2); __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x8_u32m1(tuple, 3); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x7_u32m1(tuple, 3); __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x8_u32m1(tuple, 4); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x7_u32m1(tuple, 4); __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x8_u32m1(tuple, 5); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x7_u32m1(tuple, 5); __riscv_vse32_v_u32m1(o5, v_d5, vl); - vuint32m1_t v_d6 = __riscv_vget_v_u32m1x8_u32m1(tuple, 6); + vuint32m1_t v_d6 = __riscv_vget_v_u32m1x7_u32m1(tuple, 6); __riscv_vse32_v_u32m1(o6, v_d6, vl); - vuint32m1_t v_d7 = __riscv_vget_v_u32m1x8_u32m1(tuple, 7); - __riscv_vse32_v_u32m1(o7, v_d7, vl); - - } else { - switch (block_width) { - case 7: { - vuint32m1x7_t tuple = __riscv_vlsseg7e32_v_u32m1x7(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x7_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x7_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x7_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x7_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x7_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x7_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - vuint32m1_t v_d6 = __riscv_vget_v_u32m1x7_u32m1(tuple, 6); - __riscv_vse32_v_u32m1(o6, v_d6, vl); - break; - } - - case 6: { - vuint32m1x6_t tuple = __riscv_vlsseg6e32_v_u32m1x6(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x6_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x6_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x6_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x6_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x6_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x6_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - break; - } - - case 5: { - vuint32m1x5_t tuple = __riscv_vlsseg5e32_v_u32m1x5(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x5_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x5_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x5_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x5_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x5_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - break; - } - - case 4: { - vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - break; - } - - case 3: { - vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - break; - } - - case 2: { - vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - break; - } - - case 1: { - vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i0, input_stride, vl); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - break; - } - - default: - XNN_UNREACHABLE; - } - } + break; + } + + case 6: { + vuint32m1x6_t tuple = __riscv_vlsseg6e32_v_u32m1x6(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x6_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x6_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x6_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x6_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x6_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x6_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + break; + } - i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); - o7 = (uint32_t*) ((uintptr_t) o7 + tile_hbytes); - o6 = (uint32_t*) ((uintptr_t) o6 + tile_hbytes); - o5 = (uint32_t*) ((uintptr_t) o5 + tile_hbytes); - o4 = (uint32_t*) ((uintptr_t) o4 + tile_hbytes); - o3 = (uint32_t*) ((uintptr_t) o3 + tile_hbytes); - o2 = (uint32_t*) ((uintptr_t) o2 + tile_hbytes); - o1 = (uint32_t*) ((uintptr_t) o1 + tile_hbytes); - o0 = (uint32_t*) ((uintptr_t) o0 + tile_hbytes); + case 5: { + vuint32m1x5_t tuple = __riscv_vlsseg5e32_v_u32m1x5(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x5_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x5_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x5_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x5_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x5_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + break; + } + + case 4: { + vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + break; + } + + case 3: { + vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + break; + } + + case 2: { + vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + break; + } + + case 1: { + vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i0, input_stride, vl); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + break; + } + + default: + XNN_UNREACHABLE; + } + } + + i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); + o7 = (uint32_t*) ((uintptr_t) o7 + tile_hbytes); + o6 = (uint32_t*) ((uintptr_t) o6 + tile_hbytes); + o5 = (uint32_t*) ((uintptr_t) o5 + tile_hbytes); + o4 = (uint32_t*) ((uintptr_t) o4 + tile_hbytes); + o3 = (uint32_t*) ((uintptr_t) o3 + tile_hbytes); + o2 = (uint32_t*) ((uintptr_t) o2 + tile_hbytes); + o1 = (uint32_t*) ((uintptr_t) o1 + tile_hbytes); + o0 = (uint32_t*) ((uintptr_t) o0 + tile_hbytes); } if (bh != 0) { - const uint32_t* i = i0; - vl = __riscv_vsetvl_e32m1(bh); - if (block_width >= tile_width) { - vuint32m1x8_t tuple = __riscv_vlsseg8e32_v_u32m1x8(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x8_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x8_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x8_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x8_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x8_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x8_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - vuint32m1_t v_d6 = __riscv_vget_v_u32m1x8_u32m1(tuple, 6); - __riscv_vse32_v_u32m1(o6, v_d6, vl); - vuint32m1_t v_d7 = __riscv_vget_v_u32m1x8_u32m1(tuple, 7); - __riscv_vse32_v_u32m1(o7, v_d7, vl); - } else { - switch(block_width) { - case 7: { - vuint32m1x7_t tuple = __riscv_vlsseg7e32_v_u32m1x7(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x7_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x7_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x7_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x7_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x7_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x7_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - vuint32m1_t v_d6 = __riscv_vget_v_u32m1x7_u32m1(tuple, 6); - __riscv_vse32_v_u32m1(o6, v_d6, vl); - break; - } - case 6: { - vuint32m1x6_t tuple = __riscv_vlsseg6e32_v_u32m1x6(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x6_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x6_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x6_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x6_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x6_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x6_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - break; - } - case 5: { - vuint32m1x5_t tuple = __riscv_vlsseg5e32_v_u32m1x5(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x5_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x5_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x5_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x5_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x5_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - break; - } - case 4: { - vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - break; - } - case 3: { - vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - break; - } - case 2: { - vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - break; - } - - case 1: { - vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i, input_stride, vl); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - break; - } - - default: - XNN_UNREACHABLE; - } - } + const uint32_t* i = i0; + vl = __riscv_vsetvl_e32m1(bh); + if (block_width >= tile_width) { + vuint32m1x8_t tuple = __riscv_vlsseg8e32_v_u32m1x8(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x8_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x8_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x8_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x8_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x8_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x8_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + vuint32m1_t v_d6 = __riscv_vget_v_u32m1x8_u32m1(tuple, 6); + __riscv_vse32_v_u32m1(o6, v_d6, vl); + vuint32m1_t v_d7 = __riscv_vget_v_u32m1x8_u32m1(tuple, 7); + __riscv_vse32_v_u32m1(o7, v_d7, vl); + } else { + switch(block_width) { + case 7: { + vuint32m1x7_t tuple = __riscv_vlsseg7e32_v_u32m1x7(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x7_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x7_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x7_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x7_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x7_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x7_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + vuint32m1_t v_d6 = __riscv_vget_v_u32m1x7_u32m1(tuple, 6); + __riscv_vse32_v_u32m1(o6, v_d6, vl); + break; + } + case 6: { + vuint32m1x6_t tuple = __riscv_vlsseg6e32_v_u32m1x6(i, input_stride, vl); - if (bh & 16) { - o7 += 16; - o6 += 16; - o5 += 16; - o4 += 16; - o3 += 16; - o2 += 16; - o1 += 16; - o0 += 16; - i = (uint32_t*) ((uintptr_t) i + input_stride * 16); - } - if (bh & 8) { - o7 += 8; - o6 += 8; - o5 += 8; - o4 += 8; - o3 += 8; - o2 += 8; - o1 += 8; - o0 += 8; - i = (uint32_t*) ((uintptr_t) i + input_stride * 8); - } - if (bh & 4) { - o7 += 4; - o6 += 4; - o5 += 4; - o4 += 4; - o3 += 4; - o2 += 4; - o1 += 4; - o0 += 4; - i = (uint32_t*) ((uintptr_t) i + input_stride * 4); - } - if (bh & 2) { - o7 += 2; - o6 += 2; - o5 += 2; - o4 += 2; - o3 += 2; - o2 += 2; - o1 += 2; - o0 += 2; - i = (uint32_t*) ((uintptr_t) i + input_stride * 2); + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x6_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x6_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x6_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x6_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x6_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x6_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + break; + } + case 5: { + vuint32m1x5_t tuple = __riscv_vlsseg5e32_v_u32m1x5(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x5_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x5_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x5_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x5_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x5_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + break; + } + case 4: { + vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + break; + } + case 3: { + vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + break; + } + case 2: { + vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + break; + } + + case 1: { + vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i, input_stride, vl); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + break; + } + + default: + XNN_UNREACHABLE; } + } + + if (bh & 16) { + o7 += 16; + o6 += 16; + o5 += 16; + o4 += 16; + o3 += 16; + o2 += 16; + o1 += 16; + o0 += 16; + i = (uint32_t*) ((uintptr_t) i + input_stride * 16); + } + if (bh & 8) { + o7 += 8; + o6 += 8; + o5 += 8; + o4 += 8; + o3 += 8; + o2 += 8; + o1 += 8; + o0 += 8; + i = (uint32_t*) ((uintptr_t) i + input_stride * 8); + } + if (bh & 4) { + o7 += 4; + o6 += 4; + o5 += 4; + o4 += 4; + o3 += 4; + o2 += 4; + o1 += 4; + o0 += 4; + i = (uint32_t*) ((uintptr_t) i + input_stride * 4); + } + if (bh & 2) { + o7 += 2; + o6 += 2; + o5 += 2; + o4 += 2; + o3 += 2; + o2 += 2; + o1 += 2; + o0 += 2; + i = (uint32_t*) ((uintptr_t) i + input_stride * 2); + } } i0 = (const uint32_t*) ((uintptr_t) i0 + input_reset); diff --git a/src/x32-transposec/gen/x32-transposec-4x4-rvv.c b/src/x32-transposec/gen/x32-transposec-4x4-rvv.c index 66233354cf7..cfde11ee401 100644 --- a/src/x32-transposec/gen/x32-transposec-4x4-rvv.c +++ b/src/x32-transposec/gen/x32-transposec-4x4-rvv.c @@ -16,13 +16,13 @@ #include void xnn_x32_transposec_ukernel__4x4_rvv( - const uint32_t* input, - uint32_t* output, - size_t input_stride, - size_t output_stride, - size_t block_width, - size_t block_height, - const union xnn_x32_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height, + const union xnn_x32_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_stride >= block_height * sizeof(uint32_t)); assert(input_stride >= block_width * sizeof(uint32_t)); @@ -46,115 +46,115 @@ void xnn_x32_transposec_ukernel__4x4_rvv( size_t bh = block_height; size_t vl = __riscv_vsetvl_e32m1(tile_height); for (; bh >= 4; bh -= 4) { - if (block_width >= tile_width) { - vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); + if (block_width >= tile_width) { + vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + + } else { + switch (block_width) { + case 3: { + vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - - } else { - switch (block_width) { - case 3: { - vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - break; - } - - case 2: { - vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - break; - } - - case 1: { - vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i0, input_stride, vl); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - break; - } - - default: - XNN_UNREACHABLE; - } + break; + } + + case 2: { + vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + break; + } + + case 1: { + vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i0, input_stride, vl); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + break; + } + + default: + XNN_UNREACHABLE; } + } - i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); - o3 = (uint32_t*) ((uintptr_t) o3 + tile_hbytes); - o2 = (uint32_t*) ((uintptr_t) o2 + tile_hbytes); - o1 = (uint32_t*) ((uintptr_t) o1 + tile_hbytes); - o0 = (uint32_t*) ((uintptr_t) o0 + tile_hbytes); + i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); + o3 = (uint32_t*) ((uintptr_t) o3 + tile_hbytes); + o2 = (uint32_t*) ((uintptr_t) o2 + tile_hbytes); + o1 = (uint32_t*) ((uintptr_t) o1 + tile_hbytes); + o0 = (uint32_t*) ((uintptr_t) o0 + tile_hbytes); } if (bh != 0) { - const uint32_t* i = i0; - vl = __riscv_vsetvl_e32m1(bh); - if (block_width >= tile_width) { - vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - } else { - switch(block_width) { - case 3: { - vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - break; - } - case 2: { - vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - break; - } - - case 1: { - vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i, input_stride, vl); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - break; - } - - default: - XNN_UNREACHABLE; - } - } + const uint32_t* i = i0; + vl = __riscv_vsetvl_e32m1(bh); + if (block_width >= tile_width) { + vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + } else { + switch(block_width) { + case 3: { + vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + break; + } + case 2: { + vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + break; + } + + case 1: { + vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i, input_stride, vl); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + break; + } - if (bh & 2) { - o3 += 2; - o2 += 2; - o1 += 2; - o0 += 2; - i = (uint32_t*) ((uintptr_t) i + input_stride * 2); + default: + XNN_UNREACHABLE; } + } + + if (bh & 2) { + o3 += 2; + o2 += 2; + o1 += 2; + o0 += 2; + i = (uint32_t*) ((uintptr_t) i + input_stride * 2); + } } i0 = (const uint32_t*) ((uintptr_t) i0 + input_reset); diff --git a/src/x32-transposec/gen/x32-transposec-8x8-rvv.c b/src/x32-transposec/gen/x32-transposec-8x8-rvv.c index e8fa4cd2ee4..e8a9b26547c 100644 --- a/src/x32-transposec/gen/x32-transposec-8x8-rvv.c +++ b/src/x32-transposec/gen/x32-transposec-8x8-rvv.c @@ -16,13 +16,13 @@ #include void xnn_x32_transposec_ukernel__8x8_rvv( - const uint32_t* input, - uint32_t* output, - size_t input_stride, - size_t output_stride, - size_t block_width, - size_t block_height, - const union xnn_x32_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height, + const union xnn_x32_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_stride >= block_height * sizeof(uint32_t)); assert(input_stride >= block_width * sizeof(uint32_t)); @@ -50,282 +50,282 @@ void xnn_x32_transposec_ukernel__8x8_rvv( size_t bh = block_height; size_t vl = __riscv_vsetvl_e32m1(tile_height); for (; bh >= 8; bh -= 8) { - if (block_width >= tile_width) { - vuint32m1x8_t tuple = __riscv_vlsseg8e32_v_u32m1x8(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x8_u32m1(tuple, 0); + if (block_width >= tile_width) { + vuint32m1x8_t tuple = __riscv_vlsseg8e32_v_u32m1x8(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x8_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x8_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x8_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x8_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x8_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x8_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + vuint32m1_t v_d6 = __riscv_vget_v_u32m1x8_u32m1(tuple, 6); + __riscv_vse32_v_u32m1(o6, v_d6, vl); + vuint32m1_t v_d7 = __riscv_vget_v_u32m1x8_u32m1(tuple, 7); + __riscv_vse32_v_u32m1(o7, v_d7, vl); + + } else { + switch (block_width) { + case 7: { + vuint32m1x7_t tuple = __riscv_vlsseg7e32_v_u32m1x7(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x7_u32m1(tuple, 0); __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x8_u32m1(tuple, 1); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x7_u32m1(tuple, 1); __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x8_u32m1(tuple, 2); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x7_u32m1(tuple, 2); __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x8_u32m1(tuple, 3); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x7_u32m1(tuple, 3); __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x8_u32m1(tuple, 4); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x7_u32m1(tuple, 4); __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x8_u32m1(tuple, 5); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x7_u32m1(tuple, 5); __riscv_vse32_v_u32m1(o5, v_d5, vl); - vuint32m1_t v_d6 = __riscv_vget_v_u32m1x8_u32m1(tuple, 6); + vuint32m1_t v_d6 = __riscv_vget_v_u32m1x7_u32m1(tuple, 6); __riscv_vse32_v_u32m1(o6, v_d6, vl); - vuint32m1_t v_d7 = __riscv_vget_v_u32m1x8_u32m1(tuple, 7); - __riscv_vse32_v_u32m1(o7, v_d7, vl); - - } else { - switch (block_width) { - case 7: { - vuint32m1x7_t tuple = __riscv_vlsseg7e32_v_u32m1x7(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x7_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x7_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x7_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x7_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x7_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x7_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - vuint32m1_t v_d6 = __riscv_vget_v_u32m1x7_u32m1(tuple, 6); - __riscv_vse32_v_u32m1(o6, v_d6, vl); - break; - } - - case 6: { - vuint32m1x6_t tuple = __riscv_vlsseg6e32_v_u32m1x6(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x6_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x6_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x6_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x6_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x6_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x6_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - break; - } - - case 5: { - vuint32m1x5_t tuple = __riscv_vlsseg5e32_v_u32m1x5(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x5_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x5_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x5_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x5_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x5_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - break; - } - - case 4: { - vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - break; - } - - case 3: { - vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - break; - } - - case 2: { - vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i0, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - break; - } - - case 1: { - vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i0, input_stride, vl); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - break; - } - - default: - XNN_UNREACHABLE; - } - } + break; + } + + case 6: { + vuint32m1x6_t tuple = __riscv_vlsseg6e32_v_u32m1x6(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x6_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x6_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x6_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x6_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x6_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x6_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + break; + } + + case 5: { + vuint32m1x5_t tuple = __riscv_vlsseg5e32_v_u32m1x5(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x5_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x5_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x5_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x5_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x5_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + break; + } + + case 4: { + vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i0, input_stride, vl); - i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); - o7 = (uint32_t*) ((uintptr_t) o7 + tile_hbytes); - o6 = (uint32_t*) ((uintptr_t) o6 + tile_hbytes); - o5 = (uint32_t*) ((uintptr_t) o5 + tile_hbytes); - o4 = (uint32_t*) ((uintptr_t) o4 + tile_hbytes); - o3 = (uint32_t*) ((uintptr_t) o3 + tile_hbytes); - o2 = (uint32_t*) ((uintptr_t) o2 + tile_hbytes); - o1 = (uint32_t*) ((uintptr_t) o1 + tile_hbytes); - o0 = (uint32_t*) ((uintptr_t) o0 + tile_hbytes); + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + break; + } + + case 3: { + vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + break; + } + + case 2: { + vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i0, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + break; + } + + case 1: { + vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i0, input_stride, vl); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + break; + } + + default: + XNN_UNREACHABLE; + } + } + + i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); + o7 = (uint32_t*) ((uintptr_t) o7 + tile_hbytes); + o6 = (uint32_t*) ((uintptr_t) o6 + tile_hbytes); + o5 = (uint32_t*) ((uintptr_t) o5 + tile_hbytes); + o4 = (uint32_t*) ((uintptr_t) o4 + tile_hbytes); + o3 = (uint32_t*) ((uintptr_t) o3 + tile_hbytes); + o2 = (uint32_t*) ((uintptr_t) o2 + tile_hbytes); + o1 = (uint32_t*) ((uintptr_t) o1 + tile_hbytes); + o0 = (uint32_t*) ((uintptr_t) o0 + tile_hbytes); } if (bh != 0) { - const uint32_t* i = i0; - vl = __riscv_vsetvl_e32m1(bh); - if (block_width >= tile_width) { - vuint32m1x8_t tuple = __riscv_vlsseg8e32_v_u32m1x8(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x8_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x8_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x8_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x8_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x8_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x8_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - vuint32m1_t v_d6 = __riscv_vget_v_u32m1x8_u32m1(tuple, 6); - __riscv_vse32_v_u32m1(o6, v_d6, vl); - vuint32m1_t v_d7 = __riscv_vget_v_u32m1x8_u32m1(tuple, 7); - __riscv_vse32_v_u32m1(o7, v_d7, vl); - } else { - switch(block_width) { - case 7: { - vuint32m1x7_t tuple = __riscv_vlsseg7e32_v_u32m1x7(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x7_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x7_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x7_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x7_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x7_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x7_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - vuint32m1_t v_d6 = __riscv_vget_v_u32m1x7_u32m1(tuple, 6); - __riscv_vse32_v_u32m1(o6, v_d6, vl); - break; - } - case 6: { - vuint32m1x6_t tuple = __riscv_vlsseg6e32_v_u32m1x6(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x6_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x6_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x6_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x6_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x6_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - vuint32m1_t v_d5 = __riscv_vget_v_u32m1x6_u32m1(tuple, 5); - __riscv_vse32_v_u32m1(o5, v_d5, vl); - break; - } - case 5: { - vuint32m1x5_t tuple = __riscv_vlsseg5e32_v_u32m1x5(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x5_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x5_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x5_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x5_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - vuint32m1_t v_d4 = __riscv_vget_v_u32m1x5_u32m1(tuple, 4); - __riscv_vse32_v_u32m1(o4, v_d4, vl); - break; - } - case 4: { - vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); - __riscv_vse32_v_u32m1(o3, v_d3, vl); - break; - } - case 3: { - vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); - __riscv_vse32_v_u32m1(o2, v_d2, vl); - break; - } - case 2: { - vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i, input_stride, vl); - - vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); - __riscv_vse32_v_u32m1(o1, v_d1, vl); - break; - } - - case 1: { - vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i, input_stride, vl); - __riscv_vse32_v_u32m1(o0, v_d0, vl); - break; - } - - default: - XNN_UNREACHABLE; - } - } + const uint32_t* i = i0; + vl = __riscv_vsetvl_e32m1(bh); + if (block_width >= tile_width) { + vuint32m1x8_t tuple = __riscv_vlsseg8e32_v_u32m1x8(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x8_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x8_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x8_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x8_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x8_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x8_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + vuint32m1_t v_d6 = __riscv_vget_v_u32m1x8_u32m1(tuple, 6); + __riscv_vse32_v_u32m1(o6, v_d6, vl); + vuint32m1_t v_d7 = __riscv_vget_v_u32m1x8_u32m1(tuple, 7); + __riscv_vse32_v_u32m1(o7, v_d7, vl); + } else { + switch(block_width) { + case 7: { + vuint32m1x7_t tuple = __riscv_vlsseg7e32_v_u32m1x7(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x7_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x7_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x7_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x7_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x7_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x7_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + vuint32m1_t v_d6 = __riscv_vget_v_u32m1x7_u32m1(tuple, 6); + __riscv_vse32_v_u32m1(o6, v_d6, vl); + break; + } + case 6: { + vuint32m1x6_t tuple = __riscv_vlsseg6e32_v_u32m1x6(i, input_stride, vl); - if (bh & 4) { - o7 += 4; - o6 += 4; - o5 += 4; - o4 += 4; - o3 += 4; - o2 += 4; - o1 += 4; - o0 += 4; - i = (uint32_t*) ((uintptr_t) i + input_stride * 4); - } - if (bh & 2) { - o7 += 2; - o6 += 2; - o5 += 2; - o4 += 2; - o3 += 2; - o2 += 2; - o1 += 2; - o0 += 2; - i = (uint32_t*) ((uintptr_t) i + input_stride * 2); + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x6_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x6_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x6_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x6_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x6_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + vuint32m1_t v_d5 = __riscv_vget_v_u32m1x6_u32m1(tuple, 5); + __riscv_vse32_v_u32m1(o5, v_d5, vl); + break; + } + case 5: { + vuint32m1x5_t tuple = __riscv_vlsseg5e32_v_u32m1x5(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x5_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x5_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x5_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x5_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + vuint32m1_t v_d4 = __riscv_vget_v_u32m1x5_u32m1(tuple, 4); + __riscv_vse32_v_u32m1(o4, v_d4, vl); + break; + } + case 4: { + vuint32m1x4_t tuple = __riscv_vlsseg4e32_v_u32m1x4(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x4_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x4_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x4_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + vuint32m1_t v_d3 = __riscv_vget_v_u32m1x4_u32m1(tuple, 3); + __riscv_vse32_v_u32m1(o3, v_d3, vl); + break; + } + case 3: { + vuint32m1x3_t tuple = __riscv_vlsseg3e32_v_u32m1x3(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x3_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x3_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + vuint32m1_t v_d2 = __riscv_vget_v_u32m1x3_u32m1(tuple, 2); + __riscv_vse32_v_u32m1(o2, v_d2, vl); + break; + } + case 2: { + vuint32m1x2_t tuple = __riscv_vlsseg2e32_v_u32m1x2(i, input_stride, vl); + + vuint32m1_t v_d0 = __riscv_vget_v_u32m1x2_u32m1(tuple, 0); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + vuint32m1_t v_d1 = __riscv_vget_v_u32m1x2_u32m1(tuple, 1); + __riscv_vse32_v_u32m1(o1, v_d1, vl); + break; + } + + case 1: { + vuint32m1_t v_d0 = __riscv_vlse32_v_u32m1(i, input_stride, vl); + __riscv_vse32_v_u32m1(o0, v_d0, vl); + break; + } + + default: + XNN_UNREACHABLE; } + } + + if (bh & 4) { + o7 += 4; + o6 += 4; + o5 += 4; + o4 += 4; + o3 += 4; + o2 += 4; + o1 += 4; + o0 += 4; + i = (uint32_t*) ((uintptr_t) i + input_stride * 4); + } + if (bh & 2) { + o7 += 2; + o6 += 2; + o5 += 2; + o4 += 2; + o3 += 2; + o2 += 2; + o1 += 2; + o0 += 2; + i = (uint32_t*) ((uintptr_t) i + input_stride * 2); + } } i0 = (const uint32_t*) ((uintptr_t) i0 + input_reset); diff --git a/src/x32-transposec/rvv.c.in b/src/x32-transposec/rvv.c.in index 600610cda92..aeb9e47361f 100644 --- a/src/x32-transposec/rvv.c.in +++ b/src/x32-transposec/rvv.c.in @@ -19,13 +19,13 @@ $NUM_ITERS = int(math.log2(TILE_HEIGHT)) #include void xnn_x${SIZE}_transposec_ukernel__${TILE_HEIGHT}x${TILE_WIDTH}_rvv( - const uint${SIZE}_t* input, - uint${SIZE}_t* output, - size_t input_stride, - size_t output_stride, - size_t block_width, - size_t block_height, - const union xnn_x${SIZE}_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const uint${SIZE}_t* input, + uint${SIZE}_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height, + const union xnn_x${SIZE}_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_stride >= block_height * sizeof(uint${SIZE}_t)); assert(input_stride >= block_width * sizeof(uint${SIZE}_t)); @@ -48,79 +48,79 @@ void xnn_x${SIZE}_transposec_ukernel__${TILE_HEIGHT}x${TILE_WIDTH}_rvv( size_t bh = block_height; size_t vl = __riscv_vsetvl_e${SIZE}m${LMUL}(tile_height); for (; bh >= ${TILE_HEIGHT}; bh -= ${TILE_HEIGHT}) { - if (block_width >= tile_width) { - vuint${SIZE}m${LMUL}x${TILE_WIDTH}_t tuple = __riscv_vlsseg${TILE_WIDTH}e${SIZE}_v_u${SIZE}m${LMUL}x${TILE_WIDTH}(i0, input_stride, vl); - - $for N in range(TILE_WIDTH): - vuint${SIZE}m${LMUL}_t v_d${N} = __riscv_vget_v_u${SIZE}m${LMUL}x${TILE_WIDTH}_u${SIZE}m${LMUL}(tuple, ${N}); - __riscv_vse${SIZE}_v_u${SIZE}m${LMUL}(o${N}, v_d${N}, vl); - - } else { - switch (block_width) { - $for M in reversed(range(2, TILE_WIDTH)): - case ${M}: { - vuint${SIZE}m${LMUL}x${M}_t tuple = __riscv_vlsseg${M}e${SIZE}_v_u${SIZE}m${LMUL}x${M}(i0, input_stride, vl); - - $for N in range(M): - vuint${SIZE}m${LMUL}_t v_d${N} = __riscv_vget_v_u${SIZE}m${LMUL}x${M}_u${SIZE}m${LMUL}(tuple, ${N}); - __riscv_vse${SIZE}_v_u${SIZE}m${LMUL}(o${N}, v_d${N}, vl); - break; - }\n - case 1: { - vuint32m${LMUL}_t v_d0 = __riscv_vlse32_v_u32m${LMUL}(i0, input_stride, vl); - __riscv_vse32_v_u32m${LMUL}(o0, v_d0, vl); - break; - } - - default: - XNN_UNREACHABLE; - } + if (block_width >= tile_width) { + vuint${SIZE}m${LMUL}x${TILE_WIDTH}_t tuple = __riscv_vlsseg${TILE_WIDTH}e${SIZE}_v_u${SIZE}m${LMUL}x${TILE_WIDTH}(i0, input_stride, vl); + + $for N in range(TILE_WIDTH): + vuint${SIZE}m${LMUL}_t v_d${N} = __riscv_vget_v_u${SIZE}m${LMUL}x${TILE_WIDTH}_u${SIZE}m${LMUL}(tuple, ${N}); + __riscv_vse${SIZE}_v_u${SIZE}m${LMUL}(o${N}, v_d${N}, vl); + + } else { + switch (block_width) { + $for M in reversed(range(2, TILE_WIDTH)): + case ${M}: { + vuint${SIZE}m${LMUL}x${M}_t tuple = __riscv_vlsseg${M}e${SIZE}_v_u${SIZE}m${LMUL}x${M}(i0, input_stride, vl); + + $for N in range(M): + vuint${SIZE}m${LMUL}_t v_d${N} = __riscv_vget_v_u${SIZE}m${LMUL}x${M}_u${SIZE}m${LMUL}(tuple, ${N}); + __riscv_vse${SIZE}_v_u${SIZE}m${LMUL}(o${N}, v_d${N}, vl); + break; + }\n + case 1: { + vuint32m${LMUL}_t v_d0 = __riscv_vlse32_v_u32m${LMUL}(i0, input_stride, vl); + __riscv_vse32_v_u32m${LMUL}(o0, v_d0, vl); + break; + } + + default: + XNN_UNREACHABLE; } + } - i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_offset); - $for N in reversed(range(TILE_WIDTH)): - o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes); + i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_offset); + $for N in reversed(range(TILE_WIDTH)): + o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes); } if (bh != 0) { - const uint${SIZE}_t* i = i0; - vl = __riscv_vsetvl_e${SIZE}m${LMUL}(bh); - if (block_width >= tile_width) { - vuint${SIZE}m${LMUL}x${TILE_WIDTH}_t tuple = __riscv_vlsseg${TILE_WIDTH}e${SIZE}_v_u${SIZE}m${LMUL}x${TILE_WIDTH}(i, input_stride, vl); - - $for N in range(TILE_WIDTH): - vuint${SIZE}m${LMUL}_t v_d${N} = __riscv_vget_v_u${SIZE}m${LMUL}x${TILE_WIDTH}_u${SIZE}m${LMUL}(tuple, ${N}); - __riscv_vse${SIZE}_v_u${SIZE}m${LMUL}(o${N}, v_d${N}, vl); - } else { - switch(block_width) { - $for M in reversed(range(2, TILE_WIDTH)): - case ${M}: { - vuint${SIZE}m${LMUL}x${M}_t tuple = __riscv_vlsseg${M}e${SIZE}_v_u${SIZE}m${LMUL}x${M}(i, input_stride, vl); - - $for N in range(0, M): - vuint${SIZE}m${LMUL}_t v_d${N} = __riscv_vget_v_u${SIZE}m${LMUL}x${M}_u${SIZE}m${LMUL}(tuple, ${N}); - __riscv_vse${SIZE}_v_u${SIZE}m${LMUL}(o${N}, v_d${N}, vl); - break; - } - - case 1: { - vuint32m${LMUL}_t v_d0 = __riscv_vlse32_v_u32m${LMUL}(i, input_stride, vl); - __riscv_vse32_v_u32m${LMUL}(o0, v_d0, vl); - break; - } - - default: - XNN_UNREACHABLE; - } - } - - $for M in range(1, NUM_ITERS + 1): - $if (TILE_HEIGHT>>M) > 1: - if (bh & ${TILE_HEIGHT>>M}) { - $for N in reversed(range(TILE_WIDTH)): - o${N} += ${TILE_HEIGHT>>M}; - i = (uint${SIZE}_t*) ((uintptr_t) i + input_stride * ${TILE_HEIGHT>>M}); + const uint${SIZE}_t* i = i0; + vl = __riscv_vsetvl_e${SIZE}m${LMUL}(bh); + if (block_width >= tile_width) { + vuint${SIZE}m${LMUL}x${TILE_WIDTH}_t tuple = __riscv_vlsseg${TILE_WIDTH}e${SIZE}_v_u${SIZE}m${LMUL}x${TILE_WIDTH}(i, input_stride, vl); + + $for N in range(TILE_WIDTH): + vuint${SIZE}m${LMUL}_t v_d${N} = __riscv_vget_v_u${SIZE}m${LMUL}x${TILE_WIDTH}_u${SIZE}m${LMUL}(tuple, ${N}); + __riscv_vse${SIZE}_v_u${SIZE}m${LMUL}(o${N}, v_d${N}, vl); + } else { + switch(block_width) { + $for M in reversed(range(2, TILE_WIDTH)): + case ${M}: { + vuint${SIZE}m${LMUL}x${M}_t tuple = __riscv_vlsseg${M}e${SIZE}_v_u${SIZE}m${LMUL}x${M}(i, input_stride, vl); + + $for N in range(0, M): + vuint${SIZE}m${LMUL}_t v_d${N} = __riscv_vget_v_u${SIZE}m${LMUL}x${M}_u${SIZE}m${LMUL}(tuple, ${N}); + __riscv_vse${SIZE}_v_u${SIZE}m${LMUL}(o${N}, v_d${N}, vl); + break; } + + case 1: { + vuint32m${LMUL}_t v_d0 = __riscv_vlse32_v_u32m${LMUL}(i, input_stride, vl); + __riscv_vse32_v_u32m${LMUL}(o0, v_d0, vl); + break; + } + + default: + XNN_UNREACHABLE; + } + } + + $for M in range(1, NUM_ITERS + 1): + $if (TILE_HEIGHT>>M) > 1: + if (bh & ${TILE_HEIGHT>>M}) { + $for N in reversed(range(TILE_WIDTH)): + o${N} += ${TILE_HEIGHT>>M}; + i = (uint${SIZE}_t*) ((uintptr_t) i + input_stride * ${TILE_HEIGHT>>M}); + } } i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);