Skip to content

Commit

Permalink
convert: add avx512 version of rfc4175_422be10_to_yuv420p8 (#713)
Browse files Browse the repository at this point in the history
Signed-off-by: Frank Du <[email protected]>
  • Loading branch information
frankdjx authored Jan 19, 2024
1 parent 71f8789 commit feb046b
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 1 deletion.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
* pipeline: add block get mode support, see `ST20P_TX_FLAG_BLOCK_GET`/`ST20P_RX_FLAG_BLOCK_GET`/`ST22P_TX_FLAG_BLOCK_GET`/`ST22P_RX_FLAG_BLOCK_GET`.
* rx/timing_parser: add support to export the timing_parser to app, see `app/sample/rx_st20p_timing_parser_sample.c` for usage.
* st40: add interlaced support.
* cvt: add st20_rfc4175_422be10_to_yuv422p8 with avx512
* cvt: add st20_rfc4175_422be10_to_yuv420p8 with avx512

## Changelog for 23.12

Expand Down
16 changes: 16 additions & 0 deletions doc/convert.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ For detailed API usage, please refer to [st_convert_api.h](../include/st_convert
| rfc4175_422le10 | v210 | &#x2705; | | &#x2705; | &#x2705; |
| rfc4175_422le10 | rfc4175_422be10 | &#x2705; | | &#x2705; | &#x2705; |
| rfc4175_422le10 | yuv422p10le | &#x2705; | | | |
| rfc4175_422be10 | yuv422p8 | &#x2705; | | &#x2705; | |
| rfc4175_422be10 | yuv420p8 | &#x2705; | | &#x2705; | |
| yuv422p10le | rfc4175_422be10 | &#x2705; | | &#x2705; | |
| yuv422p10le | rfc4175_422le10 | &#x2705; | | | |
| v210 | rfc4175_422be10 | &#x2705; | | &#x2705; | &#x2705; |
Expand Down Expand Up @@ -323,6 +325,20 @@ Y210 (1 pixel group)
| 0x4 | 0x5 | 0x6 | 0x7 |
```

### yuv422p8

Color space: YUV<br>
Sample: 422<br>
Packed/planar: planar<br>
Depth: 8<br>

### yuv420p8

Color space: YUV<br>
Sample: 420<br>
Packed/planar: planar<br>
Depth: 8<br>

### yuv422p12le

Color space: YU<br>
Expand Down
120 changes: 120 additions & 0 deletions lib/src/st2110/st_avx512.c
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,126 @@ int st20_rfc4175_422be10_to_yuv422p8_avx512(struct st20_rfc4175_422_10_pg2_be* p
return 0;
}

int st20_rfc4175_422be10_to_yuv420p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg,
uint8_t* y, uint8_t* b, uint8_t* r,
uint32_t w, uint32_t h) {
__m128i shuffle_mask = _mm_loadu_si128((__m128i*)be10_to_le8_shuffle0_tbl_128);
__m128i sllv_mask = _mm_loadu_si128((__m128i*)be10_to_le8_sllv_tbl_128);
__m128i sllv_shuffle_mask = _mm_loadu_si128((__m128i*)be10_to_le8_shuffle1_tbl_128);
__m128i uyvy2uvyy_mask = _mm_loadu_si128((__m128i*)p8_uyvy2uvyy_mask);
__mmask16 k = 0x3FF; /* each __m128i with 2 pg group, 10 bytes */

uint32_t line_pg_cnt = w / 2; /* two pgs in one convert */
uint32_t pg_cnt;

for (uint32_t i = 0; i < (h / 2); i++) { /* 2 lines each loop */
/* first line */
pg_cnt = line_pg_cnt;
while (pg_cnt >= 16) {
__m128i uvyy[4];

for (int step = 0; step < 4; step++) {
__m128i input = _mm_maskz_loadu_epi8(k, (__m128i*)pg);
pg += 2;
__m128i shuffle_result = _mm_shuffle_epi8(input, shuffle_mask);
__m128i sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask);
/* uyvy uyvy .... .... */
__m128i uyvy_t1 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask);

input = _mm_maskz_loadu_epi8(k, (__m128i*)pg);
pg += 2;
shuffle_result = _mm_shuffle_epi8(input, shuffle_mask);
sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask);
/* uyvy uyvy .... .... */
__m128i uyvy_t2 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask);

/* uyvy uyvy uyvy uyvy */
__m128i uyvy = _mm_unpacklo_epi64(uyvy_t1, uyvy_t2);
/* uuuu vvvv yyyy yyyy */
uvyy[step] = _mm_shuffle_epi8(uyvy, uyvy2uvyy_mask);
}

/* merge all u v y from u0v0y0y1, u1v1y2y3, u2v2y4y5, u3v3y6y7*/
/* merge y */
__m128i y0y1y2y3 = _mm_unpackhi_epi64(uvyy[0], uvyy[1]);
_mm_storeu_si128((__m128i*)y, y0y1y2y3);
y += 16;
__m128i y4y5y6y7 = _mm_unpackhi_epi64(uvyy[2], uvyy[3]);
_mm_storeu_si128((__m128i*)y, y4y5y6y7);
y += 16;
/* merge b and r */
__m128i u0v0u1v1 = _mm_unpacklo_epi64(uvyy[0], uvyy[1]);
__m128i u2v2u3v3 = _mm_unpacklo_epi64(uvyy[2], uvyy[3]);
__m128i u0u2v0v2 = _mm_unpacklo_epi32(u0v0u1v1, u2v2u3v3);
__m128i u1u3v1v3 = _mm_unpackhi_epi32(u0v0u1v1, u2v2u3v3);
__m128i u0u1u2u3 = _mm_unpacklo_epi32(u0u2v0v2, u1u3v1v3);
_mm_storeu_si128((__m128i*)b, u0u1u2u3);
b += 16;
__m128i v0v1v2v3 = _mm_unpackhi_epi32(u0u2v0v2, u1u3v1v3);
_mm_storeu_si128((__m128i*)r, v0v1v2v3);
r += 16;

pg_cnt -= 16;
}
while (pg_cnt > 0) {
*b++ = pg->Cb00;
*y++ = (pg->Y00 << 2) | (pg->Y00_ >> 2);
*r++ = (pg->Cr00 << 4) | (pg->Cr00_ >> 2);
*y++ = (pg->Y01 << 6) | (pg->Y01_ >> 2);
pg++;

pg_cnt--;
}

/* second line, no u and v */
pg_cnt = line_pg_cnt;
while (pg_cnt >= 16) {
__m128i uvyy[4];

for (int step = 0; step < 4; step++) {
__m128i input = _mm_maskz_loadu_epi8(k, (__m128i*)pg);
pg += 2;
__m128i shuffle_result = _mm_shuffle_epi8(input, shuffle_mask);
__m128i sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask);
/* uyvy uyvy .... .... */
__m128i uyvy_t1 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask);

input = _mm_maskz_loadu_epi8(k, (__m128i*)pg);
pg += 2;
shuffle_result = _mm_shuffle_epi8(input, shuffle_mask);
sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask);
/* uyvy uyvy .... .... */
__m128i uyvy_t2 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask);

/* uyvy uyvy uyvy uyvy */
__m128i uyvy = _mm_unpacklo_epi64(uyvy_t1, uyvy_t2);
/* uuuu vvvv yyyy yyyy */
uvyy[step] = _mm_shuffle_epi8(uyvy, uyvy2uvyy_mask);
}

/* merge all u v y from u0v0y0y1, u1v1y2y3, u2v2y4y5, u3v3y6y7*/
/* merge y */
__m128i y0y1y2y3 = _mm_unpackhi_epi64(uvyy[0], uvyy[1]);
_mm_storeu_si128((__m128i*)y, y0y1y2y3);
y += 16;
__m128i y4y5y6y7 = _mm_unpackhi_epi64(uvyy[2], uvyy[3]);
_mm_storeu_si128((__m128i*)y, y4y5y6y7);
y += 16;

pg_cnt -= 16;
}
while (pg_cnt > 0) {
*y++ = (pg->Y00 << 2) | (pg->Y00_ >> 2);
*y++ = (pg->Y01 << 6) | (pg->Y01_ >> 2);
pg++;

pg_cnt--;
}
}

return 0;
}

/* begin st20_rfc4175_422le10_to_v210_avx512 */
static uint8_t le10_to_v210_shuffle_r_tbl_128[16] = {
0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
Expand Down
6 changes: 5 additions & 1 deletion lib/src/st2110/st_avx512.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,11 @@ int st20_rfc4175_422be12_to_yuv422p12le_avx512_dma(
struct mtl_dma_lender_dev* dma, struct st20_rfc4175_422_12_pg2_be* pg_be,
mtl_iova_t pg_be_iova, uint16_t* y, uint16_t* b, uint16_t* r, uint32_t w, uint32_t h);

int st20_rfc4175_422be10_to_yuv422p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg_10,
int st20_rfc4175_422be10_to_yuv422p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg,
uint8_t* y, uint8_t* b, uint8_t* r,
uint32_t w, uint32_t h);

int st20_rfc4175_422be10_to_yuv420p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg,
uint8_t* y, uint8_t* b, uint8_t* r,
uint32_t w, uint32_t h);

Expand Down
9 changes: 9 additions & 0 deletions lib/src/st2110/st_convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -1396,6 +1396,15 @@ int st20_rfc4175_422be10_to_yuv420p8_simd(struct st20_rfc4175_422_10_pg2_be* pg,
MTL_MAY_UNUSED(level);
MTL_MAY_UNUSED(ret);

#ifdef MTL_HAS_AVX512
if ((level >= MTL_SIMD_LEVEL_AVX512) && (cpu_level >= MTL_SIMD_LEVEL_AVX512)) {
dbg("%s, avx512 ways\n", __func__);
ret = st20_rfc4175_422be10_to_yuv420p8_avx512(pg, y, b, r, w, h);
if (ret == 0) return 0;
dbg("%s, avx512 ways failed\n", __func__);
}
#endif

/* the last option */
return st20_rfc4175_422be10_to_yuv420p8_scalar(pg, y, b, r, w, h);
}
Expand Down
36 changes: 36 additions & 0 deletions tests/src/cvt_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1208,6 +1208,42 @@ TEST(Cvt, rfc4175_422be10_to_yuv422p8_avx512_vbmi) {
}
}

static void test_cvt_rfc4175_422be10_to_yuv420p8(int w, int h) {
int ret;
size_t fb_pg2_size_10 = (size_t)w * h * 5 / 2;
size_t fb_yuv420p8_size = (size_t)w * h * 3 / 2;
struct st20_rfc4175_422_10_pg2_be* pg_10 =
(struct st20_rfc4175_422_10_pg2_be*)st_test_zmalloc(fb_pg2_size_10);
uint8_t* p8 = (uint8_t*)st_test_zmalloc(fb_yuv420p8_size);
uint8_t* p8_2 = (uint8_t*)st_test_zmalloc(fb_yuv420p8_size);

if (!pg_10 || !p8 || !p8_2) {
EXPECT_EQ(0, 1);
if (pg_10) st_test_free(pg_10);
if (p8) st_test_free(p8);
if (p8_2) st_test_free(p8_2);
return;
}

st_test_rand_data((uint8_t*)pg_10, fb_pg2_size_10, 0);
ret = st20_rfc4175_422be10_to_yuv420p8_simd(pg_10, p8, p8 + w * h, p8 + w * h * 5 / 4,
w, h, MTL_SIMD_LEVEL_NONE);
EXPECT_EQ(0, ret);
ret = st20_rfc4175_422be10_to_yuv420p8_simd(
pg_10, p8_2, p8_2 + w * h, p8_2 + w * h * 5 / 4, w, h, MTL_SIMD_LEVEL_AVX512);
EXPECT_EQ(0, ret);

EXPECT_EQ(0, memcmp(p8, p8_2, fb_yuv420p8_size));

st_test_free(pg_10);
st_test_free(p8);
st_test_free(p8_2);
}

TEST(Cvt, rfc4175_422be10_to_yuv420p8) {
test_cvt_rfc4175_422be10_to_yuv420p8(1920, 1080);
}

static void test_cvt_rfc4175_422le10_to_v210(int w, int h, enum mtl_simd_level cvt_level,
enum mtl_simd_level back_level) {
int ret;
Expand Down

0 comments on commit feb046b

Please sign in to comment.