Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

convert: add avx512 version of rfc4175_422be10_to_yuv420p8 #713

Merged
merged 1 commit into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
* pipeline: add block get mode support, see `ST20P_TX_FLAG_BLOCK_GET`/`ST20P_RX_FLAG_BLOCK_GET`/`ST22P_TX_FLAG_BLOCK_GET`/`ST22P_RX_FLAG_BLOCK_GET`.
* rx/timing_parser: add support to export the timing_parser to app, see `app/sample/rx_st20p_timing_parser_sample.c` for usage.
* st40: add interlaced support.
* cvt: add st20_rfc4175_422be10_to_yuv422p8 with avx512
* cvt: add st20_rfc4175_422be10_to_yuv420p8 with avx512

## Changelog for 23.12

Expand Down
16 changes: 16 additions & 0 deletions doc/convert.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ For detailed API usage, please refer to [st_convert_api.h](../include/st_convert
| rfc4175_422le10 | v210 | ✅ | | ✅ | ✅ |
| rfc4175_422le10 | rfc4175_422be10 | ✅ | | ✅ | ✅ |
| rfc4175_422le10 | yuv422p10le | ✅ | | | |
| rfc4175_422be10 | yuv422p8 | ✅ | | ✅ | |
| rfc4175_422be10 | yuv420p8 | ✅ | | ✅ | |
| yuv422p10le | rfc4175_422be10 | ✅ | | ✅ | |
| yuv422p10le | rfc4175_422le10 | ✅ | | | |
| v210 | rfc4175_422be10 | ✅ | | ✅ | ✅ |
Expand Down Expand Up @@ -323,6 +325,20 @@ Y210 (1 pixel group)
| 0x4 | 0x5 | 0x6 | 0x7 |
```

### yuv422p8

Color space: YUV<br>
Sample: 422<br>
Packed/planar: planar<br>
Depth: 8<br>

### yuv420p8

Color space: YUV<br>
Sample: 420<br>
Packed/planar: planar<br>
Depth: 8<br>

### yuv422p12le

Color space: YU<br>
Expand Down
120 changes: 120 additions & 0 deletions lib/src/st2110/st_avx512.c
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,126 @@ int st20_rfc4175_422be10_to_yuv422p8_avx512(struct st20_rfc4175_422_10_pg2_be* p
return 0;
}

int st20_rfc4175_422be10_to_yuv420p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg,
uint8_t* y, uint8_t* b, uint8_t* r,
uint32_t w, uint32_t h) {
__m128i shuffle_mask = _mm_loadu_si128((__m128i*)be10_to_le8_shuffle0_tbl_128);
__m128i sllv_mask = _mm_loadu_si128((__m128i*)be10_to_le8_sllv_tbl_128);
__m128i sllv_shuffle_mask = _mm_loadu_si128((__m128i*)be10_to_le8_shuffle1_tbl_128);
__m128i uyvy2uvyy_mask = _mm_loadu_si128((__m128i*)p8_uyvy2uvyy_mask);
__mmask16 k = 0x3FF; /* each __m128i with 2 pg group, 10 bytes */

uint32_t line_pg_cnt = w / 2; /* two pgs in one convert */
uint32_t pg_cnt;

for (uint32_t i = 0; i < (h / 2); i++) { /* 2 lines each loop */
/* first line */
pg_cnt = line_pg_cnt;
while (pg_cnt >= 16) {
__m128i uvyy[4];

for (int step = 0; step < 4; step++) {
__m128i input = _mm_maskz_loadu_epi8(k, (__m128i*)pg);
pg += 2;
__m128i shuffle_result = _mm_shuffle_epi8(input, shuffle_mask);
__m128i sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask);
/* uyvy uyvy .... .... */
__m128i uyvy_t1 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask);

input = _mm_maskz_loadu_epi8(k, (__m128i*)pg);
pg += 2;
shuffle_result = _mm_shuffle_epi8(input, shuffle_mask);
sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask);
/* uyvy uyvy .... .... */
__m128i uyvy_t2 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask);

/* uyvy uyvy uyvy uyvy */
__m128i uyvy = _mm_unpacklo_epi64(uyvy_t1, uyvy_t2);
/* uuuu vvvv yyyy yyyy */
uvyy[step] = _mm_shuffle_epi8(uyvy, uyvy2uvyy_mask);
}

/* merge all u v y from u0v0y0y1, u1v1y2y3, u2v2y4y5, u3v3y6y7*/
/* merge y */
__m128i y0y1y2y3 = _mm_unpackhi_epi64(uvyy[0], uvyy[1]);
_mm_storeu_si128((__m128i*)y, y0y1y2y3);
y += 16;
__m128i y4y5y6y7 = _mm_unpackhi_epi64(uvyy[2], uvyy[3]);
_mm_storeu_si128((__m128i*)y, y4y5y6y7);
y += 16;
/* merge b and r */
__m128i u0v0u1v1 = _mm_unpacklo_epi64(uvyy[0], uvyy[1]);
__m128i u2v2u3v3 = _mm_unpacklo_epi64(uvyy[2], uvyy[3]);
__m128i u0u2v0v2 = _mm_unpacklo_epi32(u0v0u1v1, u2v2u3v3);
__m128i u1u3v1v3 = _mm_unpackhi_epi32(u0v0u1v1, u2v2u3v3);
__m128i u0u1u2u3 = _mm_unpacklo_epi32(u0u2v0v2, u1u3v1v3);
_mm_storeu_si128((__m128i*)b, u0u1u2u3);
b += 16;
__m128i v0v1v2v3 = _mm_unpackhi_epi32(u0u2v0v2, u1u3v1v3);
_mm_storeu_si128((__m128i*)r, v0v1v2v3);
r += 16;

pg_cnt -= 16;
}
while (pg_cnt > 0) {
*b++ = pg->Cb00;
*y++ = (pg->Y00 << 2) | (pg->Y00_ >> 2);
*r++ = (pg->Cr00 << 4) | (pg->Cr00_ >> 2);
*y++ = (pg->Y01 << 6) | (pg->Y01_ >> 2);
pg++;

pg_cnt--;
}

/* second line, no u and v */
pg_cnt = line_pg_cnt;
while (pg_cnt >= 16) {
__m128i uvyy[4];

for (int step = 0; step < 4; step++) {
__m128i input = _mm_maskz_loadu_epi8(k, (__m128i*)pg);
pg += 2;
__m128i shuffle_result = _mm_shuffle_epi8(input, shuffle_mask);
__m128i sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask);
/* uyvy uyvy .... .... */
__m128i uyvy_t1 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask);

input = _mm_maskz_loadu_epi8(k, (__m128i*)pg);
pg += 2;
shuffle_result = _mm_shuffle_epi8(input, shuffle_mask);
sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask);
/* uyvy uyvy .... .... */
__m128i uyvy_t2 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask);

/* uyvy uyvy uyvy uyvy */
__m128i uyvy = _mm_unpacklo_epi64(uyvy_t1, uyvy_t2);
/* uuuu vvvv yyyy yyyy */
uvyy[step] = _mm_shuffle_epi8(uyvy, uyvy2uvyy_mask);
}

/* merge all u v y from u0v0y0y1, u1v1y2y3, u2v2y4y5, u3v3y6y7*/
/* merge y */
__m128i y0y1y2y3 = _mm_unpackhi_epi64(uvyy[0], uvyy[1]);
_mm_storeu_si128((__m128i*)y, y0y1y2y3);
y += 16;
__m128i y4y5y6y7 = _mm_unpackhi_epi64(uvyy[2], uvyy[3]);
_mm_storeu_si128((__m128i*)y, y4y5y6y7);
y += 16;

pg_cnt -= 16;
}
while (pg_cnt > 0) {
*y++ = (pg->Y00 << 2) | (pg->Y00_ >> 2);
*y++ = (pg->Y01 << 6) | (pg->Y01_ >> 2);
pg++;

pg_cnt--;
}
}

return 0;
}

/* begin st20_rfc4175_422le10_to_v210_avx512 */
static uint8_t le10_to_v210_shuffle_r_tbl_128[16] = {
0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
Expand Down
6 changes: 5 additions & 1 deletion lib/src/st2110/st_avx512.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,11 @@ int st20_rfc4175_422be12_to_yuv422p12le_avx512_dma(
struct mtl_dma_lender_dev* dma, struct st20_rfc4175_422_12_pg2_be* pg_be,
mtl_iova_t pg_be_iova, uint16_t* y, uint16_t* b, uint16_t* r, uint32_t w, uint32_t h);

int st20_rfc4175_422be10_to_yuv422p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg_10,
int st20_rfc4175_422be10_to_yuv422p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg,
uint8_t* y, uint8_t* b, uint8_t* r,
uint32_t w, uint32_t h);

int st20_rfc4175_422be10_to_yuv420p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg,
uint8_t* y, uint8_t* b, uint8_t* r,
uint32_t w, uint32_t h);

Expand Down
9 changes: 9 additions & 0 deletions lib/src/st2110/st_convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -1396,6 +1396,15 @@ int st20_rfc4175_422be10_to_yuv420p8_simd(struct st20_rfc4175_422_10_pg2_be* pg,
MTL_MAY_UNUSED(level);
MTL_MAY_UNUSED(ret);

#ifdef MTL_HAS_AVX512
if ((level >= MTL_SIMD_LEVEL_AVX512) && (cpu_level >= MTL_SIMD_LEVEL_AVX512)) {
dbg("%s, avx512 ways\n", __func__);
ret = st20_rfc4175_422be10_to_yuv420p8_avx512(pg, y, b, r, w, h);
if (ret == 0) return 0;
dbg("%s, avx512 ways failed\n", __func__);
}
#endif

/* the last option */
return st20_rfc4175_422be10_to_yuv420p8_scalar(pg, y, b, r, w, h);
}
Expand Down
36 changes: 36 additions & 0 deletions tests/src/cvt_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1208,6 +1208,42 @@ TEST(Cvt, rfc4175_422be10_to_yuv422p8_avx512_vbmi) {
}
}

static void test_cvt_rfc4175_422be10_to_yuv420p8(int w, int h) {
int ret;
size_t fb_pg2_size_10 = (size_t)w * h * 5 / 2;
size_t fb_yuv420p8_size = (size_t)w * h * 3 / 2;
struct st20_rfc4175_422_10_pg2_be* pg_10 =
(struct st20_rfc4175_422_10_pg2_be*)st_test_zmalloc(fb_pg2_size_10);
uint8_t* p8 = (uint8_t*)st_test_zmalloc(fb_yuv420p8_size);
uint8_t* p8_2 = (uint8_t*)st_test_zmalloc(fb_yuv420p8_size);

if (!pg_10 || !p8 || !p8_2) {
EXPECT_EQ(0, 1);
if (pg_10) st_test_free(pg_10);
if (p8) st_test_free(p8);
if (p8_2) st_test_free(p8_2);
return;
}

st_test_rand_data((uint8_t*)pg_10, fb_pg2_size_10, 0);
ret = st20_rfc4175_422be10_to_yuv420p8_simd(pg_10, p8, p8 + w * h, p8 + w * h * 5 / 4,
w, h, MTL_SIMD_LEVEL_NONE);
EXPECT_EQ(0, ret);
ret = st20_rfc4175_422be10_to_yuv420p8_simd(
pg_10, p8_2, p8_2 + w * h, p8_2 + w * h * 5 / 4, w, h, MTL_SIMD_LEVEL_AVX512);
EXPECT_EQ(0, ret);

EXPECT_EQ(0, memcmp(p8, p8_2, fb_yuv420p8_size));

st_test_free(pg_10);
st_test_free(p8);
st_test_free(p8_2);
}

TEST(Cvt, rfc4175_422be10_to_yuv420p8) {
test_cvt_rfc4175_422be10_to_yuv420p8(1920, 1080);
}

static void test_cvt_rfc4175_422le10_to_v210(int w, int h, enum mtl_simd_level cvt_level,
enum mtl_simd_level back_level) {
int ret;
Expand Down