diff --git a/CHANGELOG.md b/CHANGELOG.md index 6dddd36a6..180bf36b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ * pipeline: add block get mode support, see `ST20P_TX_FLAG_BLOCK_GET`/`ST20P_RX_FLAG_BLOCK_GET`/`ST22P_TX_FLAG_BLOCK_GET`/`ST22P_RX_FLAG_BLOCK_GET`. * rx/timing_parser: add support to export the timing_parser to app, see `app/sample/rx_st20p_timing_parser_sample.c` for usage. * st40: add interlaced support. +* cvt: add st20_rfc4175_422be10_to_yuv422p8 with avx512 +* cvt: add st20_rfc4175_422be10_to_yuv420p8 with avx512 ## Changelog for 23.12 diff --git a/doc/convert.md b/doc/convert.md index 53ea97bba..610c466d6 100644 --- a/doc/convert.md +++ b/doc/convert.md @@ -36,6 +36,8 @@ For detailed API usage, please refer to [st_convert_api.h](../include/st_convert | rfc4175_422le10 | v210 | ✅ | | ✅ | ✅ | | rfc4175_422le10 | rfc4175_422be10 | ✅ | | ✅ | ✅ | | rfc4175_422le10 | yuv422p10le | ✅ | | | | +| rfc4175_422be10 | yuv422p8 | ✅ | | ✅ | | +| rfc4175_422be10 | yuv420p8 | ✅ | | ✅ | | | yuv422p10le | rfc4175_422be10 | ✅ | | ✅ | | | yuv422p10le | rfc4175_422le10 | ✅ | | | | | v210 | rfc4175_422be10 | ✅ | | ✅ | ✅ | @@ -323,6 +325,20 @@ Y210 (1 pixel group) | 0x4 | 0x5 | 0x6 | 0x7 | ``` +### yuv422p8 + +Color space: YUV
+Sample: 422
+Packed/planar: planar
+Depth: 8
+ +### yuv420p8 + +Color space: YUV
+Sample: 420
+Packed/planar: planar
+Depth: 8
+ ### yuv422p12le Color space: YU
diff --git a/lib/src/st2110/st_avx512.c b/lib/src/st2110/st_avx512.c index cf78640d6..5856ee2db 100644 --- a/lib/src/st2110/st_avx512.c +++ b/lib/src/st2110/st_avx512.c @@ -808,6 +808,126 @@ int st20_rfc4175_422be10_to_yuv422p8_avx512(struct st20_rfc4175_422_10_pg2_be* p return 0; } +int st20_rfc4175_422be10_to_yuv420p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg, + uint8_t* y, uint8_t* b, uint8_t* r, + uint32_t w, uint32_t h) { + __m128i shuffle_mask = _mm_loadu_si128((__m128i*)be10_to_le8_shuffle0_tbl_128); + __m128i sllv_mask = _mm_loadu_si128((__m128i*)be10_to_le8_sllv_tbl_128); + __m128i sllv_shuffle_mask = _mm_loadu_si128((__m128i*)be10_to_le8_shuffle1_tbl_128); + __m128i uyvy2uvyy_mask = _mm_loadu_si128((__m128i*)p8_uyvy2uvyy_mask); + __mmask16 k = 0x3FF; /* each __m128i with 2 pg group, 10 bytes */ + + uint32_t line_pg_cnt = w / 2; /* two pgs in one convert */ + uint32_t pg_cnt; + + for (uint32_t i = 0; i < (h / 2); i++) { /* 2 lines each loop */ + /* first line */ + pg_cnt = line_pg_cnt; + while (pg_cnt >= 16) { + __m128i uvyy[4]; + + for (int step = 0; step < 4; step++) { + __m128i input = _mm_maskz_loadu_epi8(k, (__m128i*)pg); + pg += 2; + __m128i shuffle_result = _mm_shuffle_epi8(input, shuffle_mask); + __m128i sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask); + /* uyvy uyvy .... .... */ + __m128i uyvy_t1 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask); + + input = _mm_maskz_loadu_epi8(k, (__m128i*)pg); + pg += 2; + shuffle_result = _mm_shuffle_epi8(input, shuffle_mask); + sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask); + /* uyvy uyvy .... .... */ + __m128i uyvy_t2 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask); + + /* uyvy uyvy uyvy uyvy */ + __m128i uyvy = _mm_unpacklo_epi64(uyvy_t1, uyvy_t2); + /* uuuu vvvv yyyy yyyy */ + uvyy[step] = _mm_shuffle_epi8(uyvy, uyvy2uvyy_mask); + } + + /* merge all u v y from u0v0y0y1, u1v1y2y3, u2v2y4y5, u3v3y6y7*/ + /* merge y */ + __m128i y0y1y2y3 = _mm_unpackhi_epi64(uvyy[0], uvyy[1]); + _mm_storeu_si128((__m128i*)y, y0y1y2y3); + y += 16; + __m128i y4y5y6y7 = _mm_unpackhi_epi64(uvyy[2], uvyy[3]); + _mm_storeu_si128((__m128i*)y, y4y5y6y7); + y += 16; + /* merge b and r */ + __m128i u0v0u1v1 = _mm_unpacklo_epi64(uvyy[0], uvyy[1]); + __m128i u2v2u3v3 = _mm_unpacklo_epi64(uvyy[2], uvyy[3]); + __m128i u0u2v0v2 = _mm_unpacklo_epi32(u0v0u1v1, u2v2u3v3); + __m128i u1u3v1v3 = _mm_unpackhi_epi32(u0v0u1v1, u2v2u3v3); + __m128i u0u1u2u3 = _mm_unpacklo_epi32(u0u2v0v2, u1u3v1v3); + _mm_storeu_si128((__m128i*)b, u0u1u2u3); + b += 16; + __m128i v0v1v2v3 = _mm_unpackhi_epi32(u0u2v0v2, u1u3v1v3); + _mm_storeu_si128((__m128i*)r, v0v1v2v3); + r += 16; + + pg_cnt -= 16; + } + while (pg_cnt > 0) { + *b++ = pg->Cb00; + *y++ = (pg->Y00 << 2) | (pg->Y00_ >> 2); + *r++ = (pg->Cr00 << 4) | (pg->Cr00_ >> 2); + *y++ = (pg->Y01 << 6) | (pg->Y01_ >> 2); + pg++; + + pg_cnt--; + } + + /* second line, no u and v */ + pg_cnt = line_pg_cnt; + while (pg_cnt >= 16) { + __m128i uvyy[4]; + + for (int step = 0; step < 4; step++) { + __m128i input = _mm_maskz_loadu_epi8(k, (__m128i*)pg); + pg += 2; + __m128i shuffle_result = _mm_shuffle_epi8(input, shuffle_mask); + __m128i sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask); + /* uyvy uyvy .... .... */ + __m128i uyvy_t1 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask); + + input = _mm_maskz_loadu_epi8(k, (__m128i*)pg); + pg += 2; + shuffle_result = _mm_shuffle_epi8(input, shuffle_mask); + sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask); + /* uyvy uyvy .... .... */ + __m128i uyvy_t2 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask); + + /* uyvy uyvy uyvy uyvy */ + __m128i uyvy = _mm_unpacklo_epi64(uyvy_t1, uyvy_t2); + /* uuuu vvvv yyyy yyyy */ + uvyy[step] = _mm_shuffle_epi8(uyvy, uyvy2uvyy_mask); + } + + /* merge all u v y from u0v0y0y1, u1v1y2y3, u2v2y4y5, u3v3y6y7*/ + /* merge y */ + __m128i y0y1y2y3 = _mm_unpackhi_epi64(uvyy[0], uvyy[1]); + _mm_storeu_si128((__m128i*)y, y0y1y2y3); + y += 16; + __m128i y4y5y6y7 = _mm_unpackhi_epi64(uvyy[2], uvyy[3]); + _mm_storeu_si128((__m128i*)y, y4y5y6y7); + y += 16; + + pg_cnt -= 16; + } + while (pg_cnt > 0) { + *y++ = (pg->Y00 << 2) | (pg->Y00_ >> 2); + *y++ = (pg->Y01 << 6) | (pg->Y01_ >> 2); + pg++; + + pg_cnt--; + } + } + + return 0; +} + /* begin st20_rfc4175_422le10_to_v210_avx512 */ static uint8_t le10_to_v210_shuffle_r_tbl_128[16] = { 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, diff --git a/lib/src/st2110/st_avx512.h b/lib/src/st2110/st_avx512.h index 0b4ee86eb..05cffc96d 100644 --- a/lib/src/st2110/st_avx512.h +++ b/lib/src/st2110/st_avx512.h @@ -111,7 +111,11 @@ int st20_rfc4175_422be12_to_yuv422p12le_avx512_dma( struct mtl_dma_lender_dev* dma, struct st20_rfc4175_422_12_pg2_be* pg_be, mtl_iova_t pg_be_iova, uint16_t* y, uint16_t* b, uint16_t* r, uint32_t w, uint32_t h); -int st20_rfc4175_422be10_to_yuv422p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg_10, +int st20_rfc4175_422be10_to_yuv422p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg, + uint8_t* y, uint8_t* b, uint8_t* r, + uint32_t w, uint32_t h); + +int st20_rfc4175_422be10_to_yuv420p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg, uint8_t* y, uint8_t* b, uint8_t* r, uint32_t w, uint32_t h); diff --git a/lib/src/st2110/st_convert.c b/lib/src/st2110/st_convert.c index 784d86723..a56d7d009 100644 --- a/lib/src/st2110/st_convert.c +++ b/lib/src/st2110/st_convert.c @@ -1396,6 +1396,15 @@ int st20_rfc4175_422be10_to_yuv420p8_simd(struct st20_rfc4175_422_10_pg2_be* pg, MTL_MAY_UNUSED(level); MTL_MAY_UNUSED(ret); +#ifdef MTL_HAS_AVX512 + if ((level >= MTL_SIMD_LEVEL_AVX512) && (cpu_level >= MTL_SIMD_LEVEL_AVX512)) { + dbg("%s, avx512 ways\n", __func__); + ret = st20_rfc4175_422be10_to_yuv420p8_avx512(pg, y, b, r, w, h); + if (ret == 0) return 0; + dbg("%s, avx512 ways failed\n", __func__); + } +#endif + /* the last option */ return st20_rfc4175_422be10_to_yuv420p8_scalar(pg, y, b, r, w, h); } diff --git a/tests/src/cvt_test.cpp b/tests/src/cvt_test.cpp index a8d172c6c..7ee5a84b7 100644 --- a/tests/src/cvt_test.cpp +++ b/tests/src/cvt_test.cpp @@ -1208,6 +1208,42 @@ TEST(Cvt, rfc4175_422be10_to_yuv422p8_avx512_vbmi) { } } +static void test_cvt_rfc4175_422be10_to_yuv420p8(int w, int h) { + int ret; + size_t fb_pg2_size_10 = (size_t)w * h * 5 / 2; + size_t fb_yuv420p8_size = (size_t)w * h * 3 / 2; + struct st20_rfc4175_422_10_pg2_be* pg_10 = + (struct st20_rfc4175_422_10_pg2_be*)st_test_zmalloc(fb_pg2_size_10); + uint8_t* p8 = (uint8_t*)st_test_zmalloc(fb_yuv420p8_size); + uint8_t* p8_2 = (uint8_t*)st_test_zmalloc(fb_yuv420p8_size); + + if (!pg_10 || !p8 || !p8_2) { + EXPECT_EQ(0, 1); + if (pg_10) st_test_free(pg_10); + if (p8) st_test_free(p8); + if (p8_2) st_test_free(p8_2); + return; + } + + st_test_rand_data((uint8_t*)pg_10, fb_pg2_size_10, 0); + ret = st20_rfc4175_422be10_to_yuv420p8_simd(pg_10, p8, p8 + w * h, p8 + w * h * 5 / 4, + w, h, MTL_SIMD_LEVEL_NONE); + EXPECT_EQ(0, ret); + ret = st20_rfc4175_422be10_to_yuv420p8_simd( + pg_10, p8_2, p8_2 + w * h, p8_2 + w * h * 5 / 4, w, h, MTL_SIMD_LEVEL_AVX512); + EXPECT_EQ(0, ret); + + EXPECT_EQ(0, memcmp(p8, p8_2, fb_yuv420p8_size)); + + st_test_free(pg_10); + st_test_free(p8); + st_test_free(p8_2); +} + +TEST(Cvt, rfc4175_422be10_to_yuv420p8) { + test_cvt_rfc4175_422be10_to_yuv420p8(1920, 1080); +} + static void test_cvt_rfc4175_422le10_to_v210(int w, int h, enum mtl_simd_level cvt_level, enum mtl_simd_level back_level) { int ret;