diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6dddd36a6..180bf36b5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@
* pipeline: add block get mode support, see `ST20P_TX_FLAG_BLOCK_GET`/`ST20P_RX_FLAG_BLOCK_GET`/`ST22P_TX_FLAG_BLOCK_GET`/`ST22P_RX_FLAG_BLOCK_GET`.
* rx/timing_parser: add support to export the timing_parser to app, see `app/sample/rx_st20p_timing_parser_sample.c` for usage.
* st40: add interlaced support.
+* cvt: add st20_rfc4175_422be10_to_yuv422p8 with avx512
+* cvt: add st20_rfc4175_422be10_to_yuv420p8 with avx512
## Changelog for 23.12
diff --git a/doc/convert.md b/doc/convert.md
index 53ea97bba..610c466d6 100644
--- a/doc/convert.md
+++ b/doc/convert.md
@@ -36,6 +36,8 @@ For detailed API usage, please refer to [st_convert_api.h](../include/st_convert
| rfc4175_422le10 | v210 | ✅ | | ✅ | ✅ |
| rfc4175_422le10 | rfc4175_422be10 | ✅ | | ✅ | ✅ |
| rfc4175_422le10 | yuv422p10le | ✅ | | | |
+| rfc4175_422be10 | yuv422p8 | ✅ | | ✅ | |
+| rfc4175_422be10 | yuv420p8 | ✅ | | ✅ | |
| yuv422p10le | rfc4175_422be10 | ✅ | | ✅ | |
| yuv422p10le | rfc4175_422le10 | ✅ | | | |
| v210 | rfc4175_422be10 | ✅ | | ✅ | ✅ |
@@ -323,6 +325,20 @@ Y210 (1 pixel group)
| 0x4 | 0x5 | 0x6 | 0x7 |
```
+### yuv422p8
+
+Color space: YUV
+Sample: 422
+Packed/planar: planar
+Depth: 8
+
+### yuv420p8
+
+Color space: YUV
+Sample: 420
+Packed/planar: planar
+Depth: 8
+
### yuv422p12le
Color space: YU
diff --git a/lib/src/st2110/st_avx512.c b/lib/src/st2110/st_avx512.c
index cf78640d6..5856ee2db 100644
--- a/lib/src/st2110/st_avx512.c
+++ b/lib/src/st2110/st_avx512.c
@@ -808,6 +808,126 @@ int st20_rfc4175_422be10_to_yuv422p8_avx512(struct st20_rfc4175_422_10_pg2_be* p
return 0;
}
+int st20_rfc4175_422be10_to_yuv420p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg,
+ uint8_t* y, uint8_t* b, uint8_t* r,
+ uint32_t w, uint32_t h) {
+ __m128i shuffle_mask = _mm_loadu_si128((__m128i*)be10_to_le8_shuffle0_tbl_128);
+ __m128i sllv_mask = _mm_loadu_si128((__m128i*)be10_to_le8_sllv_tbl_128);
+ __m128i sllv_shuffle_mask = _mm_loadu_si128((__m128i*)be10_to_le8_shuffle1_tbl_128);
+ __m128i uyvy2uvyy_mask = _mm_loadu_si128((__m128i*)p8_uyvy2uvyy_mask);
+ __mmask16 k = 0x3FF; /* each __m128i with 2 pg group, 10 bytes */
+
+ uint32_t line_pg_cnt = w / 2; /* two pgs in one convert */
+ uint32_t pg_cnt;
+
+ for (uint32_t i = 0; i < (h / 2); i++) { /* 2 lines each loop */
+ /* first line */
+ pg_cnt = line_pg_cnt;
+ while (pg_cnt >= 16) {
+ __m128i uvyy[4];
+
+ for (int step = 0; step < 4; step++) {
+ __m128i input = _mm_maskz_loadu_epi8(k, (__m128i*)pg);
+ pg += 2;
+ __m128i shuffle_result = _mm_shuffle_epi8(input, shuffle_mask);
+ __m128i sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask);
+ /* uyvy uyvy .... .... */
+ __m128i uyvy_t1 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask);
+
+ input = _mm_maskz_loadu_epi8(k, (__m128i*)pg);
+ pg += 2;
+ shuffle_result = _mm_shuffle_epi8(input, shuffle_mask);
+ sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask);
+ /* uyvy uyvy .... .... */
+ __m128i uyvy_t2 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask);
+
+ /* uyvy uyvy uyvy uyvy */
+ __m128i uyvy = _mm_unpacklo_epi64(uyvy_t1, uyvy_t2);
+ /* uuuu vvvv yyyy yyyy */
+ uvyy[step] = _mm_shuffle_epi8(uyvy, uyvy2uvyy_mask);
+ }
+
+ /* merge all u v y from u0v0y0y1, u1v1y2y3, u2v2y4y5, u3v3y6y7*/
+ /* merge y */
+ __m128i y0y1y2y3 = _mm_unpackhi_epi64(uvyy[0], uvyy[1]);
+ _mm_storeu_si128((__m128i*)y, y0y1y2y3);
+ y += 16;
+ __m128i y4y5y6y7 = _mm_unpackhi_epi64(uvyy[2], uvyy[3]);
+ _mm_storeu_si128((__m128i*)y, y4y5y6y7);
+ y += 16;
+ /* merge b and r */
+ __m128i u0v0u1v1 = _mm_unpacklo_epi64(uvyy[0], uvyy[1]);
+ __m128i u2v2u3v3 = _mm_unpacklo_epi64(uvyy[2], uvyy[3]);
+ __m128i u0u2v0v2 = _mm_unpacklo_epi32(u0v0u1v1, u2v2u3v3);
+ __m128i u1u3v1v3 = _mm_unpackhi_epi32(u0v0u1v1, u2v2u3v3);
+ __m128i u0u1u2u3 = _mm_unpacklo_epi32(u0u2v0v2, u1u3v1v3);
+ _mm_storeu_si128((__m128i*)b, u0u1u2u3);
+ b += 16;
+ __m128i v0v1v2v3 = _mm_unpackhi_epi32(u0u2v0v2, u1u3v1v3);
+ _mm_storeu_si128((__m128i*)r, v0v1v2v3);
+ r += 16;
+
+ pg_cnt -= 16;
+ }
+ while (pg_cnt > 0) {
+ *b++ = pg->Cb00;
+ *y++ = (pg->Y00 << 2) | (pg->Y00_ >> 2);
+ *r++ = (pg->Cr00 << 4) | (pg->Cr00_ >> 2);
+ *y++ = (pg->Y01 << 6) | (pg->Y01_ >> 2);
+ pg++;
+
+ pg_cnt--;
+ }
+
+ /* second line, no u and v */
+ pg_cnt = line_pg_cnt;
+ while (pg_cnt >= 16) {
+ __m128i uvyy[4];
+
+ for (int step = 0; step < 4; step++) {
+ __m128i input = _mm_maskz_loadu_epi8(k, (__m128i*)pg);
+ pg += 2;
+ __m128i shuffle_result = _mm_shuffle_epi8(input, shuffle_mask);
+ __m128i sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask);
+ /* uyvy uyvy .... .... */
+ __m128i uyvy_t1 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask);
+
+ input = _mm_maskz_loadu_epi8(k, (__m128i*)pg);
+ pg += 2;
+ shuffle_result = _mm_shuffle_epi8(input, shuffle_mask);
+ sllv_result = _mm_sllv_epi16(shuffle_result, sllv_mask);
+ /* uyvy uyvy .... .... */
+ __m128i uyvy_t2 = _mm_shuffle_epi8(sllv_result, sllv_shuffle_mask);
+
+ /* uyvy uyvy uyvy uyvy */
+ __m128i uyvy = _mm_unpacklo_epi64(uyvy_t1, uyvy_t2);
+ /* uuuu vvvv yyyy yyyy */
+ uvyy[step] = _mm_shuffle_epi8(uyvy, uyvy2uvyy_mask);
+ }
+
+ /* merge all u v y from u0v0y0y1, u1v1y2y3, u2v2y4y5, u3v3y6y7*/
+ /* merge y */
+ __m128i y0y1y2y3 = _mm_unpackhi_epi64(uvyy[0], uvyy[1]);
+ _mm_storeu_si128((__m128i*)y, y0y1y2y3);
+ y += 16;
+ __m128i y4y5y6y7 = _mm_unpackhi_epi64(uvyy[2], uvyy[3]);
+ _mm_storeu_si128((__m128i*)y, y4y5y6y7);
+ y += 16;
+
+ pg_cnt -= 16;
+ }
+ while (pg_cnt > 0) {
+ *y++ = (pg->Y00 << 2) | (pg->Y00_ >> 2);
+ *y++ = (pg->Y01 << 6) | (pg->Y01_ >> 2);
+ pg++;
+
+ pg_cnt--;
+ }
+ }
+
+ return 0;
+}
+
/* begin st20_rfc4175_422le10_to_v210_avx512 */
static uint8_t le10_to_v210_shuffle_r_tbl_128[16] = {
0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
diff --git a/lib/src/st2110/st_avx512.h b/lib/src/st2110/st_avx512.h
index 0b4ee86eb..05cffc96d 100644
--- a/lib/src/st2110/st_avx512.h
+++ b/lib/src/st2110/st_avx512.h
@@ -111,7 +111,11 @@ int st20_rfc4175_422be12_to_yuv422p12le_avx512_dma(
struct mtl_dma_lender_dev* dma, struct st20_rfc4175_422_12_pg2_be* pg_be,
mtl_iova_t pg_be_iova, uint16_t* y, uint16_t* b, uint16_t* r, uint32_t w, uint32_t h);
-int st20_rfc4175_422be10_to_yuv422p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg_10,
+int st20_rfc4175_422be10_to_yuv422p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg,
+ uint8_t* y, uint8_t* b, uint8_t* r,
+ uint32_t w, uint32_t h);
+
+int st20_rfc4175_422be10_to_yuv420p8_avx512(struct st20_rfc4175_422_10_pg2_be* pg,
uint8_t* y, uint8_t* b, uint8_t* r,
uint32_t w, uint32_t h);
diff --git a/lib/src/st2110/st_convert.c b/lib/src/st2110/st_convert.c
index 784d86723..a56d7d009 100644
--- a/lib/src/st2110/st_convert.c
+++ b/lib/src/st2110/st_convert.c
@@ -1396,6 +1396,15 @@ int st20_rfc4175_422be10_to_yuv420p8_simd(struct st20_rfc4175_422_10_pg2_be* pg,
MTL_MAY_UNUSED(level);
MTL_MAY_UNUSED(ret);
+#ifdef MTL_HAS_AVX512
+ if ((level >= MTL_SIMD_LEVEL_AVX512) && (cpu_level >= MTL_SIMD_LEVEL_AVX512)) {
+ dbg("%s, avx512 ways\n", __func__);
+ ret = st20_rfc4175_422be10_to_yuv420p8_avx512(pg, y, b, r, w, h);
+ if (ret == 0) return 0;
+ dbg("%s, avx512 ways failed\n", __func__);
+ }
+#endif
+
/* the last option */
return st20_rfc4175_422be10_to_yuv420p8_scalar(pg, y, b, r, w, h);
}
diff --git a/tests/src/cvt_test.cpp b/tests/src/cvt_test.cpp
index a8d172c6c..7ee5a84b7 100644
--- a/tests/src/cvt_test.cpp
+++ b/tests/src/cvt_test.cpp
@@ -1208,6 +1208,42 @@ TEST(Cvt, rfc4175_422be10_to_yuv422p8_avx512_vbmi) {
}
}
+static void test_cvt_rfc4175_422be10_to_yuv420p8(int w, int h) {
+ int ret;
+ size_t fb_pg2_size_10 = (size_t)w * h * 5 / 2;
+ size_t fb_yuv420p8_size = (size_t)w * h * 3 / 2;
+ struct st20_rfc4175_422_10_pg2_be* pg_10 =
+ (struct st20_rfc4175_422_10_pg2_be*)st_test_zmalloc(fb_pg2_size_10);
+ uint8_t* p8 = (uint8_t*)st_test_zmalloc(fb_yuv420p8_size);
+ uint8_t* p8_2 = (uint8_t*)st_test_zmalloc(fb_yuv420p8_size);
+
+ if (!pg_10 || !p8 || !p8_2) {
+ EXPECT_EQ(0, 1);
+ if (pg_10) st_test_free(pg_10);
+ if (p8) st_test_free(p8);
+ if (p8_2) st_test_free(p8_2);
+ return;
+ }
+
+ st_test_rand_data((uint8_t*)pg_10, fb_pg2_size_10, 0);
+ ret = st20_rfc4175_422be10_to_yuv420p8_simd(pg_10, p8, p8 + w * h, p8 + w * h * 5 / 4,
+ w, h, MTL_SIMD_LEVEL_NONE);
+ EXPECT_EQ(0, ret);
+ ret = st20_rfc4175_422be10_to_yuv420p8_simd(
+ pg_10, p8_2, p8_2 + w * h, p8_2 + w * h * 5 / 4, w, h, MTL_SIMD_LEVEL_AVX512);
+ EXPECT_EQ(0, ret);
+
+ EXPECT_EQ(0, memcmp(p8, p8_2, fb_yuv420p8_size));
+
+ st_test_free(pg_10);
+ st_test_free(p8);
+ st_test_free(p8_2);
+}
+
+TEST(Cvt, rfc4175_422be10_to_yuv420p8) {
+ test_cvt_rfc4175_422be10_to_yuv420p8(1920, 1080);
+}
+
static void test_cvt_rfc4175_422le10_to_v210(int w, int h, enum mtl_simd_level cvt_level,
enum mtl_simd_level back_level) {
int ret;