From b1ac9d37168e2154164e7f1167e011b0ee332e73 Mon Sep 17 00:00:00 2001 From: ttrigui <44172344+ttrigui@users.noreply.github.com> Date: Mon, 25 Sep 2023 06:27:43 -0700 Subject: [PATCH] AVX512 Release (#8) * Add CSF avx512 version * Add SAST avx512 version * Add SSIM avx512 version * Add ADM Decouple avx512 version * Add DWT2 avx512 version * Add VIF avx512 version * Add some AVX2 changes, some bug fix and remove unused variables Co-authored-by: PierreIntel --- Makefile | 2 +- .../third_party/funque/integer_funque.c | 55 +- .../third_party/funque/x86/hbd_resizer_avx2.c | 276 +- .../funque/x86/hbd_resizer_avx512.c | 583 +++ .../funque/x86/integer_funque_adm_avx512.c | 1035 +++++ .../funque/x86/integer_funque_adm_avx512.h | 36 + .../funque/x86/integer_funque_filters_avx2.c | 2240 +++++++---- .../x86/integer_funque_filters_avx512.c | 3389 +++++++++++++++++ .../x86/integer_funque_filters_avx512.h | 27 + .../funque/x86/integer_funque_ssim_avx512.c | 884 +++++ .../funque/x86/integer_funque_ssim_avx512.h | 21 + .../funque/x86/integer_funque_vif_avx2.c | 2 +- .../funque/x86/integer_funque_vif_avx2.h | 13 +- .../funque/x86/integer_funque_vif_avx512.c | 634 +++ .../funque/x86/integer_funque_vif_avx512.h | 691 ++++ .../third_party/funque/x86/resizer_avx2.c | 275 +- .../third_party/funque/x86/resizer_avx512.c | 521 +++ .../third_party/funque/x86/resizer_avx512.h | 31 + libvmaf/src/meson.build | 11 + 19 files changed, 9657 insertions(+), 1069 deletions(-) create mode 100644 libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx512.c create mode 100644 libvmaf/src/feature/third_party/funque/x86/integer_funque_adm_avx512.c create mode 100644 libvmaf/src/feature/third_party/funque/x86/integer_funque_adm_avx512.h create mode 100644 libvmaf/src/feature/third_party/funque/x86/integer_funque_filters_avx512.c create mode 100644 libvmaf/src/feature/third_party/funque/x86/integer_funque_filters_avx512.h create mode 100644 libvmaf/src/feature/third_party/funque/x86/integer_funque_ssim_avx512.c create mode 100644 libvmaf/src/feature/third_party/funque/x86/integer_funque_ssim_avx512.h create mode 100644 libvmaf/src/feature/third_party/funque/x86/integer_funque_vif_avx512.c create mode 100644 libvmaf/src/feature/third_party/funque/x86/integer_funque_vif_avx512.h create mode 100644 libvmaf/src/feature/third_party/funque/x86/resizer_avx512.c create mode 100644 libvmaf/src/feature/third_party/funque/x86/resizer_avx512.h diff --git a/Makefile b/Makefile index 468e5b2fa..bd27d4eb7 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ all: - meson setup libvmaf/build libvmaf --buildtype release -Denable_float=true -Denable_float_funque=true -Denable_integer_funque=true && \ + meson setup libvmaf/build libvmaf --buildtype release -Denable_float=true -Denable_float_funque=true -Denable_integer_funque=true -Denable_avx512=true && \ ninja -vC libvmaf/build cd python && python3 setup.py build_ext --build-lib . diff --git a/libvmaf/src/feature/third_party/funque/integer_funque.c b/libvmaf/src/feature/third_party/funque/integer_funque.c index 2e5764fb3..27a4f0a9f 100644 --- a/libvmaf/src/feature/third_party/funque/integer_funque.c +++ b/libvmaf/src/feature/third_party/funque/integer_funque.c @@ -60,6 +60,13 @@ #include "x86/integer_funque_adm_avx2.h" #include "x86/integer_funque_motion_avx2.h" #include "x86/resizer_avx2.h" +#if HAVE_AVX512 +#include "x86/integer_funque_filters_avx512.h" +#include "x86/resizer_avx512.h" +#include "x86/integer_funque_ssim_avx512.h" +#include "x86/integer_funque_adm_avx512.h" +#include "x86/integer_funque_vif_avx512.h" +#endif #endif #include "cpu.h" @@ -344,6 +351,18 @@ static int init(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt, s->resize_module.resizer_step = step_avx2; s->resize_module.hbd_resizer_step = hbd_step_avx2; } +#if HAVE_AVX512 + if (flags & VMAF_X86_CPU_FLAG_AVX512) { + s->modules.integer_spatial_filter = integer_spatial_filter_avx512; + s->resize_module.resizer_step = step_avx512; + s->resize_module.hbd_resizer_step = hbd_step_avx512; + s->modules.integer_compute_ssim_funque = integer_compute_ssim_funque_avx512; + s->modules.integer_funque_adm_decouple = integer_adm_decouple_avx512; + s->modules.integer_funque_dwt2 = integer_funque_dwt2_avx512; + s->modules.integer_funque_vifdwt2_band0 = integer_funque_vifdwt2_band0_avx512; + s->modules.integer_compute_vif_funque = integer_compute_vif_funque_avx512; + } +#endif #endif funque_log_generate(s->log_18); @@ -447,7 +466,7 @@ static int extract(VmafFeatureExtractor *fex, if (err) return err; - err = s->modules.integer_compute_ssim_funque(&s->i_ref_dwt2out, &s->i_dist_dwt2out, &ssim_score, 1, 0.01, 0.03, pending_div_factor, s->adm_div_lookup); + err = s->modules.integer_compute_ssim_funque(&s->i_ref_dwt2out, &s->i_dist_dwt2out, &ssim_score[0], 1, 0.01, 0.03, pending_div_factor, s->adm_div_lookup); if (err) return err; @@ -500,23 +519,23 @@ static int extract(VmafFeatureExtractor *fex, s->feature_name_dict, "FUNQUE_integer_feature_vif_scale0_score", vif_score[0], index); - if (s->vif_levels > 1) { - err |= vmaf_feature_collector_append_with_dict(feature_collector, - s->feature_name_dict, "FUNQUE_integer_feature_vif_scale1_score", - vif_score[1], index); - - if (s->vif_levels > 2) { - err |= vmaf_feature_collector_append_with_dict(feature_collector, - s->feature_name_dict, "FUNQUE_integer_feature_vif_scale2_score", - vif_score[2], index); - - if (s->vif_levels > 3) { - err |= vmaf_feature_collector_append_with_dict(feature_collector, - s->feature_name_dict, "FUNQUE_integer_feature_vif_scale3_score", - vif_score[3], index); - } - } - } + // if (s->vif_levels > 1) { + // err |= vmaf_feature_collector_append_with_dict(feature_collector, + // s->feature_name_dict, "FUNQUE_integer_feature_vif_scale1_score", + // vif_score[1], index); + + // if (s->vif_levels > 2) { + // err |= vmaf_feature_collector_append_with_dict(feature_collector, + // s->feature_name_dict, "FUNQUE_integer_feature_vif_scale2_score", + // vif_score[2], index); + + // if (s->vif_levels > 3) { + // err |= vmaf_feature_collector_append_with_dict(feature_collector, + // s->feature_name_dict, "FUNQUE_integer_feature_vif_scale3_score", + // vif_score[3], index); + // } + // } + // } err |= vmaf_feature_collector_append_with_dict(feature_collector, s->feature_name_dict, "FUNQUE_integer_feature_adm_score", diff --git a/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx2.c b/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx2.c index eb7a6230c..9b2613594 100644 --- a/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx2.c +++ b/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx2.c @@ -73,6 +73,10 @@ void hbd_hresize_avx2(const unsigned short **src, int **dst, int count, int swidth, int dwidth, int cn, int xmin, int xmax) #endif { + __m256i coef0_256 = _mm256_set_epi32(alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0]); + __m256i coef2_256 = _mm256_set_epi32(alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2]); + __m256i zero_256 = _mm256_setzero_si256(); + int xmax_16 = xmax - (xmax % 16); int xmax_8 = xmax - (xmax % 8); int xmax_4 = xmax - (xmax % 4); @@ -120,31 +124,28 @@ void hbd_hresize_avx2(const unsigned short **src, int **dst, int count, { int sx = xofs[dx]; // sx - 2, 4, 6, 8.... #endif - __m256i coef0 = _mm256_set_epi32(alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0]); - __m256i coef2 = _mm256_set_epi32(alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2]); - __m256i val0_0 = _mm256_loadu_si256((__m256i*)(S + sx - 1)); __m256i val2_0 = _mm256_loadu_si256((__m256i*)(S + sx + 1)); __m256i val0_16 = _mm256_loadu_si256((__m256i*)(S + sx + 15)); __m256i val2_16 = _mm256_loadu_si256((__m256i*)(S + sx + 17)); - __m256i val0_0_lo = _mm256_unpacklo_epi16(val0_0, _mm256_setzero_si256()); - __m256i val0_0_hi = _mm256_unpackhi_epi16(val0_0, _mm256_setzero_si256()); - __m256i val2_0_lo = _mm256_unpacklo_epi16(val2_0, _mm256_setzero_si256()); - __m256i val2_0_hi = _mm256_unpackhi_epi16(val2_0, _mm256_setzero_si256()); - __m256i val0_16_lo = _mm256_unpacklo_epi16(val0_16, _mm256_setzero_si256()); - __m256i val0_16_hi = _mm256_unpackhi_epi16(val0_16, _mm256_setzero_si256()); - __m256i val2_16_lo = _mm256_unpacklo_epi16(val2_16, _mm256_setzero_si256()); - __m256i val2_16_hi = _mm256_unpackhi_epi16(val2_16, _mm256_setzero_si256()); - - __m256i mul0_0_lo = _mm256_mullo_epi32(val0_0_lo, coef0); - __m256i mul0_0_hi = _mm256_mullo_epi32(val0_0_hi, coef0); - __m256i mul2_0_lo = _mm256_mullo_epi32(val2_0_lo, coef2); - __m256i mul2_0_hi = _mm256_mullo_epi32(val2_0_hi, coef2); - __m256i mul0_16_lo = _mm256_mullo_epi32(val0_16_lo, coef0); - __m256i mul0_16_hi = _mm256_mullo_epi32(val0_16_hi, coef0); - __m256i mul2_16_lo = _mm256_mullo_epi32(val2_16_lo, coef2); - __m256i mul2_16_hi = _mm256_mullo_epi32(val2_16_hi, coef2); + __m256i val0_0_lo = _mm256_unpacklo_epi16(val0_0, zero_256); + __m256i val0_0_hi = _mm256_unpackhi_epi16(val0_0, zero_256); + __m256i val2_0_lo = _mm256_unpacklo_epi16(val2_0, zero_256); + __m256i val2_0_hi = _mm256_unpackhi_epi16(val2_0, zero_256); + __m256i val0_16_lo = _mm256_unpacklo_epi16(val0_16, zero_256); + __m256i val0_16_hi = _mm256_unpackhi_epi16(val0_16, zero_256); + __m256i val2_16_lo = _mm256_unpacklo_epi16(val2_16, zero_256); + __m256i val2_16_hi = _mm256_unpackhi_epi16(val2_16, zero_256); + + __m256i mul0_0_lo = _mm256_mullo_epi32(val0_0_lo, coef0_256); + __m256i mul0_0_hi = _mm256_mullo_epi32(val0_0_hi, coef0_256); + __m256i mul2_0_lo = _mm256_mullo_epi32(val2_0_lo, coef2_256); + __m256i mul2_0_hi = _mm256_mullo_epi32(val2_0_hi, coef2_256); + __m256i mul0_16_lo = _mm256_mullo_epi32(val0_16_lo, coef0_256); + __m256i mul0_16_hi = _mm256_mullo_epi32(val0_16_hi, coef0_256); + __m256i mul2_16_lo = _mm256_mullo_epi32(val2_16_lo, coef2_256); + __m256i mul2_16_hi = _mm256_mullo_epi32(val2_16_hi, coef2_256); __m256i hadd0_0 = _mm256_hadd_epi32(mul0_0_lo, mul0_0_hi); __m256i hadd2_0 = _mm256_hadd_epi32(mul2_0_lo, mul2_0_hi); @@ -162,21 +163,18 @@ void hbd_hresize_avx2(const unsigned short **src, int **dst, int count, { int sx = dx * 2; - __m256i coef0 = _mm256_set_epi32(alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0]); - __m256i coef2 = _mm256_set_epi32(alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2]); - __m256i val0_0 = _mm256_loadu_si256((__m256i*)(S + sx - 1)); __m256i val2_0 = _mm256_loadu_si256((__m256i*)(S + sx + 1)); - __m256i val0_0_lo = _mm256_unpacklo_epi16(val0_0, _mm256_setzero_si256()); - __m256i val0_0_hi = _mm256_unpackhi_epi16(val0_0, _mm256_setzero_si256()); - __m256i val2_0_lo = _mm256_unpacklo_epi16(val2_0, _mm256_setzero_si256()); - __m256i val2_0_hi = _mm256_unpackhi_epi16(val2_0, _mm256_setzero_si256()); + __m256i val0_0_lo = _mm256_unpacklo_epi16(val0_0, zero_256); + __m256i val0_0_hi = _mm256_unpackhi_epi16(val0_0, zero_256); + __m256i val2_0_lo = _mm256_unpacklo_epi16(val2_0, zero_256); + __m256i val2_0_hi = _mm256_unpackhi_epi16(val2_0, zero_256); - __m256i mul0_0_lo = _mm256_mullo_epi32(val0_0_lo, coef0); - __m256i mul0_0_hi = _mm256_mullo_epi32(val0_0_hi, coef0); - __m256i mul2_0_lo = _mm256_mullo_epi32(val2_0_lo, coef2); - __m256i mul2_0_hi = _mm256_mullo_epi32(val2_0_hi, coef2); + __m256i mul0_0_lo = _mm256_mullo_epi32(val0_0_lo, coef0_256); + __m256i mul0_0_hi = _mm256_mullo_epi32(val0_0_hi, coef0_256); + __m256i mul2_0_lo = _mm256_mullo_epi32(val2_0_lo, coef2_256); + __m256i mul2_0_hi = _mm256_mullo_epi32(val2_0_hi, coef2_256); __m256i hadd0_0 = _mm256_hadd_epi32(mul0_0_lo, mul0_0_hi); __m256i hadd2_0 = _mm256_hadd_epi32(mul2_0_lo, mul2_0_hi); @@ -189,13 +187,10 @@ void hbd_hresize_avx2(const unsigned short **src, int **dst, int count, { int sx = dx * 2; - __m256i coef0 = _mm256_set_epi32(alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0]); - __m256i coef2 = _mm256_set_epi32(alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2]); - __m256i val0_0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(S + sx - 1))); __m256i val2_0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(S + sx + 1))); - __m256i mul0_0 = _mm256_mullo_epi32(val0_0, coef0); - __m256i mul2_0 = _mm256_mullo_epi32(val2_0, coef2); + __m256i mul0_0 = _mm256_mullo_epi32(val0_0, coef0_256); + __m256i mul2_0 = _mm256_mullo_epi32(val2_0, coef2_256); __m256i hadd0 = _mm256_hadd_epi32(mul0_0, mul0_0); __m256i hadd2 = _mm256_hadd_epi32(mul2_0, mul2_0); @@ -238,20 +233,25 @@ void hbd_vresize_avx2(const int **src, unsigned short *dst, const short *beta, i const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; int bits = 22; - __m256i delta = _mm256_set1_epi64x(1 << (bits - 1)); - __m256i max_char = _mm256_set1_epi64x(((1 << bitdepth) - 1)); - __m128i max_char_128 = _mm_set1_epi32(((1 << bitdepth) - 1)); + + __m256i delta_256 = _mm256_set1_epi64x(1 << (bits - 1)); + __m256i max_char_256 = _mm256_set1_epi64x(((1 << bitdepth) - 1)); + __m256i coef0_256 = _mm256_set1_epi32(beta[0]); + __m256i coef1_256 = _mm256_set1_epi32(beta[1]); + __m256i perm_256 = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + __m256i zero_256 = _mm256_setzero_si256(); + + __m128i max_char_128 = _mm_set1_epi64x(((1 << bitdepth) - 1)); __m128i delta_128 = _mm_set1_epi64x(1 << (bits - 1)); - __m256i coef0 = _mm256_set1_epi32(beta[0]); - __m256i coef1 = _mm256_set1_epi32(beta[1]); __m128i coef0_128 = _mm_set1_epi32(beta[0]); __m128i coef1_128 = _mm_set1_epi32(beta[1]); + __m128i zero_128 = _mm_setzero_si128(); int width_16 = width - (width % 16); int width_8 = width - (width % 8); int width_4 = width - (width % 4); - int x; - for (x = 0; x < width_16; x+=16) + int x = 0; + for (; x < width_16; x+=16) { __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x)); __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x)); @@ -263,37 +263,25 @@ void hbd_vresize_avx2(const int **src, unsigned short *dst, const short *beta, i __m256i src2_8 = _mm256_loadu_si256((__m256i*)(S2 + x + 8)); __m256i src3_8 = _mm256_loadu_si256((__m256i*)(S3 + x + 8)); - __m256i src0_4 = _mm256_permutevar8x32_epi32(src0_0, _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1)); - __m256i src1_4 = _mm256_permutevar8x32_epi32(src1_0, _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1)); - __m256i src2_4 = _mm256_permutevar8x32_epi32(src2_0, _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1)); - __m256i src3_4 = _mm256_permutevar8x32_epi32(src3_0, _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1)); - - __m256i src0_12 = _mm256_permutevar8x32_epi32(src0_8, _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1)); - __m256i src1_12 = _mm256_permutevar8x32_epi32(src1_8, _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1)); - __m256i src2_12 = _mm256_permutevar8x32_epi32(src2_8, _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1)); - __m256i src3_12 = _mm256_permutevar8x32_epi32(src3_8, _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1)); - - // 0 2 4 6 - __m256i mul0_0 = _mm256_mul_epi32(src0_0, coef0); - __m256i mul1_0 = _mm256_mul_epi32(src1_0, coef1); - __m256i mul2_0 = _mm256_mul_epi32(src2_0, coef1); - __m256i mul3_0 = _mm256_mul_epi32(src3_0, coef0); - - // 1 3 5 7 - __m256i mul0_4 = _mm256_mul_epi32(src0_4, coef0); - __m256i mul1_4 = _mm256_mul_epi32(src1_4, coef1); - __m256i mul2_4 = _mm256_mul_epi32(src2_4, coef1); - __m256i mul3_4 = _mm256_mul_epi32(src3_4, coef0); - - __m256i mul0_8 = _mm256_mul_epi32(src0_8, coef0); - __m256i mul1_8 = _mm256_mul_epi32(src1_8, coef1); - __m256i mul2_8 = _mm256_mul_epi32(src2_8, coef1); - __m256i mul3_8 = _mm256_mul_epi32(src3_8, coef0); - - __m256i mul0_12 = _mm256_mul_epi32(src0_12, coef0); - __m256i mul1_12 = _mm256_mul_epi32(src1_12, coef1); - __m256i mul2_12 = _mm256_mul_epi32(src2_12, coef1); - __m256i mul3_12 = _mm256_mul_epi32(src3_12, coef0); + __m256i mul0_0 = _mm256_mul_epi32(src0_0, coef0_256); + __m256i mul1_0 = _mm256_mul_epi32(src1_0, coef1_256); + __m256i mul2_0 = _mm256_mul_epi32(src2_0, coef1_256); + __m256i mul3_0 = _mm256_mul_epi32(src3_0, coef0_256); + + __m256i mul0_4 = _mm256_mul_epi32(_mm256_srli_si256(src0_0, 4), coef0_256); + __m256i mul1_4 = _mm256_mul_epi32(_mm256_srli_si256(src1_0, 4), coef1_256); + __m256i mul2_4 = _mm256_mul_epi32(_mm256_srli_si256(src2_0, 4), coef1_256); + __m256i mul3_4 = _mm256_mul_epi32(_mm256_srli_si256(src3_0, 4), coef0_256); + + __m256i mul0_8 = _mm256_mul_epi32(src0_8, coef0_256); + __m256i mul1_8 = _mm256_mul_epi32(src1_8, coef1_256); + __m256i mul2_8 = _mm256_mul_epi32(src2_8, coef1_256); + __m256i mul3_8 = _mm256_mul_epi32(src3_8, coef0_256); + + __m256i mul0_12 = _mm256_mul_epi32(_mm256_srli_si256(src0_8, 4), coef0_256); + __m256i mul1_12 = _mm256_mul_epi32(_mm256_srli_si256(src1_8, 4), coef1_256); + __m256i mul2_12 = _mm256_mul_epi32(_mm256_srli_si256(src2_8, 4), coef1_256); + __m256i mul3_12 = _mm256_mul_epi32(_mm256_srli_si256(src3_8, 4), coef0_256); __m256i accum_01_0 = _mm256_add_epi64(mul0_0, mul1_0); __m256i accum_23_0 = _mm256_add_epi64(mul2_0, mul3_0); @@ -309,50 +297,31 @@ void hbd_vresize_avx2(const int **src, unsigned short *dst, const short *beta, i __m256i accum_0123_8 = _mm256_add_epi64(accum_01_8, accum_23_8); __m256i accum_0123_12 = _mm256_add_epi64(accum_01_12, accum_23_12); - accum_0123_0 = _mm256_add_epi64(accum_0123_0, delta); - accum_0123_4 = _mm256_add_epi64(accum_0123_4, delta); - accum_0123_8 = _mm256_add_epi64(accum_0123_8, delta); - accum_0123_12 = _mm256_add_epi64(accum_0123_12, delta); + accum_0123_0 = _mm256_add_epi64(accum_0123_0, delta_256); + accum_0123_4 = _mm256_add_epi64(accum_0123_4, delta_256); + accum_0123_8 = _mm256_add_epi64(accum_0123_8, delta_256); + accum_0123_12 = _mm256_add_epi64(accum_0123_12, delta_256); shift22_64b_signExt(accum_0123_0, accum_0123_0); shift22_64b_signExt(accum_0123_4, accum_0123_4); shift22_64b_signExt(accum_0123_8, accum_0123_8); shift22_64b_signExt(accum_0123_12, accum_0123_12); - __m256i lt_0 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), accum_0123_0); - __m256i lt_4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), accum_0123_4); - __m256i lt_8 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), accum_0123_8); - __m256i lt_12 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), accum_0123_12); - - accum_0123_0 = _mm256_andnot_si256(lt_0, accum_0123_0); - accum_0123_4 = _mm256_andnot_si256(lt_4, accum_0123_4); - accum_0123_8 = _mm256_andnot_si256(lt_8, accum_0123_8); - accum_0123_12 = _mm256_andnot_si256(lt_12, accum_0123_12); - - __m256i gt_255_0 = _mm256_cmpgt_epi32(accum_0123_0, max_char); - __m256i gt_255_4 = _mm256_cmpgt_epi32(accum_0123_4, max_char); - __m256i gt_255_8 = _mm256_cmpgt_epi32(accum_0123_8, max_char); - __m256i gt_255_12 = _mm256_cmpgt_epi32(accum_0123_12, max_char); + accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256); + accum_0123_4 = _mm256_max_epi32(accum_0123_4, zero_256); + accum_0123_8 = _mm256_max_epi32(accum_0123_8, zero_256); + accum_0123_12 = _mm256_max_epi32(accum_0123_12, zero_256); - __m256i add_255_0 = _mm256_and_si256(gt_255_0, max_char); - __m256i add_255_4 = _mm256_and_si256(gt_255_4, max_char); - __m256i add_255_8 = _mm256_and_si256(gt_255_8, max_char); - __m256i add_255_12 = _mm256_and_si256(gt_255_12, max_char); - - accum_0123_0 = _mm256_andnot_si256(gt_255_0, accum_0123_0); - accum_0123_4 = _mm256_andnot_si256(gt_255_4, accum_0123_4); - accum_0123_8 = _mm256_andnot_si256(gt_255_8, accum_0123_8); - accum_0123_12 = _mm256_andnot_si256(gt_255_12, accum_0123_12); - - accum_0123_0 = _mm256_add_epi32(accum_0123_0, add_255_0); - accum_0123_4 = _mm256_add_epi32(accum_0123_4, add_255_4); - accum_0123_8 = _mm256_add_epi32(accum_0123_8, add_255_8); - accum_0123_12 = _mm256_add_epi32(accum_0123_12, add_255_12); + accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256); + accum_0123_4 = _mm256_min_epi32(accum_0123_4, max_char_256); + accum_0123_8 = _mm256_min_epi32(accum_0123_8, max_char_256); + accum_0123_12 = _mm256_min_epi32(accum_0123_12, max_char_256); accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi32(accum_0123_4, 16)); accum_0123_8 = _mm256_or_si256(accum_0123_8, _mm256_slli_epi32(accum_0123_12, 16)); accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi64(accum_0123_8, 32)); - accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + + accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm_256); _mm256_storeu_si256((__m256i*)(dst + x), accum_0123_0); } @@ -363,85 +332,56 @@ void hbd_vresize_avx2(const int **src, unsigned short *dst, const short *beta, i __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x)); __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x)); - __m256i src0_4 = _mm256_permutevar8x32_epi32(src0_0, _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1)); - __m256i src1_4 = _mm256_permutevar8x32_epi32(src1_0, _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1)); - __m256i src2_4 = _mm256_permutevar8x32_epi32(src2_0, _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1)); - __m256i src3_4 = _mm256_permutevar8x32_epi32(src3_0, _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1)); + __m256i mul0_0 = _mm256_mul_epi32(src0_0, coef0_256); + __m256i mul1_0 = _mm256_mul_epi32(src1_0, coef1_256); + __m256i mul2_0 = _mm256_mul_epi32(src2_0, coef1_256); + __m256i mul3_0 = _mm256_mul_epi32(src3_0, coef0_256); - // 0 2 4 6 - __m256i mul0_0 = _mm256_mul_epi32(src0_0, coef0); - __m256i mul1_0 = _mm256_mul_epi32(src1_0, coef1); - __m256i mul2_0 = _mm256_mul_epi32(src2_0, coef1); - __m256i mul3_0 = _mm256_mul_epi32(src3_0, coef0); - - // 1 3 5 7 - __m256i mul0_4 = _mm256_mul_epi32(src0_4, coef0); - __m256i mul1_4 = _mm256_mul_epi32(src1_4, coef1); - __m256i mul2_4 = _mm256_mul_epi32(src2_4, coef1); - __m256i mul3_4 = _mm256_mul_epi32(src3_4, coef0); + __m256i mul0_4 = _mm256_mul_epi32(_mm256_srli_si256(src0_0, 4), coef0_256); + __m256i mul1_4 = _mm256_mul_epi32(_mm256_srli_si256(src1_0, 4), coef1_256); + __m256i mul2_4 = _mm256_mul_epi32(_mm256_srli_si256(src2_0, 4), coef1_256); + __m256i mul3_4 = _mm256_mul_epi32(_mm256_srli_si256(src3_0, 4), coef0_256); __m256i accum_01_0 = _mm256_add_epi64(mul0_0, mul1_0); __m256i accum_23_0 = _mm256_add_epi64(mul2_0, mul3_0); __m256i accum_01_4 = _mm256_add_epi64(mul0_4, mul1_4); __m256i accum_23_4 = _mm256_add_epi64(mul2_4, mul3_4); - __m256i accum_0123_0 = _mm256_add_epi64(accum_01_0, accum_23_0); __m256i accum_0123_4 = _mm256_add_epi64(accum_01_4, accum_23_4); - accum_0123_0 = _mm256_add_epi64(accum_0123_0, delta); - accum_0123_4 = _mm256_add_epi64(accum_0123_4, delta); + accum_0123_0 = _mm256_add_epi64(accum_0123_0, delta_256); + accum_0123_4 = _mm256_add_epi64(accum_0123_4, delta_256); shift22_64b_signExt(accum_0123_0, accum_0123_0); shift22_64b_signExt(accum_0123_4, accum_0123_4); - __m256i lt_0 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), accum_0123_0); - __m256i lt_4 = _mm256_cmpgt_epi64(_mm256_setzero_si256(), accum_0123_4); - - accum_0123_0 = _mm256_andnot_si256(lt_0, accum_0123_0); - accum_0123_4 = _mm256_andnot_si256(lt_4, accum_0123_4); - - __m256i gt_255_0 = _mm256_cmpgt_epi32(accum_0123_0, max_char); - __m256i gt_255_4 = _mm256_cmpgt_epi32(accum_0123_4, max_char); + accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256); + accum_0123_4 = _mm256_max_epi32(accum_0123_4, zero_256); - __m256i add_255_0 = _mm256_and_si256(gt_255_0, max_char); - __m256i add_255_4 = _mm256_and_si256(gt_255_4, max_char); + accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256); + accum_0123_4 = _mm256_min_epi32(accum_0123_4, max_char_256); - accum_0123_0 = _mm256_andnot_si256(gt_255_0, accum_0123_0); - accum_0123_4 = _mm256_andnot_si256(gt_255_4, accum_0123_4); - - accum_0123_0 = _mm256_add_epi32(accum_0123_0, add_255_0); - accum_0123_4 = _mm256_add_epi32(accum_0123_4, add_255_4); - - // 0 1 x x 2 3 x x 4 5 x x 6 7 x x - accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi32(accum_0123_4, 16)); - __m128i accum = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(accum_0123_0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0))); + accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi32(accum_0123_4, 16)); + __m128i accum = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(accum_0123_0, perm_256)); _mm_storeu_si128((__m128i*)(dst + x), accum); } for (; x < width_4; x+=4) { - // 0 1 2 3 __m128i src0_0 = _mm_loadu_si128((__m128i*)(S0 + x)); __m128i src1_0 = _mm_loadu_si128((__m128i*)(S1 + x)); __m128i src2_0 = _mm_loadu_si128((__m128i*)(S2 + x)); __m128i src3_0 = _mm_loadu_si128((__m128i*)(S3 + x)); - __m128i src0_4 = _mm_srli_si128(src0_0, 4); - __m128i src1_4 = _mm_srli_si128(src1_0, 4); - __m128i src2_4 = _mm_srli_si128(src2_0, 4); - __m128i src3_4 = _mm_srli_si128(src3_0, 4); - - // 0 2 __m128i mul0_0 = _mm_mul_epi32(src0_0, coef0_128); __m128i mul1_0 = _mm_mul_epi32(src1_0, coef1_128); __m128i mul2_0 = _mm_mul_epi32(src2_0, coef1_128); __m128i mul3_0 = _mm_mul_epi32(src3_0, coef0_128); - // 1 3 - __m128i mul0_4 = _mm_mul_epi32(src0_4, coef0_128); - __m128i mul1_4 = _mm_mul_epi32(src1_4, coef1_128); - __m128i mul2_4 = _mm_mul_epi32(src2_4, coef1_128); - __m128i mul3_4 = _mm_mul_epi32(src3_4, coef0_128); + __m128i mul0_4 = _mm_mul_epi32(_mm_srli_si128(src0_0, 4), coef0_128); + __m128i mul1_4 = _mm_mul_epi32(_mm_srli_si128(src1_0, 4), coef1_128); + __m128i mul2_4 = _mm_mul_epi32(_mm_srli_si128(src2_0, 4), coef1_128); + __m128i mul3_4 = _mm_mul_epi32(_mm_srli_si128(src3_0, 4), coef0_128); __m128i accum_01_0 = _mm_add_epi64(mul0_0, mul1_0); __m128i accum_23_0 = _mm_add_epi64(mul2_0, mul3_0); @@ -449,33 +389,19 @@ void hbd_vresize_avx2(const int **src, unsigned short *dst, const short *beta, i __m128i accum_23_4 = _mm_add_epi64(mul2_4, mul3_4); __m128i accum_0123_0 = _mm_add_epi64(accum_01_0, accum_23_0); __m128i accum_0123_4 = _mm_add_epi64(accum_01_4, accum_23_4); - accum_0123_0 = _mm_add_epi64(accum_0123_0, delta_128); accum_0123_4 = _mm_add_epi64(accum_0123_4, delta_128); shift22_64b_signExt_128(accum_0123_0, accum_0123_0); shift22_64b_signExt_128(accum_0123_4, accum_0123_4); - __m128i lt_0 = _mm_cmpgt_epi64(_mm_setzero_si128(), accum_0123_0); - __m128i lt_4 = _mm_cmpgt_epi64(_mm_setzero_si128(), accum_0123_4); - - accum_0123_0 = _mm_andnot_si128(lt_0, accum_0123_0); - accum_0123_4 = _mm_andnot_si128(lt_4, accum_0123_4); - - __m128i gt_255_0 = _mm_cmpgt_epi32(accum_0123_0, max_char_128); - __m128i gt_255_4 = _mm_cmpgt_epi32(accum_0123_4, max_char_128); - - __m128i add_255_0 = _mm_and_si128(gt_255_0, max_char_128); - __m128i add_255_4 = _mm_and_si128(gt_255_4, max_char_128); - - accum_0123_0 = _mm_andnot_si128(gt_255_0, accum_0123_0); - accum_0123_4 = _mm_andnot_si128(gt_255_4, accum_0123_4); + + accum_0123_0 = _mm_max_epi32(accum_0123_0, zero_128); + accum_0123_4 = _mm_max_epi32(accum_0123_4, zero_128); - accum_0123_0 = _mm_add_epi32(accum_0123_0, add_255_0); - accum_0123_4 = _mm_add_epi32(accum_0123_4, add_255_4); + accum_0123_0 = _mm_min_epi32(accum_0123_0, max_char_128); + accum_0123_4 = _mm_min_epi32(accum_0123_4, max_char_128); - // 0 1 x x 2 3 x x accum_0123_0 = _mm_or_si128(accum_0123_0, _mm_slli_epi32(accum_0123_4, 16)); - // 0 1 2 3 x x x x accum_0123_0 = _mm_or_si128(accum_0123_0, _mm_srli_si128(accum_0123_0, 4)); _mm_storel_epi64((__m128i*)(dst + x), accum_0123_0); diff --git a/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx512.c b/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx512.c new file mode 100644 index 000000000..47f406564 --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/hbd_resizer_avx512.c @@ -0,0 +1,583 @@ +/* SPDX-License-Identifier: BSD-3-Clause +* Copyright (C) 2022 Intel Corporation. +*/ +/** + * + * Copyright 2016-2020 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include + +#include + +#include "resizer_avx512.h" + + +#define shift22_64b_signExt_512(a, r)\ +{ \ + r = _mm512_add_epi64( _mm512_srli_epi64(a, 22) , _mm512_and_si512(a, _mm512_set1_epi64(0xFFFFFC0000000000)));\ +} + +#define shift22_64b_signExt_256(a, r)\ +{ \ + r = _mm256_add_epi64( _mm256_srli_epi64(a, 22) , _mm256_and_si256(a, _mm256_set1_epi64x(0xFFFFFC0000000000)));\ +} + +#define shift22_64b_signExt_128(a, r)\ +{ \ + r = _mm_add_epi64( _mm_srli_epi64(a, 22) , _mm_and_si128(a, _mm_set1_epi64x(0xFFFFFC0000000000)));\ +} + +const int HBD_INTER_RESIZE_COEF_SCALE_avx512 = 2048; +static const int HBD_MAX_ESIZE_avx512 = 16; + +#define CLIP3(X, MIN, MAX) ((X < MIN) ? MIN : (X > MAX) ? MAX \ + : X) +#define MAX(LEFT, RIGHT) (LEFT > RIGHT ? LEFT : RIGHT) +#define MIN(LEFT, RIGHT) (LEFT < RIGHT ? LEFT : RIGHT) + +// enabled by default for funque since resize factor is always 0.5, disabled otherwise +//#define OPTIMISED_COEFF 1 + +//#define USE_C_VRESIZE 0 + +#if !OPTIMISED_COEFF +static void interpolateCubic(float x, float *coeffs) +{ + const float A = -0.75f; + + coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A; + coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1; + coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} +#endif + +#if OPTIMISED_COEFF +void hbd_hresize_avx512(const unsigned short **src, int **dst, int count, + const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#else +void hbd_hresize_avx512(const unsigned short **src, int **dst, int count, + const int *xofs, const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#endif +{ + __m512i idx_extract_ab_512 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx_extract_cd_512 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + + __m512i coef0_512 = _mm512_set_epi32(alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0], alpha[1], alpha[0]); + __m512i coef2_512 = _mm512_set_epi32(alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2], alpha[3], alpha[2]); + + int xmax_32 = xmax - (xmax % 32); + int xmax_16 = xmax - (xmax % 16); + int xmax_8 = xmax - (xmax % 8); + for (int k = 0; k < count; k++) + { + const unsigned short *S = src[k]; + int *D = dst[k]; + int dx = 0, limit = xmin; + for (;;) + { +#if OPTIMISED_COEFF + for (; dx < limit; dx++) + { + int j; + int sx = (dx * 2) - cn; +#else + for (; dx < limit; dx++, alpha += 4) + { + int j; + int sx = xofs[dx] - cn; +#endif + int v = 0; + for (j = 0; j < 4; j++) + { + int sxj = sx + j * cn; + if ((unsigned)sxj >= (unsigned)swidth) + { + while (sxj < 0) + sxj += cn; + while (sxj >= swidth) + sxj -= cn; + } + v += S[sxj] * alpha[j]; + } + D[dx] = v; + } + if (limit == dwidth) + break; +#if OPTIMISED_COEFF + for (; dx < xmax_32; dx+=32) + { + int sx = dx * 2; +#else + for (; dx < xmax; dx++, alpha += 4) + { + int sx = xofs[dx]; // sx - 2, 4, 6, 8.... +#endif + __m512i val0 = _mm512_loadu_si512((__m512i*)(S + sx - 1)); + __m512i val2 = _mm512_loadu_si512((__m512i*)(S + sx + 1)); + __m512i val32 = _mm512_loadu_si512((__m512i*)(S + sx - 1 + 32)); + __m512i val34 = _mm512_loadu_si512((__m512i*)(S + sx + 1 + 32)); + + __m512i val0_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val0)); + __m512i val0_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val0, 1)); + __m512i val2_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val2)); + __m512i val2_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val2, 1)); + __m512i val32_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val32)); + __m512i val32_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val32, 1)); + __m512i val34_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val34)); + __m512i val34_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val34, 1)); + + __m512i mul0_lo = _mm512_mullo_epi32(val0_lo, coef0_512); + __m512i mul0_hi = _mm512_mullo_epi32(val0_hi, coef0_512); + __m512i mul2_lo = _mm512_mullo_epi32(val2_lo, coef2_512); + __m512i mul2_hi = _mm512_mullo_epi32(val2_hi, coef2_512); + + __m512i mul32_lo = _mm512_mullo_epi32(val32_lo, coef0_512); + __m512i mul32_hi = _mm512_mullo_epi32(val32_hi, coef0_512); + __m512i mul34_lo = _mm512_mullo_epi32(val34_lo, coef2_512); + __m512i mul34_hi = _mm512_mullo_epi32(val34_hi, coef2_512); + + __m512i ac_bd_0_lo = _mm512_add_epi32(mul0_lo, mul2_lo); + __m512i ac_bd_0_hi = _mm512_add_epi32(mul0_hi, mul2_hi); + __m512i ac_bd_32_lo = _mm512_add_epi32(mul32_lo, mul34_lo); + __m512i ac_bd_32_hi = _mm512_add_epi32(mul32_hi, mul34_hi); + + __m512i ac_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_ab_512, ac_bd_0_hi); + __m512i bd_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_cd_512, ac_bd_0_hi); + __m512i ac_32 = _mm512_permutex2var_epi32(ac_bd_32_lo, idx_extract_ab_512, ac_bd_32_hi); + __m512i bd_32 = _mm512_permutex2var_epi32(ac_bd_32_lo, idx_extract_cd_512, ac_bd_32_hi); + + __m512i res_0 = _mm512_add_epi32(ac_0, bd_0); + __m512i res_32 = _mm512_add_epi32(ac_32, bd_32); + + _mm512_storeu_si512((__m512i*)(D + dx), res_0); + _mm512_storeu_si512((__m512i*)(D + dx + 16), res_32); + } + for (; dx < xmax_16; dx+=16) + { + int sx = dx * 2; + __m512i val0 = _mm512_loadu_si512((__m512i*)(S + sx - 1)); + __m512i val2 = _mm512_loadu_si512((__m512i*)(S + sx + 1)); + + __m512i val0_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val0)); + __m512i val0_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val0, 1)); + __m512i val2_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(val2)); + __m512i val2_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(val2, 1)); + + __m512i mul0_lo = _mm512_mullo_epi32(val0_lo, coef0_512); + __m512i mul0_hi = _mm512_mullo_epi32(val0_hi, coef0_512); + __m512i mul2_lo = _mm512_mullo_epi32(val2_lo, coef2_512); + __m512i mul2_hi = _mm512_mullo_epi32(val2_hi, coef2_512); + + __m512i ac_bd_0_lo = _mm512_add_epi32(mul0_lo, mul2_lo); + __m512i ac_bd_0_hi = _mm512_add_epi32(mul0_hi, mul2_hi); + + __m512i ac_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_ab_512, ac_bd_0_hi); + __m512i bd_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_cd_512, ac_bd_0_hi); + + __m512i res_0 = _mm512_add_epi32(ac_0, bd_0); + + _mm512_storeu_si512((__m512i*)(D + dx), res_0); + } + for (; dx < xmax_8; dx+=8) + { + int sx = dx * 2; + __m256i val0_0 = _mm256_loadu_si256((__m256i*)(S + sx - 1)); + __m256i val2_0 = _mm256_loadu_si256((__m256i*)(S + sx + 1)); + __m512i val0 = _mm512_cvtepu16_epi32(val0_0); + __m512i val2 = _mm512_cvtepu16_epi32(val2_0); + + __m512i mul0 = _mm512_mullo_epi32(val0, coef0_512); + __m512i mul2 = _mm512_mullo_epi32(val2, coef2_512); + __m512i ac_bd_0_lo = _mm512_add_epi32(mul0, mul2); + __m512i ac_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_ab_512, ac_bd_0_lo); + __m512i bd_0 = _mm512_permutex2var_epi32(ac_bd_0_lo, idx_extract_cd_512, ac_bd_0_lo); + __m512i res_0 = _mm512_add_epi32(ac_0, bd_0); + + _mm256_storeu_si256((__m256i*)(D + dx), _mm512_castsi512_si256(res_0)); + } + for (; dx < xmax; dx++) + { + int sx = dx * 2; + D[dx] = S[sx - 1] * alpha[0] + S[sx] * alpha[1] + S[sx + 1] * alpha[2] + S[sx + 2] * alpha[3]; + } + limit = dwidth; + } +#if !OPTIMISED_COEFF + alpha -= dwidth * 4; +#endif + } +} + +unsigned short hbd_castOp_avx512(int64_t val, int bitdepth) +{ + int bits = 22; + int SHIFT = bits; + int DELTA = (1 << (bits - 1)); + return CLIP3((val + DELTA) >> SHIFT, 0, ((1 << bitdepth) - 1)); +} + +static int hbd_clip_avx512(int x, int a, int b) +{ + return x >= a ? (x < b ? x : b - 1) : a; +} + +void hbd_vresize_avx512(const int **src, unsigned short *dst, const short *beta, int width, int bitdepth) +{ + int b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3]; + const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + int bits = 22; + + __m512i delta_512 = _mm512_set1_epi64(1 << (bits - 1)); + __m512i max_char_512 = _mm512_set1_epi64(((1 << bitdepth) - 1)); + __m512i coef0_512 = _mm512_set1_epi32(beta[0]); + __m512i coef1_512 = _mm512_set1_epi32(beta[1]); + __m512i perm_512 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i zero_512 = _mm512_setzero_si512(); + + __m256i delta_256 = _mm256_set1_epi64x(1 << (bits - 1)); + __m256i max_char_256 = _mm256_set1_epi64x(((1 << bitdepth) - 1)); + __m256i coef0_256 = _mm256_set1_epi32(beta[0]); + __m256i coef1_256 = _mm256_set1_epi32(beta[1]); + __m256i perm_256 = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + __m256i zero_256 = _mm256_setzero_si256(); + + __m128i max_char_128 = _mm_set1_epi64x(((1 << bitdepth) - 1)); + __m128i delta_128 = _mm_set1_epi64x(1 << (bits - 1)); + __m128i coef0_128 = _mm_set1_epi32(beta[0]); + __m128i coef1_128 = _mm_set1_epi32(beta[1]); + __m128i zero_128 = _mm_setzero_si128(); + + int width_32 = width - (width % 32); + int width_16 = width - (width % 16); + int width_8 = width - (width % 8); + int width_4 = width - (width % 4); + int x = 0; + + for (; x < width_32; x+=32) + { + __m512i src0_0 = _mm512_loadu_si512((__m512i*)(S0 + x)); + __m512i src1_0 = _mm512_loadu_si512((__m512i*)(S1 + x)); + __m512i src2_0 = _mm512_loadu_si512((__m512i*)(S2 + x)); + __m512i src3_0 = _mm512_loadu_si512((__m512i*)(S3 + x)); + + __m512i src0_16 = _mm512_loadu_si512((__m512i*)(S0 + x + 16)); + __m512i src1_16 = _mm512_loadu_si512((__m512i*)(S1 + x + 16)); + __m512i src2_16 = _mm512_loadu_si512((__m512i*)(S2 + x + 16)); + __m512i src3_16 = _mm512_loadu_si512((__m512i*)(S3 + x + 16)); + + __m512i mul0_0 = _mm512_mul_epi32(src0_0, coef0_512); + __m512i mul1_0 = _mm512_mul_epi32(src1_0, coef1_512); + __m512i mul2_0 = _mm512_mul_epi32(src2_0, coef1_512); + __m512i mul3_0 = _mm512_mul_epi32(src3_0, coef0_512); + + __m512i mul0_4 = _mm512_mul_epi32(_mm512_srai_epi64(src0_0, 32), coef0_512); + __m512i mul1_4 = _mm512_mul_epi32(_mm512_srai_epi64(src1_0, 32), coef1_512); + __m512i mul2_4 = _mm512_mul_epi32(_mm512_srai_epi64(src2_0, 32), coef1_512); + __m512i mul3_4 = _mm512_mul_epi32(_mm512_srai_epi64(src3_0, 32), coef0_512); + + __m512i mul0_8 = _mm512_mul_epi32(src0_16, coef0_512); + __m512i mul1_8 = _mm512_mul_epi32(src1_16, coef1_512); + __m512i mul2_8 = _mm512_mul_epi32(src2_16, coef1_512); + __m512i mul3_8 = _mm512_mul_epi32(src3_16, coef0_512); + + __m512i mul0_12 = _mm512_mul_epi32(_mm512_srai_epi64(src0_16, 32), coef0_512); + __m512i mul1_12 = _mm512_mul_epi32(_mm512_srai_epi64(src1_16, 32), coef1_512); + __m512i mul2_12 = _mm512_mul_epi32(_mm512_srai_epi64(src2_16, 32), coef1_512); + __m512i mul3_12 = _mm512_mul_epi32(_mm512_srai_epi64(src3_16, 32), coef0_512); + + __m512i accum_01_0 = _mm512_add_epi64(mul0_0, mul1_0); + __m512i accum_23_0 = _mm512_add_epi64(mul2_0, mul3_0); + __m512i accum_01_4 = _mm512_add_epi64(mul0_4, mul1_4); + __m512i accum_23_4 = _mm512_add_epi64(mul2_4, mul3_4); + __m512i accum_01_8 = _mm512_add_epi64(mul0_8, mul1_8); + __m512i accum_23_8 = _mm512_add_epi64(mul2_8, mul3_8); + __m512i accum_01_12 = _mm512_add_epi64(mul0_12, mul1_12); + __m512i accum_23_12 = _mm512_add_epi64(mul2_12, mul3_12); + + __m512i accum_0123_0 = _mm512_add_epi64(accum_01_0, accum_23_0); + __m512i accum_0123_4 = _mm512_add_epi64(accum_01_4, accum_23_4); + __m512i accum_0123_8 = _mm512_add_epi64(accum_01_8, accum_23_8); + __m512i accum_0123_12 = _mm512_add_epi64(accum_01_12, accum_23_12); + + accum_0123_0 = _mm512_add_epi64(accum_0123_0, delta_512); + accum_0123_4 = _mm512_add_epi64(accum_0123_4, delta_512); + accum_0123_8 = _mm512_add_epi64(accum_0123_8, delta_512); + accum_0123_12 = _mm512_add_epi64(accum_0123_12, delta_512); + + shift22_64b_signExt_512(accum_0123_0, accum_0123_0); + shift22_64b_signExt_512(accum_0123_4, accum_0123_4); + shift22_64b_signExt_512(accum_0123_8, accum_0123_8); + shift22_64b_signExt_512(accum_0123_12, accum_0123_12); + + accum_0123_0 = _mm512_max_epi64(accum_0123_0, zero_512); + accum_0123_4 = _mm512_max_epi64(accum_0123_4, zero_512); + accum_0123_8 = _mm512_max_epi64(accum_0123_8, zero_512); + accum_0123_12 = _mm512_max_epi64(accum_0123_12, zero_512); + + accum_0123_0 = _mm512_min_epi64(accum_0123_0, max_char_512); + accum_0123_4 = _mm512_min_epi64(accum_0123_4, max_char_512); + accum_0123_8 = _mm512_min_epi64(accum_0123_8, max_char_512); + accum_0123_12 = _mm512_min_epi64(accum_0123_12, max_char_512); + + accum_0123_0 = _mm512_or_si512(accum_0123_0, _mm512_slli_epi32(accum_0123_4, 16)); + accum_0123_8 = _mm512_or_si512(accum_0123_8, _mm512_slli_epi32(accum_0123_12, 16)); + accum_0123_0 = _mm512_or_si512(accum_0123_0, _mm512_slli_epi64(accum_0123_8, 32)); + accum_0123_0 = _mm512_permutexvar_epi32(perm_512, accum_0123_0); + + _mm512_storeu_si512((__m512i*)(dst + x), accum_0123_0); + } + for (; x < width_16; x+=16) + { + __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x)); + __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x)); + __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x)); + __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x)); + + __m256i src0_8 = _mm256_loadu_si256((__m256i*)(S0 + x + 8)); + __m256i src1_8 = _mm256_loadu_si256((__m256i*)(S1 + x + 8)); + __m256i src2_8 = _mm256_loadu_si256((__m256i*)(S2 + x + 8)); + __m256i src3_8 = _mm256_loadu_si256((__m256i*)(S3 + x + 8)); + + __m256i mul0_0 = _mm256_mul_epi32(src0_0, coef0_256); + __m256i mul1_0 = _mm256_mul_epi32(src1_0, coef1_256); + __m256i mul2_0 = _mm256_mul_epi32(src2_0, coef1_256); + __m256i mul3_0 = _mm256_mul_epi32(src3_0, coef0_256); + + __m256i mul0_4 = _mm256_mul_epi32(_mm256_srai_epi64(src0_0, 32), coef0_256); + __m256i mul1_4 = _mm256_mul_epi32(_mm256_srai_epi64(src1_0, 32), coef1_256); + __m256i mul2_4 = _mm256_mul_epi32(_mm256_srai_epi64(src2_0, 32), coef1_256); + __m256i mul3_4 = _mm256_mul_epi32(_mm256_srai_epi64(src3_0, 32), coef0_256); + + __m256i mul0_8 = _mm256_mul_epi32(src0_8, coef0_256); + __m256i mul1_8 = _mm256_mul_epi32(src1_8, coef1_256); + __m256i mul2_8 = _mm256_mul_epi32(src2_8, coef1_256); + __m256i mul3_8 = _mm256_mul_epi32(src3_8, coef0_256); + + __m256i mul0_12 = _mm256_mul_epi32(_mm256_srai_epi64(src0_8, 32), coef0_256); + __m256i mul1_12 = _mm256_mul_epi32(_mm256_srai_epi64(src1_8, 32), coef1_256); + __m256i mul2_12 = _mm256_mul_epi32(_mm256_srai_epi64(src2_8, 32), coef1_256); + __m256i mul3_12 = _mm256_mul_epi32(_mm256_srai_epi64(src3_8, 32), coef0_256); + + __m256i accum_01_0 = _mm256_add_epi64(mul0_0, mul1_0); + __m256i accum_23_0 = _mm256_add_epi64(mul2_0, mul3_0); + __m256i accum_01_4 = _mm256_add_epi64(mul0_4, mul1_4); + __m256i accum_23_4 = _mm256_add_epi64(mul2_4, mul3_4); + __m256i accum_01_8 = _mm256_add_epi64(mul0_8, mul1_8); + __m256i accum_23_8 = _mm256_add_epi64(mul2_8, mul3_8); + __m256i accum_01_12 = _mm256_add_epi64(mul0_12, mul1_12); + __m256i accum_23_12 = _mm256_add_epi64(mul2_12, mul3_12); + + __m256i accum_0123_0 = _mm256_add_epi64(accum_01_0, accum_23_0); + __m256i accum_0123_4 = _mm256_add_epi64(accum_01_4, accum_23_4); + __m256i accum_0123_8 = _mm256_add_epi64(accum_01_8, accum_23_8); + __m256i accum_0123_12 = _mm256_add_epi64(accum_01_12, accum_23_12); + + accum_0123_0 = _mm256_add_epi64(accum_0123_0, delta_256); + accum_0123_4 = _mm256_add_epi64(accum_0123_4, delta_256); + accum_0123_8 = _mm256_add_epi64(accum_0123_8, delta_256); + accum_0123_12 = _mm256_add_epi64(accum_0123_12, delta_256); + + shift22_64b_signExt_256(accum_0123_0, accum_0123_0); + shift22_64b_signExt_256(accum_0123_4, accum_0123_4); + shift22_64b_signExt_256(accum_0123_8, accum_0123_8); + shift22_64b_signExt_256(accum_0123_12, accum_0123_12); + + accum_0123_0 = _mm256_max_epi64(accum_0123_0, zero_256); + accum_0123_4 = _mm256_max_epi64(accum_0123_4, zero_256); + accum_0123_8 = _mm256_max_epi64(accum_0123_8, zero_256); + accum_0123_12 = _mm256_max_epi64(accum_0123_12, zero_256); + + accum_0123_0 = _mm256_min_epi64(accum_0123_0, max_char_256); + accum_0123_4 = _mm256_min_epi64(accum_0123_4, max_char_256); + accum_0123_8 = _mm256_min_epi64(accum_0123_8, max_char_256); + accum_0123_12 = _mm256_min_epi64(accum_0123_12, max_char_256); + + accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi32(accum_0123_4, 16)); + accum_0123_8 = _mm256_or_si256(accum_0123_8, _mm256_slli_epi32(accum_0123_12, 16)); + accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi64(accum_0123_8, 32)); + accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm_256); + + _mm256_storeu_si256((__m256i*)(dst + x), accum_0123_0); + } + for (; x < width_8; x+=8) + { + __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x)); + __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x)); + __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x)); + __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x)); + + __m256i mul0_0 = _mm256_mul_epi32(src0_0, coef0_256); + __m256i mul1_0 = _mm256_mul_epi32(src1_0, coef1_256); + __m256i mul2_0 = _mm256_mul_epi32(src2_0, coef1_256); + __m256i mul3_0 = _mm256_mul_epi32(src3_0, coef0_256); + + __m256i mul0_4 = _mm256_mul_epi32(_mm256_srai_epi64(src0_0, 32), coef0_256); + __m256i mul1_4 = _mm256_mul_epi32(_mm256_srai_epi64(src1_0, 32), coef1_256); + __m256i mul2_4 = _mm256_mul_epi32(_mm256_srai_epi64(src2_0, 32), coef1_256); + __m256i mul3_4 = _mm256_mul_epi32(_mm256_srai_epi64(src3_0, 32), coef0_256); + + __m256i accum_01_0 = _mm256_add_epi64(mul0_0, mul1_0); + __m256i accum_23_0 = _mm256_add_epi64(mul2_0, mul3_0); + __m256i accum_01_4 = _mm256_add_epi64(mul0_4, mul1_4); + __m256i accum_23_4 = _mm256_add_epi64(mul2_4, mul3_4); + + __m256i accum_0123_0 = _mm256_add_epi64(accum_01_0, accum_23_0); + __m256i accum_0123_4 = _mm256_add_epi64(accum_01_4, accum_23_4); + + accum_0123_0 = _mm256_add_epi64(accum_0123_0, delta_256); + accum_0123_4 = _mm256_add_epi64(accum_0123_4, delta_256); + + shift22_64b_signExt_256(accum_0123_0, accum_0123_0); + shift22_64b_signExt_256(accum_0123_4, accum_0123_4); + + accum_0123_0 = _mm256_max_epi64(accum_0123_0, zero_256); + accum_0123_4 = _mm256_max_epi64(accum_0123_4, zero_256); + accum_0123_0 = _mm256_min_epi64(accum_0123_0, max_char_256); + accum_0123_4 = _mm256_min_epi64(accum_0123_4, max_char_256); + + accum_0123_0 = _mm256_or_si256(accum_0123_0, _mm256_slli_epi32(accum_0123_4, 16)); + __m128i accum = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(accum_0123_0, perm_256)); + _mm_storeu_si128((__m128i*)(dst + x), accum); + } + for (; x < width_4; x+=4) + { + __m128i src0_0 = _mm_loadu_si128((__m128i*)(S0 + x)); + __m128i src1_0 = _mm_loadu_si128((__m128i*)(S1 + x)); + __m128i src2_0 = _mm_loadu_si128((__m128i*)(S2 + x)); + __m128i src3_0 = _mm_loadu_si128((__m128i*)(S3 + x)); + + __m128i mul0_0 = _mm_mul_epi32(src0_0, coef0_128); + __m128i mul1_0 = _mm_mul_epi32(src1_0, coef1_128); + __m128i mul2_0 = _mm_mul_epi32(src2_0, coef1_128); + __m128i mul3_0 = _mm_mul_epi32(src3_0, coef0_128); + + __m128i mul0_4 = _mm_mul_epi32(_mm_srli_si128(src0_0, 4), coef0_128); + __m128i mul1_4 = _mm_mul_epi32(_mm_srli_si128(src1_0, 4), coef1_128); + __m128i mul2_4 = _mm_mul_epi32(_mm_srli_si128(src2_0, 4), coef1_128); + __m128i mul3_4 = _mm_mul_epi32(_mm_srli_si128(src3_0, 4), coef0_128); + + __m128i accum_01_0 = _mm_add_epi64(mul0_0, mul1_0); + __m128i accum_23_0 = _mm_add_epi64(mul2_0, mul3_0); + __m128i accum_01_4 = _mm_add_epi64(mul0_4, mul1_4); + __m128i accum_23_4 = _mm_add_epi64(mul2_4, mul3_4); + __m128i accum_0123_0 = _mm_add_epi64(accum_01_0, accum_23_0); + __m128i accum_0123_4 = _mm_add_epi64(accum_01_4, accum_23_4); + + accum_0123_0 = _mm_add_epi64(accum_0123_0, delta_128); + accum_0123_4 = _mm_add_epi64(accum_0123_4, delta_128); + + shift22_64b_signExt_128(accum_0123_0, accum_0123_0); + shift22_64b_signExt_128(accum_0123_4, accum_0123_4); + + accum_0123_0 = _mm_max_epi64(accum_0123_0, zero_128); + accum_0123_4 = _mm_max_epi64(accum_0123_4, zero_128); + accum_0123_0 = _mm_min_epi64(accum_0123_0, max_char_128); + accum_0123_4 = _mm_min_epi64(accum_0123_4, max_char_128); + + accum_0123_0 = _mm_or_si128(accum_0123_0, _mm_slli_epi32(accum_0123_4, 16)); + accum_0123_0 = _mm_or_si128(accum_0123_0, _mm_srli_si128(accum_0123_0, 4)); + + _mm_storel_epi64((__m128i*)(dst + x), accum_0123_0); + } + for (; x < width; x++) + dst[x] = hbd_castOp_avx512((int64_t)S0[x] * b0 + (int64_t)S1[x] * b1 + (int64_t)S2[x] * b2 + (int64_t)S3[x] * b3, bitdepth); +} + +#if OPTIMISED_COEFF +void hbd_step_avx512(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth) +#else +void hbd_step_avx512(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth) +#endif +{ + int dy, cn = channels; + + int bufstep = (int)((dwidth + 16 - 1) & -16); + int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int)); + if (_buffer == NULL) + { + printf("resizer: malloc fails\n"); + return; + } + const unsigned short *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int prev_sy[HBD_MAX_ESIZE_avx512]; + + for (int k = 0; k < ksize; k++) + { + prev_sy[k] = -1; + rows[k] = _buffer + bufstep * k; + } + +#if !OPTIMISED_COEFF + const short *beta = _beta + ksize * start; +#endif + +#if OPTIMISED_COEFF + for (dy = start; dy < end; dy++) + { + int sy0 = dy * 2; +#else + for (dy = start; dy < end; dy++, beta += ksize) + { + int sy0 = yofs[dy]; +#endif + int k0 = ksize, k1 = 0, ksize2 = ksize / 2; + + for (int k = 0; k < ksize; k++) + { + int sy = hbd_clip_avx512(sy0 - ksize2 + 1 + k, 0, iheight); + for (k1 = MAX(k1, k); k1 < ksize; k1++) + { + if (k1 < HBD_MAX_ESIZE_avx512 && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it. + { + if (k1 > k) + memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0])); + break; + } + } + if (k1 == ksize) + k0 = MIN(k0, k); // remember the first row that needs to be computed + srows[k] = _src + (sy * iwidth); + prev_sy[k] = sy; + } + + // printf("%d ", dy); + +#if OPTIMISED_COEFF + if (k0 < ksize) + { + hbd_hresize_avx512((srows + k0), (rows + k0), ksize - k0, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + hbd_vresize_avx512((const int **)rows, (_dst + dwidth * dy), _beta, dwidth, bitdepth); +#else + if (k0 < ksize) + { + hbd_hresize_avx512((srows + k0), (rows + k0), ksize - k0, xofs, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + hbd_vresize_avx512((const int **)rows, (_dst + dwidth * dy), beta, dwidth, bitdepth); +#endif + } + free(_buffer); +} \ No newline at end of file diff --git a/libvmaf/src/feature/third_party/funque/x86/integer_funque_adm_avx512.c b/libvmaf/src/feature/third_party/funque/x86/integer_funque_adm_avx512.c new file mode 100644 index 000000000..4ef692e35 --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/integer_funque_adm_avx512.c @@ -0,0 +1,1035 @@ +/* SPDX-License-Identifier: BSD-3-Clause +* Copyright (C) 2022 Intel Corporation. +*/ +/** + * + * Copyright 2016-2020 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include +#include +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "../integer_funque_adm.h" +#include "integer_funque_adm_avx512.h" +#include "integer_funque_adm_avx2.h" +#include "mem.h" +#include "../adm_tools.h" +#include "../integer_funque_filters.h" +#include + +#define cvt_1_16x16_to_2_32x8_512(a_16x16, r_32x8_lo, r_32x8_hi) \ +{ \ + r_32x8_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(a_16x16)); \ + r_32x8_hi = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(a_16x16, 1)); \ +} + +#define cvt_1_16x16_to_2_32x8_256(a_16x16, r_32x8_lo, r_32x8_hi) \ +{ \ + r_32x8_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_16x16)); \ + r_32x8_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(a_16x16, 1)); \ +} + +#define cvt_1_16x8_to_2_32x4_256(a_16x16, r_32x8_lo, r_32x8_hi) \ +{ \ + r_32x8_lo = _mm_cvtepi16_epi32(a_16x16); \ + r_32x8_hi = _mm_cvtepi16_epi32(_mm_shuffle_epi32(a_16x16, 0x0E)); \ +} + +#define shift15_64b_signExt_512(a, r)\ +{ \ + r = _mm512_add_epi64( _mm512_srli_epi64(a, 15) , _mm512_and_si512(a, _mm512_set1_epi64(0xFFFE000000000000)));\ +} + +#define shift15_64b_signExt_256(a, r)\ +{ \ + r = _mm256_add_epi64( _mm256_srli_epi64(a, 15) , _mm256_and_si256(a, _mm256_set1_epi64x(0xFFFE000000000000)));\ +} + +#define shift15_64b_signExt_128(a, r)\ +{ \ + r = _mm_add_epi64( _mm_srli_epi64(a, 15) , _mm_and_si128(a, _mm_set1_epi64x(0xFFFE000000000000)));\ +} + +void integer_adm_decouple_avx512(i_dwt2buffers ref, i_dwt2buffers dist, + i_dwt2buffers i_dlm_rest, adm_i32_dtype *i_dlm_add, + int32_t *adm_div_lookup, float border_size, double *adm_score_den) +{ + // const float cos_1deg_sq = COS_1DEG_SQ; + + size_t width = ref.width; + size_t height = ref.height; + int i, j, k, index, addIndex,restIndex; + + adm_i16_dtype tmp_val; + int angle_flag; + + adm_i32_dtype ot_dp, o_mag_sq, t_mag_sq; + int border_h = (border_size * height); + int border_w = (border_size * width); + + double den_sum[3] = {0}; + int64_t den_row_sum[3] = {0}; + int64_t col0_ref_cube[3] = {0}; + int loop_h, loop_w, dlm_width; + int extra_sample_h = 0, extra_sample_w = 0; + + adm_i64_dtype den_cube[3] = {0}; + + /** + DLM has the configurability of computing the metric only for the + centre region. currently border_size defines the percentage of pixels to be avoided + from all sides so that size of centre region is defined. + */ +#if ADM_REFLECT_PAD + extra_sample_w = 0; + extra_sample_h = 0; +#else + extra_sample_w = 1; + extra_sample_h = 1; + +#endif + + border_h -= extra_sample_h; + border_w -= extra_sample_w; + +#if !ADM_REFLECT_PAD + //If reflect pad is disabled & if border_size is 0, process 1 row,col pixels lesser + border_h = MAX(1,border_h); + border_w = MAX(1,border_w); +#endif + + loop_h = height - border_h; + loop_w = width - border_w; +#if ADM_REFLECT_PAD + int dlm_height = height - (border_h << 1); +#endif + dlm_width = width - (border_w << 1); + + //The width of i_dlm_add buffer will be extra only if padding is enabled + int dlm_add_w = dlm_width + (ADM_REFLECT_PAD << 1); + + int loop_w_32 = loop_w - ((loop_w - border_w) % 32); + int loop_w_16 = loop_w - ((loop_w - border_w) % 16); + int loop_w_8 = loop_w - ((loop_w - border_w) % 8); + + __m512i perm_64_to_32_512 = _mm512_set_epi32(30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0); + __m512i packs_32_512 = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0); + __m512i perm_for_64b_mul_512 = _mm512_set_epi32(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0); + __m512i add_16384_512 = _mm512_set1_epi64(16384); + __m512i add_32768_512 = _mm512_set1_epi64(32768); + __m512i add_32768_32b_512 = _mm512_set1_epi32(32768); + __m512i add_16384_32b_512 = _mm512_set1_epi32(16384); + __m512i zero_512 = _mm512_setzero_si512(); + + __m256i perm_64_to_32_256 = _mm256_set_epi32(14, 6, 12, 4, 10, 2, 8, 0); + __m256i perm_for_64b_mul_256 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + __m256i add_16384_256 = _mm256_set1_epi64x(16384); + __m256i add_32768_256 = _mm256_set1_epi64x(32768); + __m256i add_32768_32b_256 = _mm256_set1_epi32(32768); + __m256i add_16384_32b_256 = _mm256_set1_epi32(16384); + __m256i zero_256 = _mm256_setzero_si256(); + + __m128i perm_64_to_32_128 = _mm_set_epi32(6, 2, 4, 0); + __m128i add_16384_128 = _mm_set1_epi64x(16384); + __m128i add_32768_128 = _mm_set1_epi64x(32768); + __m128i add_32768_32b_128 = _mm_set1_epi32(32768); + __m128i add_16384_32b_128 = _mm_set1_epi32(16384); + __m128i zero_128 = _mm_setzero_si128(); + + for (i = border_h; i < loop_h; i++) + { + if(extra_sample_w) + { + for(k=1; k<4; k++) + { + int16_t ref_abs = abs(ref.bands[k][i*width + border_w]); + col0_ref_cube[k-1] = (int64_t) ref_abs * ref_abs * ref_abs; + } + } + j = border_w; + for (; j < loop_w_32; j+=32) + { + index = i * width + j; + //If padding is enabled the computation of i_dlm_add will be from 1,1 & later padded + addIndex = (i + ADM_REFLECT_PAD - border_h) * (dlm_add_w) + j + ADM_REFLECT_PAD - border_w; + restIndex = (i - border_h) * (dlm_width) + j - border_w; + + __m512i ref_b1_512 = _mm512_loadu_si512((__m512i*)(ref.bands[1] + index)); + __m512i dis_b1_512 = _mm512_loadu_si512((__m512i*)(dist.bands[1] + index)); + __m512i ref_b2_512 = _mm512_loadu_si512((__m512i*)(ref.bands[2] + index)); + __m512i dis_b2_512 = _mm512_loadu_si512((__m512i*)(dist.bands[2] + index)); + + __m512i ref_b1b2_lo = _mm512_unpacklo_epi16(ref_b1_512, ref_b2_512); + __m512i ref_b1b2_hi = _mm512_unpackhi_epi16(ref_b1_512, ref_b2_512); + __m512i dis_b1b2_lo = _mm512_unpacklo_epi16(dis_b1_512, dis_b2_512); + __m512i dis_b1b2_hi = _mm512_unpackhi_epi16(dis_b1_512, dis_b2_512); + + __m512i ot_dp_lo = _mm512_madd_epi16(ref_b1b2_lo, dis_b1b2_lo); + __m512i ot_dp_hi = _mm512_madd_epi16(ref_b1b2_hi, dis_b1b2_hi); + + __m512i o_mag_sq_lo = _mm512_madd_epi16(ref_b1b2_lo, ref_b1b2_lo); + __m512i o_mag_sq_hi = _mm512_madd_epi16(ref_b1b2_hi, ref_b1b2_hi); + + __m512i t_mag_sq_lo = _mm512_madd_epi16(dis_b1b2_lo, dis_b1b2_lo); + __m512i t_mag_sq_hi = _mm512_madd_epi16(dis_b1b2_hi, dis_b1b2_hi); + + ot_dp_lo = _mm512_max_epi32(ot_dp_lo, zero_512); + ot_dp_hi = _mm512_max_epi32(ot_dp_hi, zero_512); + + __m512i ot_dp_lo_0 = _mm512_mul_epi32(ot_dp_lo, ot_dp_lo); + __m512i ot_dp_lo_1 = _mm512_mul_epi32(_mm512_srai_epi64(ot_dp_lo, 32), _mm512_srai_epi64(ot_dp_lo, 32)); + __m512i ot_dp_hi_0 = _mm512_mul_epi32(ot_dp_hi, ot_dp_hi); + __m512i ot_dp_hi_1 = _mm512_mul_epi32(_mm512_srai_epi64(ot_dp_hi, 32), _mm512_srai_epi64(ot_dp_hi, 32)); + + __m512i ot_mag_sq_lo_0 = _mm512_mul_epi32(o_mag_sq_lo, t_mag_sq_lo); + __m512i ot_mag_sq_lo_1 = _mm512_mul_epi32(_mm512_srai_epi64(o_mag_sq_lo, 32), _mm512_srai_epi64(t_mag_sq_lo, 32)); + __m512i ot_mag_sq_hi_0 = _mm512_mul_epi32(o_mag_sq_hi, t_mag_sq_hi); + __m512i ot_mag_sq_hi_1 = _mm512_mul_epi32(_mm512_srai_epi64(o_mag_sq_hi, 32), _mm512_srai_epi64(t_mag_sq_hi, 32)); + + __mmask32 angle_mask32 = 0; + for(int a = 0; a < 8; a+=2) + { + int a0 = ((adm_i64_dtype)ot_dp_lo_0[a] >= COS_1DEG_SQ * (adm_i64_dtype)ot_mag_sq_lo_0[a]) << a*4; + int a2 = (ot_dp_lo_0[a + 1] >= COS_1DEG_SQ * ot_mag_sq_lo_0[a + 1]) << (a*4 + 2); + int a1 = (ot_dp_lo_1[a] >= COS_1DEG_SQ * ot_mag_sq_lo_1[a]) << (a*4 + 1); + int a3 = (ot_dp_lo_1[a + 1] >= COS_1DEG_SQ * ot_mag_sq_lo_1[a + 1]) << (a*4 + 3); + int a4 = (ot_dp_hi_0[a] >= COS_1DEG_SQ * ot_mag_sq_hi_0[a]) << (a*4 + 4); + int a6 = (ot_dp_hi_0[a + 1] >= COS_1DEG_SQ * ot_mag_sq_hi_0[a + 1]) << (a*4 + 6); + int a5 = (ot_dp_hi_1[a] >= COS_1DEG_SQ * ot_mag_sq_hi_1[a]) << (a*4 + 5); + int a7 = (ot_dp_hi_1[a + 1] >= COS_1DEG_SQ * ot_mag_sq_hi_1[a + 1]) << (a*4 + 7); + angle_mask32 += a0 + a2 + a1 + a3 + a4 + a6 + a5 + a7; + } + + __m512i dis_b3_512 = _mm512_loadu_si512((__m512i*)(dist.bands[3] + index)); + __m512i ref_b3_512 = _mm512_loadu_si512((__m512i*)(ref.bands[3] + index)); + + __m512i ref_b1_lo, ref_b1_hi, ref_b2_lo, ref_b2_hi, ref_b3_lo, ref_b3_hi; + cvt_1_16x16_to_2_32x8_512(ref_b1_512, ref_b1_lo, ref_b1_hi); + cvt_1_16x16_to_2_32x8_512(ref_b2_512, ref_b2_lo, ref_b2_hi); + cvt_1_16x16_to_2_32x8_512(ref_b3_512, ref_b3_lo, ref_b3_hi); + + __m512i adm_div_b1_lo, adm_div_b1_hi, adm_div_b2_lo, adm_div_b2_hi, adm_div_b3_lo, adm_div_b3_hi; + + adm_div_b1_lo = _mm512_i32gather_epi32(_mm512_add_epi32(ref_b1_lo, add_32768_32b_512), adm_div_lookup, 4); + adm_div_b1_hi = _mm512_i32gather_epi32(_mm512_add_epi32(ref_b1_hi, add_32768_32b_512), adm_div_lookup, 4); + adm_div_b2_lo = _mm512_i32gather_epi32(_mm512_add_epi32(ref_b2_lo, add_32768_32b_512), adm_div_lookup, 4); + adm_div_b2_hi = _mm512_i32gather_epi32(_mm512_add_epi32(ref_b2_hi, add_32768_32b_512), adm_div_lookup, 4); + adm_div_b3_lo = _mm512_i32gather_epi32(_mm512_add_epi32(ref_b3_lo, add_32768_32b_512), adm_div_lookup, 4); + adm_div_b3_hi = _mm512_i32gather_epi32(_mm512_add_epi32(ref_b3_hi, add_32768_32b_512), adm_div_lookup, 4); + + __m512i dis_b1_lo, dis_b1_hi, dis_b2_lo, dis_b2_hi, dis_b3_lo, dis_b3_hi; + cvt_1_16x16_to_2_32x8_512(dis_b1_512, dis_b1_lo, dis_b1_hi); + cvt_1_16x16_to_2_32x8_512(dis_b2_512, dis_b2_lo, dis_b2_hi); + cvt_1_16x16_to_2_32x8_512(dis_b3_512, dis_b3_lo, dis_b3_hi); + + __m512i adm_b1_dis_lo0 = _mm512_mul_epi32(adm_div_b1_lo, dis_b1_lo); + __m512i adm_b1_dis_lo1 = _mm512_mul_epi32(_mm512_srli_epi64(adm_div_b1_lo, 32), _mm512_srli_epi64(dis_b1_lo, 32)); + __m512i adm_b1_dis_hi8 = _mm512_mul_epi32(adm_div_b1_hi, dis_b1_hi); + __m512i adm_b1_dis_hi9 = _mm512_mul_epi32(_mm512_srli_epi64(adm_div_b1_hi, 32), _mm512_srli_epi64(dis_b1_hi, 32)); + + __m512i adm_b2_dis_lo0 = _mm512_mul_epi32(adm_div_b2_lo, dis_b2_lo); + __m512i adm_b2_dis_lo1 = _mm512_mul_epi32(_mm512_srli_epi64(adm_div_b2_lo, 32), _mm512_srli_epi64(dis_b2_lo, 32)); + __m512i adm_b2_dis_hi8 = _mm512_mul_epi32(adm_div_b2_hi, dis_b2_hi); + __m512i adm_b2_dis_hi9 = _mm512_mul_epi32(_mm512_srli_epi64(adm_div_b2_hi, 32), _mm512_srli_epi64(dis_b2_hi, 32)); + + __m512i adm_b3_dis_lo0 = _mm512_mul_epi32(adm_div_b3_lo, dis_b3_lo); + __m512i adm_b3_dis_lo1 = _mm512_mul_epi32(_mm512_srli_epi64(adm_div_b3_lo, 32), _mm512_srli_epi64(dis_b3_lo, 32)); + __m512i adm_b3_dis_hi8 = _mm512_mul_epi32(adm_div_b3_hi, dis_b3_hi); + __m512i adm_b3_dis_hi9 = _mm512_mul_epi32(_mm512_srli_epi64(adm_div_b3_hi, 32), _mm512_srli_epi64(dis_b3_hi, 32)); + + adm_b1_dis_lo0 = _mm512_add_epi64(adm_b1_dis_lo0, add_16384_512); + adm_b1_dis_lo1 = _mm512_add_epi64(adm_b1_dis_lo1, add_16384_512); + adm_b1_dis_hi8 = _mm512_add_epi64(adm_b1_dis_hi8, add_16384_512); + adm_b1_dis_hi9 = _mm512_add_epi64(adm_b1_dis_hi9, add_16384_512); + adm_b2_dis_lo0 = _mm512_add_epi64(adm_b2_dis_lo0, add_16384_512); + adm_b2_dis_lo1 = _mm512_add_epi64(adm_b2_dis_lo1, add_16384_512); + adm_b2_dis_hi8 = _mm512_add_epi64(adm_b2_dis_hi8, add_16384_512); + adm_b2_dis_hi9 = _mm512_add_epi64(adm_b2_dis_hi9, add_16384_512); + adm_b3_dis_lo0 = _mm512_add_epi64(adm_b3_dis_lo0, add_16384_512); + adm_b3_dis_lo1 = _mm512_add_epi64(adm_b3_dis_lo1, add_16384_512); + adm_b3_dis_hi8 = _mm512_add_epi64(adm_b3_dis_hi8, add_16384_512); + adm_b3_dis_hi9 = _mm512_add_epi64(adm_b3_dis_hi9, add_16384_512); + + shift15_64b_signExt_512(adm_b1_dis_lo0, adm_b1_dis_lo0); + shift15_64b_signExt_512(adm_b1_dis_lo1, adm_b1_dis_lo1); + shift15_64b_signExt_512(adm_b1_dis_hi8, adm_b1_dis_hi8); + shift15_64b_signExt_512(adm_b1_dis_hi9, adm_b1_dis_hi9); + shift15_64b_signExt_512(adm_b2_dis_lo0, adm_b2_dis_lo0); + shift15_64b_signExt_512(adm_b2_dis_lo1, adm_b2_dis_lo1); + shift15_64b_signExt_512(adm_b2_dis_hi8, adm_b2_dis_hi8); + shift15_64b_signExt_512(adm_b2_dis_hi9, adm_b2_dis_hi9); + shift15_64b_signExt_512(adm_b3_dis_lo0, adm_b3_dis_lo0); + shift15_64b_signExt_512(adm_b3_dis_lo1, adm_b3_dis_lo1); + shift15_64b_signExt_512(adm_b3_dis_hi8, adm_b3_dis_hi8); + shift15_64b_signExt_512(adm_b3_dis_hi9, adm_b3_dis_hi9); + + __mmask16 eqz_b1_lo = _mm512_cmpeq_epi32_mask(ref_b1_lo, zero_512); + __mmask16 eqz_b1_hi = _mm512_cmpeq_epi32_mask(ref_b1_hi, zero_512); + __mmask16 eqz_b2_lo = _mm512_cmpeq_epi32_mask(ref_b2_lo, zero_512); + __mmask16 eqz_b2_hi = _mm512_cmpeq_epi32_mask(ref_b2_hi, zero_512); + __mmask16 eqz_b3_lo = _mm512_cmpeq_epi32_mask(ref_b3_lo, zero_512); + __mmask16 eqz_b3_hi = _mm512_cmpeq_epi32_mask(ref_b3_hi, zero_512); + + __m512i adm_b1_dis_lo = _mm512_permutex2var_epi32(adm_b1_dis_lo0, perm_64_to_32_512, adm_b1_dis_lo1); + __m512i adm_b1_dis_hi = _mm512_permutex2var_epi32(adm_b1_dis_hi8, perm_64_to_32_512, adm_b1_dis_hi9); + __m512i adm_b2_dis_lo = _mm512_permutex2var_epi32(adm_b2_dis_lo0, perm_64_to_32_512, adm_b2_dis_lo1); + __m512i adm_b2_dis_hi = _mm512_permutex2var_epi32(adm_b2_dis_hi8, perm_64_to_32_512, adm_b2_dis_hi9); + __m512i adm_b3_dis_lo = _mm512_permutex2var_epi32(adm_b3_dis_lo0, perm_64_to_32_512, adm_b3_dis_lo1); + __m512i adm_b3_dis_hi = _mm512_permutex2var_epi32(adm_b3_dis_hi8, perm_64_to_32_512, adm_b3_dis_hi9); + + __m512i tmp_k_b1_lo = _mm512_mask_blend_epi32(eqz_b1_lo, adm_b1_dis_lo, add_32768_512); + __m512i tmp_k_b1_hi = _mm512_mask_blend_epi32(eqz_b1_hi, adm_b1_dis_hi, add_32768_512); + __m512i tmp_k_b2_lo = _mm512_mask_blend_epi32(eqz_b2_lo, adm_b2_dis_lo, add_32768_512); + __m512i tmp_k_b2_hi = _mm512_mask_blend_epi32(eqz_b2_hi, adm_b2_dis_hi, add_32768_512); + __m512i tmp_k_b3_lo = _mm512_mask_blend_epi32(eqz_b3_lo, adm_b3_dis_lo, add_32768_512); + __m512i tmp_k_b3_hi = _mm512_mask_blend_epi32(eqz_b3_hi, adm_b3_dis_hi, add_32768_512); + + tmp_k_b1_lo = _mm512_max_epi32(tmp_k_b1_lo, zero_512); + tmp_k_b1_hi = _mm512_max_epi32(tmp_k_b1_hi, zero_512); + tmp_k_b2_lo = _mm512_max_epi32(tmp_k_b2_lo, zero_512); + tmp_k_b2_hi = _mm512_max_epi32(tmp_k_b2_hi, zero_512); + tmp_k_b3_lo = _mm512_max_epi32(tmp_k_b3_lo, zero_512); + tmp_k_b3_hi = _mm512_max_epi32(tmp_k_b3_hi, zero_512); + + tmp_k_b1_lo = _mm512_min_epi32(tmp_k_b1_lo, add_32768_32b_512); + tmp_k_b1_hi = _mm512_min_epi32(tmp_k_b1_hi, add_32768_32b_512); + tmp_k_b2_lo = _mm512_min_epi32(tmp_k_b2_lo, add_32768_32b_512); + tmp_k_b2_hi = _mm512_min_epi32(tmp_k_b2_hi, add_32768_32b_512); + tmp_k_b3_lo = _mm512_min_epi32(tmp_k_b3_lo, add_32768_32b_512); + tmp_k_b3_hi = _mm512_min_epi32(tmp_k_b3_hi, add_32768_32b_512); + + __m512i tmp_val_b1_lo = _mm512_mullo_epi32(tmp_k_b1_lo, ref_b1_lo); + __m512i tmp_val_b1_hi = _mm512_mullo_epi32(tmp_k_b1_hi, ref_b1_hi); + __m512i tmp_val_b2_lo = _mm512_mullo_epi32(tmp_k_b2_lo, ref_b2_lo); + __m512i tmp_val_b2_hi = _mm512_mullo_epi32(tmp_k_b2_hi, ref_b2_hi); + __m512i tmp_val_b3_lo = _mm512_mullo_epi32(tmp_k_b3_lo, ref_b3_lo); + __m512i tmp_val_b3_hi = _mm512_mullo_epi32(tmp_k_b3_hi, ref_b3_hi); + + tmp_val_b1_lo = _mm512_add_epi32(tmp_val_b1_lo, add_16384_32b_512); + tmp_val_b1_hi = _mm512_add_epi32(tmp_val_b1_hi, add_16384_32b_512); + tmp_val_b2_lo = _mm512_add_epi32(tmp_val_b2_lo, add_16384_32b_512); + tmp_val_b3_lo = _mm512_add_epi32(tmp_val_b3_lo, add_16384_32b_512); + tmp_val_b2_hi = _mm512_add_epi32(tmp_val_b2_hi, add_16384_32b_512); + tmp_val_b3_hi = _mm512_add_epi32(tmp_val_b3_hi, add_16384_32b_512); + + tmp_val_b1_lo = _mm512_srai_epi32(tmp_val_b1_lo, 15); + tmp_val_b1_hi = _mm512_srai_epi32(tmp_val_b1_hi, 15); + tmp_val_b2_lo = _mm512_srai_epi32(tmp_val_b2_lo, 15); + tmp_val_b2_hi = _mm512_srai_epi32(tmp_val_b2_hi, 15); + tmp_val_b3_lo = _mm512_srai_epi32(tmp_val_b3_lo, 15); + tmp_val_b3_hi = _mm512_srai_epi32(tmp_val_b3_hi, 15); + + __m512i tmp_val_b1 = _mm512_packs_epi32(tmp_val_b1_lo, tmp_val_b1_hi); + __m512i tmp_val_b2 = _mm512_packs_epi32(tmp_val_b2_lo, tmp_val_b2_hi); + __m512i tmp_val_b3 = _mm512_packs_epi32(tmp_val_b3_lo, tmp_val_b3_hi); + + tmp_val_b1 = _mm512_permutexvar_epi64(packs_32_512, tmp_val_b1); + tmp_val_b2 = _mm512_permutexvar_epi64(packs_32_512, tmp_val_b2); + tmp_val_b3 = _mm512_permutexvar_epi64(packs_32_512, tmp_val_b3); + + __m512i dlm_rest_b1_512 = _mm512_mask_blend_epi16(angle_mask32, tmp_val_b1, dis_b1_512); + __m512i dlm_rest_b2_512 = _mm512_mask_blend_epi16(angle_mask32, tmp_val_b2, dis_b2_512); + __m512i dlm_rest_b3_512 = _mm512_mask_blend_epi16(angle_mask32, tmp_val_b3, dis_b3_512); + + __m512i dist_m_dlm_rest_b1 = _mm512_abs_epi16(_mm512_sub_epi16(dis_b1_512, dlm_rest_b1_512)); + __m512i dist_m_dlm_rest_b2 = _mm512_abs_epi16(_mm512_sub_epi16(dis_b2_512, dlm_rest_b2_512)); + __m512i dlm_add_512 = _mm512_adds_epu16(dist_m_dlm_rest_b1, dist_m_dlm_rest_b2); + __m512i dist_m_dlm_rest_b3 = _mm512_abs_epi16(_mm512_sub_epi16(dis_b3_512, dlm_rest_b3_512)); + dlm_add_512 = _mm512_adds_epu16(dlm_add_512, dist_m_dlm_rest_b3); + + _mm512_storeu_si512((__m512i*)(i_dlm_rest.bands[1] + restIndex), dlm_rest_b1_512); + _mm512_storeu_si512((__m512i*)(i_dlm_rest.bands[2] + restIndex), dlm_rest_b2_512); + _mm512_storeu_si512((__m512i*)(i_dlm_rest.bands[3] + restIndex), dlm_rest_b3_512); + + __m512i dlm_add_lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(dlm_add_512)); + __m512i dlm_add_hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(dlm_add_512, 1)); + + ref_b1_lo = _mm512_abs_epi32(ref_b1_lo); + ref_b1_hi = _mm512_abs_epi32(ref_b1_hi); + ref_b2_lo = _mm512_abs_epi32(ref_b2_lo); + ref_b2_hi = _mm512_abs_epi32(ref_b2_hi); + ref_b3_lo = _mm512_abs_epi32(ref_b3_lo); + ref_b3_hi = _mm512_abs_epi32(ref_b3_hi); + + _mm512_storeu_si512((__m512i*)(i_dlm_add + addIndex), dlm_add_lo); + _mm512_storeu_si512((__m512i*)(i_dlm_add + addIndex + 16), dlm_add_hi); + + __m512i ref_b_ref_b1_lo = _mm512_mullo_epi32(ref_b1_lo, ref_b1_lo); + __m512i ref_b_ref_b1_hi = _mm512_mullo_epi32(ref_b1_hi, ref_b1_hi); + __m512i ref_b_ref_b2_lo = _mm512_mullo_epi32(ref_b2_lo, ref_b2_lo); + __m512i ref_b_ref_b2_hi = _mm512_mullo_epi32(ref_b2_hi, ref_b2_hi); + __m512i ref_b_ref_b3_lo = _mm512_mullo_epi32(ref_b3_lo, ref_b3_lo); + __m512i ref_b_ref_b3_hi = _mm512_mullo_epi32(ref_b3_hi, ref_b3_hi); + + ref_b_ref_b1_lo = _mm512_permutexvar_epi32(perm_for_64b_mul_512, ref_b_ref_b1_lo); + ref_b1_lo = _mm512_permutexvar_epi32(perm_for_64b_mul_512, ref_b1_lo); + ref_b_ref_b1_hi = _mm512_permutexvar_epi32(perm_for_64b_mul_512, ref_b_ref_b1_hi); + ref_b1_hi = _mm512_permutexvar_epi32(perm_for_64b_mul_512, ref_b1_hi); + ref_b_ref_b2_lo = _mm512_permutexvar_epi32(perm_for_64b_mul_512, ref_b_ref_b2_lo); + ref_b2_lo = _mm512_permutexvar_epi32(perm_for_64b_mul_512, ref_b2_lo); + ref_b_ref_b2_hi = _mm512_permutexvar_epi32(perm_for_64b_mul_512, ref_b_ref_b2_hi); + ref_b2_hi = _mm512_permutexvar_epi32(perm_for_64b_mul_512, ref_b2_hi); + ref_b_ref_b3_lo = _mm512_permutexvar_epi32(perm_for_64b_mul_512, ref_b_ref_b3_lo); + ref_b3_lo = _mm512_permutexvar_epi32(perm_for_64b_mul_512, ref_b3_lo); + ref_b_ref_b3_hi = _mm512_permutexvar_epi32(perm_for_64b_mul_512, ref_b_ref_b3_hi); + ref_b3_hi = _mm512_permutexvar_epi32(perm_for_64b_mul_512, ref_b3_hi); + + __m512i ref_b_ref_b1_lo0 = _mm512_mul_epi32(ref_b_ref_b1_lo, ref_b1_lo); + __m512i ref_b_ref_b1_lo1 = _mm512_mul_epi32(_mm512_srli_epi64(ref_b_ref_b1_lo, 32), _mm512_srli_epi64(ref_b1_lo, 32)); + __m512i ref_b_ref_b1_hi0 = _mm512_mul_epi32(ref_b_ref_b1_hi, ref_b1_hi); + __m512i ref_b_ref_b1_hi1 = _mm512_mul_epi32(_mm512_srli_epi64(ref_b_ref_b1_hi, 32), _mm512_srli_epi64(ref_b1_hi, 32)); + __m512i ref_b_ref_b2_lo0 = _mm512_mul_epi32(ref_b_ref_b2_lo, ref_b2_lo); + __m512i ref_b_ref_b2_lo1 = _mm512_mul_epi32(_mm512_srli_epi64(ref_b_ref_b2_lo, 32), _mm512_srli_epi64(ref_b2_lo, 32)); + __m512i ref_b_ref_b2_hi0 = _mm512_mul_epi32(ref_b_ref_b2_hi, ref_b2_hi); + __m512i ref_b_ref_b2_hi1 = _mm512_mul_epi32(_mm512_srli_epi64(ref_b_ref_b2_hi, 32), _mm512_srli_epi64(ref_b2_hi, 32)); + __m512i ref_b_ref_b3_lo0 = _mm512_mul_epi32(ref_b_ref_b3_lo, ref_b3_lo); + __m512i ref_b_ref_b3_lo1 = _mm512_mul_epi32(_mm512_srli_epi64(ref_b_ref_b3_lo, 32), _mm512_srli_epi64(ref_b3_lo, 32)); + __m512i ref_b_ref_b3_hi0 = _mm512_mul_epi32(ref_b_ref_b3_hi, ref_b3_hi); + __m512i ref_b_ref_b3_hi1 = _mm512_mul_epi32(_mm512_srli_epi64(ref_b_ref_b3_hi, 32), _mm512_srli_epi64(ref_b3_hi, 32)); + + __m512i b1_r8_lo = _mm512_add_epi64(ref_b_ref_b1_lo0, ref_b_ref_b1_lo1); + __m512i b1_r8_hi = _mm512_add_epi64(ref_b_ref_b1_hi0, ref_b_ref_b1_hi1); + __m512i b2_r8_lo = _mm512_add_epi64(ref_b_ref_b2_lo0, ref_b_ref_b2_lo1); + __m512i b2_r8_hi = _mm512_add_epi64(ref_b_ref_b2_hi0, ref_b_ref_b2_hi1); + __m512i b3_r8_lo = _mm512_add_epi64(ref_b_ref_b3_lo0, ref_b_ref_b3_lo1); + __m512i b3_r8_hi = _mm512_add_epi64(ref_b_ref_b3_hi0, ref_b_ref_b3_hi1); + __m512i b1_r8 = _mm512_add_epi64(b1_r8_lo, b1_r8_hi); + __m512i b2_r8 = _mm512_add_epi64(b2_r8_lo, b2_r8_hi); + __m512i b3_r8 = _mm512_add_epi64(b3_r8_lo, b3_r8_hi); + + __m256i b1_r4 = _mm256_add_epi64(_mm512_castsi512_si256(b1_r8), _mm512_extracti64x4_epi64(b1_r8, 1)); + __m256i b2_r4 = _mm256_add_epi64(_mm512_castsi512_si256(b2_r8), _mm512_extracti64x4_epi64(b2_r8, 1)); + __m256i b3_r4 = _mm256_add_epi64(_mm512_castsi512_si256(b3_r8), _mm512_extracti64x4_epi64(b3_r8, 1)); + + __m128i b1_r2 = _mm_add_epi64(_mm256_castsi256_si128(b1_r4), _mm256_extractf128_si256(b1_r4, 1)); + __m128i b2_r2 = _mm_add_epi64(_mm256_castsi256_si128(b2_r4), _mm256_extractf128_si256(b2_r4, 1)); + __m128i b3_r2 = _mm_add_epi64(_mm256_castsi256_si128(b3_r4), _mm256_extractf128_si256(b3_r4, 1)); + + int64_t r_b1 = _mm_extract_epi64(b1_r2, 0) + _mm_extract_epi64(b1_r2, 1); + int64_t r_b2 = _mm_extract_epi64(b2_r2, 0) + _mm_extract_epi64(b2_r2, 1); + int64_t r_b3 = _mm_extract_epi64(b3_r2, 0) + _mm_extract_epi64(b3_r2, 1); + + den_row_sum[0] += r_b1; + den_row_sum[1] += r_b2; + den_row_sum[2] += r_b3; + } + + for (; j < loop_w_16; j+=16) + { + index = i * width + j; + + //If padding is enabled the computation of i_dlm_add will be from 1,1 & later padded + addIndex = (i + ADM_REFLECT_PAD - border_h) * (dlm_add_w) + j + ADM_REFLECT_PAD - border_w; + restIndex = (i - border_h) * (dlm_width) + j - border_w; + + __m256i ref_b1_256 = _mm256_loadu_si256((__m256i*)(ref.bands[1] + index)); + __m256i dis_b1_256 = _mm256_loadu_si256((__m256i*)(dist.bands[1] + index)); + __m256i ref_b2_256 = _mm256_loadu_si256((__m256i*)(ref.bands[2] + index)); + __m256i dis_b2_256 = _mm256_loadu_si256((__m256i*)(dist.bands[2] + index)); + + __m256i ref_b1b2_lo = _mm256_unpacklo_epi16(ref_b1_256, ref_b2_256); + __m256i ref_b1b2_hi = _mm256_unpackhi_epi16(ref_b1_256, ref_b2_256); + __m256i dis_b1b2_lo = _mm256_unpacklo_epi16(dis_b1_256, dis_b2_256); + __m256i dis_b1b2_hi = _mm256_unpackhi_epi16(dis_b1_256, dis_b2_256); + + __m256i ot_dp_lo = _mm256_madd_epi16(ref_b1b2_lo, dis_b1b2_lo); + __m256i ot_dp_hi = _mm256_madd_epi16(ref_b1b2_hi, dis_b1b2_hi); + + __m256i o_mag_sq_lo = _mm256_madd_epi16(ref_b1b2_lo, ref_b1b2_lo); + __m256i o_mag_sq_hi = _mm256_madd_epi16(ref_b1b2_hi, ref_b1b2_hi); + + __m256i t_mag_sq_lo = _mm256_madd_epi16(dis_b1b2_lo, dis_b1b2_lo); + __m256i t_mag_sq_hi = _mm256_madd_epi16(dis_b1b2_hi, dis_b1b2_hi); + + ot_dp_lo = _mm256_max_epi32(ot_dp_lo, zero_256); + ot_dp_hi = _mm256_max_epi32(ot_dp_hi, zero_256); + + __m256i ot_dp_lo_0 = _mm256_mul_epi32(ot_dp_lo, ot_dp_lo); + __m256i ot_dp_lo_1 = _mm256_mul_epi32(_mm256_srai_epi64(ot_dp_lo, 32), _mm256_srai_epi64(ot_dp_lo, 32)); + __m256i ot_dp_hi_0 = _mm256_mul_epi32(ot_dp_hi, ot_dp_hi); + __m256i ot_dp_hi_1 = _mm256_mul_epi32(_mm256_srai_epi64(ot_dp_hi, 32), _mm256_srai_epi64(ot_dp_hi, 32)); + + __m256i ot_mag_sq_lo_0 = _mm256_mul_epi32(o_mag_sq_lo, t_mag_sq_lo); + __m256i ot_mag_sq_lo_1 = _mm256_mul_epi32(_mm256_srai_epi64(o_mag_sq_lo, 32), _mm256_srai_epi64(t_mag_sq_lo, 32)); + __m256i ot_mag_sq_hi_0 = _mm256_mul_epi32(o_mag_sq_hi, t_mag_sq_hi); + __m256i ot_mag_sq_hi_1 = _mm256_mul_epi32(_mm256_srai_epi64(o_mag_sq_hi, 32), _mm256_srai_epi64(t_mag_sq_hi, 32)); + + __mmask16 angle_mask16 = 0; + for(int a = 0; a < 4; a+=2) + { + int a0 = ((adm_i64_dtype)ot_dp_lo_0[a] >= COS_1DEG_SQ * (adm_i64_dtype)ot_mag_sq_lo_0[a]) << a*4; + int a2 = (ot_dp_lo_0[a + 1] >= COS_1DEG_SQ * ot_mag_sq_lo_0[a + 1]) << (a*4 + 2); + int a1 = (ot_dp_lo_1[a] >= COS_1DEG_SQ * ot_mag_sq_lo_1[a]) << (a*4 + 1); + int a3 = (ot_dp_lo_1[a + 1] >= COS_1DEG_SQ * ot_mag_sq_lo_1[a + 1]) << (a*4 + 3); + int a4 = (ot_dp_hi_0[a] >= COS_1DEG_SQ * ot_mag_sq_hi_0[a]) << (a*4 + 4); + int a6 = (ot_dp_hi_0[a + 1] >= COS_1DEG_SQ * ot_mag_sq_hi_0[a + 1]) << (a*4 + 6); + int a5 = (ot_dp_hi_1[a] >= COS_1DEG_SQ * ot_mag_sq_hi_1[a]) << (a*4 + 5); + int a7 = (ot_dp_hi_1[a + 1] >= COS_1DEG_SQ * ot_mag_sq_hi_1[a + 1]) << (a*4 + 7); + angle_mask16 += a0 + a2 + a1 + a3 + a4 + a6 + a5 + a7; + } + + __m256i dis_b3_256 = _mm256_loadu_si256((__m256i*)(dist.bands[3] + index)); + __m256i ref_b3_256 = _mm256_loadu_si256((__m256i*)(ref.bands[3] + index)); + + __m256i ref_b1_lo, ref_b1_hi, ref_b2_lo, ref_b2_hi, ref_b3_lo, ref_b3_hi; + cvt_1_16x16_to_2_32x8_256(ref_b1_256, ref_b1_lo, ref_b1_hi); + cvt_1_16x16_to_2_32x8_256(ref_b2_256, ref_b2_lo, ref_b2_hi); + cvt_1_16x16_to_2_32x8_256(ref_b3_256, ref_b3_lo, ref_b3_hi); + + __m256i adm_div_b1_lo, adm_div_b1_hi, adm_div_b2_lo, adm_div_b2_hi, adm_div_b3_lo, adm_div_b3_hi; + adm_div_b1_lo = _mm256_mmask_i32gather_epi32(zero_256, 0xFF, _mm256_add_epi32(ref_b1_lo, add_32768_32b_256), adm_div_lookup, 4); + adm_div_b1_hi = _mm256_mmask_i32gather_epi32(zero_256, 0xFF, _mm256_add_epi32(ref_b1_hi, add_32768_32b_256), adm_div_lookup, 4); + adm_div_b2_lo = _mm256_mmask_i32gather_epi32(zero_256, 0xFF, _mm256_add_epi32(ref_b2_lo, add_32768_32b_256), adm_div_lookup, 4); + adm_div_b2_hi = _mm256_mmask_i32gather_epi32(zero_256, 0xFF, _mm256_add_epi32(ref_b2_hi, add_32768_32b_256), adm_div_lookup, 4); + adm_div_b3_lo = _mm256_mmask_i32gather_epi32(zero_256, 0xFF, _mm256_add_epi32(ref_b3_lo, add_32768_32b_256), adm_div_lookup, 4); + adm_div_b3_hi = _mm256_mmask_i32gather_epi32(zero_256, 0xFF, _mm256_add_epi32(ref_b3_hi, add_32768_32b_256), adm_div_lookup, 4); + + __m256i dis_b1_lo, dis_b1_hi, dis_b2_lo, dis_b2_hi, dis_b3_lo, dis_b3_hi; + cvt_1_16x16_to_2_32x8_256(dis_b1_256, dis_b1_lo, dis_b1_hi); + cvt_1_16x16_to_2_32x8_256(dis_b2_256, dis_b2_lo, dis_b2_hi); + cvt_1_16x16_to_2_32x8_256(dis_b3_256, dis_b3_lo, dis_b3_hi); + + __m256i adm_b1_dis_lo0 = _mm256_mul_epi32(adm_div_b1_lo, dis_b1_lo); + __m256i adm_b1_dis_lo1 = _mm256_mul_epi32(_mm256_srli_epi64(adm_div_b1_lo, 32), _mm256_srli_epi64(dis_b1_lo, 32)); + __m256i adm_b1_dis_hi8 = _mm256_mul_epi32(adm_div_b1_hi, dis_b1_hi); + __m256i adm_b1_dis_hi9 = _mm256_mul_epi32(_mm256_srli_epi64(adm_div_b1_hi, 32), _mm256_srli_epi64(dis_b1_hi, 32)); + + __m256i adm_b2_dis_lo0 = _mm256_mul_epi32(adm_div_b2_lo, dis_b2_lo); + __m256i adm_b2_dis_lo1 = _mm256_mul_epi32(_mm256_srli_epi64(adm_div_b2_lo, 32), _mm256_srli_epi64(dis_b2_lo, 32)); + __m256i adm_b2_dis_hi8 = _mm256_mul_epi32(adm_div_b2_hi, dis_b2_hi); + __m256i adm_b2_dis_hi9 = _mm256_mul_epi32(_mm256_srli_epi64(adm_div_b2_hi, 32), _mm256_srli_epi64(dis_b2_hi, 32)); + + __m256i adm_b3_dis_lo0 = _mm256_mul_epi32(adm_div_b3_lo, dis_b3_lo); + __m256i adm_b3_dis_lo1 = _mm256_mul_epi32(_mm256_srli_epi64(adm_div_b3_lo, 32), _mm256_srli_epi64(dis_b3_lo, 32)); + __m256i adm_b3_dis_hi8 = _mm256_mul_epi32(adm_div_b3_hi, dis_b3_hi); + __m256i adm_b3_dis_hi9 = _mm256_mul_epi32(_mm256_srli_epi64(adm_div_b3_hi, 32), _mm256_srli_epi64(dis_b3_hi, 32)); + + adm_b1_dis_lo0 = _mm256_add_epi64(adm_b1_dis_lo0, add_16384_256); + adm_b1_dis_lo1 = _mm256_add_epi64(adm_b1_dis_lo1, add_16384_256); + adm_b1_dis_hi8 = _mm256_add_epi64(adm_b1_dis_hi8, add_16384_256); + adm_b1_dis_hi9 = _mm256_add_epi64(adm_b1_dis_hi9, add_16384_256); + adm_b2_dis_lo0 = _mm256_add_epi64(adm_b2_dis_lo0, add_16384_256); + adm_b2_dis_lo1 = _mm256_add_epi64(adm_b2_dis_lo1, add_16384_256); + adm_b2_dis_hi8 = _mm256_add_epi64(adm_b2_dis_hi8, add_16384_256); + adm_b2_dis_hi9 = _mm256_add_epi64(adm_b2_dis_hi9, add_16384_256); + adm_b3_dis_lo0 = _mm256_add_epi64(adm_b3_dis_lo0, add_16384_256); + adm_b3_dis_lo1 = _mm256_add_epi64(adm_b3_dis_lo1, add_16384_256); + adm_b3_dis_hi8 = _mm256_add_epi64(adm_b3_dis_hi8, add_16384_256); + adm_b3_dis_hi9 = _mm256_add_epi64(adm_b3_dis_hi9, add_16384_256); + + shift15_64b_signExt_256(adm_b1_dis_lo0, adm_b1_dis_lo0); + shift15_64b_signExt_256(adm_b1_dis_lo1, adm_b1_dis_lo1); + shift15_64b_signExt_256(adm_b1_dis_hi8, adm_b1_dis_hi8); + shift15_64b_signExt_256(adm_b1_dis_hi9, adm_b1_dis_hi9); + shift15_64b_signExt_256(adm_b2_dis_lo0, adm_b2_dis_lo0); + shift15_64b_signExt_256(adm_b2_dis_lo1, adm_b2_dis_lo1); + shift15_64b_signExt_256(adm_b2_dis_hi8, adm_b2_dis_hi8); + shift15_64b_signExt_256(adm_b2_dis_hi9, adm_b2_dis_hi9); + shift15_64b_signExt_256(adm_b3_dis_lo0, adm_b3_dis_lo0); + shift15_64b_signExt_256(adm_b3_dis_lo1, adm_b3_dis_lo1); + shift15_64b_signExt_256(adm_b3_dis_hi8, adm_b3_dis_hi8); + shift15_64b_signExt_256(adm_b3_dis_hi9, adm_b3_dis_hi9); + + __mmask8 eqz_b1_lo = _mm256_cmpeq_epi32_mask(ref_b1_lo, zero_256); + __mmask8 eqz_b1_hi = _mm256_cmpeq_epi32_mask(ref_b1_hi, zero_256); + __mmask8 eqz_b2_lo = _mm256_cmpeq_epi32_mask(ref_b2_lo, zero_256); + __mmask8 eqz_b2_hi = _mm256_cmpeq_epi32_mask(ref_b2_hi, zero_256); + __mmask8 eqz_b3_lo = _mm256_cmpeq_epi32_mask(ref_b3_lo, zero_256); + __mmask8 eqz_b3_hi = _mm256_cmpeq_epi32_mask(ref_b3_hi, zero_256); + + __m256i adm_b1_dis_lo = _mm256_permutex2var_epi32(adm_b1_dis_lo0, perm_64_to_32_256, adm_b1_dis_lo1); + __m256i adm_b1_dis_hi = _mm256_permutex2var_epi32(adm_b1_dis_hi8, perm_64_to_32_256, adm_b1_dis_hi9); + __m256i adm_b2_dis_lo = _mm256_permutex2var_epi32(adm_b2_dis_lo0, perm_64_to_32_256, adm_b2_dis_lo1); + __m256i adm_b2_dis_hi = _mm256_permutex2var_epi32(adm_b2_dis_hi8, perm_64_to_32_256, adm_b2_dis_hi9); + __m256i adm_b3_dis_lo = _mm256_permutex2var_epi32(adm_b3_dis_lo0, perm_64_to_32_256, adm_b3_dis_lo1); + __m256i adm_b3_dis_hi = _mm256_permutex2var_epi32(adm_b3_dis_hi8, perm_64_to_32_256, adm_b3_dis_hi9); + + __m256i tmp_k_b1_lo = _mm256_mask_blend_epi32(eqz_b1_lo, adm_b1_dis_lo, add_32768_256); + __m256i tmp_k_b1_hi = _mm256_mask_blend_epi32(eqz_b1_hi, adm_b1_dis_hi, add_32768_256); + __m256i tmp_k_b2_lo = _mm256_mask_blend_epi32(eqz_b2_lo, adm_b2_dis_lo, add_32768_256); + __m256i tmp_k_b2_hi = _mm256_mask_blend_epi32(eqz_b2_hi, adm_b2_dis_hi, add_32768_256); + __m256i tmp_k_b3_lo = _mm256_mask_blend_epi32(eqz_b3_lo, adm_b3_dis_lo, add_32768_256); + __m256i tmp_k_b3_hi = _mm256_mask_blend_epi32(eqz_b3_hi, adm_b3_dis_hi, add_32768_256); + + tmp_k_b1_lo = _mm256_max_epi32(tmp_k_b1_lo, zero_256); + tmp_k_b1_hi = _mm256_max_epi32(tmp_k_b1_hi, zero_256); + tmp_k_b2_lo = _mm256_max_epi32(tmp_k_b2_lo, zero_256); + tmp_k_b2_hi = _mm256_max_epi32(tmp_k_b2_hi, zero_256); + tmp_k_b3_lo = _mm256_max_epi32(tmp_k_b3_lo, zero_256); + tmp_k_b3_hi = _mm256_max_epi32(tmp_k_b3_hi, zero_256); + + tmp_k_b1_lo = _mm256_min_epi32(tmp_k_b1_lo, add_32768_32b_256); + tmp_k_b1_hi = _mm256_min_epi32(tmp_k_b1_hi, add_32768_32b_256); + tmp_k_b2_lo = _mm256_min_epi32(tmp_k_b2_lo, add_32768_32b_256); + tmp_k_b2_hi = _mm256_min_epi32(tmp_k_b2_hi, add_32768_32b_256); + tmp_k_b3_lo = _mm256_min_epi32(tmp_k_b3_lo, add_32768_32b_256); + tmp_k_b3_hi = _mm256_min_epi32(tmp_k_b3_hi, add_32768_32b_256); + + __m256i tmp_val_b1_lo = _mm256_mullo_epi32(tmp_k_b1_lo, ref_b1_lo); + __m256i tmp_val_b1_hi = _mm256_mullo_epi32(tmp_k_b1_hi, ref_b1_hi); + __m256i tmp_val_b2_lo = _mm256_mullo_epi32(tmp_k_b2_lo, ref_b2_lo); + __m256i tmp_val_b2_hi = _mm256_mullo_epi32(tmp_k_b2_hi, ref_b2_hi); + __m256i tmp_val_b3_lo = _mm256_mullo_epi32(tmp_k_b3_lo, ref_b3_lo); + __m256i tmp_val_b3_hi = _mm256_mullo_epi32(tmp_k_b3_hi, ref_b3_hi); + + tmp_val_b1_lo = _mm256_add_epi32(tmp_val_b1_lo, add_16384_32b_256); + tmp_val_b1_hi = _mm256_add_epi32(tmp_val_b1_hi, add_16384_32b_256); + tmp_val_b2_lo = _mm256_add_epi32(tmp_val_b2_lo, add_16384_32b_256); + tmp_val_b3_lo = _mm256_add_epi32(tmp_val_b3_lo, add_16384_32b_256); + tmp_val_b2_hi = _mm256_add_epi32(tmp_val_b2_hi, add_16384_32b_256); + tmp_val_b3_hi = _mm256_add_epi32(tmp_val_b3_hi, add_16384_32b_256); + + tmp_val_b1_lo = _mm256_srai_epi32(tmp_val_b1_lo, 15); + tmp_val_b1_hi = _mm256_srai_epi32(tmp_val_b1_hi, 15); + tmp_val_b2_lo = _mm256_srai_epi32(tmp_val_b2_lo, 15); + tmp_val_b2_hi = _mm256_srai_epi32(tmp_val_b2_hi, 15); + tmp_val_b3_lo = _mm256_srai_epi32(tmp_val_b3_lo, 15); + tmp_val_b3_hi = _mm256_srai_epi32(tmp_val_b3_hi, 15); + + __m256i tmp_val_b1 = _mm256_packs_epi32(tmp_val_b1_lo, tmp_val_b1_hi); + __m256i tmp_val_b2 = _mm256_packs_epi32(tmp_val_b2_lo, tmp_val_b2_hi); + __m256i tmp_val_b3 = _mm256_packs_epi32(tmp_val_b3_lo, tmp_val_b3_hi); + tmp_val_b1 = _mm256_permute4x64_epi64(tmp_val_b1, 0xD8); + tmp_val_b2 = _mm256_permute4x64_epi64(tmp_val_b2, 0xD8); + tmp_val_b3 = _mm256_permute4x64_epi64(tmp_val_b3, 0xD8); + + __m256i dlm_rest_b1_256 = _mm256_mask_blend_epi16(angle_mask16, tmp_val_b1, dis_b1_256); + __m256i dlm_rest_b2_256 = _mm256_mask_blend_epi16(angle_mask16, tmp_val_b2, dis_b2_256); + __m256i dlm_rest_b3_256 = _mm256_mask_blend_epi16(angle_mask16, tmp_val_b3, dis_b3_256); + + __m256i dist_m_dlm_rest_b1 = _mm256_abs_epi16(_mm256_sub_epi16(dis_b1_256, dlm_rest_b1_256)); + __m256i dist_m_dlm_rest_b2 = _mm256_abs_epi16(_mm256_sub_epi16(dis_b2_256, dlm_rest_b2_256)); + __m256i dlm_add_256 = _mm256_adds_epu16(dist_m_dlm_rest_b1, dist_m_dlm_rest_b2); + __m256i dist_m_dlm_rest_b3 = _mm256_abs_epi16(_mm256_sub_epi16(dis_b3_256, dlm_rest_b3_256)); + dlm_add_256 = _mm256_adds_epu16(dlm_add_256, dist_m_dlm_rest_b3); + + _mm256_storeu_si256((__m256i*)(i_dlm_rest.bands[1] + restIndex), dlm_rest_b1_256); + _mm256_storeu_si256((__m256i*)(i_dlm_rest.bands[2] + restIndex), dlm_rest_b2_256); + _mm256_storeu_si256((__m256i*)(i_dlm_rest.bands[3] + restIndex), dlm_rest_b3_256); + + __m256i dlm_add_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(dlm_add_256)); + __m256i dlm_add_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(dlm_add_256, 1)); + + ref_b1_lo = _mm256_abs_epi32(ref_b1_lo); + ref_b1_hi = _mm256_abs_epi32(ref_b1_hi); + ref_b2_lo = _mm256_abs_epi32(ref_b2_lo); + ref_b2_hi = _mm256_abs_epi32(ref_b2_hi); + ref_b3_lo = _mm256_abs_epi32(ref_b3_lo); + ref_b3_hi = _mm256_abs_epi32(ref_b3_hi); + + _mm256_storeu_si256((__m256i*)(i_dlm_add + addIndex), dlm_add_lo); + _mm256_storeu_si256((__m256i*)(i_dlm_add + addIndex + 8), dlm_add_hi); + + __m256i ref_b_ref_b1_lo = _mm256_mullo_epi32(ref_b1_lo, ref_b1_lo); + __m256i ref_b_ref_b1_hi = _mm256_mullo_epi32(ref_b1_hi, ref_b1_hi); + __m256i ref_b_ref_b2_lo = _mm256_mullo_epi32(ref_b2_lo, ref_b2_lo); + __m256i ref_b_ref_b2_hi = _mm256_mullo_epi32(ref_b2_hi, ref_b2_hi); + __m256i ref_b_ref_b3_lo = _mm256_mullo_epi32(ref_b3_lo, ref_b3_lo); + __m256i ref_b_ref_b3_hi = _mm256_mullo_epi32(ref_b3_hi, ref_b3_hi); + + ref_b_ref_b1_lo = _mm256_permutevar8x32_epi32( ref_b_ref_b1_lo, perm_for_64b_mul_256); + ref_b1_lo = _mm256_permutevar8x32_epi32( ref_b1_lo, perm_for_64b_mul_256); + ref_b_ref_b1_hi = _mm256_permutevar8x32_epi32( ref_b_ref_b1_hi, perm_for_64b_mul_256); + ref_b1_hi = _mm256_permutevar8x32_epi32( ref_b1_hi, perm_for_64b_mul_256); + ref_b_ref_b2_lo = _mm256_permutevar8x32_epi32( ref_b_ref_b2_lo, perm_for_64b_mul_256); + ref_b2_lo = _mm256_permutevar8x32_epi32( ref_b2_lo, perm_for_64b_mul_256); + ref_b_ref_b2_hi = _mm256_permutevar8x32_epi32( ref_b_ref_b2_hi, perm_for_64b_mul_256); + ref_b2_hi = _mm256_permutevar8x32_epi32( ref_b2_hi, perm_for_64b_mul_256); + ref_b_ref_b3_lo = _mm256_permutevar8x32_epi32( ref_b_ref_b3_lo, perm_for_64b_mul_256); + ref_b3_lo = _mm256_permutevar8x32_epi32( ref_b3_lo, perm_for_64b_mul_256); + ref_b_ref_b3_hi = _mm256_permutevar8x32_epi32( ref_b_ref_b3_hi, perm_for_64b_mul_256); + ref_b3_hi = _mm256_permutevar8x32_epi32( ref_b3_hi, perm_for_64b_mul_256); + + __m256i ref_b_ref_b1_lo0 = _mm256_mul_epi32(ref_b_ref_b1_lo, ref_b1_lo); + __m256i ref_b_ref_b1_lo1 = _mm256_mul_epi32(_mm256_srli_epi64(ref_b_ref_b1_lo, 32), _mm256_srli_epi64(ref_b1_lo, 32)); + __m256i ref_b_ref_b1_hi0 = _mm256_mul_epi32(ref_b_ref_b1_hi, ref_b1_hi); + __m256i ref_b_ref_b1_hi1 = _mm256_mul_epi32(_mm256_srli_epi64(ref_b_ref_b1_hi, 32), _mm256_srli_epi64(ref_b1_hi, 32)); + __m256i ref_b_ref_b2_lo0 = _mm256_mul_epi32(ref_b_ref_b2_lo, ref_b2_lo); + __m256i ref_b_ref_b2_lo1 = _mm256_mul_epi32(_mm256_srli_epi64(ref_b_ref_b2_lo, 32), _mm256_srli_epi64(ref_b2_lo, 32)); + __m256i ref_b_ref_b2_hi0 = _mm256_mul_epi32(ref_b_ref_b2_hi, ref_b2_hi); + __m256i ref_b_ref_b2_hi1 = _mm256_mul_epi32(_mm256_srli_epi64(ref_b_ref_b2_hi, 32), _mm256_srli_epi64(ref_b2_hi, 32)); + __m256i ref_b_ref_b3_lo0 = _mm256_mul_epi32(ref_b_ref_b3_lo, ref_b3_lo); + __m256i ref_b_ref_b3_lo1 = _mm256_mul_epi32(_mm256_srli_epi64(ref_b_ref_b3_lo, 32), _mm256_srli_epi64(ref_b3_lo, 32)); + __m256i ref_b_ref_b3_hi0 = _mm256_mul_epi32(ref_b_ref_b3_hi, ref_b3_hi); + __m256i ref_b_ref_b3_hi1 = _mm256_mul_epi32(_mm256_srli_epi64(ref_b_ref_b3_hi, 32), _mm256_srli_epi64(ref_b3_hi, 32)); + + __m256i b1_r4_lo = _mm256_add_epi64(ref_b_ref_b1_lo0, ref_b_ref_b1_lo1); + __m256i b1_r4_hi = _mm256_add_epi64(ref_b_ref_b1_hi0, ref_b_ref_b1_hi1); + __m256i b2_r4_lo = _mm256_add_epi64(ref_b_ref_b2_lo0, ref_b_ref_b2_lo1); + __m256i b2_r4_hi = _mm256_add_epi64(ref_b_ref_b2_hi0, ref_b_ref_b2_hi1); + __m256i b3_r4_lo = _mm256_add_epi64(ref_b_ref_b3_lo0, ref_b_ref_b3_lo1); + __m256i b3_r4_hi = _mm256_add_epi64(ref_b_ref_b3_hi0, ref_b_ref_b3_hi1); + __m256i b1_r4 = _mm256_add_epi64(b1_r4_lo, b1_r4_hi); + __m256i b2_r4 = _mm256_add_epi64(b2_r4_lo, b2_r4_hi); + __m256i b3_r4 = _mm256_add_epi64(b3_r4_lo, b3_r4_hi); + __m128i b1_r2 = _mm_add_epi64(_mm256_castsi256_si128(b1_r4), _mm256_extractf128_si256(b1_r4, 1)); + __m128i b2_r2 = _mm_add_epi64(_mm256_castsi256_si128(b2_r4), _mm256_extractf128_si256(b2_r4, 1)); + __m128i b3_r2 = _mm_add_epi64(_mm256_castsi256_si128(b3_r4), _mm256_extractf128_si256(b3_r4, 1)); + int64_t r_b1 = _mm_extract_epi64(b1_r2, 0) + _mm_extract_epi64(b1_r2, 1); + int64_t r_b2 = _mm_extract_epi64(b2_r2, 0) + _mm_extract_epi64(b2_r2, 1); + int64_t r_b3 = _mm_extract_epi64(b3_r2, 0) + _mm_extract_epi64(b3_r2, 1); + + den_row_sum[0] += r_b1; + den_row_sum[1] += r_b2; + den_row_sum[2] += r_b3; + } + + for (; j < loop_w_8; j+=8) + { + index = i * width + j; + //If padding is enabled the computation of i_dlm_add will be from 1,1 & later padded + addIndex = (i + ADM_REFLECT_PAD - border_h) * (dlm_add_w) + j + ADM_REFLECT_PAD - border_w; + restIndex = (i - border_h) * (dlm_width) + j - border_w; + + __m128i ref_b1_128 = _mm_loadu_si128((__m128i*)(ref.bands[1] + index)); + __m128i dis_b1_128 = _mm_loadu_si128((__m128i*)(dist.bands[1] + index)); + __m128i ref_b2_128 = _mm_loadu_si128((__m128i*)(ref.bands[2] + index)); + __m128i dis_b2_128 = _mm_loadu_si128((__m128i*)(dist.bands[2] + index)); + + __m128i ref_b1b2_lo = _mm_unpacklo_epi16(ref_b1_128, ref_b2_128); + __m128i ref_b1b2_hi = _mm_unpackhi_epi16(ref_b1_128, ref_b2_128); + __m128i dis_b1b2_lo = _mm_unpacklo_epi16(dis_b1_128, dis_b2_128); + __m128i dis_b1b2_hi = _mm_unpackhi_epi16(dis_b1_128, dis_b2_128); + + __m128i ot_dp_lo = _mm_madd_epi16(ref_b1b2_lo, dis_b1b2_lo); + __m128i ot_dp_hi = _mm_madd_epi16(ref_b1b2_hi, dis_b1b2_hi); + + __m128i o_mag_sq_lo = _mm_madd_epi16(ref_b1b2_lo, ref_b1b2_lo); + __m128i o_mag_sq_hi = _mm_madd_epi16(ref_b1b2_hi, ref_b1b2_hi); + + __m128i t_mag_sq_lo = _mm_madd_epi16(dis_b1b2_lo, dis_b1b2_lo); + __m128i t_mag_sq_hi = _mm_madd_epi16(dis_b1b2_hi, dis_b1b2_hi); + + ot_dp_lo = _mm_max_epi32(ot_dp_lo, zero_128); + ot_dp_hi = _mm_max_epi32(ot_dp_hi, zero_128); + + __m128i ot_dp_lo_0 = _mm_mul_epi32(ot_dp_lo, ot_dp_lo); + __m128i ot_dp_lo_1 = _mm_mul_epi32(_mm_srai_epi64(ot_dp_lo, 32), _mm_srai_epi64(ot_dp_lo, 32)); + __m128i ot_dp_hi_0 = _mm_mul_epi32(ot_dp_hi, ot_dp_hi); + __m128i ot_dp_hi_1 = _mm_mul_epi32(_mm_srai_epi64(ot_dp_hi, 32), _mm_srai_epi64(ot_dp_hi, 32)); + + __m128i ot_mag_sq_lo_0 = _mm_mul_epi32(o_mag_sq_lo, t_mag_sq_lo); + __m128i ot_mag_sq_lo_1 = _mm_mul_epi32(_mm_srai_epi64(o_mag_sq_lo, 32), _mm_srai_epi64(t_mag_sq_lo, 32)); + __m128i ot_mag_sq_hi_0 = _mm_mul_epi32(o_mag_sq_hi, t_mag_sq_hi); + __m128i ot_mag_sq_hi_1 = _mm_mul_epi32(_mm_srai_epi64(o_mag_sq_hi, 32), _mm_srai_epi64(t_mag_sq_hi, 32)); + + __mmask8 angle_mask8 = 0; + int a0 = ((adm_i64_dtype)ot_dp_lo_0[0] >= COS_1DEG_SQ * (adm_i64_dtype)ot_mag_sq_lo_0[0]); + int a2 = (ot_dp_lo_0[1] >= COS_1DEG_SQ * ot_mag_sq_lo_0[1]) << 2; + int a1 = (ot_dp_lo_1[0] >= COS_1DEG_SQ * ot_mag_sq_lo_1[0]) << 1; + int a3 = (ot_dp_lo_1[1] >= COS_1DEG_SQ * ot_mag_sq_lo_1[1]) << 3; + int a4 = (ot_dp_hi_0[0] >= COS_1DEG_SQ * ot_mag_sq_hi_0[0]) << 4; + int a6 = (ot_dp_hi_0[1] >= COS_1DEG_SQ * ot_mag_sq_hi_0[1]) << 6; + int a5 = (ot_dp_hi_1[0] >= COS_1DEG_SQ * ot_mag_sq_hi_1[0]) << 5; + int a7 = (ot_dp_hi_1[1] >= COS_1DEG_SQ * ot_mag_sq_hi_1[1]) << 7; + angle_mask8 += a0 + a2 + a1 + a3 + a4 + a6 + a5 + a7; + + __m128i dis_b3_128 = _mm_loadu_si128((__m128i*)(dist.bands[3] + index)); + __m128i ref_b3_128 = _mm_loadu_si128((__m128i*)(ref.bands[3] + index)); + + __m128i ref_b1_lo, ref_b1_hi, ref_b2_lo, ref_b2_hi, ref_b3_lo, ref_b3_hi; + cvt_1_16x8_to_2_32x4_256(ref_b1_128, ref_b1_lo, ref_b1_hi); + cvt_1_16x8_to_2_32x4_256(ref_b2_128, ref_b2_lo, ref_b2_hi); + cvt_1_16x8_to_2_32x4_256(ref_b3_128, ref_b3_lo, ref_b3_hi); + + __m128i adm_div_b1_lo, adm_div_b1_hi, adm_div_b2_lo, adm_div_b2_hi, adm_div_b3_lo, adm_div_b3_hi; + adm_div_b1_lo = _mm_i32gather_epi32(adm_div_lookup, _mm_add_epi32(ref_b1_lo, add_32768_32b_128), 4); + adm_div_b1_hi = _mm_i32gather_epi32(adm_div_lookup, _mm_add_epi32(ref_b1_hi, add_32768_32b_128), 4); + adm_div_b2_lo = _mm_i32gather_epi32(adm_div_lookup, _mm_add_epi32(ref_b2_lo, add_32768_32b_128), 4); + adm_div_b2_hi = _mm_i32gather_epi32(adm_div_lookup, _mm_add_epi32(ref_b2_hi, add_32768_32b_128), 4); + adm_div_b3_lo = _mm_i32gather_epi32(adm_div_lookup, _mm_add_epi32(ref_b3_lo, add_32768_32b_128), 4); + adm_div_b3_hi = _mm_i32gather_epi32(adm_div_lookup, _mm_add_epi32(ref_b3_hi, add_32768_32b_128), 4); + + __m128i dis_b1_lo, dis_b1_hi, dis_b2_lo, dis_b2_hi, dis_b3_lo, dis_b3_hi; + cvt_1_16x8_to_2_32x4_256(dis_b1_128, dis_b1_lo, dis_b1_hi); + cvt_1_16x8_to_2_32x4_256(dis_b2_128, dis_b2_lo, dis_b2_hi); + cvt_1_16x8_to_2_32x4_256(dis_b3_128, dis_b3_lo, dis_b3_hi); + + __m128i adm_b1_dis_lo0 = _mm_mul_epi32(adm_div_b1_lo, dis_b1_lo); + __m128i adm_b1_dis_lo1 = _mm_mul_epi32(_mm_srli_epi64(adm_div_b1_lo, 32), _mm_srli_epi64(dis_b1_lo, 32)); + __m128i adm_b1_dis_hi8 = _mm_mul_epi32(adm_div_b1_hi, dis_b1_hi); + __m128i adm_b1_dis_hi9 = _mm_mul_epi32(_mm_srli_epi64(adm_div_b1_hi, 32), _mm_srli_epi64(dis_b1_hi, 32)); + + __m128i adm_b2_dis_lo0 = _mm_mul_epi32(adm_div_b2_lo, dis_b2_lo); + __m128i adm_b2_dis_lo1 = _mm_mul_epi32(_mm_srli_epi64(adm_div_b2_lo, 32), _mm_srli_epi64(dis_b2_lo, 32)); + __m128i adm_b2_dis_hi8 = _mm_mul_epi32(adm_div_b2_hi, dis_b2_hi); + __m128i adm_b2_dis_hi9 = _mm_mul_epi32(_mm_srli_epi64(adm_div_b2_hi, 32), _mm_srli_epi64(dis_b2_hi, 32)); + + __m128i adm_b3_dis_lo0 = _mm_mul_epi32(adm_div_b3_lo, dis_b3_lo); + __m128i adm_b3_dis_lo1 = _mm_mul_epi32(_mm_srli_epi64(adm_div_b3_lo, 32), _mm_srli_epi64(dis_b3_lo, 32)); + __m128i adm_b3_dis_hi8 = _mm_mul_epi32(adm_div_b3_hi, dis_b3_hi); + __m128i adm_b3_dis_hi9 = _mm_mul_epi32(_mm_srli_epi64(adm_div_b3_hi, 32), _mm_srli_epi64(dis_b3_hi, 32)); + + adm_b1_dis_lo0 = _mm_add_epi64(adm_b1_dis_lo0, add_16384_128); + adm_b1_dis_lo1 = _mm_add_epi64(adm_b1_dis_lo1, add_16384_128); + adm_b1_dis_hi8 = _mm_add_epi64(adm_b1_dis_hi8, add_16384_128); + adm_b1_dis_hi9 = _mm_add_epi64(adm_b1_dis_hi9, add_16384_128); + adm_b2_dis_lo0 = _mm_add_epi64(adm_b2_dis_lo0, add_16384_128); + adm_b2_dis_lo1 = _mm_add_epi64(adm_b2_dis_lo1, add_16384_128); + adm_b2_dis_hi8 = _mm_add_epi64(adm_b2_dis_hi8, add_16384_128); + adm_b2_dis_hi9 = _mm_add_epi64(adm_b2_dis_hi9, add_16384_128); + adm_b3_dis_lo0 = _mm_add_epi64(adm_b3_dis_lo0, add_16384_128); + adm_b3_dis_lo1 = _mm_add_epi64(adm_b3_dis_lo1, add_16384_128); + adm_b3_dis_hi8 = _mm_add_epi64(adm_b3_dis_hi8, add_16384_128); + adm_b3_dis_hi9 = _mm_add_epi64(adm_b3_dis_hi9, add_16384_128); + + shift15_64b_signExt_128(adm_b1_dis_lo0, adm_b1_dis_lo0); + shift15_64b_signExt_128(adm_b1_dis_lo1, adm_b1_dis_lo1); + shift15_64b_signExt_128(adm_b1_dis_hi8, adm_b1_dis_hi8); + shift15_64b_signExt_128(adm_b1_dis_hi9, adm_b1_dis_hi9); + shift15_64b_signExt_128(adm_b2_dis_lo0, adm_b2_dis_lo0); + shift15_64b_signExt_128(adm_b2_dis_lo1, adm_b2_dis_lo1); + shift15_64b_signExt_128(adm_b2_dis_hi8, adm_b2_dis_hi8); + shift15_64b_signExt_128(adm_b2_dis_hi9, adm_b2_dis_hi9); + shift15_64b_signExt_128(adm_b3_dis_lo0, adm_b3_dis_lo0); + shift15_64b_signExt_128(adm_b3_dis_lo1, adm_b3_dis_lo1); + shift15_64b_signExt_128(adm_b3_dis_hi8, adm_b3_dis_hi8); + shift15_64b_signExt_128(adm_b3_dis_hi9, adm_b3_dis_hi9); + + __mmask8 eqz_b1_lo = _mm_cmpeq_epi32_mask(ref_b1_lo, _mm_setzero_si128()); + __mmask8 eqz_b1_hi = _mm_cmpeq_epi32_mask(ref_b1_hi, _mm_setzero_si128()); + __mmask8 eqz_b2_lo = _mm_cmpeq_epi32_mask(ref_b2_lo, _mm_setzero_si128()); + __mmask8 eqz_b2_hi = _mm_cmpeq_epi32_mask(ref_b2_hi, _mm_setzero_si128()); + __mmask8 eqz_b3_lo = _mm_cmpeq_epi32_mask(ref_b3_lo, _mm_setzero_si128()); + __mmask8 eqz_b3_hi = _mm_cmpeq_epi32_mask(ref_b3_hi, _mm_setzero_si128()); + + __m128i adm_b1_dis_lo = _mm_permutex2var_epi32(adm_b1_dis_lo0, perm_64_to_32_128, adm_b1_dis_lo1); + __m128i adm_b1_dis_hi = _mm_permutex2var_epi32(adm_b1_dis_hi8, perm_64_to_32_128, adm_b1_dis_hi9); + __m128i adm_b2_dis_lo = _mm_permutex2var_epi32(adm_b2_dis_lo0, perm_64_to_32_128, adm_b2_dis_lo1); + __m128i adm_b2_dis_hi = _mm_permutex2var_epi32(adm_b2_dis_hi8, perm_64_to_32_128, adm_b2_dis_hi9); + __m128i adm_b3_dis_lo = _mm_permutex2var_epi32(adm_b3_dis_lo0, perm_64_to_32_128, adm_b3_dis_lo1); + __m128i adm_b3_dis_hi = _mm_permutex2var_epi32(adm_b3_dis_hi8, perm_64_to_32_128, adm_b3_dis_hi9); + + __m128i tmp_k_b1_lo = _mm_mask_blend_epi32(eqz_b1_lo, adm_b1_dis_lo, add_32768_128); + __m128i tmp_k_b1_hi = _mm_mask_blend_epi32(eqz_b1_hi, adm_b1_dis_hi, add_32768_128); + __m128i tmp_k_b2_lo = _mm_mask_blend_epi32(eqz_b2_lo, adm_b2_dis_lo, add_32768_128); + __m128i tmp_k_b2_hi = _mm_mask_blend_epi32(eqz_b2_hi, adm_b2_dis_hi, add_32768_128); + __m128i tmp_k_b3_lo = _mm_mask_blend_epi32(eqz_b3_lo, adm_b3_dis_lo, add_32768_128); + __m128i tmp_k_b3_hi = _mm_mask_blend_epi32(eqz_b3_hi, adm_b3_dis_hi, add_32768_128); + + tmp_k_b1_lo = _mm_max_epi32(tmp_k_b1_lo, zero_128); + tmp_k_b1_hi = _mm_max_epi32(tmp_k_b1_hi, zero_128); + tmp_k_b2_lo = _mm_max_epi32(tmp_k_b2_lo, zero_128); + tmp_k_b2_hi = _mm_max_epi32(tmp_k_b2_hi, zero_128); + tmp_k_b3_lo = _mm_max_epi32(tmp_k_b3_lo, zero_128); + tmp_k_b3_hi = _mm_max_epi32(tmp_k_b3_hi, zero_128); + + tmp_k_b1_lo = _mm_min_epi32(tmp_k_b1_lo, add_32768_32b_128); + tmp_k_b1_hi = _mm_min_epi32(tmp_k_b1_hi, add_32768_32b_128); + tmp_k_b2_lo = _mm_min_epi32(tmp_k_b2_lo, add_32768_32b_128); + tmp_k_b2_hi = _mm_min_epi32(tmp_k_b2_hi, add_32768_32b_128); + tmp_k_b3_lo = _mm_min_epi32(tmp_k_b3_lo, add_32768_32b_128); + tmp_k_b3_hi = _mm_min_epi32(tmp_k_b3_hi, add_32768_32b_128); + + __m128i tmp_val_b1_lo = _mm_mullo_epi32(tmp_k_b1_lo, ref_b1_lo); + __m128i tmp_val_b1_hi = _mm_mullo_epi32(tmp_k_b1_hi, ref_b1_hi); + __m128i tmp_val_b2_lo = _mm_mullo_epi32(tmp_k_b2_lo, ref_b2_lo); + __m128i tmp_val_b2_hi = _mm_mullo_epi32(tmp_k_b2_hi, ref_b2_hi); + __m128i tmp_val_b3_lo = _mm_mullo_epi32(tmp_k_b3_lo, ref_b3_lo); + __m128i tmp_val_b3_hi = _mm_mullo_epi32(tmp_k_b3_hi, ref_b3_hi); + + tmp_val_b1_lo = _mm_add_epi32(tmp_val_b1_lo, add_16384_32b_128); + tmp_val_b1_hi = _mm_add_epi32(tmp_val_b1_hi, add_16384_32b_128); + tmp_val_b2_lo = _mm_add_epi32(tmp_val_b2_lo, add_16384_32b_128); + tmp_val_b3_lo = _mm_add_epi32(tmp_val_b3_lo, add_16384_32b_128); + tmp_val_b2_hi = _mm_add_epi32(tmp_val_b2_hi, add_16384_32b_128); + tmp_val_b3_hi = _mm_add_epi32(tmp_val_b3_hi, add_16384_32b_128); + + tmp_val_b1_lo = _mm_srai_epi32(tmp_val_b1_lo, 15); + tmp_val_b1_hi = _mm_srai_epi32(tmp_val_b1_hi, 15); + tmp_val_b2_lo = _mm_srai_epi32(tmp_val_b2_lo, 15); + tmp_val_b2_hi = _mm_srai_epi32(tmp_val_b2_hi, 15); + tmp_val_b3_lo = _mm_srai_epi32(tmp_val_b3_lo, 15); + tmp_val_b3_hi = _mm_srai_epi32(tmp_val_b3_hi, 15); + + __m128i tmp_val_b1 = _mm_packs_epi32(tmp_val_b1_lo, tmp_val_b1_hi); + __m128i tmp_val_b2 = _mm_packs_epi32(tmp_val_b2_lo, tmp_val_b2_hi); + __m128i tmp_val_b3 = _mm_packs_epi32(tmp_val_b3_lo, tmp_val_b3_hi); + + __m128i dlm_rest_b1_128 = _mm_mask_blend_epi16(angle_mask8, tmp_val_b1, dis_b1_128); + __m128i dlm_rest_b2_128 = _mm_mask_blend_epi16(angle_mask8, tmp_val_b2, dis_b2_128); + __m128i dlm_rest_b3_128 = _mm_mask_blend_epi16(angle_mask8, tmp_val_b3, dis_b3_128); + + __m128i dist_m_dlm_rest_b1 = _mm_abs_epi16(_mm_sub_epi16(dis_b1_128, dlm_rest_b1_128)); + __m128i dist_m_dlm_rest_b2 = _mm_abs_epi16(_mm_sub_epi16(dis_b2_128, dlm_rest_b2_128)); + __m128i dlm_add_256 = _mm_adds_epu16(dist_m_dlm_rest_b1, dist_m_dlm_rest_b2); + __m128i dist_m_dlm_rest_b3 = _mm_abs_epi16(_mm_sub_epi16(dis_b3_128, dlm_rest_b3_128)); + dlm_add_256 = _mm_adds_epu16(dlm_add_256, dist_m_dlm_rest_b3); + + _mm_storeu_si128((__m128i*)(i_dlm_rest.bands[1] + restIndex), dlm_rest_b1_128); + _mm_storeu_si128((__m128i*)(i_dlm_rest.bands[2] + restIndex), dlm_rest_b2_128); + _mm_storeu_si128((__m128i*)(i_dlm_rest.bands[3] + restIndex), dlm_rest_b3_128); + + __m128i dlm_add_lo = _mm_cvtepu16_epi32(dlm_add_256); + __m128i dlm_add_hi = _mm_cvtepu16_epi32(_mm_shuffle_epi32(dlm_add_256, 0x0E)); + + ref_b1_lo = _mm_abs_epi32(ref_b1_lo); + ref_b1_hi = _mm_abs_epi32(ref_b1_hi); + ref_b2_lo = _mm_abs_epi32(ref_b2_lo); + ref_b2_hi = _mm_abs_epi32(ref_b2_hi); + ref_b3_lo = _mm_abs_epi32(ref_b3_lo); + ref_b3_hi = _mm_abs_epi32(ref_b3_hi); + + _mm_storeu_si128((__m128i*)(i_dlm_add + addIndex), dlm_add_lo); + _mm_storeu_si128((__m128i*)(i_dlm_add + addIndex + 4), dlm_add_hi); + + __m128i ref_b_ref_b1_lo = _mm_mullo_epi32(ref_b1_lo, ref_b1_lo); + __m128i ref_b_ref_b1_hi = _mm_mullo_epi32(ref_b1_hi, ref_b1_hi); + __m128i ref_b_ref_b2_lo = _mm_mullo_epi32(ref_b2_lo, ref_b2_lo); + __m128i ref_b_ref_b2_hi = _mm_mullo_epi32(ref_b2_hi, ref_b2_hi); + __m128i ref_b_ref_b3_lo = _mm_mullo_epi32(ref_b3_lo, ref_b3_lo); + __m128i ref_b_ref_b3_hi = _mm_mullo_epi32(ref_b3_hi, ref_b3_hi); + + ref_b_ref_b1_lo = _mm_shuffle_epi32( ref_b_ref_b1_lo, 0xD8); + ref_b1_lo = _mm_shuffle_epi32( ref_b1_lo, 0xD8); + ref_b_ref_b1_hi = _mm_shuffle_epi32( ref_b_ref_b1_hi, 0xD8); + ref_b1_hi = _mm_shuffle_epi32( ref_b1_hi, 0xD8); + ref_b_ref_b2_lo = _mm_shuffle_epi32( ref_b_ref_b2_lo, 0xD8); + ref_b2_lo = _mm_shuffle_epi32( ref_b2_lo, 0xD8); + ref_b_ref_b2_hi = _mm_shuffle_epi32( ref_b_ref_b2_hi, 0xD8); + ref_b2_hi = _mm_shuffle_epi32( ref_b2_hi, 0xD8); + ref_b_ref_b3_lo = _mm_shuffle_epi32( ref_b_ref_b3_lo, 0xD8); + ref_b3_lo = _mm_shuffle_epi32( ref_b3_lo, 0xD8); + ref_b_ref_b3_hi = _mm_shuffle_epi32( ref_b_ref_b3_hi, 0xD8); + ref_b3_hi = _mm_shuffle_epi32( ref_b3_hi, 0xD8); + + __m128i ref_b_ref_b1_lo0 = _mm_mul_epi32(ref_b_ref_b1_lo, ref_b1_lo); + __m128i ref_b_ref_b1_lo1 = _mm_mul_epi32(_mm_srli_epi64(ref_b_ref_b1_lo, 32), _mm_srli_epi64(ref_b1_lo, 32)); + __m128i ref_b_ref_b1_hi0 = _mm_mul_epi32(ref_b_ref_b1_hi, ref_b1_hi); + __m128i ref_b_ref_b1_hi1 = _mm_mul_epi32(_mm_srli_epi64(ref_b_ref_b1_hi, 32), _mm_srli_epi64(ref_b1_hi, 32)); + __m128i ref_b_ref_b2_lo0 = _mm_mul_epi32(ref_b_ref_b2_lo, ref_b2_lo); + __m128i ref_b_ref_b2_lo1 = _mm_mul_epi32(_mm_srli_epi64(ref_b_ref_b2_lo, 32), _mm_srli_epi64(ref_b2_lo, 32)); + __m128i ref_b_ref_b2_hi0 = _mm_mul_epi32(ref_b_ref_b2_hi, ref_b2_hi); + __m128i ref_b_ref_b2_hi1 = _mm_mul_epi32(_mm_srli_epi64(ref_b_ref_b2_hi, 32), _mm_srli_epi64(ref_b2_hi, 32)); + __m128i ref_b_ref_b3_lo0 = _mm_mul_epi32(ref_b_ref_b3_lo, ref_b3_lo); + __m128i ref_b_ref_b3_lo1 = _mm_mul_epi32(_mm_srli_epi64(ref_b_ref_b3_lo, 32), _mm_srli_epi64(ref_b3_lo, 32)); + __m128i ref_b_ref_b3_hi0 = _mm_mul_epi32(ref_b_ref_b3_hi, ref_b3_hi); + __m128i ref_b_ref_b3_hi1 = _mm_mul_epi32(_mm_srli_epi64(ref_b_ref_b3_hi, 32), _mm_srli_epi64(ref_b3_hi, 32)); + + __m128i b1_r2_lo = _mm_add_epi64(ref_b_ref_b1_lo0, ref_b_ref_b1_lo1); + __m128i b1_r2_hi = _mm_add_epi64(ref_b_ref_b1_hi0, ref_b_ref_b1_hi1); + __m128i b2_r2_lo = _mm_add_epi64(ref_b_ref_b2_lo0, ref_b_ref_b2_lo1); + __m128i b2_r2_hi = _mm_add_epi64(ref_b_ref_b2_hi0, ref_b_ref_b2_hi1); + __m128i b3_r2_lo = _mm_add_epi64(ref_b_ref_b3_lo0, ref_b_ref_b3_lo1); + __m128i b3_r2_hi = _mm_add_epi64(ref_b_ref_b3_hi0, ref_b_ref_b3_hi1); + __m128i b1_r2 = _mm_add_epi64(b1_r2_lo, b1_r2_hi); + __m128i b2_r2 = _mm_add_epi64(b2_r2_lo, b2_r2_hi); + __m128i b3_r2 = _mm_add_epi64(b3_r2_lo, b3_r2_hi); + int64_t r_b1 = _mm_extract_epi64(b1_r2, 0) + _mm_extract_epi64(b1_r2, 1); + int64_t r_b2 = _mm_extract_epi64(b2_r2, 0) + _mm_extract_epi64(b2_r2, 1); + int64_t r_b3 = _mm_extract_epi64(b3_r2, 0) + _mm_extract_epi64(b3_r2, 1); + + den_row_sum[0] += r_b1; + den_row_sum[1] += r_b2; + den_row_sum[2] += r_b3; + } + + for (; j < loop_w; j++) + { + index = i * width + j; + + //If padding is enabled the computation of i_dlm_add will be from 1,1 & later padded + addIndex = (i + ADM_REFLECT_PAD - border_h) * (dlm_add_w) + j + ADM_REFLECT_PAD - border_w; + + restIndex = (i - border_h) * (dlm_width) + j - border_w; + ot_dp = ((adm_i32_dtype)ref.bands[1][index] * dist.bands[1][index]) + ((adm_i32_dtype)ref.bands[2][index] * dist.bands[2][index]); + o_mag_sq = ((adm_i32_dtype)ref.bands[1][index] * ref.bands[1][index]) + ((adm_i32_dtype)ref.bands[2][index] * ref.bands[2][index]); + t_mag_sq = ((adm_i32_dtype)dist.bands[1][index] * dist.bands[1][index]) + ((adm_i32_dtype)dist.bands[2][index] * dist.bands[2][index]); + angle_flag = ((ot_dp >= 0) && (((adm_i64_dtype)ot_dp * ot_dp) >= COS_1DEG_SQ * ((adm_i64_dtype)o_mag_sq * t_mag_sq))); + i_dlm_add[addIndex] = 0; + for (k = 1; k < 4; k++) + { + /** + * Division dist/ref is carried using lookup table adm_div_lookup and converted to multiplication + */ + adm_i32_dtype tmp_k = (ref.bands[k][index] == 0) ? 32768 : (((adm_i64_dtype)adm_div_lookup[ref.bands[k][index] + 32768] * dist.bands[k][index]) + 16384) >> 15; + adm_u16_dtype kh = tmp_k < 0 ? 0 : (tmp_k > 32768 ? 32768 : tmp_k); + /** + * kh is in Q15 type and ref.bands[k][index] is in Q16 type hence shifted by + * 15 to make result Q16 + */ + tmp_val = (((adm_i32_dtype)kh * ref.bands[k][index]) + 16384) >> 15; + + i_dlm_rest.bands[k][restIndex] = angle_flag ? dist.bands[k][index] : tmp_val; + /** + * Absolute is taken here for the difference value instead of + * taking absolute of pyr_2 in integer_dlm_contrast_mask_one_way function + */ + i_dlm_add[addIndex] += (int32_t)abs(dist.bands[k][index] - i_dlm_rest.bands[k][restIndex]); + + //Accumulating denominator score to avoid load in next stage + int16_t ref_abs = abs(ref.bands[k][index]); + den_cube[k-1] = (adm_i64_dtype)ref_abs * ref_abs * ref_abs; + + den_row_sum[k-1] += den_cube[k-1]; + } + } + if(extra_sample_w) + { + for(k = 0; k < 3; k++) + { + den_row_sum[k] -= den_cube[k]; + den_row_sum[k] -= col0_ref_cube[k]; + } + } + if((i != border_h && i != (loop_h - 1)) || !extra_sample_h) + { + for(k=0; k<3; k++) + { + den_sum[k] += den_row_sum[k]; + } + } + den_row_sum[0] = 0; + den_row_sum[1] = 0; + den_row_sum[2] = 0; +#if ADM_REFLECT_PAD + if(!extra_sample_w) + { + addIndex = (i + 1 - border_h) * (dlm_add_w); + i_dlm_add[addIndex + 0] = i_dlm_add[addIndex + 2]; + i_dlm_add[addIndex + dlm_width + 1] = i_dlm_add[addIndex + dlm_width - 1]; + } +#endif + } +#if ADM_REFLECT_PAD + if(!extra_sample_h) + { + int row2Idx = 2 * (dlm_add_w); + int rowLast2Idx = (dlm_height - 1) * (dlm_add_w); + int rowLastPadIdx = (dlm_height + 1) * (dlm_add_w); + + memcpy(&i_dlm_add[0], &i_dlm_add[row2Idx], sizeof(int32_t) * (dlm_add_w)); + + memcpy(&i_dlm_add[rowLastPadIdx], &i_dlm_add[rowLast2Idx], sizeof(int32_t) * (dlm_width+2)); + } +#endif + //Calculating denominator score + double den_band = 0; + for(k=0; k<3; k++) + { + double accum_den = (double) den_sum[k] / ADM_CUBE_DIV; + den_band += powf((double)(accum_den), 1.0 / 3.0); + } + // compensation for the division by thirty in the numerator + *adm_score_den = (den_band * 30) + 1e-4; + +} diff --git a/libvmaf/src/feature/third_party/funque/x86/integer_funque_adm_avx512.h b/libvmaf/src/feature/third_party/funque/x86/integer_funque_adm_avx512.h new file mode 100644 index 000000000..594debd1a --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/integer_funque_adm_avx512.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: BSD-3-Clause +* Copyright (C) 2022 Intel Corporation. +*/ +/** + * + * Copyright 2016-2020 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include + +#include + +#include +#include + +#include "../integer_funque_filters.h" +#include "../integer_funque_adm.h" + +void integer_adm_decouple_avx512(i_dwt2buffers ref, i_dwt2buffers dist, + i_dwt2buffers i_dlm_rest, adm_i32_dtype *i_dlm_add, + int32_t *adm_div_lookup, float border_size, double *adm_score_den); \ No newline at end of file diff --git a/libvmaf/src/feature/third_party/funque/x86/integer_funque_filters_avx2.c b/libvmaf/src/feature/third_party/funque/x86/integer_funque_filters_avx2.c index e3603356a..dae81b0ad 100644 --- a/libvmaf/src/feature/third_party/funque/x86/integer_funque_filters_avx2.c +++ b/libvmaf/src/feature/third_party/funque/x86/integer_funque_filters_avx2.c @@ -30,23 +30,6 @@ #include "../integer_funque_filters.h" #include -#define hor_sum_and_store(addr, r) \ -{ \ - __m128i r4 = _mm_add_epi32(_mm256_castsi256_si128(r), _mm256_extracti128_si256(r, 1)); \ - __m128i r2 = _mm_hadd_epi32(r4, r4); \ - __m128i r1 = _mm_hadd_epi32(r2, r2); \ - int r = _mm_cvtsi128_si32(r1); \ - dst[dst_row_idx + j] = (spat_fil_output_dtype) ((r + SPAT_FILTER_OUT_RND) >> SPAT_FILTER_OUT_SHIFT); \ -} - -#define shuffle_and_store(addr, v0, v8) \ -{ \ - __m256i r0 = _mm256_permute2x128_si256(v0, v8, 0x20); \ - __m256i r8 = _mm256_permute2x128_si256(v0, v8, 0x31); \ - _mm256_store_si256((__m256i*)(addr), r0); \ - _mm256_store_si256((__m256i*)(addr + 16), r8); \ -} - void integer_funque_dwt2_avx2(spat_fil_output_dtype *src, i_dwt2buffers *dwt2_dst, ptrdiff_t dst_stride, int width, int height) { int dst_px_stride = dst_stride / sizeof(dwt2_dtype); @@ -60,9 +43,6 @@ void integer_funque_dwt2_avx2(spat_fil_output_dtype *src, i_dwt2buffers *dwt2_ds const int8_t filter_shift = 1 + DWT2_OUT_SHIFT; const int8_t filter_shift_rnd = 1<<(filter_shift - 1); - __m256i filter_shift_256 = _mm256_set1_epi32(filter_shift); - __m128i filter_shift_128 = _mm_set1_epi32(filter_shift); - /** * Last column due to padding the values are left shifted and then right shifted * Hence using updated shifts. Subtracting 1 due to left shift @@ -84,10 +64,16 @@ void integer_funque_dwt2_avx2(spat_fil_output_dtype *src, i_dwt2buffers *dwt2_ds int i, j; - int width_rem_size = width_div_2 - (width_div_2 % 16); + int width_rem_size16 = width_div_2 - (width_div_2 % 16); int width_rem_size8 = width_div_2 - (width_div_2 % 8); + int width_rem_size4 = width_div_2 - (width_div_2 % 4); + __m256i filter_shift_256 = _mm256_set1_epi32(filter_shift); __m256i idx_perm = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + __m256i zero_256 = _mm256_setzero_si256(); + + __m128i filter_shift_128 = _mm_set1_epi32(filter_shift); + __m128i zero_128 = _mm_setzero_si128(); for (i=0; i < (height+1)/2; ++i) { @@ -96,8 +82,8 @@ void integer_funque_dwt2_avx2(spat_fil_output_dtype *src, i_dwt2buffers *dwt2_ds row_idx1 = row_idx1 < height ? row_idx1 : 2*i; row0_offset = (row_idx0)*width; row1_offset = (row_idx1)*width; - - for(j=0; j< width_rem_size; j+=16) + j = 0; + for(; j< width_rem_size16; j+=16) { int col_idx0 = (j << 1); @@ -177,10 +163,62 @@ void integer_funque_dwt2_avx2(spat_fil_output_dtype *src, i_dwt2buffers *dwt2_ds { int col_idx0 = (j << 1); + __m256i src_a_256 = _mm256_loadu_si256((__m256i*)(src + row0_offset + col_idx0)); + __m256i src_b_256 = _mm256_loadu_si256((__m256i*)(src + row1_offset + col_idx0)); + + // Original + //F* F (a + b + c + d) - band A (F*F is 1/2) + //F* F (a - b + c - d) - band H (F*F is 1/2) + //F* F (a + b - c + d) - band V (F*F is 1/2) + //F* F (a - b - c - d) - band D (F*F is 1/2) + + __m256i a_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(src_a_256)); + __m256i a_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(src_a_256, 1)); + __m256i b_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(src_b_256)); + __m256i b_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(src_b_256, 1)); + + __m256i a_p_b_c_p_d_lo = _mm256_add_epi32(a_lo, b_lo); + __m256i a_p_b_c_p_d_hi = _mm256_add_epi32(a_hi, b_hi); + __m256i a_m_b_c_m_d_lo = _mm256_sub_epi32(a_lo, b_lo); + __m256i a_m_b_c_m_d_hi = _mm256_sub_epi32(a_hi, b_hi); + + __m256i band_a_256 = _mm256_hadd_epi32(a_p_b_c_p_d_lo, a_p_b_c_p_d_hi); + __m256i band_v_256 = _mm256_hsub_epi32(a_p_b_c_p_d_lo, a_p_b_c_p_d_hi); + __m256i band_h_256 = _mm256_hadd_epi32(a_m_b_c_m_d_lo, a_m_b_c_m_d_hi); + __m256i band_d_256 = _mm256_hsub_epi32(a_m_b_c_m_d_lo, a_m_b_c_m_d_hi); + + band_a_256 = _mm256_add_epi32(band_a_256, filter_shift_256); + band_v_256 = _mm256_add_epi32(band_v_256, filter_shift_256); + band_h_256 = _mm256_add_epi32(band_h_256, filter_shift_256); + band_d_256 = _mm256_add_epi32(band_d_256, filter_shift_256); + + band_a_256 = _mm256_srai_epi32(band_a_256, filter_shift_rnd); + band_h_256 = _mm256_srai_epi32(band_h_256, filter_shift_rnd); + band_v_256 = _mm256_srai_epi32(band_v_256, filter_shift_rnd); + band_d_256 = _mm256_srai_epi32(band_d_256, filter_shift_rnd); + + band_a_256 = _mm256_packs_epi32(band_a_256, zero_256); + band_h_256 = _mm256_packs_epi32(band_h_256, zero_256); + band_v_256 = _mm256_packs_epi32(band_v_256, zero_256); + band_d_256 = _mm256_packs_epi32(band_d_256, zero_256); + + band_a_256 = _mm256_permutevar8x32_epi32(band_a_256, idx_perm); + band_h_256 = _mm256_permutevar8x32_epi32(band_h_256, idx_perm); + band_v_256 = _mm256_permutevar8x32_epi32(band_v_256, idx_perm); + band_d_256 = _mm256_permutevar8x32_epi32(band_d_256, idx_perm); + + _mm_storeu_si128((__m128i*)(band_a + i * dst_px_stride + j), _mm256_castsi256_si128(band_a_256)); + _mm_storeu_si128((__m128i*)(band_h + i * dst_px_stride + j), _mm256_castsi256_si128(band_h_256)); + _mm_storeu_si128((__m128i*)(band_v + i * dst_px_stride + j), _mm256_castsi256_si128(band_v_256)); + _mm_storeu_si128((__m128i*)(band_d + i * dst_px_stride + j), _mm256_castsi256_si128(band_d_256)); + } + + for(; j< width_rem_size4; j+=4) + { + int col_idx0 = (j << 1); + __m128i src_a_128 = _mm_loadu_si128((__m128i*)(src + row0_offset + col_idx0)); __m128i src_b_128 = _mm_loadu_si128((__m128i*)(src + row1_offset + col_idx0)); - __m128i src2_a_128 = _mm_loadu_si128((__m128i*)(src + row0_offset + col_idx0 + 8)); - __m128i src2_b_128 = _mm_loadu_si128((__m128i*)(src + row1_offset + col_idx0 + 8)); // Original //F* F (a + b + c + d) - band A (F*F is 1/2) @@ -188,60 +226,40 @@ void integer_funque_dwt2_avx2(spat_fil_output_dtype *src, i_dwt2buffers *dwt2_ds //F* F (a + b - c + d) - band V (F*F is 1/2) //F* F (a - b - c - d) - band D (F*F is 1/2) - __m128i a_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src_a_128, _mm_setzero_si128())); - __m128i a_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src_a_128, _mm_setzero_si128())); - __m128i b_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src_b_128, _mm_setzero_si128())); - __m128i b_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src_b_128, _mm_setzero_si128())); - __m128i a2_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src2_a_128, _mm_setzero_si128())); - __m128i a2_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src2_a_128, _mm_setzero_si128())); - __m128i b2_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src2_b_128, _mm_setzero_si128())); - __m128i b2_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src2_b_128, _mm_setzero_si128())); + __m128i a_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src_a_128, zero_128)); + __m128i a_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src_a_128, zero_128)); + __m128i b_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src_b_128, zero_128)); + __m128i b_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src_b_128, zero_128)); __m128i a_p_b_c_p_d_lo = _mm_add_epi32(a_lo, b_lo); __m128i a_p_b_c_p_d_hi = _mm_add_epi32(a_hi, b_hi); __m128i a_m_b_c_m_d_lo = _mm_sub_epi32(a_lo, b_lo); __m128i a_m_b_c_m_d_hi = _mm_sub_epi32(a_hi, b_hi); - __m128i a_p_b_c_p_d_2_lo = _mm_add_epi32(a2_lo, b2_lo); - __m128i a_p_b_c_p_d_2_hi = _mm_add_epi32(a2_hi, b2_hi); - __m128i a_m_b_c_m_d_2_lo = _mm_sub_epi32(a2_lo, b2_lo); - __m128i a_m_b_c_m_d_2_hi = _mm_sub_epi32(a2_hi, b2_hi); __m128i band_a_128 = _mm_hadd_epi32(a_p_b_c_p_d_lo, a_p_b_c_p_d_hi); __m128i band_h_128 = _mm_hadd_epi32(a_m_b_c_m_d_lo, a_m_b_c_m_d_hi); __m128i band_v_128 = _mm_hsub_epi32(a_p_b_c_p_d_lo, a_p_b_c_p_d_hi); __m128i band_d_128 = _mm_hsub_epi32(a_m_b_c_m_d_lo, a_m_b_c_m_d_hi); - __m128i band_a2_128 = _mm_hadd_epi32(a_p_b_c_p_d_2_lo, a_p_b_c_p_d_2_hi); - __m128i band_h2_128 = _mm_hadd_epi32(a_m_b_c_m_d_2_lo, a_m_b_c_m_d_2_hi); - __m128i band_v2_128 = _mm_hsub_epi32(a_p_b_c_p_d_2_lo, a_p_b_c_p_d_2_hi); - __m128i band_d2_128 = _mm_hsub_epi32(a_m_b_c_m_d_2_lo, a_m_b_c_m_d_2_hi); band_a_128 = _mm_add_epi32(band_a_128, filter_shift_128); band_h_128 = _mm_add_epi32(band_h_128, filter_shift_128); band_v_128 = _mm_add_epi32(band_v_128, filter_shift_128); band_d_128 = _mm_add_epi32(band_d_128, filter_shift_128); - band_a2_128 = _mm_add_epi32(band_a2_128, filter_shift_128); - band_h2_128 = _mm_add_epi32(band_h2_128, filter_shift_128); - band_v2_128 = _mm_add_epi32(band_v2_128, filter_shift_128); - band_d2_128 = _mm_add_epi32(band_d2_128, filter_shift_128); band_a_128 = _mm_srai_epi32(band_a_128, filter_shift_rnd); band_h_128 = _mm_srai_epi32(band_h_128, filter_shift_rnd); band_v_128 = _mm_srai_epi32(band_v_128, filter_shift_rnd); band_d_128 = _mm_srai_epi32(band_d_128, filter_shift_rnd); - band_a2_128 = _mm_srai_epi32(band_a2_128, filter_shift_rnd); - band_h2_128 = _mm_srai_epi32(band_h2_128, filter_shift_rnd); - band_v2_128 = _mm_srai_epi32(band_v2_128, filter_shift_rnd); - band_d2_128 = _mm_srai_epi32(band_d2_128, filter_shift_rnd); - - band_a_128 = _mm_packs_epi32(band_a_128, band_a2_128); - band_h_128 = _mm_packs_epi32(band_h_128, band_h2_128); - band_v_128 = _mm_packs_epi32(band_v_128, band_v2_128); - band_d_128 = _mm_packs_epi32(band_d_128, band_d2_128); + + band_a_128 = _mm_packs_epi32(band_a_128, zero_128); + band_h_128 = _mm_packs_epi32(band_h_128, zero_128); + band_v_128 = _mm_packs_epi32(band_v_128, zero_128); + band_d_128 = _mm_packs_epi32(band_d_128, zero_128); - _mm_storeu_si128((__m128i*)(band_a + i * dst_px_stride + j), band_a_128); - _mm_storeu_si128((__m128i*)(band_h + i * dst_px_stride + j), band_h_128); - _mm_storeu_si128((__m128i*)(band_v + i * dst_px_stride + j), band_v_128); - _mm_storeu_si128((__m128i*)(band_d + i * dst_px_stride + j), band_d_128); + _mm_storel_epi64((__m128i*)(band_a + i * dst_px_stride + j), band_a_128); + _mm_storel_epi64((__m128i*)(band_h + i * dst_px_stride + j), band_h_128); + _mm_storel_epi64((__m128i*)(band_v + i * dst_px_stride + j), band_v_128); + _mm_storel_epi64((__m128i*)(band_d + i * dst_px_stride + j), band_d_128); } for(; j< width_div_2; ++j) @@ -318,8 +336,6 @@ void integer_funque_vifdwt2_band0_avx2(dwt2_dtype *src, dwt2_dtype *band_a, ptrd */ const int8_t filter_shift = 1 + DWT2_OUT_SHIFT; const int8_t filter_shift_rnd = 1<<(filter_shift - 1); - __m256i filter_shift_256 = _mm256_set1_epi32(filter_shift); - __m128i filter_shift_128 = _mm_set1_epi32(filter_shift); /** * Last column due to padding the values are left shifted and then right shifted @@ -339,7 +355,14 @@ void integer_funque_vifdwt2_band0_avx2(dwt2_dtype *src, dwt2_dtype *band_a, ptrd int width_rem_size16 = width_div_2 - (width_div_2 % 16); int width_rem_size8 = width_div_2 - (width_div_2 % 8); + int width_rem_size4 = width_div_2 - (width_div_2 % 4); + + __m256i filter_shift_256 = _mm256_set1_epi32(filter_shift); __m256i idx_perm = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + __m256i zero_256 = _mm256_setzero_si256(); + + __m128i filter_shift_128 = _mm_set1_epi32(filter_shift); + __m128i zero_128 = _mm_setzero_si128(); for (i=0; i < (height+1)/2; ++i) { @@ -348,8 +371,8 @@ void integer_funque_vifdwt2_band0_avx2(dwt2_dtype *src, dwt2_dtype *band_a, ptrd row_idx1 = row_idx1 < height ? row_idx1 : 2*i; row0_offset = (row_idx0)*width; row1_offset = (row_idx1)*width; - - for(j=0; j< width_rem_size16; j+=16) + j = 0; + for(; j< width_rem_size16; j+=16) { int col_idx0 = (j << 1); @@ -390,10 +413,34 @@ void integer_funque_vifdwt2_band0_avx2(dwt2_dtype *src, dwt2_dtype *band_a, ptrd { int col_idx0 = (j << 1); + __m256i src_a_256 = _mm256_loadu_si256((__m256i*)(src + row0_offset + col_idx0)); + __m256i src_b_256 = _mm256_loadu_si256((__m256i*)(src + row1_offset + col_idx0)); + + __m256i a_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(src_a_256)); + __m256i a_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(src_a_256, 1)); + __m256i b_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(src_b_256)); + __m256i b_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(src_b_256, 1)); + + __m256i a_p_b_c_p_d_lo = _mm256_add_epi32(a_lo, b_lo); + __m256i a_p_b_c_p_d_hi = _mm256_add_epi32(a_hi, b_hi); + + __m256i band_a_256 = _mm256_hadd_epi32(a_p_b_c_p_d_lo, a_p_b_c_p_d_hi); + + band_a_256 = _mm256_add_epi32(band_a_256, filter_shift_256); + band_a_256 = _mm256_srai_epi32(band_a_256, filter_shift_rnd); + + band_a_256 = _mm256_packs_epi32(band_a_256, zero_256); + band_a_256 = _mm256_permutevar8x32_epi32(band_a_256, idx_perm); + + _mm256_storeu_si256((__m256i*)(band_a + i * dst_px_stride + j), band_a_256); + } + + for(; j< width_rem_size4; j+=4) + { + int col_idx0 = (j << 1); + __m128i src_a_128 = _mm_loadu_si128((__m128i*)(src + row0_offset + col_idx0)); __m128i src_b_128 = _mm_loadu_si128((__m128i*)(src + row1_offset + col_idx0)); - __m128i src2_a_128 = _mm_loadu_si128((__m128i*)(src + row0_offset + col_idx0 + 8)); - __m128i src2_b_128 = _mm_loadu_si128((__m128i*)(src + row1_offset + col_idx0 + 8)); // Original //F* F (a + b + c + d) - band A (F*F is 1/2) @@ -401,32 +448,22 @@ void integer_funque_vifdwt2_band0_avx2(dwt2_dtype *src, dwt2_dtype *band_a, ptrd //F* F (a + b - c + d) - band V (F*F is 1/2) //F* F (a - b - c - d) - band D (F*F is 1/2) - __m128i a_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src_a_128, _mm_setzero_si128())); - __m128i a_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src_a_128, _mm_setzero_si128())); - __m128i b_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src_b_128, _mm_setzero_si128())); - __m128i b_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src_b_128, _mm_setzero_si128())); - __m128i a2_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src2_a_128, _mm_setzero_si128())); - __m128i a2_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src2_a_128, _mm_setzero_si128())); - __m128i b2_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src2_b_128, _mm_setzero_si128())); - __m128i b2_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src2_b_128, _mm_setzero_si128())); + __m128i a_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src_a_128, zero_128)); + __m128i a_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src_a_128, zero_128)); + __m128i b_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src_b_128, zero_128)); + __m128i b_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src_b_128, zero_128)); __m128i a_p_b_c_p_d_lo = _mm_add_epi32(a_lo, b_lo); __m128i a_p_b_c_p_d_hi = _mm_add_epi32(a_hi, b_hi); - __m128i a_p_b_c_p_d_2_lo = _mm_add_epi32(a2_lo, b2_lo); - __m128i a_p_b_c_p_d_2_hi = _mm_add_epi32(a2_hi, b2_hi); __m128i band_a_128 = _mm_hadd_epi32(a_p_b_c_p_d_lo, a_p_b_c_p_d_hi); - __m128i band_a2_128 = _mm_hadd_epi32(a_p_b_c_p_d_2_lo, a_p_b_c_p_d_2_hi); band_a_128 = _mm_add_epi32(band_a_128, filter_shift_128); - band_a2_128 = _mm_add_epi32(band_a2_128, filter_shift_128); - band_a_128 = _mm_srai_epi32(band_a_128, filter_shift_rnd); - band_a2_128 = _mm_srai_epi32(band_a2_128, filter_shift_rnd); - band_a_128 = _mm_packs_epi32(band_a_128, band_a2_128); + band_a_128 = _mm_packs_epi32(band_a_128, zero_128); - _mm_storeu_si128((__m128i*)(band_a + i * dst_px_stride + j), band_a_128); + _mm_storel_epi64((__m128i*)(band_a + i * dst_px_stride + j), band_a_128); } for(; j< width_div_2; ++j) @@ -479,49 +516,68 @@ void integer_funque_vifdwt2_band0_avx2(dwt2_dtype *src, dwt2_dtype *band_a, ptrd static inline void integer_horizontal_filter_avx2(spat_fil_inter_dtype *tmp, spat_fil_output_dtype *dst, const spat_fil_coeff_dtype *i_filter_coeffs, int width, int fwidth, int dst_row_idx, int half_fw) { int j, fj, jj1, jj2; - __m256i mul0_lo, mul0_hi, mul1_lo, mul1_hi, res0, res4, res8, res12; - __m256i tmp0_lo, tmp0_hi, tmp1_lo, tmp1_hi; + __m256i res0_256, res4_256, res8_256, res12_256; + __m128i res0_128, res4_128, res8_128, res12_128; - __m128i mul0_lo_sse, mul0_hi_sse, mul1_lo_sse, mul1_hi_sse, res0_sse, res4_sse, res8_sse, res12_sse; - __m128i tmp0_lo_sse, tmp0_hi_sse; + int width_rem_size32 = (width - half_fw) - ((width - 2*half_fw) % 32); + int width_rem_size16 = (width - half_fw) - ((width - 2*half_fw) % 16); + int width_rem_size8 = (width - half_fw) - ((width - 2*half_fw) % 8); const spat_fil_coeff_dtype i_filter_coeffs_with_zeros[51] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -900, -1054, -1239, -1452, -1669, -1798, -1547, -66, 4677, 14498, 21495, 14498, 4677, -66, -1547, -1798, -1669, -1452, -1239, -1054, -900, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + const spat_fil_accum_dtype i32_filter_coeffs[11] = { + -900 + (spat_fil_accum_dtype)(((unsigned int)-1054) << 16) + (1 << 16), + -1239 + (spat_fil_accum_dtype)(((unsigned int)-1452) << 16) + (1 << 16), + -1669 + (spat_fil_accum_dtype)(((unsigned int)-1798) << 16) + (1 << 16), + -1547 + (spat_fil_accum_dtype)(((unsigned int)-66) << 16) + (1 << 16), + 4677 + (14498 << 16) /* + (1 << 16) */, + 21495 + (14498 << 16) /* + (1 << 16) */, + 4677 + (spat_fil_accum_dtype)(((unsigned int)-66) << 16) /* + (1 << 16) */, + -1547 + (spat_fil_accum_dtype)(((unsigned int)-1798) << 16) + (1 << 16), + -1669 + (spat_fil_accum_dtype)(((unsigned int)-1452) << 16) + (1 << 16), + -1239 + (spat_fil_accum_dtype)(((unsigned int)-1054) << 16) + (1 << 16), + -900 + (1 << 16) }; (void)fwidth; - res0 = res4 = res8 = res12 = _mm256_setzero_si256(); + __m256i d0 = _mm256_load_si256((__m256i*)(tmp)); __m256i d1 = _mm256_load_si256((__m256i*)(tmp + 16)); int half_filter_table_w = 25; - for (j = 0; j < 6; j++) + for (j = 0; j < (half_fw / 2) + 1; j++) { int fi0 = half_filter_table_w - j; int fi1 = j + half_filter_table_w + 1; __m256i coef0 = _mm256_loadu_si256((__m256i*)(i_filter_coeffs_with_zeros + fi0)); __m256i coef1 = _mm256_loadu_si256((__m256i*)(i_filter_coeffs_with_zeros + fi1)); - mul0_lo = _mm256_mullo_epi16(d0, coef0); - mul0_hi = _mm256_mulhi_epi16(d0, coef0); + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef0); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef0); - mul1_lo = _mm256_mullo_epi16(d0, coef1); - mul1_hi = _mm256_mulhi_epi16(d0, coef1); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0, coef1); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0, coef1); - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); - tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); - tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); tmp0_lo = _mm256_add_epi32(tmp0_lo, tmp0_hi); tmp0_hi = _mm256_add_epi32(tmp1_lo, tmp1_hi); - res0 = _mm256_add_epi32(tmp0_lo, tmp0_hi); - hor_sum_and_store(dst[dst_row_idx + j], res0); + res0_256 = _mm256_add_epi32(tmp0_lo, tmp0_hi); + + __m128i r4 = _mm_add_epi32(_mm256_castsi256_si128(res0_256), _mm256_extracti128_si256(res0_256, 1)); + __m128i r2 = _mm_hadd_epi32(r4, r4); + __m128i r1 = _mm_hadd_epi32(r2, r2); + int r = _mm_cvtsi128_si32(r1); + dst[dst_row_idx + j] = (spat_fil_output_dtype) ((r + SPAT_FILTER_OUT_RND) >> SPAT_FILTER_OUT_SHIFT); } - for (; j < 10; j++) + for (; j < half_fw; j++) { int fi0 = half_filter_table_w - j; int fi1 = j + half_filter_table_w + 1; @@ -533,123 +589,176 @@ static inline void integer_horizontal_filter_avx2(spat_fil_inter_dtype *tmp, spa coef0 = _mm256_add_epi16(coef0, coef1); - mul0_lo = _mm256_mullo_epi16(d0, coef0); - mul0_hi = _mm256_mulhi_epi16(d0, coef0); + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef0); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef0); - mul1_lo = _mm256_mullo_epi16(d1, coef2); - mul1_hi = _mm256_mulhi_epi16(d1, coef2); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d1, coef2); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d1, coef2); - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); - tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); - tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); tmp0_lo = _mm256_add_epi32(tmp0_lo, tmp0_hi); tmp0_hi = _mm256_add_epi32(tmp1_lo, tmp1_hi); - res0 = _mm256_add_epi32(tmp0_lo, tmp0_hi); - hor_sum_and_store(dst[dst_row_idx + j], res0); + res0_256 = _mm256_add_epi32(tmp0_lo, tmp0_hi); + __m128i r4 = _mm_add_epi32(_mm256_castsi256_si128(res0_256), _mm256_extracti128_si256(res0_256, 1)); + __m128i r2 = _mm_hadd_epi32(r4, r4); + __m128i r1 = _mm_hadd_epi32(r2, r2); + int r = _mm_cvtsi128_si32(r1); + dst[dst_row_idx + j] = (spat_fil_output_dtype) ((r + SPAT_FILTER_OUT_RND) >> SPAT_FILTER_OUT_SHIFT); } - - int width_rem_size16 = (width - half_fw) - ((width - 2*half_fw) % 16); - int width_rem_size8 = (width - half_fw) - ((width - 2*half_fw) % 8); - + //This is the core loop - for (; j < width_rem_size16; j+=16) + __m256i coef0_256 = _mm256_set1_epi16(i_filter_coeffs[0]); + for (; j < width_rem_size32; j+=32) { int f_l_j = j - half_fw; int f_r_j = j + half_fw; - res0 = res4 = res8 = res12 = _mm256_set1_epi32(SPAT_FILTER_OUT_RND); + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(SPAT_FILTER_OUT_RND); for (fj = 0; fj < half_fw; fj+=2){ - jj1 = f_l_j + fj; - jj2 = f_r_j - fj; - __m256i coef0 = _mm256_set1_epi16(i_filter_coeffs[fj]); - __m256i coef1 = _mm256_set1_epi16(i_filter_coeffs[fj+1]); - + jj1 = f_l_j + fj*2; + + __m256i coef0 = _mm256_set1_epi32(i32_filter_coeffs[fj]); + __m256i coef1 = _mm256_set1_epi32(i32_filter_coeffs[fj+1]); + __m256i d0 = _mm256_loadu_si256((__m256i*)(tmp + jj1)); - __m256i d20 = _mm256_loadu_si256((__m256i*)(tmp + jj2)); + __m256i d2 = _mm256_loadu_si256((__m256i*)(tmp + jj1 + 2)); __m256i d1 = _mm256_loadu_si256((__m256i*)(tmp + jj1 + 1)); - __m256i d19 = _mm256_loadu_si256((__m256i*)(tmp + jj2 - 1)); + __m256i d3 = _mm256_loadu_si256((__m256i*)(tmp + jj1 + 3)); + + __m256i d0_16 = _mm256_loadu_si256((__m256i*)(tmp + jj1 + 16)); + __m256i d2_16 = _mm256_loadu_si256((__m256i*)(tmp + jj1 + 18)); + __m256i d1_16 = _mm256_loadu_si256((__m256i*)(tmp + jj1 + 17)); + __m256i d3_16 = _mm256_loadu_si256((__m256i*)(tmp + jj1 + 19)); + + res0_256 = _mm256_add_epi32(res0_256, _mm256_madd_epi16(d0, coef0)); + res0_256 = _mm256_add_epi32(res0_256, _mm256_madd_epi16(d2, coef1)); + res4_256 = _mm256_add_epi32(res4_256, _mm256_madd_epi16(d1, coef0)); + res4_256 = _mm256_add_epi32(res4_256, _mm256_madd_epi16(d3, coef1)); + + res8_256 = _mm256_add_epi32(res8_256, _mm256_madd_epi16(d0_16, coef0)); + res8_256 = _mm256_add_epi32(res8_256, _mm256_madd_epi16(d2_16, coef1)); + res12_256 = _mm256_add_epi32(res12_256, _mm256_madd_epi16(d1_16, coef0)); + res12_256 = _mm256_add_epi32(res12_256, _mm256_madd_epi16(d3_16, coef1)); + } + __m256i d0 = _mm256_loadu_si256((__m256i*)(tmp + f_r_j)); + __m256i d16 = _mm256_loadu_si256((__m256i*)(tmp + f_r_j + 16)); + + __m256i tmp0 = _mm256_unpacklo_epi32(res0_256, res4_256); + __m256i tmp4 = _mm256_unpackhi_epi32(res0_256, res4_256); + __m256i tmp16 = _mm256_unpacklo_epi32(res8_256, res12_256); + __m256i tmp20 = _mm256_unpackhi_epi32(res8_256, res12_256); + + __m256i mul0_lo = _mm256_mullo_epi16(d0, coef0_256); + __m256i mul0_hi = _mm256_mulhi_epi16(d0, coef0_256); + __m256i mul16_lo = _mm256_mullo_epi16(d16, coef0_256); + __m256i mul16_hi = _mm256_mulhi_epi16(d16, coef0_256); + + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); + __m256i tmp16_lo = _mm256_unpacklo_epi16(mul16_lo, mul16_hi); + __m256i tmp16_hi = _mm256_unpackhi_epi16(mul16_lo, mul16_hi); + + tmp0 = _mm256_add_epi32(tmp0, tmp0_lo); + tmp4 = _mm256_add_epi32(tmp4, tmp0_hi); + tmp16 = _mm256_add_epi32(tmp16, tmp16_lo); + tmp20 = _mm256_add_epi32(tmp20, tmp16_hi); + + tmp0 = _mm256_srai_epi32(tmp0, SPAT_FILTER_OUT_SHIFT); + tmp4 = _mm256_srai_epi32(tmp4, SPAT_FILTER_OUT_SHIFT); + tmp16 = _mm256_srai_epi32(tmp16, SPAT_FILTER_OUT_SHIFT); + tmp20 = _mm256_srai_epi32(tmp20, SPAT_FILTER_OUT_SHIFT); + + res0_256 = _mm256_packs_epi32(tmp0, tmp4); + res8_256 = _mm256_packs_epi32(tmp16, tmp20); + + _mm256_storeu_si256((__m256i*)(dst + dst_row_idx + j), res0_256); + _mm256_storeu_si256((__m256i*)(dst + dst_row_idx + j + 16), res8_256); + } + + for (; j < width_rem_size16; j+=16) + { + int f_l_j = j - half_fw; + int f_r_j = j + half_fw; + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(SPAT_FILTER_OUT_RND); - __m256i d0_20_lo = _mm256_unpacklo_epi16(d0, d20); - __m256i d0_20_hi = _mm256_unpackhi_epi16(d0, d20); - __m256i d1_19_lo = _mm256_unpacklo_epi16(d1, d19); - __m256i d1_19_hi = _mm256_unpackhi_epi16(d1, d19); + for (fj = 0; fj < half_fw; fj+=2){ + jj1 = f_l_j + fj*2; - mul0_lo = _mm256_madd_epi16(d0_20_lo, coef0); - mul0_hi = _mm256_madd_epi16(d0_20_hi, coef0); - mul1_lo = _mm256_madd_epi16(d1_19_lo, coef1); - mul1_hi = _mm256_madd_epi16(d1_19_hi, coef1); + __m256i coef0 = _mm256_set1_epi32(i32_filter_coeffs[fj]); + __m256i coef1 = _mm256_set1_epi32(i32_filter_coeffs[fj+1]); - __m256i t0_lo = _mm256_add_epi32(mul0_lo, mul1_lo); - __m256i t1_lo = _mm256_add_epi32(mul0_hi, mul1_hi); + __m256i d0 = _mm256_loadu_si256((__m256i*)(tmp + jj1)); + __m256i d2 = _mm256_loadu_si256((__m256i*)(tmp + jj1 + 2)); + __m256i d1 = _mm256_loadu_si256((__m256i*)(tmp + jj1 + 1)); + __m256i d3 = _mm256_loadu_si256((__m256i*)(tmp + jj1 + 3)); - res0 = _mm256_add_epi32(res0, t0_lo); - res4 = _mm256_add_epi32(res4, t1_lo); - } - __m256i d0 = _mm256_loadu_si256((__m256i*)(tmp + j)); - __m256i coef = _mm256_set1_epi16(i_filter_coeffs[half_fw]); - mul0_lo = _mm256_mullo_epi16(d0, coef); - mul0_hi = _mm256_mulhi_epi16(d0, coef); - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); - res0 = _mm256_add_epi32(res0, tmp0_lo); - res4 = _mm256_add_epi32(res4, tmp0_hi); - res0 = _mm256_srai_epi32(res0, SPAT_FILTER_OUT_SHIFT); - res4 = _mm256_srai_epi32(res4, SPAT_FILTER_OUT_SHIFT); + res0_256 = _mm256_add_epi32(res0_256, _mm256_madd_epi16(d0, coef0)); + res0_256 = _mm256_add_epi32(res0_256, _mm256_madd_epi16(d2, coef1)); + res4_256 = _mm256_add_epi32(res4_256, _mm256_madd_epi16(d1, coef0)); + res4_256 = _mm256_add_epi32(res4_256, _mm256_madd_epi16(d3, coef1)); + } + __m256i d0 = _mm256_loadu_si256((__m256i*)(tmp + f_r_j)); + __m256i tmp0 = _mm256_unpacklo_epi32(res0_256, res4_256); + __m256i tmp4 = _mm256_unpackhi_epi32(res0_256, res4_256); + __m256i mul0_lo = _mm256_mullo_epi16(d0, coef0_256); + __m256i mul0_hi = _mm256_mulhi_epi16(d0, coef0_256); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); + + tmp0 = _mm256_add_epi32(tmp0, tmp0_lo); + tmp4 = _mm256_add_epi32(tmp4, tmp0_hi); + + tmp0 = _mm256_srai_epi32(tmp0, SPAT_FILTER_OUT_SHIFT); + tmp4 = _mm256_srai_epi32(tmp4, SPAT_FILTER_OUT_SHIFT); - res0 = _mm256_packs_epi32(res0, res4); - _mm256_storeu_si256((__m256i*)(dst + dst_row_idx + j), res0); + res0_256 = _mm256_packs_epi32(tmp0, tmp4); + _mm256_storeu_si256((__m256i*)(dst + dst_row_idx + j), res0_256); } + __m128i coef0_128 = _mm_set1_epi16(i_filter_coeffs[0]); for (; j < width_rem_size8; j+=8) { int f_l_j = j - half_fw; int f_r_j = j + half_fw; - res0_sse = res4_sse = res8_sse = res12_sse = _mm_set1_epi32(SPAT_FILTER_OUT_RND); + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(SPAT_FILTER_OUT_RND); for (fj = 0; fj < half_fw; fj+=2){ - jj1 = f_l_j + fj; - jj2 = f_r_j - fj; - __m128i coef0 = _mm_set1_epi16(i_filter_coeffs[fj]); - __m128i coef1 = _mm_set1_epi16(i_filter_coeffs[fj+1]); + jj1 = f_l_j + fj*2; + + __m128i coef0 = _mm_set1_epi32(i32_filter_coeffs[fj]); + __m128i coef1 = _mm_set1_epi32(i32_filter_coeffs[fj+1]); __m128i d0 = _mm_loadu_si128((__m128i*)(tmp + jj1)); - __m128i d20 = _mm_loadu_si128((__m128i*)(tmp + jj2)); + __m128i d2 = _mm_loadu_si128((__m128i*)(tmp + jj1 + 2)); __m128i d1 = _mm_loadu_si128((__m128i*)(tmp + jj1 + 1)); - __m128i d19 = _mm_loadu_si128((__m128i*)(tmp + jj2 - 1)); + __m128i d3 = _mm_loadu_si128((__m128i*)(tmp + jj1 + 3)); - __m128i d0_20_lo = _mm_unpacklo_epi16(d0, d20); - __m128i d0_20_hi = _mm_unpackhi_epi16(d0, d20); - __m128i d1_19_lo = _mm_unpacklo_epi16(d1, d19); - __m128i d1_19_hi = _mm_unpackhi_epi16(d1, d19); - - mul0_lo_sse = _mm_madd_epi16(d0_20_lo, coef0); - mul0_hi_sse = _mm_madd_epi16(d0_20_hi, coef0); - mul1_lo_sse = _mm_madd_epi16(d1_19_lo, coef1); - mul1_hi_sse = _mm_madd_epi16(d1_19_hi, coef1); - - __m128i t0_lo = _mm_add_epi32(mul0_lo_sse, mul1_lo_sse); - __m128i t1_lo = _mm_add_epi32(mul0_hi_sse, mul1_hi_sse); - - res0_sse = _mm_add_epi32(res0_sse, t0_lo); - res4_sse = _mm_add_epi32(res4_sse, t1_lo); + res0_128 = _mm_add_epi32(res0_128, _mm_madd_epi16(d0, coef0)); + res0_128 = _mm_add_epi32(res0_128, _mm_madd_epi16(d2, coef1)); + res4_128 = _mm_add_epi32(res4_128, _mm_madd_epi16(d1, coef0)); + res4_128 = _mm_add_epi32(res4_128, _mm_madd_epi16(d3, coef1)); } - __m128i d0 = _mm_loadu_si128((__m128i*)(tmp + j)); - __m128i coef = _mm_set1_epi16(i_filter_coeffs[half_fw]); - mul0_lo_sse = _mm_mullo_epi16(d0, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0, coef); - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); - res0_sse = _mm_add_epi32(res0_sse, tmp0_lo_sse); - res4_sse = _mm_add_epi32(res4_sse, tmp0_hi_sse); - res0_sse = _mm_srai_epi32(res0_sse, SPAT_FILTER_OUT_SHIFT); - res4_sse = _mm_srai_epi32(res4_sse, SPAT_FILTER_OUT_SHIFT); - res0_sse = _mm_packs_epi32(res0_sse, res4_sse); - - _mm_storeu_si128((__m128i*)(dst + dst_row_idx + j), res0_sse); - } + __m128i d0 = _mm_loadu_si128((__m128i*)(tmp + f_r_j)); + __m128i tmp0 = _mm_unpacklo_epi32(res0_128, res4_128); + __m128i tmp4 = _mm_unpackhi_epi32(res0_128, res4_128); + __m128i mul0_lo = _mm_mullo_epi16(d0, coef0_128); + __m128i mul0_hi = _mm_mulhi_epi16(d0, coef0_128); + __m128i tmp0_lo = _mm_unpacklo_epi16(mul0_lo, mul0_hi); + __m128i tmp0_hi = _mm_unpackhi_epi16(mul0_lo, mul0_hi); + + tmp0 = _mm_add_epi32(tmp0, tmp0_lo); + tmp4 = _mm_add_epi32(tmp4, tmp0_hi); + tmp0 = _mm_srai_epi32(tmp0, SPAT_FILTER_OUT_SHIFT); + tmp4 = _mm_srai_epi32(tmp4, SPAT_FILTER_OUT_SHIFT); + res0_128 = _mm_packs_epi32(tmp0, tmp4); + + _mm_storeu_si128((__m128i*)(dst + dst_row_idx + j), res0_128); + } for (; j < (width - half_fw); j++) { @@ -677,7 +786,7 @@ static inline void integer_horizontal_filter_avx2(spat_fil_inter_dtype *tmp, spa d0 = _mm256_loadu_si256((__m256i*)(tmp + j - 6)); d1 = _mm256_loadu_si256((__m256i*)(tmp + j - 22)); - for (; j < (width - 6); j++) + for (; j < (width - ((half_fw / 2) + 1)); j++) { int fi0 = half_filter_table_w + width - half_fw - j - 6; int fi1 = j - width + half_fw; @@ -689,23 +798,23 @@ static inline void integer_horizontal_filter_avx2(spat_fil_inter_dtype *tmp, spa coef0 = _mm256_add_epi16(coef0, coef1); - mul0_lo = _mm256_mullo_epi16(d0, coef0); - mul0_hi = _mm256_mulhi_epi16(d0, coef0); + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef0); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef0); - mul1_lo = _mm256_mullo_epi16(d1, coef2); - mul1_hi = _mm256_mulhi_epi16(d1, coef2); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d1, coef2); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d1, coef2); - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); - tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); - tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); tmp0_lo = _mm256_add_epi32(tmp0_lo, tmp0_hi); tmp1_lo = _mm256_add_epi32(tmp1_lo, tmp1_hi); - res0 = _mm256_add_epi32(tmp0_lo, tmp1_lo); + res0_256 = _mm256_add_epi32(tmp0_lo, tmp1_lo); - __m128i r4 = _mm_add_epi32(_mm256_castsi256_si128(res0), _mm256_extracti128_si256(res0, 1)); + __m128i r4 = _mm_add_epi32(_mm256_castsi256_si128(res0_256), _mm256_extracti128_si256(res0_256,1)); __m128i r2 = _mm_hadd_epi32(r4, r4); __m128i r1 = _mm_hadd_epi32(r2, r2); @@ -721,24 +830,24 @@ static inline void integer_horizontal_filter_avx2(spat_fil_inter_dtype *tmp, spa __m256i coef0 = _mm256_loadu_si256((__m256i*)(i_filter_coeffs_with_zeros + fi0)); __m256i coef1 = _mm256_loadu_si256((__m256i*)(i_filter_coeffs_with_zeros + fi1)); - mul0_lo = _mm256_mullo_epi16(d0, coef0); - mul0_hi = _mm256_mulhi_epi16(d0, coef0); + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef0); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef0); - mul1_lo = _mm256_mullo_epi16(d0, coef1); - mul1_hi = _mm256_mulhi_epi16(d0, coef1); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0, coef1); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0, coef1); - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); - tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); - tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); tmp0_lo = _mm256_add_epi32(tmp0_lo, tmp0_hi); tmp0_hi = _mm256_add_epi32(tmp1_lo, tmp1_hi); - res0 = _mm256_add_epi32(tmp0_lo, tmp0_hi); + res0_256 = _mm256_add_epi32(tmp0_lo, tmp0_hi); - __m128i r4 = _mm_add_epi32(_mm256_castsi256_si128(res0), _mm256_extracti128_si256(res0, 1)); + __m128i r4 = _mm_add_epi32(_mm256_castsi256_si128(res0_256), _mm256_extracti128_si256(res0_256,1)); __m128i r2 = _mm_hadd_epi32(r4, r4); __m128i r1 = _mm_hadd_epi32(r2, r2); int r = _mm_cvtsi128_si32(r1); @@ -754,7 +863,7 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt }; // For madd version - const spat_fil_accum_dtype i32_filter_coeffs2[11] = { + const spat_fil_accum_dtype i32_filter_coeffs[11] = { -900 + (spat_fil_accum_dtype)(((unsigned int)-1054) << 16) + (1 << 16), -1239 + (spat_fil_accum_dtype)(((unsigned int)-1452) << 16) + (1 << 16), -1669 + (spat_fil_accum_dtype)(((unsigned int)-1798) << 16) + (1 << 16), @@ -770,7 +879,7 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt int src_px_stride = width; int dst_px_stride = width; - int width_rem_size = width - (width % 32); + int width_rem_size64 = width - (width % 64); int width_rem_size32 = width - (width % 32); int width_rem_size16 = width - (width % 16); int width_rem_size8 = width - (width % 8); @@ -805,11 +914,11 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt interim_rnd = (1 << (interim_shift - 1)); } - __m256i mul0_lo, mul0_hi, mul1_lo, mul1_hi, res0, res4, res8, res12; - __m256i tmp0_lo, tmp0_hi, tmp1_lo, tmp1_hi; - - __m128i mul0_lo_sse, mul0_hi_sse, mul1_lo_sse, mul1_hi_sse, res0_sse, res4_sse, res8_sse, res12_sse; - __m128i tmp0_lo_sse, tmp0_hi_sse, tmp1_lo_sse, tmp1_hi_sse; + __m256i res0_256, res4_256, res8_256, res12_256; + __m256i res0_32_256, res4_32_256, res8_32_256, res12_32_256; + __m256i zero_256 = _mm256_setzero_si256(); + __m128i res0_128, res4_128, res8_128, res12_128; + __m128i zero_128 = _mm_setzero_si128(); /** * The loop i=0 to height is split into 3 parts @@ -819,13 +928,14 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt int diff_i_halffw = i - half_fw; int pro_mir_end = -diff_i_halffw - 1; - + j = 0; /* Vertical pass. */ if(8 == bitdepth) { - for (j = 0; j < width_rem_size32; j+=32){ - res0 = res4 = res8 = res12 = _mm256_set1_epi32(interim_rnd); + for (; j < width_rem_size64; j+=64){ + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); + res0_32_256 = res4_32_256 = res8_32_256 = res12_32_256 = _mm256_set1_epi32(interim_rnd); /** * The full loop is from fi = 0 to fwidth * During the loop when the centre pixel is at i, @@ -835,67 +945,191 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) for (fi = 0; fi <= pro_mir_end; fi++){ ii = pro_mir_end - fi; - __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j)); - __m256i d0_lo = _mm256_unpacklo_epi8(d0, _mm256_setzero_si256()); - __m256i d0_hi = _mm256_unpackhi_epi8(d0, _mm256_setzero_si256()); + __m256i d0_32 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j + 32)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + __m256i d0_32_lo = _mm256_unpacklo_epi8(d0_32, zero_256); + __m256i d0_32_hi = _mm256_unpackhi_epi8(d0_32, zero_256); - mul0_lo = _mm256_mullo_epi16(d0_lo, coef); - mul0_hi = _mm256_mulhi_epi16(d0_lo, coef); - mul1_lo = _mm256_mullo_epi16(d0_hi, coef); - mul1_hi = _mm256_mulhi_epi16(d0_hi, coef); + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0_hi, coef); + + __m256i mul0_32_lo_256 = _mm256_mullo_epi16(d0_32_lo, coef); + __m256i mul0_32_hi_256 = _mm256_mulhi_epi16(d0_32_lo, coef); + __m256i mul1_32_lo_256 = _mm256_mullo_epi16(d0_32_hi, coef); + __m256i mul1_32_hi_256 = _mm256_mulhi_epi16(d0_32_hi, coef); // regroup the 2 parts of the result - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); - tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); - tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); - - res0 = _mm256_add_epi32(tmp0_lo, res0); - res4 = _mm256_add_epi32(tmp0_hi, res4); - res8 = _mm256_add_epi32(tmp1_lo, res8); - res12 = _mm256_add_epi32(tmp1_hi, res12); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + __m256i tmp0_32_lo = _mm256_unpacklo_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp0_32_hi = _mm256_unpackhi_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp1_32_lo = _mm256_unpacklo_epi16(mul1_32_lo_256, mul1_32_hi_256); + __m256i tmp1_32_hi = _mm256_unpackhi_epi16(mul1_32_lo_256, mul1_32_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + + res0_32_256 = _mm256_add_epi32(tmp0_32_lo, res0_32_256); + res4_32_256 = _mm256_add_epi32(tmp0_32_hi, res4_32_256); + res8_32_256 = _mm256_add_epi32(tmp1_32_lo, res8_32_256); + res12_32_256 = _mm256_add_epi32(tmp1_32_hi, res12_32_256); } //Here the normal loop is executed where ii = i - fwidth / 2 + fi for ( ; fi < fwidth; fi++) { ii = diff_i_halffw + fi; - __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j)); - __m256i d0_lo = _mm256_unpacklo_epi8(d0, _mm256_setzero_si256()); - __m256i d0_hi = _mm256_unpackhi_epi8(d0, _mm256_setzero_si256()); + __m256i d0_32 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j + 32)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + __m256i d0_32_lo = _mm256_unpacklo_epi8(d0_32, zero_256); + __m256i d0_32_hi = _mm256_unpackhi_epi8(d0_32, zero_256); - mul0_lo = _mm256_mullo_epi16(d0_lo, coef); - mul0_hi = _mm256_mulhi_epi16(d0_lo, coef); - mul1_lo = _mm256_mullo_epi16(d0_hi, coef); - mul1_hi = _mm256_mulhi_epi16(d0_hi, coef); + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0_hi, coef); + + __m256i mul0_32_lo_256 = _mm256_mullo_epi16(d0_32_lo, coef); + __m256i mul0_32_hi_256 = _mm256_mulhi_epi16(d0_32_lo, coef); + __m256i mul1_32_lo_256 = _mm256_mullo_epi16(d0_32_hi, coef); + __m256i mul1_32_hi_256 = _mm256_mulhi_epi16(d0_32_hi, coef); // regroup the 2 parts of the result - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); - tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); - tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + __m256i tmp0_32_lo = _mm256_unpacklo_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp0_32_hi = _mm256_unpackhi_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp1_32_lo = _mm256_unpacklo_epi16(mul1_32_lo_256, mul1_32_hi_256); + __m256i tmp1_32_hi = _mm256_unpackhi_epi16(mul1_32_lo_256, mul1_32_hi_256); - res0 = _mm256_add_epi32(tmp0_lo, res0); - res4 = _mm256_add_epi32(tmp0_hi, res4); - res8 = _mm256_add_epi32(tmp1_lo, res8); - res12 = _mm256_add_epi32(tmp1_hi, res12); + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + + res0_32_256 = _mm256_add_epi32(tmp0_32_lo, res0_32_256); + res4_32_256 = _mm256_add_epi32(tmp0_32_hi, res4_32_256); + res8_32_256 = _mm256_add_epi32(tmp1_32_lo, res8_32_256); + res12_32_256 = _mm256_add_epi32(tmp1_32_hi, res12_32_256); + } + res0_256 = _mm256_srai_epi32(res0_256, interim_shift); + res4_256 = _mm256_srai_epi32(res4_256, interim_shift); + res8_256 = _mm256_srai_epi32(res8_256, interim_shift); + res12_256 = _mm256_srai_epi32(res12_256, interim_shift); + + res0_32_256 = _mm256_srai_epi32(res0_32_256, interim_shift); + res4_32_256 = _mm256_srai_epi32(res4_32_256, interim_shift); + res8_32_256 = _mm256_srai_epi32(res8_32_256, interim_shift); + res12_32_256 = _mm256_srai_epi32(res12_32_256, interim_shift); + + res0_256 = _mm256_packs_epi32(res0_256, res4_256); + res8_256 = _mm256_packs_epi32(res8_256, res12_256); + res0_32_256 = _mm256_packs_epi32(res0_32_256, res4_32_256); + res8_32_256 = _mm256_packs_epi32(res8_32_256, res12_32_256); + + __m256i r0 = _mm256_permute2x128_si256(res0_256, res8_256, 0x20); + __m256i r8 = _mm256_permute2x128_si256(res0_256, res8_256, 0x31); + __m256i r0_32 = _mm256_permute2x128_si256(res0_32_256, res8_32_256, 0x20); + __m256i r8_32 = _mm256_permute2x128_si256(res0_32_256, res8_32_256, 0x31); + + _mm256_store_si256((__m256i*)(tmp + j), r0); + _mm256_store_si256((__m256i*)(tmp + j + 16), r8); + _mm256_store_si256((__m256i*)(tmp + j + 32), r0_32); + _mm256_store_si256((__m256i*)(tmp + j + 48), r8_32); + } - //test = _mm256_add_epi32(tmp0_hi, res8); + for (; j < width_rem_size32; j+=32){ + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi <= pro_mir_end; fi++){ + ii = pro_mir_end - fi; + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); } - - res0 = _mm256_srai_epi32(res0, interim_shift); - res4 = _mm256_srai_epi32(res4, interim_shift); - res8 = _mm256_srai_epi32(res8, interim_shift); - res12 = _mm256_srai_epi32(res12, interim_shift); + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = diff_i_halffw + fi; + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + } + res0_256 = _mm256_srai_epi32(res0_256,interim_shift); + res4_256 = _mm256_srai_epi32(res4_256,interim_shift); + res8_256 = _mm256_srai_epi32(res8_256,interim_shift); + res12_256 = _mm256_srai_epi32(res12_256,interim_shift); - res0 = _mm256_packs_epi32(res0, res4); - res8 = _mm256_packs_epi32(res8, res12); - shuffle_and_store(tmp + j, res0, res8); + res0_256 = _mm256_packs_epi32(res0_256,res4_256); + res8_256 = _mm256_packs_epi32(res8_256,res12_256); + + __m256i r0 = _mm256_permute2x128_si256(res0_256, res8_256, 0x20); + __m256i r8 = _mm256_permute2x128_si256(res0_256, res8_256, 0x31); + _mm256_store_si256((__m256i*)(tmp + j), r0); + _mm256_store_si256((__m256i*)(tmp + j + 16), r8); } for (; j < width_rem_size16; j+=16){ - res0_sse = res4_sse = res8_sse = res12_sse = _mm_set1_epi32(interim_rnd); + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); /** * The full loop is from fi = 0 to fwidth * During the loop when the centre pixel is at i, @@ -908,24 +1142,25 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt ii = pro_mir_end - fi; __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - __m128i d0_lo = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); - __m128i d0_hi = _mm_unpackhi_epi8(d0, _mm_setzero_si128()); - mul0_lo_sse = _mm_mullo_epi16(d0_lo, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0_lo, coef); - mul1_lo_sse = _mm_mullo_epi16(d0_hi, coef); - mul1_hi_sse = _mm_mulhi_epi16(d0_hi, coef); + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i d0_hi = _mm_unpackhi_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d0_hi, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d0_hi, coef); // regroup the 2 parts of the result - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); - tmp1_lo_sse = _mm_unpacklo_epi16(mul1_lo_sse, mul1_hi_sse); - tmp1_hi_sse = _mm_unpackhi_epi16(mul1_lo_sse, mul1_hi_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); - res8_sse = _mm_add_epi32(tmp1_lo_sse, res8_sse); - res12_sse = _mm_add_epi32(tmp1_hi_sse, res12_sse); + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); } //Here the normal loop is executed where ii = i - fwidth / 2 + fi @@ -934,40 +1169,41 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt ii = diff_i_halffw + fi; __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - __m128i d0_lo = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); - __m128i d0_hi = _mm_unpackhi_epi8(d0, _mm_setzero_si128()); - mul0_lo_sse = _mm_mullo_epi16(d0_lo, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0_lo, coef); - mul1_lo_sse = _mm_mullo_epi16(d0_hi, coef); - mul1_hi_sse = _mm_mulhi_epi16(d0_hi, coef); + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i d0_hi = _mm_unpackhi_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d0_hi, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d0_hi, coef); // regroup the 2 parts of the result - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); - tmp1_lo_sse = _mm_unpacklo_epi16(mul1_lo_sse, mul1_hi_sse); - tmp1_hi_sse = _mm_unpackhi_epi16(mul1_lo_sse, mul1_hi_sse); - - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); - res8_sse = _mm_add_epi32(tmp1_lo_sse, res8_sse); - res12_sse = _mm_add_epi32(tmp1_hi_sse, res12_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); } - res0_sse = _mm_srai_epi32(res0_sse, interim_shift); - res4_sse = _mm_srai_epi32(res4_sse, interim_shift); - res8_sse = _mm_srai_epi32(res8_sse, interim_shift); - res12_sse = _mm_srai_epi32(res12_sse, interim_shift); + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res8_128 = _mm_srai_epi32(res8_128, interim_shift); + res12_128 = _mm_srai_epi32(res12_128, interim_shift); - res0_sse = _mm_packs_epi32(res0_sse, res4_sse); - res8_sse = _mm_packs_epi32(res8_sse, res12_sse); + res0_128 = _mm_packs_epi32(res0_128, res4_128); + res8_128 = _mm_packs_epi32(res8_128, res12_128); - __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_sse), res8_sse, 1); + __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_128), res8_128, 1); _mm256_store_si256((__m256i*)(tmp + j), res); } for (; j < width_rem_size8; j+=8){ - res0_sse = res4_sse = res8_sse = res12_sse = _mm_set1_epi32(interim_rnd); + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); /** * The full loop is from fi = 0 to fwidth * During the loop when the centre pixel is at i, @@ -976,21 +1212,20 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt */ //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) for (fi = 0; fi <= pro_mir_end; fi++){ - ii = pro_mir_end - fi; __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - __m128i d0_lo = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); - mul0_lo_sse = _mm_mullo_epi16(d0_lo, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0_lo, coef); + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); // regroup the 2 parts of the result - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); } //Here the normal loop is executed where ii = i - fwidth / 2 + fi for ( ; fi < fwidth; fi++) @@ -998,23 +1233,23 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt ii = diff_i_halffw + fi; __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - __m128i d0_lo = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); - mul0_lo_sse = _mm_mullo_epi16(d0_lo, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0_lo, coef); + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); // regroup the 2 parts of the result - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); } - res0_sse = _mm_srai_epi32(res0_sse, interim_shift); - res4_sse = _mm_srai_epi32(res4_sse, interim_shift); - res0_sse = _mm_packs_epi32(res0_sse, res4_sse); - _mm_store_si128((__m128i*)(tmp + j), res0_sse); + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res0_128 = _mm_packs_epi32(res0_128, res4_128); + _mm_store_si128((__m128i*)(tmp + j), res0_128); } for (; j < width; j++){ @@ -1043,9 +1278,124 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt } else { - for (j = 0; j < width_rem_size; j+=32) + for (; j < width_rem_size64; j+=64) + { + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); + res0_32_256 = res4_32_256 = res8_32_256 = res12_32_256 = _mm256_set1_epi32(interim_rnd); + + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi <= pro_mir_end; fi++){ + ii = pro_mir_end - fi; + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j)); + __m256i d1 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 16)); + __m256i d0_32 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 32)); + __m256i d1_32 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 48)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d1, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d1, coef); + + __m256i mul0_32_lo_256 = _mm256_mullo_epi16(d0_32, coef); + __m256i mul0_32_hi_256 = _mm256_mulhi_epi16(d0_32, coef); + __m256i mul1_32_lo_256 = _mm256_mullo_epi16(d1_32, coef); + __m256i mul1_32_hi_256 = _mm256_mulhi_epi16(d1_32, coef); + + // regroup the 2 parts of the result + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + __m256i tmp0_32_lo = _mm256_unpacklo_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp0_32_hi = _mm256_unpackhi_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp1_32_lo = _mm256_unpacklo_epi16(mul1_32_lo_256, mul1_32_hi_256); + __m256i tmp1_32_hi = _mm256_unpackhi_epi16(mul1_32_lo_256, mul1_32_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + + res0_32_256 = _mm256_add_epi32(tmp0_32_lo, res0_32_256); + res4_32_256 = _mm256_add_epi32(tmp0_32_hi, res4_32_256); + res8_32_256 = _mm256_add_epi32(tmp1_32_lo, res8_32_256); + res12_32_256 = _mm256_add_epi32(tmp1_32_hi, res12_32_256); + } + + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = diff_i_halffw + fi; + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j)); + __m256i d1 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 16)); + __m256i d0_32 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 32)); + __m256i d1_32 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 48)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d1, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d1, coef); + + __m256i mul0_32_lo_256 = _mm256_mullo_epi16(d0_32, coef); + __m256i mul0_32_hi_256 = _mm256_mulhi_epi16(d0_32, coef); + __m256i mul1_32_lo_256 = _mm256_mullo_epi16(d1_32, coef); + __m256i mul1_32_hi_256 = _mm256_mulhi_epi16(d1_32, coef); + + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + __m256i tmp0_32_lo = _mm256_unpacklo_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp0_32_hi = _mm256_unpackhi_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp1_32_lo = _mm256_unpacklo_epi16(mul1_32_lo_256, mul1_32_hi_256); + __m256i tmp1_32_hi = _mm256_unpackhi_epi16(mul1_32_lo_256, mul1_32_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + + res0_32_256 = _mm256_add_epi32(tmp0_32_lo, res0_32_256); + res4_32_256 = _mm256_add_epi32(tmp0_32_hi, res4_32_256); + res8_32_256 = _mm256_add_epi32(tmp1_32_lo, res8_32_256); + res12_32_256 = _mm256_add_epi32(tmp1_32_hi, res12_32_256); + } + + res0_256 = _mm256_srai_epi32(res0_256, interim_shift); + res4_256 = _mm256_srai_epi32(res4_256, interim_shift); + res8_256 = _mm256_srai_epi32(res8_256, interim_shift); + res12_256 = _mm256_srai_epi32(res12_256, interim_shift); + + res0_32_256 = _mm256_srai_epi32(res0_32_256, interim_shift); + res4_32_256 = _mm256_srai_epi32(res4_32_256, interim_shift); + res8_32_256 = _mm256_srai_epi32(res8_32_256, interim_shift); + res12_32_256 = _mm256_srai_epi32(res12_32_256, interim_shift); + + res0_256 = _mm256_packs_epi32(res0_256, res4_256); + res8_256 = _mm256_packs_epi32(res8_256, res12_256); + + res0_32_256 = _mm256_packs_epi32(res0_32_256, res4_32_256); + res8_32_256 = _mm256_packs_epi32(res8_32_256, res12_32_256); + + _mm256_store_si256((__m256i*)(tmp + j), res0_256); + _mm256_store_si256((__m256i*)(tmp + j + 16), res8_256); + _mm256_store_si256((__m256i*)(tmp + j + 32), res0_32_256); + _mm256_store_si256((__m256i*)(tmp + j + 48), res8_32_256); + } + + for (; j < width_rem_size32; j+=32) { - res0 = res4 = res8 = res12 = _mm256_set1_epi32(interim_rnd); + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); /** * The full loop is from fi = 0 to fwidth @@ -1060,21 +1410,21 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt __m256i d1 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 16)); __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); - mul0_lo = _mm256_mullo_epi16(d0, coef); - mul0_hi = _mm256_mulhi_epi16(d0, coef); - mul1_lo = _mm256_mullo_epi16(d1, coef); - mul1_hi = _mm256_mulhi_epi16(d1, coef); + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d1, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d1, coef); // regroup the 2 parts of the result - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); - tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); - tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); - - res0 = _mm256_add_epi32(tmp0_lo, res0); - res4 = _mm256_add_epi32(tmp0_hi, res4); - res8 = _mm256_add_epi32(tmp1_lo, res8); - res12 = _mm256_add_epi32(tmp1_hi, res12); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); } //Here the normal loop is executed where ii = i - fwidth / 2 + fi @@ -1085,37 +1435,37 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt __m256i d1 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 16)); __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); - mul0_lo = _mm256_mullo_epi16(d0, coef); - mul0_hi = _mm256_mulhi_epi16(d0, coef); - mul1_lo = _mm256_mullo_epi16(d1, coef); - mul1_hi = _mm256_mulhi_epi16(d1, coef); + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d1, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d1, coef); - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); - tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); - tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); - res0 = _mm256_add_epi32(tmp0_lo, res0); - res4 = _mm256_add_epi32(tmp0_hi, res4); - res8 = _mm256_add_epi32(tmp1_lo, res8); - res12 = _mm256_add_epi32(tmp1_hi, res12); + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); } - res0 = _mm256_srai_epi32(res0, interim_shift); - res4 = _mm256_srai_epi32(res4, interim_shift); - res8 = _mm256_srai_epi32(res8, interim_shift); - res12 = _mm256_srai_epi32(res12, interim_shift); + res0_256 = _mm256_srai_epi32(res0_256, interim_shift); + res4_256 = _mm256_srai_epi32(res4_256, interim_shift); + res8_256 = _mm256_srai_epi32(res8_256, interim_shift); + res12_256 = _mm256_srai_epi32(res12_256, interim_shift); - res0 = _mm256_packs_epi32(res0, res4); - res8 = _mm256_packs_epi32(res8, res12); + res0_256 = _mm256_packs_epi32(res0_256, res4_256); + res8_256 = _mm256_packs_epi32(res8_256, res12_256); - _mm256_store_si256((__m256i*)(tmp + j), res0); - _mm256_store_si256((__m256i*)(tmp + j + 16), res8); + _mm256_store_si256((__m256i*)(tmp + j), res0_256); + _mm256_store_si256((__m256i*)(tmp + j + 16), res8_256); } for (; j < width_rem_size16; j+=16) { - res0_sse = res4_sse = res8_sse = res12_sse = _mm_set1_epi32(interim_rnd); + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); /** * The full loop is from fi = 0 to fwidth @@ -1125,66 +1475,49 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt */ //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) for (fi = 0; fi <= pro_mir_end; fi++){ - ii = pro_mir_end - fi; - __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); - __m128i d1 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j + 8)); - __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - mul0_lo_sse = _mm_mullo_epi16(d0, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0, coef); - mul1_lo_sse = _mm_mullo_epi16(d1, coef); - mul1_hi_sse = _mm_mulhi_epi16(d1, coef); + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef); // regroup the 2 parts of the result - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); - tmp1_lo_sse = _mm_unpacklo_epi16(mul1_lo_sse, mul1_hi_sse); - tmp1_hi_sse = _mm_unpackhi_epi16(mul1_lo_sse, mul1_hi_sse); - - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); - res8_sse = _mm_add_epi32(tmp1_lo_sse, res8_sse); - res12_sse = _mm_add_epi32(tmp1_hi_sse, res12_sse); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); } //Here the normal loop is executed where ii = i - fwidth / 2 + fi for ( ; fi < fwidth; fi++) { ii = diff_i_halffw + fi; - __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); - __m128i d1 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j + 8)); - __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); - mul0_lo_sse = _mm_mullo_epi16(d0, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0, coef); - mul1_lo_sse = _mm_mullo_epi16(d1, coef); - mul1_hi_sse = _mm_mulhi_epi16(d1, coef); + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef); - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); - tmp1_lo_sse = _mm_unpacklo_epi16(mul1_lo_sse, mul1_hi_sse); - tmp1_hi_sse = _mm_unpackhi_epi16(mul1_lo_sse, mul1_hi_sse); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); - res8_sse = _mm_add_epi32(tmp1_lo_sse, res8_sse); - res12_sse = _mm_add_epi32(tmp1_hi_sse, res12_sse); + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); } - res0_sse = _mm_srai_epi32(res0_sse, interim_shift); - res4_sse = _mm_srai_epi32(res4_sse, interim_shift); - res8_sse = _mm_srai_epi32(res8_sse, interim_shift); - res12_sse = _mm_srai_epi32(res12_sse, interim_shift); - res0_sse = _mm_packs_epi32(res0_sse, res4_sse); - res8_sse = _mm_packs_epi32(res8_sse, res12_sse); + res0_256 = _mm256_srai_epi32(res0_256, interim_shift); + res4_256 = _mm256_srai_epi32(res4_256, interim_shift); + + res0_256 = _mm256_packs_epi32(res0_256, res4_256); - __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_sse), res8_sse, 1); - _mm256_store_si256((__m256i*)(tmp + j), res); + _mm256_store_si256((__m256i*)(tmp + j), res0_256); } for (; j < width_rem_size8; j+=8) { - res0_sse = res4_sse = res8_sse = res12_sse = _mm_set1_epi32(interim_rnd); + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); /** * The full loop is from fi = 0 to fwidth @@ -1197,15 +1530,16 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt ii = pro_mir_end - fi; __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - mul0_lo_sse = _mm_mullo_epi16(d0, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0, coef); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); // regroup the 2 parts of the result - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); } //Here the normal loop is executed where ii = i - fwidth / 2 + fi @@ -1215,21 +1549,21 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - mul0_lo_sse = _mm_mullo_epi16(d0, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0, coef); + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); } - res0_sse = _mm_srai_epi32(res0_sse, interim_shift); - res4_sse = _mm_srai_epi32(res4_sse, interim_shift); + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); - res0_sse = _mm_packs_epi32(res0_sse, res4_sse); - _mm_store_si128((__m128i*)(tmp + j), res0_sse); + res0_128 = _mm_packs_epi32(res0_128, res4_128); + _mm_store_si128((__m128i*)(tmp + j), res0_128); } for (; j < width; j++) @@ -1270,11 +1604,149 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt int f_l_i = i - half_fw; int f_r_i = i + half_fw; /* Vertical pass. */ - + j = 0; if(8 == bitdepth) { - for (j = 0; j < width_rem_size32; j+=32){ - res0 = res4 = res8 = res12 = _mm256_set1_epi32(interim_rnd); + for (; j < width_rem_size64; j+=64){ + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); + res0_32_256 = res4_32_256 = res8_32_256 = res12_32_256 = _mm256_set1_epi32(interim_rnd); + /** + * The filter coefficients are symmetric, + * hence the corresponding pixels for whom coefficient values would be same are added first & then multiplied by coeff + * The centre pixel is multiplied and accumulated outside the loop + */ + for (fi = 0; fi < (half_fw); fi+=2) + { + ii1 = f_l_i + fi; + ii2 = f_r_i - fi; + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii1 * src_px_stride + j)); + __m256i d20 = _mm256_loadu_si256((__m256i*)(src_8b + ii2 * src_px_stride + j)); + __m256i d1 = _mm256_loadu_si256((__m256i*)(src_8b + (ii1 + 1) * src_px_stride + j)); + __m256i d19 = _mm256_loadu_si256((__m256i*)(src_8b + (ii2 - 1) * src_px_stride + j)); + + __m256i d0_32 = _mm256_loadu_si256((__m256i*)(src_8b + ii1 * src_px_stride + j + 32)); + __m256i d20_32 = _mm256_loadu_si256((__m256i*)(src_8b + ii2 * src_px_stride + j + 32)); + __m256i d1_32 = _mm256_loadu_si256((__m256i*)(src_8b + (ii1 + 1) * src_px_stride + j + 32)); + __m256i d19_32 = _mm256_loadu_si256((__m256i*)(src_8b + (ii2 - 1) * src_px_stride + j + 32)); + + __m256i f0_1 = _mm256_set1_epi32(i32_filter_coeffs[fi / 2]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d20_lo = _mm256_unpacklo_epi8(d20, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + __m256i d20_hi = _mm256_unpackhi_epi8(d20, zero_256); + __m256i d1_lo = _mm256_unpacklo_epi8(d1, zero_256); + __m256i d19_lo = _mm256_unpacklo_epi8(d19, zero_256); + __m256i d1_hi = _mm256_unpackhi_epi8(d1, zero_256); + __m256i d19_hi = _mm256_unpackhi_epi8(d19, zero_256); + + __m256i d0_32_lo = _mm256_unpacklo_epi8(d0_32, zero_256); + __m256i d20_32_lo = _mm256_unpacklo_epi8(d20_32, zero_256); + __m256i d0_32_hi = _mm256_unpackhi_epi8(d0_32, zero_256); + __m256i d20_32_hi = _mm256_unpackhi_epi8(d20_32, zero_256); + __m256i d1_32_lo = _mm256_unpacklo_epi8(d1_32, zero_256); + __m256i d19_32_lo = _mm256_unpacklo_epi8(d19_32, zero_256); + __m256i d1_32_hi = _mm256_unpackhi_epi8(d1_32, zero_256); + __m256i d19_32_hi = _mm256_unpackhi_epi8(d19_32, zero_256); + + d0_lo = _mm256_add_epi16(d0_lo, d20_lo); + d1_lo = _mm256_add_epi16(d1_lo, d19_lo); + d0_hi = _mm256_add_epi16(d0_hi, d20_hi); + d1_hi = _mm256_add_epi16(d1_hi, d19_hi); + + d0_32_lo = _mm256_add_epi16(d0_32_lo, d20_32_lo); + d1_32_lo = _mm256_add_epi16(d1_32_lo, d19_32_lo); + d0_32_hi = _mm256_add_epi16(d0_32_hi, d20_32_hi); + d1_32_hi = _mm256_add_epi16(d1_32_hi, d19_32_hi); + + __m256i l0_20_1_19_0 = _mm256_unpacklo_epi16(d0_lo, d1_lo); + __m256i l0_20_1_19_4 = _mm256_unpackhi_epi16(d0_lo, d1_lo); + __m256i l0_20_1_19_8 = _mm256_unpacklo_epi16(d0_hi, d1_hi); + __m256i l0_20_1_19_12 = _mm256_unpackhi_epi16(d0_hi, d1_hi); + + __m256i l0_20_1_19_0_32 = _mm256_unpacklo_epi16(d0_32_lo, d1_32_lo); + __m256i l0_20_1_19_4_32 = _mm256_unpackhi_epi16(d0_32_lo, d1_32_lo); + __m256i l0_20_1_19_8_32 = _mm256_unpacklo_epi16(d0_32_hi, d1_32_hi); + __m256i l0_20_1_19_12_32 = _mm256_unpackhi_epi16(d0_32_hi, d1_32_hi); + + res0_256 = _mm256_add_epi32(res0_256,_mm256_madd_epi16(l0_20_1_19_0, f0_1)); + res4_256 = _mm256_add_epi32(res4_256,_mm256_madd_epi16(l0_20_1_19_4, f0_1)); + res8_256 = _mm256_add_epi32(res8_256,_mm256_madd_epi16(l0_20_1_19_8, f0_1)); + res12_256 = _mm256_add_epi32(res12_256,_mm256_madd_epi16(l0_20_1_19_12, f0_1)); + + res0_32_256 = _mm256_add_epi32(res0_32_256,_mm256_madd_epi16(l0_20_1_19_0_32, f0_1)); + res4_32_256 = _mm256_add_epi32(res4_32_256,_mm256_madd_epi16(l0_20_1_19_4_32, f0_1)); + res8_32_256 = _mm256_add_epi32(res8_32_256,_mm256_madd_epi16(l0_20_1_19_8_32, f0_1)); + res12_32_256 = _mm256_add_epi32(res12_32_256,_mm256_madd_epi16(l0_20_1_19_12_32, f0_1)); + } + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + i * src_px_stride + j)); + __m256i d0_32 = _mm256_loadu_si256((__m256i*)(src_8b + i * src_px_stride + j + 32)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + __m256i d0_32_lo = _mm256_unpacklo_epi8(d0_32, zero_256); + __m256i d0_32_hi = _mm256_unpackhi_epi8(d0_32, zero_256); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0_hi, coef); + + __m256i mul0_32_lo_256 = _mm256_mullo_epi16(d0_32_lo, coef); + __m256i mul0_32_hi_256 = _mm256_mulhi_epi16(d0_32_lo, coef); + __m256i mul1_32_lo_256 = _mm256_mullo_epi16(d0_32_hi, coef); + __m256i mul1_32_hi_256 = _mm256_mulhi_epi16(d0_32_hi, coef); + + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + __m256i tmp0_32_lo = _mm256_unpacklo_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp0_32_hi = _mm256_unpackhi_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp1_32_lo = _mm256_unpacklo_epi16(mul1_32_lo_256, mul1_32_hi_256); + __m256i tmp1_32_hi = _mm256_unpackhi_epi16(mul1_32_lo_256, mul1_32_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + + res0_32_256 = _mm256_add_epi32(tmp0_32_lo, res0_32_256); + res4_32_256 = _mm256_add_epi32(tmp0_32_hi, res4_32_256); + res8_32_256 = _mm256_add_epi32(tmp1_32_lo, res8_32_256); + res12_32_256 = _mm256_add_epi32(tmp1_32_hi, res12_32_256); + + res0_256 = _mm256_srai_epi32(res0_256, interim_shift); + res4_256 = _mm256_srai_epi32(res4_256, interim_shift); + res8_256 = _mm256_srai_epi32(res8_256, interim_shift); + res12_256 = _mm256_srai_epi32(res12_256, interim_shift); + + res0_32_256 = _mm256_srai_epi32(res0_32_256, interim_shift); + res4_32_256 = _mm256_srai_epi32(res4_32_256, interim_shift); + res8_32_256 = _mm256_srai_epi32(res8_32_256, interim_shift); + res12_32_256 = _mm256_srai_epi32(res12_32_256, interim_shift); + + res0_256 = _mm256_packs_epi32(res0_256,res4_256); + res8_256 = _mm256_packs_epi32(res8_256,res12_256); + + res0_32_256 = _mm256_packs_epi32(res0_32_256, res4_32_256); + res8_32_256 = _mm256_packs_epi32(res8_32_256, res12_32_256); + + __m256i r0 = _mm256_permute2x128_si256(res0_256, res8_256, 0x20); + __m256i r8 = _mm256_permute2x128_si256(res0_256, res8_256, 0x31); + __m256i r0_32 = _mm256_permute2x128_si256(res0_32_256, res8_32_256, 0x20); + __m256i r8_32 = _mm256_permute2x128_si256(res0_32_256, res8_32_256, 0x31); + + _mm256_storeu_si256((__m256i*)(tmp + j), r0); + _mm256_storeu_si256((__m256i*)(tmp + j + 16), r8); + _mm256_storeu_si256((__m256i*)(tmp + j + 32), r0_32); + _mm256_storeu_si256((__m256i*)(tmp + j + 48), r8_32); + } + + for (; j < width_rem_size32; j+=32){ + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); /** * The filter coefficients are symmetric, * hence the corresponding pixels for whom coefficient values would be same are added first & then multiplied by coeff @@ -1284,22 +1756,22 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt ii1 = f_l_i + fi; ii2 = f_r_i - fi; - __m256i f0_1 = _mm256_set1_epi32(i32_filter_coeffs2[fi / 2]); __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii1 * src_px_stride + j)); __m256i d20 = _mm256_loadu_si256((__m256i*)(src_8b + ii2 * src_px_stride + j)); __m256i d1 = _mm256_loadu_si256((__m256i*)(src_8b + (ii1 + 1) * src_px_stride + j)); __m256i d19 = _mm256_loadu_si256((__m256i*)(src_8b + (ii2 - 1) * src_px_stride + j)); + __m256i f0_1 = _mm256_set1_epi32(i32_filter_coeffs[fi / 2]); - __m256i d0_lo = _mm256_unpacklo_epi8(d0, _mm256_setzero_si256()); - __m256i d20_lo = _mm256_unpacklo_epi8(d20, _mm256_setzero_si256()); - __m256i d0_hi = _mm256_unpackhi_epi8(d0, _mm256_setzero_si256()); - __m256i d20_hi = _mm256_unpackhi_epi8(d20, _mm256_setzero_si256()); + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d20_lo = _mm256_unpacklo_epi8(d20, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + __m256i d20_hi = _mm256_unpackhi_epi8(d20, zero_256); - __m256i d1_lo = _mm256_unpacklo_epi8(d1, _mm256_setzero_si256()); - __m256i d19_lo = _mm256_unpacklo_epi8(d19, _mm256_setzero_si256()); - __m256i d1_hi = _mm256_unpackhi_epi8(d1, _mm256_setzero_si256()); - __m256i d19_hi = _mm256_unpackhi_epi8(d19, _mm256_setzero_si256()); + __m256i d1_lo = _mm256_unpacklo_epi8(d1, zero_256); + __m256i d19_lo = _mm256_unpacklo_epi8(d19, zero_256); + __m256i d1_hi = _mm256_unpackhi_epi8(d1, zero_256); + __m256i d19_hi = _mm256_unpackhi_epi8(d19, zero_256); d0_lo = _mm256_add_epi16(d0_lo, d20_lo); d1_lo = _mm256_add_epi16(d1_lo, d19_lo); @@ -1311,44 +1783,48 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt __m256i l0_20_1_19_8 = _mm256_unpacklo_epi16(d0_hi, d1_hi); __m256i l0_20_1_19_12 = _mm256_unpackhi_epi16(d0_hi, d1_hi); - res0 = _mm256_add_epi32(res0, _mm256_madd_epi16(l0_20_1_19_0, f0_1)); - res4 = _mm256_add_epi32(res4, _mm256_madd_epi16(l0_20_1_19_4, f0_1)); - res8 = _mm256_add_epi32(res8, _mm256_madd_epi16(l0_20_1_19_8, f0_1)); - res12 = _mm256_add_epi32(res12, _mm256_madd_epi16(l0_20_1_19_12, f0_1)); + res0_256 = _mm256_add_epi32(res0_256,_mm256_madd_epi16(l0_20_1_19_0, f0_1)); + res4_256 = _mm256_add_epi32(res4_256,_mm256_madd_epi16(l0_20_1_19_4, f0_1)); + res8_256 = _mm256_add_epi32(res8_256,_mm256_madd_epi16(l0_20_1_19_8, f0_1)); + res12_256 = _mm256_add_epi32(res12_256,_mm256_madd_epi16(l0_20_1_19_12, f0_1)); } __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + i * src_px_stride + j)); __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); - __m256i d0_lo = _mm256_unpacklo_epi8(d0, _mm256_setzero_si256()); - __m256i d0_hi = _mm256_unpackhi_epi8(d0, _mm256_setzero_si256()); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); - mul0_lo = _mm256_mullo_epi16(d0_lo, coef); - mul0_hi = _mm256_mulhi_epi16(d0_lo, coef); - mul1_lo = _mm256_mullo_epi16(d0_hi, coef); - mul1_hi = _mm256_mulhi_epi16(d0_hi, coef); + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0_hi, coef); - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); - tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); - tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); - res0 = _mm256_add_epi32(tmp0_lo, res0); - res4 = _mm256_add_epi32(tmp0_hi, res4); - res8 = _mm256_add_epi32(tmp1_lo, res8); - res12 = _mm256_add_epi32(tmp1_hi, res12); - - res0 = _mm256_srai_epi32(res0, interim_shift); - res4 = _mm256_srai_epi32(res4, interim_shift); - res8 = _mm256_srai_epi32(res8, interim_shift); - res12 = _mm256_srai_epi32(res12, interim_shift); + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + + res0_256 = _mm256_srai_epi32(res0_256,interim_shift); + res4_256 = _mm256_srai_epi32(res4_256,interim_shift); + res8_256 = _mm256_srai_epi32(res8_256,interim_shift); + res12_256 = _mm256_srai_epi32(res12_256,interim_shift); - res0 = _mm256_packs_epi32(res0, res4); - res8 = _mm256_packs_epi32(res8, res12); + res0_256 = _mm256_packs_epi32(res0_256,res4_256); + res8_256 = _mm256_packs_epi32(res8_256,res12_256); - shuffle_and_store(tmp + j, res0, res8); + __m256i r0 = _mm256_permute2x128_si256(res0_256, res8_256, 0x20); + __m256i r8 = _mm256_permute2x128_si256(res0_256, res8_256, 0x31); + _mm256_storeu_si256((__m256i*)(tmp + j), r0); + _mm256_storeu_si256((__m256i*)(tmp + j + 16), r8); } for (; j < width_rem_size16; j+=16){ - res0_sse = res4_sse = res8_sse = res12_sse = _mm_set1_epi32(interim_rnd); + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); /** * The filter coefficients are symmetric, @@ -1362,16 +1838,17 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt __m128i d20 = _mm_loadu_si128((__m128i*)(src_8b + ii2 * src_px_stride + j)); __m128i d1 = _mm_loadu_si128((__m128i*)(src_8b + (ii1 + 1) * src_px_stride + j)); __m128i d19 = _mm_loadu_si128((__m128i*)(src_8b + (ii2 - 1) * src_px_stride + j)); + __m128i f0_1 = _mm_set1_epi32(i32_filter_coeffs[fi / 2]); + + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i d0_hi = _mm_unpackhi_epi8(d0, zero_128); + __m128i d20_lo = _mm_unpacklo_epi8(d20, zero_128); + __m128i d20_hi = _mm_unpackhi_epi8(d20, zero_128); - __m128i d0_lo = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); - __m128i d0_hi = _mm_unpackhi_epi8(d0, _mm_setzero_si128()); - __m128i d20_lo = _mm_unpacklo_epi8(d20, _mm_setzero_si128()); - __m128i d20_hi = _mm_unpackhi_epi8(d20, _mm_setzero_si128()); - - __m128i d1_lo = _mm_unpacklo_epi8(d1, _mm_setzero_si128()); - __m128i d1_hi = _mm_unpackhi_epi8(d1, _mm_setzero_si128()); - __m128i d19_lo = _mm_unpacklo_epi8(d19, _mm_setzero_si128()); - __m128i d19_hi = _mm_unpackhi_epi8(d19, _mm_setzero_si128()); + __m128i d1_lo = _mm_unpacklo_epi8(d1, zero_128); + __m128i d1_hi = _mm_unpackhi_epi8(d1, zero_128); + __m128i d19_lo = _mm_unpacklo_epi8(d19, zero_128); + __m128i d19_hi = _mm_unpackhi_epi8(d19, zero_128); d0_lo = _mm_add_epi16(d0_lo, d20_lo); d0_hi = _mm_add_epi16(d0_hi, d20_hi); @@ -1383,47 +1860,45 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt __m128i l0_20_1_19_8 = _mm_unpacklo_epi16(d0_hi, d1_hi); __m128i l0_20_1_19_12 = _mm_unpackhi_epi16(d0_hi, d1_hi); - __m128i f0_1 = _mm_set1_epi32(i32_filter_coeffs2[fi / 2]); - - res0_sse = _mm_add_epi32(res0_sse, _mm_madd_epi16(l0_20_1_19_0, f0_1)); - res4_sse = _mm_add_epi32(res4_sse, _mm_madd_epi16(l0_20_1_19_4, f0_1)); - res8_sse = _mm_add_epi32(res8_sse, _mm_madd_epi16(l0_20_1_19_8, f0_1)); - res12_sse = _mm_add_epi32(res12_sse, _mm_madd_epi16(l0_20_1_19_12, f0_1)); + res0_128 = _mm_add_epi32(res0_128, _mm_madd_epi16(l0_20_1_19_0, f0_1)); + res4_128 = _mm_add_epi32(res4_128, _mm_madd_epi16(l0_20_1_19_4, f0_1)); + res8_128 = _mm_add_epi32(res8_128, _mm_madd_epi16(l0_20_1_19_8, f0_1)); + res12_128 = _mm_add_epi32(res12_128, _mm_madd_epi16(l0_20_1_19_12, f0_1)); } __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + i * src_px_stride + j)); __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - __m128i d0_lo = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); - __m128i d0_hi = _mm_unpackhi_epi8(d0, _mm_setzero_si128()); + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i d0_hi = _mm_unpackhi_epi8(d0, zero_128); - mul0_lo_sse = _mm_mullo_epi16(d0_lo, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0_lo, coef); - mul1_lo_sse = _mm_mullo_epi16(d0_hi, coef); - mul1_hi_sse = _mm_mulhi_epi16(d0_hi, coef); + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d0_hi, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d0_hi, coef); - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); - tmp1_lo_sse = _mm_unpacklo_epi16(mul1_lo_sse, mul1_hi_sse); - tmp1_hi_sse = _mm_unpackhi_epi16(mul1_lo_sse, mul1_hi_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); - res8_sse = _mm_add_epi32(tmp1_lo_sse, res8_sse); - res12_sse = _mm_add_epi32(tmp1_hi_sse, res12_sse); - - res0_sse = _mm_srai_epi32(res0_sse, interim_shift); - res4_sse = _mm_srai_epi32(res4_sse, interim_shift); - res8_sse = _mm_srai_epi32(res8_sse, interim_shift); - res12_sse = _mm_srai_epi32(res12_sse, interim_shift); + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); + + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res8_128 = _mm_srai_epi32(res8_128, interim_shift); + res12_128 = _mm_srai_epi32(res12_128, interim_shift); - res0_sse = _mm_packs_epi32(res0_sse, res4_sse); - res8_sse = _mm_packs_epi32(res8_sse, res12_sse); + res0_128 = _mm_packs_epi32(res0_128, res4_128); + res8_128 = _mm_packs_epi32(res8_128, res12_128); - __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_sse), res8_sse, 1); + __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_128), res8_128, 1); _mm256_store_si256((__m256i*)(tmp + j), res); } for (; j < width_rem_size8; j+=8){ - res0_sse = res4_sse = res8_sse = res12_sse = _mm_set1_epi32(interim_rnd); + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); /** * The filter coefficients are symmetric, @@ -1437,11 +1912,12 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt __m128i d20 = _mm_loadu_si128((__m128i*)(src_8b + ii2 * src_px_stride + j)); __m128i d1 = _mm_loadu_si128((__m128i*)(src_8b + (ii1 + 1) * src_px_stride + j)); __m128i d19 = _mm_loadu_si128((__m128i*)(src_8b + (ii2 - 1) * src_px_stride + j)); + __m128i f0_1 = _mm_set1_epi32(i32_filter_coeffs[fi / 2]); - __m128i d0_lo = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); - __m128i d20_lo = _mm_unpacklo_epi8(d20, _mm_setzero_si128()); - __m128i d1_lo = _mm_unpacklo_epi8(d1, _mm_setzero_si128()); - __m128i d19_lo = _mm_unpacklo_epi8(d19, _mm_setzero_si128()); + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i d20_lo = _mm_unpacklo_epi8(d20, zero_128); + __m128i d1_lo = _mm_unpacklo_epi8(d1, zero_128); + __m128i d19_lo = _mm_unpacklo_epi8(d19, zero_128); d0_lo = _mm_add_epi16(d0_lo, d20_lo); d1_lo = _mm_add_epi16(d1_lo, d19_lo); @@ -1449,29 +1925,27 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt __m128i l0_20_1_19_0 = _mm_unpacklo_epi16(d0_lo, d1_lo); __m128i l0_20_1_19_4 = _mm_unpackhi_epi16(d0_lo, d1_lo); - __m128i f0_1 = _mm_set1_epi32(i32_filter_coeffs2[fi / 2]); - - res0_sse = _mm_add_epi32(res0_sse, _mm_madd_epi16(l0_20_1_19_0, f0_1)); - res4_sse = _mm_add_epi32(res4_sse, _mm_madd_epi16(l0_20_1_19_4, f0_1)); + res0_128 = _mm_add_epi32(res0_128, _mm_madd_epi16(l0_20_1_19_0, f0_1)); + res4_128 = _mm_add_epi32(res4_128, _mm_madd_epi16(l0_20_1_19_4, f0_1)); } __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + i * src_px_stride + j)); - __m128i d0_lo = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); - __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - mul0_lo_sse = _mm_mullo_epi16(d0_lo, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0_lo, coef); + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); - res0_sse = _mm_srai_epi32(res0_sse, interim_shift); - res4_sse = _mm_srai_epi32(res4_sse, interim_shift); - res0_sse = _mm_packs_epi32(res0_sse, res4_sse); + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res0_128 = _mm_packs_epi32(res0_128, res4_128); - _mm_store_si128((__m128i*)(tmp + j), res0_sse); + _mm_store_si128((__m128i*)(tmp + j), res0_128); } for (; j < width; j++){ @@ -1484,12 +1958,11 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt accum += (spat_fil_inter_dtype) i_filter_coeffs[fi] * src_8b[i * src_px_stride + j]; tmp[j] = (spat_fil_inter_dtype) ((accum + interim_rnd) >> interim_shift); } - } else { - for (j = 0; j < width_rem_size; j+=32){ - res0 = res4 = res8 = res12 = _mm256_set1_epi32(interim_rnd); + for (; j < width_rem_size32; j+=32){ + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); /** * The filter coefficients are symmetric, @@ -1504,11 +1977,12 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt __m256i d0_16 = _mm256_loadu_si256((__m256i*)(src_hbd + ii1 * src_px_stride + j + 16)); __m256i d20_16 = _mm256_loadu_si256((__m256i*)(src_hbd + ii2 * src_px_stride + j + 16)); - __m256i d1 = _mm256_loadu_si256((__m256i*)(src_hbd + ii1 * src_px_stride + j)); - __m256i d19 = _mm256_loadu_si256((__m256i*)(src_hbd + ii2 * src_px_stride + j)); - __m256i d1_16 = _mm256_loadu_si256((__m256i*)(src_hbd + ii1 * src_px_stride + j + 16)); - __m256i d19_16 = _mm256_loadu_si256((__m256i*)(src_hbd + ii2 * src_px_stride + j + 16)); - __m256i f0_1 = _mm256_set1_epi32(i32_filter_coeffs2[fi / 2]); + __m256i d1 = _mm256_loadu_si256((__m256i*)(src_hbd + (ii1 + 1) * src_px_stride + j)); + __m256i d19 = _mm256_loadu_si256((__m256i*)(src_hbd + (ii2 - 1) * src_px_stride + j)); + __m256i d1_16 = _mm256_loadu_si256((__m256i*)(src_hbd + (ii1 + 1) * src_px_stride + j + 16)); + __m256i d19_16 = _mm256_loadu_si256((__m256i*)(src_hbd + (ii2 - 1) * src_px_stride + j + 16)); + __m256i f0_1 = _mm256_set1_epi32(i32_filter_coeffs[fi / 2]); + d0 = _mm256_add_epi16(d0, d20); d0_16 = _mm256_add_epi16(d0_16, d20_16); d1 = _mm256_add_epi16(d1, d19); @@ -1519,43 +1993,44 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt __m256i l0_20_1_19_16 = _mm256_unpacklo_epi16(d0_16, d1_16); __m256i l0_20_1_19_20 = _mm256_unpackhi_epi16(d0_16, d1_16); - res0 = _mm256_add_epi32(res0, _mm256_madd_epi16(l0_20_1_19_0, f0_1)); - res4 = _mm256_add_epi32(res4, _mm256_madd_epi16(l0_20_1_19_4, f0_1)); - res8 = _mm256_add_epi32(res8, _mm256_madd_epi16(l0_20_1_19_16, f0_1)); - res12 = _mm256_add_epi32(res12, _mm256_madd_epi16(l0_20_1_19_20, f0_1)); + res0_256 = _mm256_add_epi32(res0_256,_mm256_madd_epi16(l0_20_1_19_0, f0_1)); + res4_256 = _mm256_add_epi32(res4_256,_mm256_madd_epi16(l0_20_1_19_4, f0_1)); + res8_256 = _mm256_add_epi32(res8_256,_mm256_madd_epi16(l0_20_1_19_16, f0_1)); + res12_256 = _mm256_add_epi32(res12_256,_mm256_madd_epi16(l0_20_1_19_20, f0_1)); } __m256i d0 = _mm256_loadu_si256((__m256i*)(src_hbd + i * src_px_stride + j)); __m256i d0_16 = _mm256_loadu_si256((__m256i*)(src_hbd + i * src_px_stride + j + 16)); __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); - mul0_lo = _mm256_mullo_epi16(d0, coef); - mul0_hi = _mm256_mulhi_epi16(d0, coef); - mul1_lo = _mm256_mullo_epi16(d0_16, coef); - mul1_hi = _mm256_mulhi_epi16(d0_16, coef); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0_16, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0_16, coef); - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); - tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); - tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); - res0 = _mm256_add_epi32(tmp0_lo, res0); - res4 = _mm256_add_epi32(tmp0_hi, res4); - res8 = _mm256_add_epi32(tmp1_lo, res8); - res12 = _mm256_add_epi32(tmp1_hi, res12); - - res0 = _mm256_srai_epi32(res0, interim_shift); - res4 = _mm256_srai_epi32(res4, interim_shift); - res8 = _mm256_srai_epi32(res8, interim_shift); - res12 = _mm256_srai_epi32(res12, interim_shift); + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + + res0_256 = _mm256_srai_epi32(res0_256,interim_shift); + res4_256 = _mm256_srai_epi32(res4_256,interim_shift); + res8_256 = _mm256_srai_epi32(res8_256,interim_shift); + res12_256 = _mm256_srai_epi32(res12_256,interim_shift); - res0 = _mm256_packs_epi32(res0, res4); - res8 = _mm256_packs_epi32(res8, res12); + res0_256 = _mm256_packs_epi32(res0_256,res4_256); + res8_256 = _mm256_packs_epi32(res8_256,res12_256); - _mm256_store_si256((__m256i*)(tmp + j), res0); - _mm256_store_si256((__m256i*)(tmp + j + 16), res8); + _mm256_store_si256((__m256i*)(tmp + j), res0_256); + _mm256_store_si256((__m256i*)(tmp + j + 16), res8_256); } - for (j = 0; j < width_rem_size16; j+=16){ - res0_sse = res4_sse = res8_sse = res12_sse = _mm_set1_epi32(interim_rnd); + for (; j < width_rem_size16; j+=16){ + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); /** * The filter coefficients are symmetric, @@ -1565,63 +2040,44 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt for (fi = 0; fi < (half_fw); fi+=2){ ii1 = f_l_i + fi; ii2 = f_r_i - fi; - __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii1 * src_px_stride + j)); - __m128i d20 = _mm_loadu_si128((__m128i*)(src_hbd + ii2 * src_px_stride + j)); - __m128i d0_8 = _mm_loadu_si128((__m128i*)(src_hbd + ii1 * src_px_stride + j + 8)); - __m128i d20_8 = _mm_loadu_si128((__m128i*)(src_hbd + ii2 * src_px_stride + j + 8)); - __m128i d1 = _mm_loadu_si128((__m128i*)(src_hbd + (ii1 + 1) * src_px_stride + j)); - __m128i d19 = _mm_loadu_si128((__m128i*)(src_hbd + (ii2 - 1) * src_px_stride + j)); - __m128i d1_8 = _mm_loadu_si128((__m128i*)(src_hbd + (ii1 + 1) * src_px_stride + j + 8)); - __m128i d19_8 = _mm_loadu_si128((__m128i*)(src_hbd + (ii2 - 1) * src_px_stride + j + 8)); - __m128i f0_1 = _mm_set1_epi32(i32_filter_coeffs2[fi / 2]); + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_hbd + ii1 * src_px_stride + j)); + __m256i d20 = _mm256_loadu_si256((__m256i*)(src_hbd + ii2 * src_px_stride + j)); - d0 = _mm_add_epi16(d0, d20); - d0_8 = _mm_add_epi16(d0_8, d20_8); - d1 = _mm_add_epi16(d1, d19); - d1_8 = _mm_add_epi16(d1_8, d19_8); + __m256i d1 = _mm256_loadu_si256((__m256i*)(src_hbd + (ii1 + 1) * src_px_stride + j)); + __m256i d19 = _mm256_loadu_si256((__m256i*)(src_hbd + (ii2 - 1) * src_px_stride + j)); + __m256i f0_1 = _mm256_set1_epi32(i32_filter_coeffs[fi / 2]); - __m128i l0_20_1_19_0 = _mm_unpacklo_epi16(d0, d1); - __m128i l0_20_1_19_4 = _mm_unpackhi_epi16(d0, d1); - __m128i l0_20_1_19_8 = _mm_unpacklo_epi16(d0_8, d1_8); - __m128i l0_20_1_19_16 = _mm_unpackhi_epi16(d0_8, d1_8); + d0 = _mm256_add_epi16(d0, d20); + d1 = _mm256_add_epi16(d1, d19); - res0_sse = _mm_add_epi32(res0_sse, _mm_madd_epi16(l0_20_1_19_0, f0_1)); - res4_sse = _mm_add_epi32(res4_sse, _mm_madd_epi16(l0_20_1_19_4, f0_1)); - res8_sse = _mm_add_epi32(res8_sse, _mm_madd_epi16(l0_20_1_19_8, f0_1)); - res12_sse = _mm_add_epi32(res12_sse, _mm_madd_epi16(l0_20_1_19_16, f0_1)); + __m256i l0_20_1_19_0 = _mm256_unpacklo_epi16(d0, d1); + __m256i l0_20_1_19_4 = _mm256_unpackhi_epi16(d0, d1); + + res0_256 = _mm256_add_epi32(res0_256,_mm256_madd_epi16(l0_20_1_19_0, f0_1)); + res4_256 = _mm256_add_epi32(res4_256,_mm256_madd_epi16(l0_20_1_19_4, f0_1)); } - __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + i * src_px_stride + j)); - __m128i d0_16 = _mm_loadu_si128((__m128i*)(src_hbd + i * src_px_stride + j + 8)); - __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - mul0_lo_sse = _mm_mullo_epi16(d0, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0, coef); - mul1_lo_sse = _mm_mullo_epi16(d0_16, coef); - mul1_hi_sse = _mm_mulhi_epi16(d0_16, coef); + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_hbd + i * src_px_stride + j)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef); - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); - tmp1_lo_sse = _mm_unpacklo_epi16(mul1_lo_sse, mul1_hi_sse); - tmp1_hi_sse = _mm_unpackhi_epi16(mul1_lo_sse, mul1_hi_sse); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); - res8_sse = _mm_add_epi32(tmp1_lo_sse, res8_sse); - res12_sse = _mm_add_epi32(tmp1_hi_sse, res12_sse); - - res0_sse = _mm_srai_epi32(res0_sse, interim_shift); - res4_sse = _mm_srai_epi32(res4_sse, interim_shift); - res8_sse = _mm_srai_epi32(res8_sse, interim_shift); - res12_sse = _mm_srai_epi32(res12_sse, interim_shift); + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + + res0_256 = _mm256_srai_epi32(res0_256,interim_shift); + res4_256 = _mm256_srai_epi32(res4_256,interim_shift); - res0_sse = _mm_packs_epi32(res0_sse, res4_sse); - res8_sse = _mm_packs_epi32(res8_sse, res12_sse); + res0_256 = _mm256_packs_epi32(res0_256,res4_256); - __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_sse), res8_sse, 1); - _mm256_store_si256((__m256i*)(tmp + j), res); + _mm256_store_si256((__m256i*)(tmp + j), res0_256); } for (; j < width_rem_size8; j+=8){ - res0_sse = res4_sse = res8_sse = res12_sse = _mm_set1_epi32(interim_rnd); + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); /** * The filter coefficients are symmetric, @@ -1635,32 +2091,34 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt __m128i d20 = _mm_loadu_si128((__m128i*)(src_hbd + ii2 * src_px_stride + j)); __m128i d1 = _mm_loadu_si128((__m128i*)(src_hbd + (ii1 + 1) * src_px_stride + j)); __m128i d19 = _mm_loadu_si128((__m128i*)(src_hbd + (ii2 - 1) * src_px_stride + j)); - __m128i f0_1 = _mm_set1_epi32(i32_filter_coeffs2[fi / 2]); + __m128i f0_1 = _mm_set1_epi32(i32_filter_coeffs[fi / 2]); + d0 = _mm_add_epi16(d0, d20); d1 = _mm_add_epi16(d1, d19); __m128i l0_20_1_19_0 = _mm_unpacklo_epi16(d0, d1); __m128i l0_20_1_19_4 = _mm_unpackhi_epi16(d0, d1); - res0_sse = _mm_add_epi32(res0_sse, _mm_madd_epi16(l0_20_1_19_0, f0_1)); - res4_sse = _mm_add_epi32(res4_sse, _mm_madd_epi16(l0_20_1_19_4, f0_1)); + res0_128 = _mm_add_epi32(res0_128, _mm_madd_epi16(l0_20_1_19_0, f0_1)); + res4_128 = _mm_add_epi32(res4_128, _mm_madd_epi16(l0_20_1_19_4, f0_1)); } __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + i * src_px_stride + j)); __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - mul0_lo_sse = _mm_mullo_epi16(d0, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0, coef); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); - res0_sse = _mm_srai_epi32(res0_sse, interim_shift); - res4_sse = _mm_srai_epi32(res4_sse, interim_shift); - res0_sse = _mm_packs_epi32(res0_sse, res4_sse); + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res0_128 = _mm_packs_epi32(res0_128, res4_128); - _mm_store_si128((__m128i*)(tmp + j), res0_sse); + _mm_store_si128((__m128i*)(tmp + j), res0_128); } for (; j < width; j++){ @@ -1696,97 +2154,220 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt int epi_last_i = height - diff_i_halffw; /* Vertical pass. */ - + j = 0; if(8 == bitdepth) { - for (j = 0; j < width_rem_size32; j+=32){ - res0 = res4 = res8 = res12 = _mm256_set1_epi32(interim_rnd); + for (; j < width_rem_size64; j+=64){ + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); + res0_32_256 = res4_32_256 = res8_32_256 = res12_32_256 = _mm256_set1_epi32(interim_rnd); //Here the normal loop is executed where ii = i - fwidth/2 + fi for (fi = 0; fi < epi_last_i; fi++){ ii = diff_i_halffw + fi; - __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j)); - __m256i d0_lo = _mm256_unpacklo_epi8(d0, _mm256_setzero_si256()); - __m256i d0_hi = _mm256_unpackhi_epi8(d0, _mm256_setzero_si256()); + __m256i d0_32 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j + 32)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + __m256i d0_32_lo = _mm256_unpacklo_epi8(d0_32, zero_256); + __m256i d0_32_hi = _mm256_unpackhi_epi8(d0_32, zero_256); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0_hi, coef); - mul0_lo = _mm256_mullo_epi16(d0_lo, coef); - mul0_hi = _mm256_mulhi_epi16(d0_lo, coef); - mul1_lo = _mm256_mullo_epi16(d0_hi, coef); - mul1_hi = _mm256_mulhi_epi16(d0_hi, coef); + __m256i mul0_32_lo_256 = _mm256_mullo_epi16(d0_32_lo, coef); + __m256i mul0_32_hi_256 = _mm256_mulhi_epi16(d0_32_lo, coef); + __m256i mul1_32_lo_256 = _mm256_mullo_epi16(d0_32_hi, coef); + __m256i mul1_32_hi_256 = _mm256_mulhi_epi16(d0_32_hi, coef); // regroup the 2 parts of the result - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); - tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); - tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); - - res0 = _mm256_add_epi32(tmp0_lo, res0); - res4 = _mm256_add_epi32(tmp0_hi, res4); - res8 = _mm256_add_epi32(tmp1_lo, res8); - res12 = _mm256_add_epi32(tmp1_hi, res12); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + __m256i tmp0_32_lo = _mm256_unpacklo_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp0_32_hi = _mm256_unpackhi_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp1_32_lo = _mm256_unpacklo_epi16(mul1_32_lo_256, mul1_32_hi_256); + __m256i tmp1_32_hi = _mm256_unpackhi_epi16(mul1_32_lo_256, mul1_32_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + + res0_32_256 = _mm256_add_epi32(tmp0_32_lo, res0_32_256); + res4_32_256 = _mm256_add_epi32(tmp0_32_hi, res4_32_256); + res8_32_256 = _mm256_add_epi32(tmp1_32_lo, res8_32_256); + res12_32_256 = _mm256_add_epi32(tmp1_32_hi, res12_32_256); } //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) for ( ; fi < fwidth; fi++) { ii = epi_mir_i - fi; + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j)); + __m256i d0_32 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j + 32)); __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + __m256i d0_32_lo = _mm256_unpacklo_epi8(d0_32, zero_256); + __m256i d0_32_hi = _mm256_unpackhi_epi8(d0_32, zero_256); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0_hi, coef); + + __m256i mul0_32_lo_256 = _mm256_mullo_epi16(d0_32_lo, coef); + __m256i mul0_32_hi_256 = _mm256_mulhi_epi16(d0_32_lo, coef); + __m256i mul1_32_lo_256 = _mm256_mullo_epi16(d0_32_hi, coef); + __m256i mul1_32_hi_256 = _mm256_mulhi_epi16(d0_32_hi, coef); + + // regroup the 2 parts of the result + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + __m256i tmp0_32_lo = _mm256_unpacklo_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp0_32_hi = _mm256_unpackhi_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp1_32_lo = _mm256_unpacklo_epi16(mul1_32_lo_256, mul1_32_hi_256); + __m256i tmp1_32_hi = _mm256_unpackhi_epi16(mul1_32_lo_256, mul1_32_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + + res0_32_256 = _mm256_add_epi32(tmp0_32_lo, res0_32_256); + res4_32_256 = _mm256_add_epi32(tmp0_32_hi, res4_32_256); + res8_32_256 = _mm256_add_epi32(tmp1_32_lo, res8_32_256); + res12_32_256 = _mm256_add_epi32(tmp1_32_hi, res12_32_256); + } + + res0_256 = _mm256_srai_epi32(res0_256,interim_shift); + res4_256 = _mm256_srai_epi32(res4_256,interim_shift); + res8_256 = _mm256_srai_epi32(res8_256,interim_shift); + res12_256 = _mm256_srai_epi32(res12_256,interim_shift); + + res0_32_256 = _mm256_srai_epi32(res0_32_256, interim_shift); + res4_32_256 = _mm256_srai_epi32(res4_32_256, interim_shift); + res8_32_256 = _mm256_srai_epi32(res8_32_256, interim_shift); + res12_32_256 = _mm256_srai_epi32(res12_32_256, interim_shift); + + res0_256 = _mm256_packs_epi32(res0_256,res4_256); + res8_256 = _mm256_packs_epi32(res8_256,res12_256); + res0_32_256 = _mm256_packs_epi32(res0_32_256, res4_32_256); + res8_32_256 = _mm256_packs_epi32(res8_32_256, res12_32_256); + + __m256i r0 = _mm256_permute2x128_si256(res0_256, res8_256, 0x20); + __m256i r8 = _mm256_permute2x128_si256(res0_256, res8_256, 0x31); + __m256i r0_32 = _mm256_permute2x128_si256(res0_32_256, res8_32_256, 0x20); + __m256i r8_32 = _mm256_permute2x128_si256(res0_32_256, res8_32_256, 0x31); + + _mm256_store_si256((__m256i*)(tmp + j), r0); + _mm256_store_si256((__m256i*)(tmp + j + 16), r8); + _mm256_store_si256((__m256i*)(tmp + j + 32), r0_32); + _mm256_store_si256((__m256i*)(tmp + j + 48), r8_32); + } + + for (; j < width_rem_size32; j+=32){ + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); + //Here the normal loop is executed where ii = i - fwidth/2 + fi + for (fi = 0; fi < epi_last_i; fi++){ + ii = diff_i_halffw + fi; __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j)); - __m256i d0_lo = _mm256_unpacklo_epi8(d0, _mm256_setzero_si256()); - __m256i d0_hi = _mm256_unpackhi_epi8(d0, _mm256_setzero_si256()); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); - mul0_lo = _mm256_mullo_epi16(d0_lo, coef); - mul0_hi = _mm256_mulhi_epi16(d0_lo, coef); - mul1_lo = _mm256_mullo_epi16(d0_hi, coef); - mul1_hi = _mm256_mulhi_epi16(d0_hi, coef); + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0_hi, coef); // regroup the 2 parts of the result - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); - tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); - tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + } + //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) + for ( ; fi < fwidth; fi++) + { + ii = epi_mir_i - fi; + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); - res0 = _mm256_add_epi32(tmp0_lo, res0); - res4 = _mm256_add_epi32(tmp0_hi, res4); - res8 = _mm256_add_epi32(tmp1_lo, res8); - res12 = _mm256_add_epi32(tmp1_hi, res12); + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); } - res0 = _mm256_srai_epi32(res0, interim_shift); - res4 = _mm256_srai_epi32(res4, interim_shift); - res8 = _mm256_srai_epi32(res8, interim_shift); - res12 = _mm256_srai_epi32(res12, interim_shift); + res0_256 = _mm256_srai_epi32(res0_256,interim_shift); + res4_256 = _mm256_srai_epi32(res4_256,interim_shift); + res8_256 = _mm256_srai_epi32(res8_256,interim_shift); + res12_256 = _mm256_srai_epi32(res12_256,interim_shift); - res0 = _mm256_packs_epi32(res0, res4); - res8 = _mm256_packs_epi32(res8, res12); + res0_256 = _mm256_packs_epi32(res0_256,res4_256); + res8_256 = _mm256_packs_epi32(res8_256,res12_256); - shuffle_and_store(tmp + j, res0, res8); + __m256i r0 = _mm256_permute2x128_si256(res0_256, res8_256, 0x20); + __m256i r8 = _mm256_permute2x128_si256(res0_256, res8_256, 0x31); + _mm256_store_si256((__m256i*)(tmp + j), r0); + _mm256_store_si256((__m256i*)(tmp + j + 16), r8); } for (; j < width_rem_size16; j+=16){ - res0_sse = res4_sse = res8_sse = res12_sse = _mm_set1_epi32(interim_rnd); + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); //Here the normal loop is executed where ii = i - fwidth/2 + fi for (fi = 0; fi < epi_last_i; fi++){ ii = diff_i_halffw + fi; __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - __m128i d0_lo = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); - __m128i d0_hi = _mm_unpackhi_epi8(d0, _mm_setzero_si128()); + + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i d0_hi = _mm_unpackhi_epi8(d0, zero_128); - mul0_lo_sse = _mm_mullo_epi16(d0_lo, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0_lo, coef); - mul1_lo_sse = _mm_mullo_epi16(d0_hi, coef); - mul1_hi_sse = _mm_mulhi_epi16(d0_hi, coef); + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d0_hi, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d0_hi, coef); // regroup the 2 parts of the result - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); - tmp1_lo_sse = _mm_unpacklo_epi16(mul1_lo_sse, mul1_hi_sse); - tmp1_hi_sse = _mm_unpackhi_epi16(mul1_lo_sse, mul1_hi_sse); - - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); - res8_sse = _mm_add_epi32(tmp1_lo_sse, res8_sse); - res12_sse = _mm_add_epi32(tmp1_hi_sse, res12_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); } //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) for ( ; fi < fwidth; fi++) @@ -1794,55 +2375,57 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt ii = epi_mir_i - fi; __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - __m128i d0_lo = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); - __m128i d0_hi = _mm_unpackhi_epi8(d0, _mm_setzero_si128()); - mul0_lo_sse = _mm_mullo_epi16(d0_lo, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0_lo, coef); - mul1_lo_sse = _mm_mullo_epi16(d0_hi, coef); - mul1_hi_sse = _mm_mulhi_epi16(d0_hi, coef); + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i d0_hi = _mm_unpackhi_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d0_hi, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d0_hi, coef); // regroup the 2 parts of the result - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); - tmp1_lo_sse = _mm_unpacklo_epi16(mul1_lo_sse, mul1_hi_sse); - tmp1_hi_sse = _mm_unpackhi_epi16(mul1_lo_sse, mul1_hi_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); - res8_sse = _mm_add_epi32(tmp1_lo_sse, res8_sse); - res12_sse = _mm_add_epi32(tmp1_hi_sse, res12_sse); + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); } - res0_sse = _mm_srai_epi32(res0_sse, interim_shift); - res4_sse = _mm_srai_epi32(res4_sse, interim_shift); - res8_sse = _mm_srai_epi32(res8_sse, interim_shift); - res12_sse = _mm_srai_epi32(res12_sse, interim_shift); + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res8_128 = _mm_srai_epi32(res8_128, interim_shift); + res12_128 = _mm_srai_epi32(res12_128, interim_shift); - res0_sse = _mm_packs_epi32(res0_sse, res4_sse); - res8_sse = _mm_packs_epi32(res8_sse, res12_sse); + res0_128 = _mm_packs_epi32(res0_128, res4_128); + res8_128 = _mm_packs_epi32(res8_128, res12_128); - __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_sse), res8_sse, 1); + __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_128), res8_128, 1); _mm256_store_si256((__m256i*)(tmp + j), res); } for (; j < width_rem_size8; j+=8){ - res0_sse = res4_sse = res8_sse = res12_sse = _mm_set1_epi32(interim_rnd); + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); //Here the normal loop is executed where ii = i - fwidth/2 + fi for (fi = 0; fi < epi_last_i; fi++){ ii = diff_i_halffw + fi; __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - __m128i d0_lo = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); - mul0_lo_sse = _mm_mullo_epi16(d0_lo, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0_lo, coef); + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); // regroup the 2 parts of the result - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); } //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) for ( ; fi < fwidth; fi++) @@ -1850,24 +2433,25 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt ii = epi_mir_i - fi; __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - __m128i d0_lo = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); - mul0_lo_sse = _mm_mullo_epi16(d0_lo, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0_lo, coef); + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); // regroup the 2 parts of the result - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); } - res0_sse = _mm_srai_epi32(res0_sse, interim_shift); - res4_sse = _mm_srai_epi32(res4_sse, interim_shift); + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); - res0_sse = _mm_packs_epi32(res0_sse, res4_sse); - _mm_store_si128((__m128i*)(tmp + j), res0_sse); + res0_128 = _mm_packs_epi32(res0_128, res4_128); + _mm_store_si128((__m128i*)(tmp + j), res0_128); } for (; j < width; j++) @@ -1890,8 +2474,116 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt } else { - for (j = 0; j < width_rem_size; j+=32){ - res0 = res4 = res8 = res12 = _mm256_set1_epi32(interim_rnd); + for (; j < width_rem_size64; j+=64){ + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); + res0_32_256 = res4_32_256 = res8_32_256 = res12_32_256 = _mm256_set1_epi32(interim_rnd); + + //Here the normal loop is executed where ii = i - fwidth/2 + fi + for (fi = 0; fi < epi_last_i; fi++){ + ii = diff_i_halffw + fi; + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j)); + __m256i d1 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 16)); + __m256i d0_32 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 32)); + __m256i d1_32 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 48)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d1, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d1, coef); + + __m256i mul0_32_lo_256 = _mm256_mullo_epi16(d0_32, coef); + __m256i mul0_32_hi_256 = _mm256_mulhi_epi16(d0_32, coef); + __m256i mul1_32_lo_256 = _mm256_mullo_epi16(d1_32, coef); + __m256i mul1_32_hi_256 = _mm256_mulhi_epi16(d1_32, coef); + + // regroup the 2 parts of the result + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + __m256i tmp0_32_lo = _mm256_unpacklo_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp0_32_hi = _mm256_unpackhi_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp1_32_lo = _mm256_unpacklo_epi16(mul1_32_lo_256, mul1_32_hi_256); + __m256i tmp1_32_hi = _mm256_unpackhi_epi16(mul1_32_lo_256, mul1_32_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + + res0_32_256 = _mm256_add_epi32(tmp0_32_lo, res0_32_256); + res4_32_256 = _mm256_add_epi32(tmp0_32_hi, res4_32_256); + res8_32_256 = _mm256_add_epi32(tmp1_32_lo, res8_32_256); + res12_32_256 = _mm256_add_epi32(tmp1_32_hi, res12_32_256); + } + + //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) + for ( ; fi < fwidth; fi++) + { + ii = epi_mir_i - fi; + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j)); + __m256i d1 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 16)); + __m256i d0_32 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 32)); + __m256i d1_32 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 48)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d1, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d1, coef); + + __m256i mul0_32_lo_256 = _mm256_mullo_epi16(d0_32, coef); + __m256i mul0_32_hi_256 = _mm256_mulhi_epi16(d0_32, coef); + __m256i mul1_32_lo_256 = _mm256_mullo_epi16(d1_32, coef); + __m256i mul1_32_hi_256 = _mm256_mulhi_epi16(d1_32, coef); + + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + __m256i tmp0_32_lo = _mm256_unpacklo_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp0_32_hi = _mm256_unpackhi_epi16(mul0_32_lo_256, mul0_32_hi_256); + __m256i tmp1_32_lo = _mm256_unpacklo_epi16(mul1_32_lo_256, mul1_32_hi_256); + __m256i tmp1_32_hi = _mm256_unpackhi_epi16(mul1_32_lo_256, mul1_32_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + + res0_32_256 = _mm256_add_epi32(tmp0_32_lo, res0_32_256); + res4_32_256 = _mm256_add_epi32(tmp0_32_hi, res4_32_256); + res8_32_256 = _mm256_add_epi32(tmp1_32_lo, res8_32_256); + res12_32_256 = _mm256_add_epi32(tmp1_32_hi, res12_32_256); + } + + res0_256 = _mm256_srai_epi32(res0_256, interim_shift); + res4_256 = _mm256_srai_epi32(res4_256, interim_shift); + res8_256 = _mm256_srai_epi32(res8_256, interim_shift); + res12_256 = _mm256_srai_epi32(res12_256, interim_shift); + + res0_32_256 = _mm256_srai_epi32(res0_32_256, interim_shift); + res4_32_256 = _mm256_srai_epi32(res4_32_256, interim_shift); + res8_32_256 = _mm256_srai_epi32(res8_32_256, interim_shift); + res12_32_256 = _mm256_srai_epi32(res12_32_256, interim_shift); + + res0_256 = _mm256_packs_epi32(res0_256, res4_256); + res8_256 = _mm256_packs_epi32(res8_256, res12_256); + + res0_32_256 = _mm256_packs_epi32(res0_32_256, res4_32_256); + res8_32_256 = _mm256_packs_epi32(res8_32_256, res12_32_256); + + _mm256_store_si256((__m256i*)(tmp + j), res0_256); + _mm256_store_si256((__m256i*)(tmp + j + 16), res8_256); + _mm256_store_si256((__m256i*)(tmp + j + 32), res0_32_256); + _mm256_store_si256((__m256i*)(tmp + j + 48), res8_32_256); + } + + for (; j < width_rem_size32; j+=32){ + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); //Here the normal loop is executed where ii = i - fwidth/2 + fi for (fi = 0; fi < epi_last_i; fi++){ @@ -1900,21 +2592,21 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt __m256i d16 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 16)); __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); - mul0_lo = _mm256_mullo_epi16(d0, coef); - mul0_hi = _mm256_mulhi_epi16(d0, coef); - mul1_lo = _mm256_mullo_epi16(d16, coef); - mul1_hi = _mm256_mulhi_epi16(d16, coef); + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d16, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d16, coef); // regroup the 2 parts of the result - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); - tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); - tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); - - res0 = _mm256_add_epi32(tmp0_lo, res0); - res4 = _mm256_add_epi32(tmp0_hi, res4); - res8 = _mm256_add_epi32(tmp1_lo, res8); - res12 = _mm256_add_epi32(tmp1_hi, res12); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); } //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) @@ -1925,100 +2617,83 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt __m256i d16 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j + 16)); __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); - mul0_lo = _mm256_mullo_epi16(d0, coef); - mul0_hi = _mm256_mulhi_epi16(d0, coef); - mul1_lo = _mm256_mullo_epi16(d16, coef); - mul1_hi = _mm256_mulhi_epi16(d16, coef); + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d16, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d16, coef); // regroup the 2 parts of the result - tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); - tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); - tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); - tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); - res0 = _mm256_add_epi32(tmp0_lo, res0); - res4 = _mm256_add_epi32(tmp0_hi, res4); - res8 = _mm256_add_epi32(tmp1_lo, res8); - res12 = _mm256_add_epi32(tmp1_hi, res12); + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); } - res0 = _mm256_srai_epi32(res0, interim_shift); - res4 = _mm256_srai_epi32(res4, interim_shift); - res8 = _mm256_srai_epi32(res8, interim_shift); - res12 = _mm256_srai_epi32(res12, interim_shift); + res0_256 = _mm256_srai_epi32(res0_256,interim_shift); + res4_256 = _mm256_srai_epi32(res4_256,interim_shift); + res8_256 = _mm256_srai_epi32(res8_256,interim_shift); + res12_256 = _mm256_srai_epi32(res12_256,interim_shift); - res0 = _mm256_packs_epi32(res0, res4); - res8 = _mm256_packs_epi32(res8, res12); + res0_256 = _mm256_packs_epi32(res0_256,res4_256); + res8_256 = _mm256_packs_epi32(res8_256,res12_256); - _mm256_store_si256((__m256i*)(tmp + j), res0); - _mm256_store_si256((__m256i*)(tmp + j + 16), res8); + _mm256_store_si256((__m256i*)(tmp + j), res0_256); + _mm256_store_si256((__m256i*)(tmp + j + 16), res8_256); } for (; j < width_rem_size16; j+=16){ - res0_sse = res4_sse = res8_sse = res12_sse = _mm_set1_epi32(interim_rnd); + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); //Here the normal loop is executed where ii = i - fwidth/2 + fi for (fi = 0; fi < epi_last_i; fi++){ - ii = diff_i_halffw + fi; - __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); - __m128i d1 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j + 8)); - __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - mul0_lo_sse = _mm_mullo_epi16(d0, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0, coef); - mul1_lo_sse = _mm_mullo_epi16(d1, coef); - mul1_hi_sse = _mm_mulhi_epi16(d1, coef); + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef); // regroup the 2 parts of the result - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); - tmp1_lo_sse = _mm_unpacklo_epi16(mul1_lo_sse, mul1_hi_sse); - tmp1_hi_sse = _mm_unpackhi_epi16(mul1_lo_sse, mul1_hi_sse); - - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); - res8_sse = _mm_add_epi32(tmp1_lo_sse, res8_sse); - res12_sse = _mm_add_epi32(tmp1_hi_sse, res12_sse); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); } //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) for ( ; fi < fwidth; fi++) { ii = epi_mir_i - fi; - __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); - __m128i d1 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j + 8)); - __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_hbd + ii * src_px_stride + j)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); - mul0_lo_sse = _mm_mullo_epi16(d0, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0, coef); - mul1_lo_sse = _mm_mullo_epi16(d1, coef); - mul1_hi_sse = _mm_mulhi_epi16(d1, coef); + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0, coef); - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); - tmp1_lo_sse = _mm_unpacklo_epi16(mul1_lo_sse, mul1_hi_sse); - tmp1_hi_sse = _mm_unpackhi_epi16(mul1_lo_sse, mul1_hi_sse); + // regroup the 2 parts of the result + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); - res8_sse = _mm_add_epi32(tmp1_lo_sse, res8_sse); - res12_sse = _mm_add_epi32(tmp1_hi_sse, res12_sse); + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); } - res0_sse = _mm_srai_epi32(res0_sse, interim_shift); - res4_sse = _mm_srai_epi32(res4_sse, interim_shift); - res8_sse = _mm_srai_epi32(res8_sse, interim_shift); - res12_sse = _mm_srai_epi32(res12_sse, interim_shift); + res0_256 = _mm256_srai_epi32(res0_256,interim_shift); + res4_256 = _mm256_srai_epi32(res4_256,interim_shift); - res0_sse = _mm_packs_epi32(res0_sse, res4_sse); - res8_sse = _mm_packs_epi32(res8_sse, res12_sse); + res0_256 = _mm256_packs_epi32(res0_256,res4_256); - __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_sse), res8_sse, 1); - _mm256_store_si256((__m256i*)(tmp + j), res); + _mm256_store_si256((__m256i*)(tmp + j), res0_256); } for (; j < width_rem_size8; j+=8){ - res0_sse = res4_sse = res8_sse = res12_sse = _mm_set1_epi32(interim_rnd); + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); //Here the normal loop is executed where ii = i - fwidth/2 + fi for (fi = 0; fi < epi_last_i; fi++){ @@ -2026,15 +2701,16 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt ii = diff_i_halffw + fi; __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - mul0_lo_sse = _mm_mullo_epi16(d0, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0, coef); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); // regroup the 2 parts of the result - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); } //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) @@ -2044,21 +2720,21 @@ void integer_spatial_filter_avx2(void *src, spat_fil_output_dtype *dst, int widt __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); - mul0_lo_sse = _mm_mullo_epi16(d0, coef); - mul0_hi_sse = _mm_mulhi_epi16(d0, coef); + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); - tmp0_lo_sse = _mm_unpacklo_epi16(mul0_lo_sse, mul0_hi_sse); - tmp0_hi_sse = _mm_unpackhi_epi16(mul0_lo_sse, mul0_hi_sse); + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); - res0_sse = _mm_add_epi32(tmp0_lo_sse, res0_sse); - res4_sse = _mm_add_epi32(tmp0_hi_sse, res4_sse); + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); } - res0_sse = _mm_srai_epi32(res0_sse, interim_shift); - res4_sse = _mm_srai_epi32(res4_sse, interim_shift); + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); - res0_sse = _mm_packs_epi32(res0_sse, res4_sse); - _mm_store_si128((__m128i*)(tmp + j), res0_sse); + res0_128 = _mm_packs_epi32(res0_128, res4_128); + _mm_store_si128((__m128i*)(tmp + j), res0_128); } for (; j < width; j++){ diff --git a/libvmaf/src/feature/third_party/funque/x86/integer_funque_filters_avx512.c b/libvmaf/src/feature/third_party/funque/x86/integer_funque_filters_avx512.c new file mode 100644 index 000000000..48b014db2 --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/integer_funque_filters_avx512.c @@ -0,0 +1,3389 @@ +/* SPDX-License-Identifier: BSD-3-Clause +* Copyright (C) 2022 Intel Corporation. +*/ +/** + * + * Copyright 2016-2020 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include +#include +#include +#include "mem.h" +#include "../offset.h" +#include "../integer_funque_filters.h" +#include + +void integer_funque_dwt2_avx512(spat_fil_output_dtype *src, i_dwt2buffers *dwt2_dst, ptrdiff_t dst_stride, int width, int height) +{ + int dst_px_stride = dst_stride / sizeof(dwt2_dtype); + + /** + * Absolute value of filter coefficients are 1/sqrt(2) + * The filter is handled by multiplying square of coefficients in final stage + * Hence the value becomes 1/2, and this is handled using shifts + * Also extra required out shift is done along with filter shift itself + */ + const int8_t filter_shift = 1 + DWT2_OUT_SHIFT; + const int8_t filter_shift_rnd = 1<<(filter_shift - 1); + + /** + * Last column due to padding the values are left shifted and then right shifted + * Hence using updated shifts. Subtracting 1 due to left shift + */ + const int8_t filter_shift_lcpad = 1 + DWT2_OUT_SHIFT - 1; + const int8_t filter_shift_lcpad_rnd = 1<<(filter_shift_lcpad - 1); + + dwt2_dtype *band_a = dwt2_dst->bands[0]; + dwt2_dtype *band_h = dwt2_dst->bands[1]; + dwt2_dtype *band_v = dwt2_dst->bands[2]; + dwt2_dtype *band_d = dwt2_dst->bands[3]; + + int16_t row_idx0, row_idx1, col_idx0; + + int row0_offset, row1_offset; + + int width_div_2 = width >> 1; // without rounding (last value is handle outside) + int last_col = width & 1; + + int i, j; + + int width_rem_size32 = width_div_2 - (width_div_2 % 32); + int width_rem_size16 = width_div_2 - (width_div_2 % 16); + int width_rem_size8 = width_div_2 - (width_div_2 % 8); + int width_rem_size4 = width_div_2 - (width_div_2 % 4); + + __m512i filter_shift_512 = _mm512_set1_epi32(filter_shift); + __m512i idx_perm_512 = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0); + __m512i idx_extract_ab_512 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx_extract_cd_512 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + __m512i zero_512 = _mm512_setzero_si512(); + + __m256i idx_perm_256 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + __m256i filter_shift_256 = _mm256_set1_epi32(filter_shift); + __m256i zero_256 = _mm256_setzero_si256(); + + __m128i filter_shift_128 = _mm_set1_epi32(filter_shift); + __m128i zero_128 = _mm_setzero_si128(); + + for (i=0; i < (height+1)/2; ++i) + { + row_idx0 = 2*i; + row_idx1 = 2*i+1; + row_idx1 = row_idx1 < height ? row_idx1 : 2*i; + row0_offset = (row_idx0)*width; + row1_offset = (row_idx1)*width; + j = 0; + for(; j< width_rem_size32; j+=32) + { + int col_idx0 = (j << 1); + + __m512i src_a_512 = _mm512_loadu_si512((__m512i*)(src + row0_offset + col_idx0)); + __m512i src_b_512 = _mm512_loadu_si512((__m512i*)(src + row1_offset + col_idx0)); + __m512i src2_a_512 = _mm512_loadu_si512((__m512i*)(src + row0_offset + col_idx0 + 32)); + __m512i src2_b_512 = _mm512_loadu_si512((__m512i*)(src + row1_offset + col_idx0 + 32)); + + // Original + //F* F (a + b + c + d) - band A (F*F is 1/2) + //F* F (a - b + c - d) - band H (F*F is 1/2) + //F* F (a + b - c + d) - band V (F*F is 1/2) + //F* F (a - b - c - d) - band D (F*F is 1/2) + + __m512i a_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(src_a_512)); + __m512i a_hi = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(src_a_512, 1)); + __m512i b_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(src_b_512)); + __m512i b_hi = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(src_b_512, 1)); + __m512i a2_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(src2_a_512)); + __m512i a2_hi = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(src2_a_512, 1)); + __m512i b2_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(src2_b_512)); + __m512i b2_hi = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(src2_b_512, 1)); + + __m512i a_p_b_c_p_d_lo = _mm512_add_epi32(a_lo, b_lo); + __m512i a_p_b_c_p_d_hi = _mm512_add_epi32(a_hi, b_hi); + __m512i a_m_b_c_m_d_lo = _mm512_sub_epi32(a_lo, b_lo); + __m512i a_m_b_c_m_d_hi = _mm512_sub_epi32(a_hi, b_hi); + __m512i a_p_b_c_p_d_2_lo = _mm512_add_epi32(a2_lo, b2_lo); + __m512i a_p_b_c_p_d_2_hi = _mm512_add_epi32(a2_hi, b2_hi); + __m512i a_m_b_c_m_d_2_lo = _mm512_sub_epi32(a2_lo, b2_lo); + __m512i a_m_b_c_m_d_2_hi = _mm512_sub_epi32(a2_hi, b2_hi);; + + __m512i a_p_b_512 = _mm512_permutex2var_epi32(a_p_b_c_p_d_lo, idx_extract_ab_512, a_p_b_c_p_d_hi); + __m512i c_p_d_512 = _mm512_permutex2var_epi32(a_p_b_c_p_d_lo, idx_extract_cd_512, a_p_b_c_p_d_hi); + __m512i a_m_b_512 = _mm512_permutex2var_epi32(a_m_b_c_m_d_lo, idx_extract_ab_512, a_m_b_c_m_d_hi); + __m512i c_m_d_512 = _mm512_permutex2var_epi32(a_m_b_c_m_d_lo, idx_extract_cd_512, a_m_b_c_m_d_hi); + __m512i a_p_b_2_512 = _mm512_permutex2var_epi32(a_p_b_c_p_d_2_lo, idx_extract_ab_512, a_p_b_c_p_d_2_hi); + __m512i c_p_d_2_512 = _mm512_permutex2var_epi32(a_p_b_c_p_d_2_lo, idx_extract_cd_512, a_p_b_c_p_d_2_hi); + __m512i a_m_b_2_512 = _mm512_permutex2var_epi32(a_m_b_c_m_d_2_lo, idx_extract_ab_512, a_m_b_c_m_d_2_hi); + __m512i c_m_d_2_512 = _mm512_permutex2var_epi32(a_m_b_c_m_d_2_lo, idx_extract_cd_512, a_m_b_c_m_d_2_hi); + + __m512i band_a_512 = _mm512_add_epi32(a_p_b_512, c_p_d_512); + __m512i band_v_512 = _mm512_sub_epi32(a_p_b_512, c_p_d_512); + __m512i band_h_512 = _mm512_add_epi32(a_m_b_512, c_m_d_512); + __m512i band_d_512 = _mm512_sub_epi32(a_m_b_512, c_m_d_512); + __m512i band_a2_512 = _mm512_add_epi32(a_p_b_2_512, c_p_d_2_512); + __m512i band_v2_512 = _mm512_sub_epi32(a_p_b_2_512, c_p_d_2_512); + __m512i band_h2_512 = _mm512_add_epi32(a_m_b_2_512, c_m_d_2_512); + __m512i band_d2_512 = _mm512_sub_epi32(a_m_b_2_512, c_m_d_2_512); + + band_a_512 = _mm512_add_epi32(band_a_512, filter_shift_512); + band_v_512 = _mm512_add_epi32(band_v_512, filter_shift_512); + band_h_512 = _mm512_add_epi32(band_h_512, filter_shift_512); + band_d_512 = _mm512_add_epi32(band_d_512, filter_shift_512); + band_a2_512 = _mm512_add_epi32(band_a2_512, filter_shift_512); + band_h2_512 = _mm512_add_epi32(band_h2_512, filter_shift_512); + band_v2_512 = _mm512_add_epi32(band_v2_512, filter_shift_512); + band_d2_512 = _mm512_add_epi32(band_d2_512, filter_shift_512); + + band_a_512 = _mm512_srai_epi32(band_a_512, filter_shift_rnd); + band_a2_512 = _mm512_srai_epi32(band_a2_512, filter_shift_rnd); + band_h_512 = _mm512_srai_epi32(band_h_512, filter_shift_rnd); + band_h2_512 = _mm512_srai_epi32(band_h2_512, filter_shift_rnd); + band_v_512 = _mm512_srai_epi32(band_v_512, filter_shift_rnd); + band_v2_512 = _mm512_srai_epi32(band_v2_512, filter_shift_rnd); + band_d_512 = _mm512_srai_epi32(band_d_512, filter_shift_rnd); + band_d2_512 = _mm512_srai_epi32(band_d2_512, filter_shift_rnd); + + band_a_512 = _mm512_packs_epi32(band_a_512, band_a2_512); + band_h_512 = _mm512_packs_epi32(band_h_512, band_h2_512); + band_v_512 = _mm512_packs_epi32(band_v_512, band_v2_512); + band_d_512 = _mm512_packs_epi32(band_d_512, band_d2_512); + + band_a_512 = _mm512_permutexvar_epi64(idx_perm_512, band_a_512); + band_h_512 = _mm512_permutexvar_epi64(idx_perm_512, band_h_512); + band_v_512 = _mm512_permutexvar_epi64(idx_perm_512, band_v_512); + band_d_512 = _mm512_permutexvar_epi64(idx_perm_512, band_d_512); + + _mm512_storeu_si512((__m512i*)(band_a + i * dst_px_stride + j), band_a_512); + _mm512_storeu_si512((__m512i*)(band_h + i * dst_px_stride + j), band_h_512); + _mm512_storeu_si512((__m512i*)(band_v + i * dst_px_stride + j), band_v_512); + _mm512_storeu_si512((__m512i*)(band_d + i * dst_px_stride + j), band_d_512); + } + + for(; j< width_rem_size16; j+=16) + { + int col_idx0 = (j << 1); + + __m512i src_a_512 = _mm512_loadu_si512((__m512i*)(src + row0_offset + col_idx0)); + __m512i src_b_512 = _mm512_loadu_si512((__m512i*)(src + row1_offset + col_idx0)); + + // Original + //F* F (a + b + c + d) - band A (F*F is 1/2) + //F* F (a - b + c - d) - band H (F*F is 1/2) + //F* F (a + b - c + d) - band V (F*F is 1/2) + //F* F (a - b - c - d) - band D (F*F is 1/2) + + __m512i a_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(src_a_512)); + __m512i a_hi = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(src_a_512, 1)); + __m512i b_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(src_b_512)); + __m512i b_hi = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(src_b_512, 1)); + + __m512i a_p_b_c_p_d_lo = _mm512_add_epi32(a_lo, b_lo); + __m512i a_p_b_c_p_d_hi = _mm512_add_epi32(a_hi, b_hi); + __m512i a_m_b_c_m_d_lo = _mm512_sub_epi32(a_lo, b_lo); + __m512i a_m_b_c_m_d_hi = _mm512_sub_epi32(a_hi, b_hi); + + __m512i a_p_b_512 = _mm512_permutex2var_epi32(a_p_b_c_p_d_lo, idx_extract_ab_512, a_p_b_c_p_d_hi); + __m512i c_p_d_512 = _mm512_permutex2var_epi32(a_p_b_c_p_d_lo, idx_extract_cd_512, a_p_b_c_p_d_hi); + __m512i a_m_b_512 = _mm512_permutex2var_epi32(a_m_b_c_m_d_lo, idx_extract_ab_512, a_m_b_c_m_d_hi); + __m512i c_m_d_512 = _mm512_permutex2var_epi32(a_m_b_c_m_d_lo, idx_extract_cd_512, a_m_b_c_m_d_hi); + + __m512i band_a_512 = _mm512_add_epi32(a_p_b_512, c_p_d_512); + __m512i band_v_512 = _mm512_sub_epi32(a_p_b_512, c_p_d_512); + __m512i band_h_512 = _mm512_add_epi32(a_m_b_512, c_m_d_512); + __m512i band_d_512 = _mm512_sub_epi32(a_m_b_512, c_m_d_512); + + band_a_512 = _mm512_add_epi32(band_a_512, filter_shift_512); + band_v_512 = _mm512_add_epi32(band_v_512, filter_shift_512); + band_h_512 = _mm512_add_epi32(band_h_512, filter_shift_512); + band_d_512 = _mm512_add_epi32(band_d_512, filter_shift_512); + + band_a_512 = _mm512_srai_epi32(band_a_512, filter_shift_rnd); + band_h_512 = _mm512_srai_epi32(band_h_512, filter_shift_rnd); + band_v_512 = _mm512_srai_epi32(band_v_512, filter_shift_rnd); + band_d_512 = _mm512_srai_epi32(band_d_512, filter_shift_rnd); + + band_a_512 = _mm512_packs_epi32(band_a_512, zero_512); + band_h_512 = _mm512_packs_epi32(band_h_512, zero_512); + band_v_512 = _mm512_packs_epi32(band_v_512, zero_512); + band_d_512 = _mm512_packs_epi32(band_d_512, zero_512); + + band_a_512 = _mm512_permutexvar_epi64(idx_perm_512, band_a_512); + band_h_512 = _mm512_permutexvar_epi64(idx_perm_512, band_h_512); + band_v_512 = _mm512_permutexvar_epi64(idx_perm_512, band_v_512); + band_d_512 = _mm512_permutexvar_epi64(idx_perm_512, band_d_512); + + _mm256_storeu_si256((__m256i*)(band_a + i * dst_px_stride + j), _mm512_castsi512_si256(band_a_512)); + _mm256_storeu_si256((__m256i*)(band_h + i * dst_px_stride + j), _mm512_castsi512_si256(band_h_512)); + _mm256_storeu_si256((__m256i*)(band_v + i * dst_px_stride + j), _mm512_castsi512_si256(band_v_512)); + _mm256_storeu_si256((__m256i*)(band_d + i * dst_px_stride + j), _mm512_castsi512_si256(band_d_512)); + } + + for(; j< width_rem_size8; j+=8) + { + int col_idx0 = (j << 1); + + __m256i src_a_256 = _mm256_loadu_si256((__m256i*)(src + row0_offset + col_idx0)); + __m256i src_b_256 = _mm256_loadu_si256((__m256i*)(src + row1_offset + col_idx0)); + + // Original + //F* F (a + b + c + d) - band A (F*F is 1/2) + //F* F (a - b + c - d) - band H (F*F is 1/2) + //F* F (a + b - c + d) - band V (F*F is 1/2) + //F* F (a - b - c - d) - band D (F*F is 1/2) + + __m256i a_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(src_a_256)); + __m256i a_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(src_a_256, 1)); + __m256i b_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(src_b_256)); + __m256i b_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(src_b_256, 1)); + + __m256i a_p_b_c_p_d_lo = _mm256_add_epi32(a_lo, b_lo); + __m256i a_p_b_c_p_d_hi = _mm256_add_epi32(a_hi, b_hi); + __m256i a_m_b_c_m_d_lo = _mm256_sub_epi32(a_lo, b_lo); + __m256i a_m_b_c_m_d_hi = _mm256_sub_epi32(a_hi, b_hi); + + __m256i band_a_256 = _mm256_hadd_epi32(a_p_b_c_p_d_lo, a_p_b_c_p_d_hi); + __m256i band_v_256 = _mm256_hsub_epi32(a_p_b_c_p_d_lo, a_p_b_c_p_d_hi); + __m256i band_h_256 = _mm256_hadd_epi32(a_m_b_c_m_d_lo, a_m_b_c_m_d_hi); + __m256i band_d_256 = _mm256_hsub_epi32(a_m_b_c_m_d_lo, a_m_b_c_m_d_hi); + + band_a_256 = _mm256_add_epi32(band_a_256, filter_shift_256); + band_v_256 = _mm256_add_epi32(band_v_256, filter_shift_256); + band_h_256 = _mm256_add_epi32(band_h_256, filter_shift_256); + band_d_256 = _mm256_add_epi32(band_d_256, filter_shift_256); + + band_a_256 = _mm256_srai_epi32(band_a_256, filter_shift_rnd); + band_h_256 = _mm256_srai_epi32(band_h_256, filter_shift_rnd); + band_v_256 = _mm256_srai_epi32(band_v_256, filter_shift_rnd); + band_d_256 = _mm256_srai_epi32(band_d_256, filter_shift_rnd); + + band_a_256 = _mm256_packs_epi32(band_a_256, zero_256); + band_h_256 = _mm256_packs_epi32(band_h_256, zero_256); + band_v_256 = _mm256_packs_epi32(band_v_256, zero_256); + band_d_256 = _mm256_packs_epi32(band_d_256, zero_256); + + band_a_256 = _mm256_permutevar8x32_epi32(band_a_256, idx_perm_256); + band_h_256 = _mm256_permutevar8x32_epi32(band_h_256, idx_perm_256); + band_v_256 = _mm256_permutevar8x32_epi32(band_v_256, idx_perm_256); + band_d_256 = _mm256_permutevar8x32_epi32(band_d_256, idx_perm_256); + + _mm_storeu_si128((__m128i*)(band_a + i * dst_px_stride + j), _mm256_castsi256_si128(band_a_256)); + _mm_storeu_si128((__m128i*)(band_h + i * dst_px_stride + j), _mm256_castsi256_si128(band_h_256)); + _mm_storeu_si128((__m128i*)(band_v + i * dst_px_stride + j), _mm256_castsi256_si128(band_v_256)); + _mm_storeu_si128((__m128i*)(band_d + i * dst_px_stride + j), _mm256_castsi256_si128(band_d_256)); + } + + for(; j< width_rem_size4; j+=4) + { + int col_idx0 = (j << 1); + + __m128i src_a_128 = _mm_loadu_si128((__m128i*)(src + row0_offset + col_idx0)); + __m128i src_b_128 = _mm_loadu_si128((__m128i*)(src + row1_offset + col_idx0)); + + // Original + //F* F (a + b + c + d) - band A (F*F is 1/2) + //F* F (a - b + c - d) - band H (F*F is 1/2) + //F* F (a + b - c + d) - band V (F*F is 1/2) + //F* F (a - b - c - d) - band D (F*F is 1/2) + + __m128i a_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src_a_128, zero_128)); + __m128i a_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src_a_128, zero_128)); + __m128i b_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src_b_128, zero_128)); + __m128i b_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src_b_128, zero_128)); + + __m128i a_p_b_c_p_d_lo = _mm_add_epi32(a_lo, b_lo); + __m128i a_p_b_c_p_d_hi = _mm_add_epi32(a_hi, b_hi); + __m128i a_m_b_c_m_d_lo = _mm_sub_epi32(a_lo, b_lo); + __m128i a_m_b_c_m_d_hi = _mm_sub_epi32(a_hi, b_hi); + + __m128i band_a_128 = _mm_hadd_epi32(a_p_b_c_p_d_lo, a_p_b_c_p_d_hi); + __m128i band_h_128 = _mm_hadd_epi32(a_m_b_c_m_d_lo, a_m_b_c_m_d_hi); + __m128i band_v_128 = _mm_hsub_epi32(a_p_b_c_p_d_lo, a_p_b_c_p_d_hi); + __m128i band_d_128 = _mm_hsub_epi32(a_m_b_c_m_d_lo, a_m_b_c_m_d_hi); + + band_a_128 = _mm_add_epi32(band_a_128, filter_shift_128); + band_h_128 = _mm_add_epi32(band_h_128, filter_shift_128); + band_v_128 = _mm_add_epi32(band_v_128, filter_shift_128); + band_d_128 = _mm_add_epi32(band_d_128, filter_shift_128); + + band_a_128 = _mm_srai_epi32(band_a_128, filter_shift_rnd); + band_h_128 = _mm_srai_epi32(band_h_128, filter_shift_rnd); + band_v_128 = _mm_srai_epi32(band_v_128, filter_shift_rnd); + band_d_128 = _mm_srai_epi32(band_d_128, filter_shift_rnd); + + band_a_128 = _mm_packs_epi32(band_a_128, zero_128); + band_h_128 = _mm_packs_epi32(band_h_128, zero_128); + band_v_128 = _mm_packs_epi32(band_v_128, zero_128); + band_d_128 = _mm_packs_epi32(band_d_128, zero_128); + + _mm_storel_epi64((__m128i*)(band_a + i * dst_px_stride + j), band_a_128); + _mm_storel_epi64((__m128i*)(band_h + i * dst_px_stride + j), band_h_128); + _mm_storel_epi64((__m128i*)(band_v + i * dst_px_stride + j), band_v_128); + _mm_storel_epi64((__m128i*)(band_d + i * dst_px_stride + j), band_d_128); + } + + for(; j< width_div_2; ++j) + { + int col_idx0 = (j << 1); + int col_idx1 = (j << 1) + 1; + + // a & b 2 values in adjacent rows at the same coloumn + spat_fil_output_dtype src_a = src[row0_offset+ col_idx0]; + spat_fil_output_dtype src_b = src[row1_offset+ col_idx0]; + + // c & d are adjacent values to a & b in teh same row + spat_fil_output_dtype src_c = src[row0_offset + col_idx1]; + spat_fil_output_dtype src_d = src[row1_offset + col_idx1]; + + //a + b & a - b + int32_t src_a_p_b = src_a + src_b; + int32_t src_a_m_b = src_a - src_b; + + //c + d & c - d + int32_t src_c_p_d = src_c + src_d; + int32_t src_c_m_d = src_c - src_d; + + //F* F (a + b + c + d) - band A (F*F is 1/2) + band_a[i*dst_px_stride+j] = (dwt2_dtype) (((src_a_p_b + src_c_p_d) + filter_shift_rnd) >> filter_shift); + + //F* F (a - b + c - d) - band H (F*F is 1/2) + band_h[i*dst_px_stride+j] = (dwt2_dtype) (((src_a_m_b + src_c_m_d) + filter_shift_rnd) >> filter_shift); + + //F* F (a + b - c + d) - band V (F*F is 1/2) + band_v[i*dst_px_stride+j] = (dwt2_dtype) (((src_a_p_b - src_c_p_d) + filter_shift_rnd) >> filter_shift); + + //F* F (a - b - c - d) - band D (F*F is 1/2) + band_d[i*dst_px_stride+j] = (dwt2_dtype) (((src_a_m_b - src_c_m_d) + filter_shift_rnd) >> filter_shift); + } + + if(last_col) + { + col_idx0 = width_div_2 << 1; + j = width_div_2; + + // a & b 2 values in adjacent rows at the last coloumn + spat_fil_output_dtype src_a = src[row0_offset+ col_idx0]; + spat_fil_output_dtype src_b = src[row1_offset+ col_idx0]; + + //a + b & a - b + int src_a_p_b = src_a + src_b; + int src_a_m_b = src_a - src_b; + + //F* F (a + b + a + b) - band A (F*F is 1/2) + band_a[i*dst_px_stride+j] = (dwt2_dtype) ((src_a_p_b + filter_shift_lcpad_rnd) >> filter_shift_lcpad); + + //F* F (a - b + a - b) - band H (F*F is 1/2) + band_h[i*dst_px_stride+j] = (dwt2_dtype) ((src_a_m_b + filter_shift_lcpad_rnd) >> filter_shift_lcpad); + + //F* F (a + b - (a + b)) - band V, Last column V will always be 0 + band_v[i*dst_px_stride+j] = 0; + + //F* F (a - b - (a -b)) - band D, Last column D will always be 0 + band_d[i*dst_px_stride+j] = 0; + } + } +} + +void integer_funque_vifdwt2_band0_avx512(dwt2_dtype *src, dwt2_dtype *band_a, ptrdiff_t dst_stride, int width, int height) +{ + int dst_px_stride = dst_stride / sizeof(dwt2_dtype); + + /** + * Absolute value of filter coefficients are 1/sqrt(2) + * The filter is handled by multiplying square of coefficients in final stage + * Hence the value becomes 1/2, and this is handled using shifts + * Also extra required out shift is done along with filter shift itself + */ + const int8_t filter_shift = 1 + DWT2_OUT_SHIFT; + const int8_t filter_shift_rnd = 1<<(filter_shift - 1); + + /** + * Last column due to padding the values are left shifted and then right shifted + * Hence using updated shifts. Subtracting 1 due to left shift + */ + const int8_t filter_shift_lcpad = 1 + DWT2_OUT_SHIFT - 1; + const int8_t filter_shift_lcpad_rnd = 1<<(filter_shift_lcpad - 1); + + int16_t row_idx0, row_idx1, col_idx0; + // int16_t col_idx1; + int row0_offset, row1_offset; + // int64_t accum; + int width_div_2 = width >> 1; // without rounding (last value is handle outside) + int last_col = width & 1; + + int i, j; + + int width_rem_size32 = width_div_2 - (width_div_2 % 32); + int width_rem_size16 = width_div_2 - (width_div_2 % 16); + int width_rem_size8 = width_div_2 - (width_div_2 % 8); + int width_rem_size4 = width_div_2 - (width_div_2 % 4); + + __m512i idx_extract_ab_512 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx_extract_cd_512 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + __m512i filter_shift_512 = _mm512_set1_epi32(filter_shift); + __m512i idx_perm_512 = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0); + __m512i zero_512 = _mm512_setzero_si512(); + + __m256i filter_shift_256 = _mm256_set1_epi32(filter_shift); + __m256i idx_perm_256 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + __m256i zero_256 = _mm256_setzero_si256(); + + __m128i filter_shift_128 = _mm_set1_epi32(filter_shift); + __m128i zero_128 = _mm_setzero_si128(); + + for (i=0; i < (height+1)/2; ++i) + { + row_idx0 = 2*i; + row_idx1 = 2*i+1; + row_idx1 = row_idx1 < height ? row_idx1 : 2*i; + row0_offset = (row_idx0)*width; + row1_offset = (row_idx1)*width; + j=0; + + for(; j< width_rem_size32; j+=32) + { + int col_idx0 = (j << 1); + __m512i src_a_512 = _mm512_loadu_si512((__m512i*)(src + row0_offset + col_idx0)); + __m512i src_b_512 = _mm512_loadu_si512((__m512i*)(src + row1_offset + col_idx0)); + __m512i src2_a_512 = _mm512_loadu_si512((__m512i*)(src + row0_offset + col_idx0 + 32)); + __m512i src2_b_512 = _mm512_loadu_si512((__m512i*)(src + row1_offset + col_idx0 + 32)); + + __m512i a_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(src_a_512)); + __m512i a_hi = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(src_a_512, 1)); + __m512i b_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(src_b_512)); + __m512i b_hi = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(src_b_512, 1)); + __m512i a2_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(src2_a_512)); + __m512i a2_hi = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(src2_a_512, 1)); + __m512i b2_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(src2_b_512)); + __m512i b2_hi = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(src2_b_512, 1)); + + __m512i a_p_b_c_p_d_lo = _mm512_add_epi32(a_lo, b_lo); + __m512i a_p_b_c_p_d_hi = _mm512_add_epi32(a_hi, b_hi); + __m512i a_p_b_c_p_d_2_lo = _mm512_add_epi32(a2_lo, b2_lo); + __m512i a_p_b_c_p_d_2_hi = _mm512_add_epi32(a2_hi, b2_hi); + + __m512i band_a_ab_512 = _mm512_permutex2var_epi32(a_p_b_c_p_d_lo, idx_extract_ab_512, a_p_b_c_p_d_hi); + __m512i band_a_cd_512 = _mm512_permutex2var_epi32(a_p_b_c_p_d_lo, idx_extract_cd_512, a_p_b_c_p_d_hi); + __m512i band_a_ab_2_512 = _mm512_permutex2var_epi32(a_p_b_c_p_d_2_lo, idx_extract_ab_512, a_p_b_c_p_d_2_hi); + __m512i band_a_cd_2_512 = _mm512_permutex2var_epi32(a_p_b_c_p_d_2_lo, idx_extract_cd_512, a_p_b_c_p_d_2_hi); + + __m512i band_a_512 = _mm512_add_epi32(band_a_ab_512, band_a_cd_512); + __m512i band_a_2_512 = _mm512_add_epi32(band_a_ab_2_512, band_a_cd_2_512); + + band_a_512 = _mm512_add_epi32(band_a_512, filter_shift_512); + band_a_2_512 = _mm512_add_epi32(band_a_2_512, filter_shift_512); + + band_a_512 = _mm512_srai_epi32(band_a_512, filter_shift_rnd); + band_a_2_512 = _mm512_srai_epi32(band_a_2_512, filter_shift_rnd); + + band_a_512 = _mm512_packs_epi32(band_a_512, band_a_2_512); + band_a_512 = _mm512_permutexvar_epi64(idx_perm_512, band_a_512); + + _mm512_storeu_si512((__m512i*)(band_a + i * dst_px_stride + j), band_a_512); + } + + for(; j< width_rem_size16; j+=16) + { + int col_idx0 = (j << 1); + __m512i src_a_512 = _mm512_loadu_si512((__m512i*)(src + row0_offset + col_idx0)); + __m512i src_b_512 = _mm512_loadu_si512((__m512i*)(src + row1_offset + col_idx0)); + + __m512i a_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(src_a_512)); + __m512i a_hi = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(src_a_512, 1)); + __m512i b_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(src_b_512)); + __m512i b_hi = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(src_b_512, 1)); + + __m512i a_p_b_c_p_d_lo = _mm512_add_epi32(a_lo, b_lo); + __m512i a_p_b_c_p_d_hi = _mm512_add_epi32(a_hi, b_hi); + + __m512i band_a_ab_512 = _mm512_permutex2var_epi32(a_p_b_c_p_d_lo, idx_extract_ab_512, a_p_b_c_p_d_hi); + __m512i band_a_cd_512 = _mm512_permutex2var_epi32(a_p_b_c_p_d_lo, idx_extract_cd_512, a_p_b_c_p_d_hi); + __m512i band_a_512 = _mm512_add_epi32(band_a_ab_512, band_a_cd_512); + + band_a_512 = _mm512_add_epi32(band_a_512, filter_shift_512); + band_a_512 = _mm512_srai_epi32(band_a_512, filter_shift_rnd); + band_a_512 = _mm512_packs_epi32(band_a_512, zero_512); + band_a_512 = _mm512_permutexvar_epi64(idx_perm_512, band_a_512); + + _mm256_storeu_si256((__m256i*)(band_a + i * dst_px_stride + j), _mm512_castsi512_si256(band_a_512)); + } + + for(; j< width_rem_size8; j+=8) + { + int col_idx0 = (j << 1); + + __m256i src_a_256 = _mm256_loadu_si256((__m256i*)(src + row0_offset + col_idx0)); + __m256i src_b_256 = _mm256_loadu_si256((__m256i*)(src + row1_offset + col_idx0)); + + __m256i a_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(src_a_256)); + __m256i a_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(src_a_256, 1)); + __m256i b_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(src_b_256)); + __m256i b_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(src_b_256, 1)); + + __m256i a_p_b_c_p_d_lo = _mm256_add_epi32(a_lo, b_lo); + __m256i a_p_b_c_p_d_hi = _mm256_add_epi32(a_hi, b_hi); + + __m256i band_a_256 = _mm256_hadd_epi32(a_p_b_c_p_d_lo, a_p_b_c_p_d_hi); + band_a_256 = _mm256_add_epi32(band_a_256, filter_shift_256); + band_a_256 = _mm256_srai_epi32(band_a_256, filter_shift_rnd); + band_a_256 = _mm256_packs_epi32(band_a_256, zero_256); + + band_a_256 = _mm256_permutevar8x32_epi32(band_a_256, idx_perm_256); + _mm_storeu_si128((__m128i*)(band_a + i * dst_px_stride + j), _mm256_castsi256_si128(band_a_256)); + } + + for(; j< width_rem_size4; j+=4) + { + int col_idx0 = (j << 1); + + __m128i src_a_128 = _mm_loadu_si128((__m128i*)(src + row0_offset + col_idx0)); + __m128i src_b_128 = _mm_loadu_si128((__m128i*)(src + row1_offset + col_idx0)); + + __m128i a_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src_a_128, zero_128)); + __m128i a_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src_a_128, zero_128)); + __m128i b_lo = _mm_cvtepi16_epi32( _mm_unpacklo_epi64(src_b_128, zero_128)); + __m128i b_hi = _mm_cvtepi16_epi32( _mm_unpackhi_epi64(src_b_128, zero_128)); + + __m128i a_p_b_c_p_d_lo = _mm_add_epi32(a_lo, b_lo); + __m128i a_p_b_c_p_d_hi = _mm_add_epi32(a_hi, b_hi); + + __m128i band_a_128 = _mm_hadd_epi32(a_p_b_c_p_d_lo, a_p_b_c_p_d_hi); + band_a_128 = _mm_add_epi32(band_a_128, filter_shift_128); + band_a_128 = _mm_srai_epi32(band_a_128, filter_shift_rnd); + band_a_128 = _mm_packs_epi32(band_a_128, zero_128); + + _mm_storel_epi64((__m128i*)(band_a + i * dst_px_stride + j), band_a_128); + } + + for(; j< width_div_2; ++j) + { + int col_idx0 = (j << 1); + int col_idx1 = (j << 1) + 1; + + // a & b 2 values in adjacent rows at the same coloumn + spat_fil_output_dtype src_a = src[row0_offset+ col_idx0]; + spat_fil_output_dtype src_b = src[row1_offset+ col_idx0]; + + // c & d are adjacent values to a & b in teh same row + spat_fil_output_dtype src_c = src[row0_offset + col_idx1]; + spat_fil_output_dtype src_d = src[row1_offset + col_idx1]; + + //a + b & a - b + int32_t src_a_p_b = src_a + src_b; + // int32_t src_a_m_b = src_a - src_b; + + //c + d & c - d + int32_t src_c_p_d = src_c + src_d; + // int32_t src_c_m_d = src_c - src_d; + + //F* F (a + b + c + d) - band A (F*F is 1/2) + band_a[i*dst_px_stride+j] = (dwt2_dtype) (((src_a_p_b + src_c_p_d) + filter_shift_rnd) >> filter_shift); + } + + if(last_col) + { + col_idx0 = width_div_2 << 1; + j = width_div_2; + + // a & b 2 values in adjacent rows at the last coloumn + spat_fil_output_dtype src_a = src[row0_offset+ col_idx0]; + spat_fil_output_dtype src_b = src[row1_offset+ col_idx0]; + + //a + b & a - b + int src_a_p_b = src_a + src_b; + + //F* F (a + b + a + b) - band A (F*F is 1/2) + band_a[i*dst_px_stride+j] = (dwt2_dtype) ((src_a_p_b + filter_shift_lcpad_rnd) >> filter_shift_lcpad); + } + } +} + +/** + * This function applies intermediate horizontal pass filter inside spatial filter + */ + +static void integer_horizontal_filter_avx512(spat_fil_inter_dtype *tmp, spat_fil_output_dtype *dst, const spat_fil_coeff_dtype *i_filter_coeffs, int width, int fwidth, int dst_row_idx, int half_fw) +{ + int j, fj, jj1, jj2; + __m512i res0_512, res4_512, res8_512, res12_512; + __m256i res0_256, res4_256, res8_256, res12_256; + __m128i res0_128, res4_128, res8_128, res12_128; + + int width_rem_size128 = (width - half_fw) - ((width - 2*half_fw) % 128); + int width_rem_size64 = (width - half_fw) - ((width - 2*half_fw) % 64); + int width_rem_size32 = (width - half_fw) - ((width - 2*half_fw) % 32); + int width_rem_size16 = (width - half_fw) - ((width - 2*half_fw) % 16); + int width_rem_size8 = (width - half_fw) - ((width - 2*half_fw) % 8); + + const spat_fil_coeff_dtype i_filter_coeffs_with_zeros[83] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -900, -1054, -1239, -1452, -1669, -1798, -1547, -66, 4677, 14498, 21495, + 14498, 4677, -66, -1547, -1798, -1669, -1452, -1239, -1054, -900, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + const spat_fil_accum_dtype i32_filter_coeffs[11] = { + -900 + (spat_fil_accum_dtype)(((unsigned int)-1054) << 16) + (1 << 16), + -1239 + (spat_fil_accum_dtype)(((unsigned int)-1452) << 16) + (1 << 16), + -1669 + (spat_fil_accum_dtype)(((unsigned int)-1798) << 16) + (1 << 16), + -1547 + (spat_fil_accum_dtype)(((unsigned int)-66) << 16) + (1 << 16), + 4677 + (14498 << 16) /* + (1 << 16) */, + 21495 + (14498 << 16) /* + (1 << 16) */, + 4677 + (spat_fil_accum_dtype)(((unsigned int)-66) << 16) /* + (1 << 16) */, + -1547 + (spat_fil_accum_dtype)(((unsigned int)-1798) << 16) + (1 << 16), + -1669 + (spat_fil_accum_dtype)(((unsigned int)-1452) << 16) + (1 << 16), + -1239 + (spat_fil_accum_dtype)(((unsigned int)-1054) << 16) + (1 << 16), + -900 + (1 << 16) + }; + (void)fwidth; + + __m512i d0_512 = _mm512_loadu_si512((__m512i*)(tmp)); + int half_filter_table_w2 = 41; + + for(j = 0; j < half_fw; j++) + { + int fi0 = half_filter_table_w2 - j; + int fi1 = j + half_filter_table_w2 + 1; + __m512i coef1 = _mm512_loadu_si512((__m512i*)(i_filter_coeffs_with_zeros + fi1)); + __m512i coef0 = _mm512_loadu_si512((__m512i*)(i_filter_coeffs_with_zeros + fi0)); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0_512, coef0); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0_512, coef0); + + __m512i mul1_lo_512 = _mm512_mullo_epi16(d0_512, coef1); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d0_512, coef1); + + __m512i tmp0_lo = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + + __m512i tmp1_lo = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + tmp0_lo = _mm512_add_epi32(tmp0_lo, tmp0_hi); + tmp0_hi = _mm512_add_epi32(tmp1_lo, tmp1_hi); + + __m512i res0 = _mm512_add_epi32(tmp0_lo, tmp0_hi); + __m256i r8 = _mm256_add_epi32(_mm512_castsi512_si256(res0), _mm512_extracti32x8_epi32(res0, 1)); + __m128i r4 = _mm_add_epi32(_mm256_castsi256_si128(r8), _mm256_extracti128_si256(r8, 1)); + __m128i r2 = _mm_hadd_epi32(r4, r4); + __m128i r1 = _mm_hadd_epi32(r2, r2); + dst[dst_row_idx + j] = ((_mm_cvtsi128_si32(r1) + SPAT_FILTER_OUT_RND) >> SPAT_FILTER_OUT_SHIFT); + } + + __m512i coef0_512 = _mm512_set1_epi16(i_filter_coeffs[0]); + //This is the core loop + for (; j < width_rem_size128; j+=128) + { + int f_l_j = j - half_fw; + int f_r_j = j + half_fw; + res0_512 = res4_512 = res8_512 = res12_512 = _mm512_set1_epi32(SPAT_FILTER_OUT_RND); + __m512i res16_512, res20_512, res24_512, res28_512; + res16_512 = res20_512 = res24_512 = res28_512 = _mm512_set1_epi32(SPAT_FILTER_OUT_RND); + + for (fj = 0; fj < half_fw; fj+=2){ + jj1 = f_l_j + fj*2; + + __m512i coef0 = _mm512_set1_epi32(i32_filter_coeffs[fj]); + __m512i coef1 = _mm512_set1_epi32(i32_filter_coeffs[fj+1]); + + __m512i d0 = _mm512_loadu_si512((__m512i*)(tmp + jj1)); + __m512i d2 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 2)); + __m512i d1 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 1)); + __m512i d3 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 3)); + __m512i d0_32 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 32)); + __m512i d2_32 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 34)); + __m512i d1_32 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 33)); + __m512i d3_32 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 35)); + + __m512i d0_64 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 64)); + __m512i d2_64 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 2 + 64)); + __m512i d1_64 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 1 + 64)); + __m512i d3_64 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 3 + 64)); + __m512i d0_96 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 32 + 64)); + __m512i d2_96 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 34 + 64)); + __m512i d1_96 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 33 + 64)); + __m512i d3_96 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 35 + 64)); + + res0_512 = _mm512_add_epi32(res0_512, _mm512_madd_epi16(d0, coef0)); + res0_512 = _mm512_add_epi32(res0_512, _mm512_madd_epi16(d2, coef1)); + res4_512 = _mm512_add_epi32(res4_512, _mm512_madd_epi16(d1, coef0)); + res4_512 = _mm512_add_epi32(res4_512, _mm512_madd_epi16(d3, coef1)); + + res8_512 = _mm512_add_epi32(res8_512, _mm512_madd_epi16(d0_32, coef0)); + res8_512 = _mm512_add_epi32(res8_512, _mm512_madd_epi16(d2_32, coef1)); + res12_512 = _mm512_add_epi32(res12_512, _mm512_madd_epi16(d1_32, coef0)); + res12_512 = _mm512_add_epi32(res12_512, _mm512_madd_epi16(d3_32, coef1)); + + res16_512 = _mm512_add_epi32(res16_512, _mm512_madd_epi16(d0_64, coef0)); + res16_512 = _mm512_add_epi32(res16_512, _mm512_madd_epi16(d2_64, coef1)); + res20_512 = _mm512_add_epi32(res20_512, _mm512_madd_epi16(d1_64, coef0)); + res20_512 = _mm512_add_epi32(res20_512, _mm512_madd_epi16(d3_64, coef1)); + + res24_512 = _mm512_add_epi32(res24_512, _mm512_madd_epi16(d0_96, coef0)); + res24_512 = _mm512_add_epi32(res24_512, _mm512_madd_epi16(d2_96, coef1)); + res28_512 = _mm512_add_epi32(res28_512, _mm512_madd_epi16(d1_96, coef0)); + res28_512 = _mm512_add_epi32(res28_512, _mm512_madd_epi16(d3_96, coef1)); + + } + __m512i d0 = _mm512_loadu_si512((__m512i*)(tmp + f_r_j)); + __m512i d0_32 = _mm512_loadu_si512((__m512i*)(tmp + f_r_j + 32)); + __m512i d0_64 = _mm512_loadu_si512((__m512i*)(tmp + f_r_j + 64)); + __m512i d0_96 = _mm512_loadu_si512((__m512i*)(tmp + f_r_j + 96)); + + __m512i tmp0 = _mm512_unpacklo_epi32(res0_512, res4_512); + __m512i tmp4 = _mm512_unpackhi_epi32(res0_512, res4_512); + __m512i tmp8 = _mm512_unpacklo_epi32(res8_512, res12_512); + __m512i tmp12 = _mm512_unpackhi_epi32(res8_512, res12_512); + __m512i tmp16 = _mm512_unpacklo_epi32(res16_512, res20_512); + __m512i tmp20 = _mm512_unpackhi_epi32(res16_512, res20_512); + __m512i tmp24 = _mm512_unpacklo_epi32(res24_512, res28_512); + __m512i tmp28 = _mm512_unpackhi_epi32(res24_512, res28_512); + + __m512i mul0_lo = _mm512_mullo_epi16(d0, coef0_512); + __m512i mul0_hi = _mm512_mulhi_epi16(d0, coef0_512); + __m512i mul0_32_lo = _mm512_mullo_epi16(d0_32, coef0_512); + __m512i mul0_32_hi = _mm512_mulhi_epi16(d0_32, coef0_512); + __m512i mul0_64_lo = _mm512_mullo_epi16(d0_64, coef0_512); + __m512i mul0_64_hi = _mm512_mulhi_epi16(d0_64, coef0_512); + __m512i mul0_96_lo = _mm512_mullo_epi16(d0_96, coef0_512); + __m512i mul0_96_hi = _mm512_mulhi_epi16(d0_96, coef0_512); + + __m512i tmp0_lo = _mm512_unpacklo_epi16(mul0_lo, mul0_hi); + __m512i tmp0_hi = _mm512_unpackhi_epi16(mul0_lo, mul0_hi); + __m512i tmp0_32_lo = _mm512_unpacklo_epi16(mul0_32_lo, mul0_32_hi); + __m512i tmp0_32_hi = _mm512_unpackhi_epi16(mul0_32_lo, mul0_32_hi); + + __m512i tmp0_64_lo = _mm512_unpacklo_epi16(mul0_64_lo, mul0_64_hi); + __m512i tmp0_64_hi = _mm512_unpackhi_epi16(mul0_64_lo, mul0_64_hi); + __m512i tmp0_96_lo = _mm512_unpacklo_epi16(mul0_96_lo, mul0_96_hi); + __m512i tmp0_96_hi = _mm512_unpackhi_epi16(mul0_96_lo, mul0_96_hi); + + tmp0 = _mm512_add_epi32(tmp0, tmp0_lo); + tmp4 = _mm512_add_epi32(tmp4, tmp0_hi); + tmp8 = _mm512_add_epi32(tmp8, tmp0_32_lo); + tmp12 = _mm512_add_epi32(tmp12, tmp0_32_hi); + tmp16 = _mm512_add_epi32(tmp16, tmp0_64_lo); + tmp20 = _mm512_add_epi32(tmp20, tmp0_64_hi); + tmp24 = _mm512_add_epi32(tmp24, tmp0_96_lo); + tmp28 = _mm512_add_epi32(tmp28, tmp0_96_hi); + + tmp0 = _mm512_srai_epi32(tmp0, SPAT_FILTER_OUT_SHIFT); + tmp4 = _mm512_srai_epi32(tmp4, SPAT_FILTER_OUT_SHIFT); + tmp8 = _mm512_srai_epi32(tmp8, SPAT_FILTER_OUT_SHIFT); + tmp12 = _mm512_srai_epi32(tmp12, SPAT_FILTER_OUT_SHIFT); + tmp16 = _mm512_srai_epi32(tmp16, SPAT_FILTER_OUT_SHIFT); + tmp20 = _mm512_srai_epi32(tmp20, SPAT_FILTER_OUT_SHIFT); + tmp24 = _mm512_srai_epi32(tmp24, SPAT_FILTER_OUT_SHIFT); + tmp28 = _mm512_srai_epi32(tmp28, SPAT_FILTER_OUT_SHIFT); + + res0_512 = _mm512_packs_epi32(tmp0, tmp4); + res8_512 = _mm512_packs_epi32(tmp8, tmp12); + res16_512 = _mm512_packs_epi32(tmp16, tmp20); + res24_512 = _mm512_packs_epi32(tmp24, tmp28); + + _mm512_storeu_si512((__m512i*)(dst + dst_row_idx + j), res0_512); + _mm512_storeu_si512((__m512i*)(dst + dst_row_idx + j + 32), res8_512); + _mm512_storeu_si512((__m512i*)(dst + dst_row_idx + j + 64), res16_512); + _mm512_storeu_si512((__m512i*)(dst + dst_row_idx + j + 96), res24_512); + } + + for (; j < width_rem_size64; j+=64) + { + int f_l_j = j - half_fw; + int f_r_j = j + half_fw; + res0_512 = res4_512 = res8_512 = res12_512 = _mm512_set1_epi32(SPAT_FILTER_OUT_RND); + + for (fj = 0; fj < half_fw; fj+=2){ + jj1 = f_l_j + fj*2; + + __m512i coef0 = _mm512_set1_epi32(i32_filter_coeffs[fj]); + __m512i coef1 = _mm512_set1_epi32(i32_filter_coeffs[fj+1]); + + __m512i d0 = _mm512_loadu_si512((__m512i*)(tmp + jj1)); + __m512i d2 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 2)); + __m512i d1 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 1)); + __m512i d3 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 3)); + + __m512i d0_32 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 32)); + __m512i d2_32 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 34)); + __m512i d1_32 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 33)); + __m512i d3_32 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 35)); + + res0_512 = _mm512_add_epi32(res0_512, _mm512_madd_epi16(d0, coef0)); + res0_512 = _mm512_add_epi32(res0_512, _mm512_madd_epi16(d2, coef1)); + res4_512 = _mm512_add_epi32(res4_512, _mm512_madd_epi16(d1, coef0)); + res4_512 = _mm512_add_epi32(res4_512, _mm512_madd_epi16(d3, coef1)); + + res8_512 = _mm512_add_epi32(res8_512, _mm512_madd_epi16(d0_32, coef0)); + res8_512 = _mm512_add_epi32(res8_512, _mm512_madd_epi16(d2_32, coef1)); + res12_512 = _mm512_add_epi32(res12_512, _mm512_madd_epi16(d1_32, coef0)); + res12_512 = _mm512_add_epi32(res12_512, _mm512_madd_epi16(d3_32, coef1)); + } + __m512i d0 = _mm512_loadu_si512((__m512i*)(tmp + f_r_j)); + __m512i d0_32 = _mm512_loadu_si512((__m512i*)(tmp + f_r_j + 32)); + + __m512i tmp0 = _mm512_unpacklo_epi32(res0_512, res4_512); + __m512i tmp4 = _mm512_unpackhi_epi32(res0_512, res4_512); + __m512i tmp8 = _mm512_unpacklo_epi32(res8_512, res12_512); + __m512i tmp12 = _mm512_unpackhi_epi32(res8_512, res12_512); + + __m512i mul0_lo = _mm512_mullo_epi16(d0, coef0_512); + __m512i mul0_hi = _mm512_mulhi_epi16(d0, coef0_512); + __m512i mul0_32_lo = _mm512_mullo_epi16(d0_32, coef0_512); + __m512i mul0_32_hi = _mm512_mulhi_epi16(d0_32, coef0_512); + + __m512i tmp0_lo = _mm512_unpacklo_epi16(mul0_lo, mul0_hi); + __m512i tmp0_hi = _mm512_unpackhi_epi16(mul0_lo, mul0_hi); + __m512i tmp0_32_lo = _mm512_unpacklo_epi16(mul0_32_lo, mul0_32_hi); + __m512i tmp0_32_hi = _mm512_unpackhi_epi16(mul0_32_lo, mul0_32_hi); + + tmp0 = _mm512_add_epi32(tmp0, tmp0_lo); + tmp4 = _mm512_add_epi32(tmp4, tmp0_hi); + tmp8 = _mm512_add_epi32(tmp8, tmp0_32_lo); + tmp12 = _mm512_add_epi32(tmp12, tmp0_32_hi); + + tmp0 = _mm512_srai_epi32(tmp0, SPAT_FILTER_OUT_SHIFT); + tmp4 = _mm512_srai_epi32(tmp4, SPAT_FILTER_OUT_SHIFT); + tmp8 = _mm512_srai_epi32(tmp8, SPAT_FILTER_OUT_SHIFT); + tmp12 = _mm512_srai_epi32(tmp12, SPAT_FILTER_OUT_SHIFT); + + res0_512 = _mm512_packs_epi32(tmp0, tmp4); + res8_512 = _mm512_packs_epi32(tmp8, tmp12); + + _mm512_storeu_si512((__m512i*)(dst + dst_row_idx + j), res0_512); + _mm512_storeu_si512((__m512i*)(dst + dst_row_idx + j + 32), res8_512); + } + + for (; j < width_rem_size32; j+=32) + { + int f_l_j = j - half_fw; + int f_r_j = j + half_fw; + + res0_512 = res4_512 = res8_512 = res12_512 = _mm512_set1_epi32(SPAT_FILTER_OUT_RND); + for (fj = 0; fj < half_fw; fj+=2){ + jj1 = f_l_j + fj*2; + + __m512i coef0 = _mm512_set1_epi32(i32_filter_coeffs[fj]); + __m512i coef1 = _mm512_set1_epi32(i32_filter_coeffs[fj+1]); + + __m512i d0 = _mm512_loadu_si512((__m512i*)(tmp + jj1)); + __m512i d2 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 2)); + __m512i d1 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 1)); + __m512i d3 = _mm512_loadu_si512((__m512i*)(tmp + jj1 + 3)); + + res0_512 = _mm512_add_epi32(res0_512, _mm512_madd_epi16(d0, coef0)); + res0_512 = _mm512_add_epi32(res0_512, _mm512_madd_epi16(d2, coef1)); + res4_512 = _mm512_add_epi32(res4_512, _mm512_madd_epi16(d1, coef0)); + res4_512 = _mm512_add_epi32(res4_512, _mm512_madd_epi16(d3, coef1)); + } + __m512i d0 = _mm512_loadu_si512((__m512i*)(tmp + f_r_j)); + __m512i tmp0 = _mm512_unpacklo_epi32(res0_512, res4_512); + __m512i tmp4 = _mm512_unpackhi_epi32(res0_512, res4_512); + __m512i mul0_lo = _mm512_mullo_epi16(d0, coef0_512); + __m512i mul0_hi = _mm512_mulhi_epi16(d0, coef0_512); + __m512i tmp0_lo = _mm512_unpacklo_epi16(mul0_lo, mul0_hi); + __m512i tmp0_hi = _mm512_unpackhi_epi16(mul0_lo, mul0_hi); + + tmp0 = _mm512_add_epi32(tmp0, tmp0_lo); + tmp4 = _mm512_add_epi32(tmp4, tmp0_hi); + + tmp0 = _mm512_srai_epi32(tmp0, SPAT_FILTER_OUT_SHIFT); + tmp4 = _mm512_srai_epi32(tmp4, SPAT_FILTER_OUT_SHIFT); + + res0_512 = _mm512_packs_epi32(tmp0, tmp4); + _mm512_storeu_si512((__m512i*)(dst + dst_row_idx + j), res0_512); + } + + __m256i coef0_256 = _mm256_set1_epi16(i_filter_coeffs[0]); + for (; j < width_rem_size16; j+=16) + { + int f_l_j = j - half_fw; + int f_r_j = j + half_fw; + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(SPAT_FILTER_OUT_RND); + + for (fj = 0; fj < half_fw; fj+=2){ + jj1 = f_l_j + fj*2; + + __m256i coef0 = _mm256_set1_epi32(i32_filter_coeffs[fj]); + __m256i coef1 = _mm256_set1_epi32(i32_filter_coeffs[fj+1]); + + __m256i d0 = _mm256_loadu_si256((__m256i*)(tmp + jj1)); + __m256i d2 = _mm256_loadu_si256((__m256i*)(tmp + jj1 + 2)); + __m256i d1 = _mm256_loadu_si256((__m256i*)(tmp + jj1 + 1)); + __m256i d3 = _mm256_loadu_si256((__m256i*)(tmp + jj1 + 3)); + + res0_256 = _mm256_add_epi32(res0_256, _mm256_madd_epi16(d0, coef0)); + res0_256 = _mm256_add_epi32(res0_256, _mm256_madd_epi16(d2, coef1)); + res4_256 = _mm256_add_epi32(res4_256, _mm256_madd_epi16(d1, coef0)); + res4_256 = _mm256_add_epi32(res4_256, _mm256_madd_epi16(d3, coef1)); + } + __m256i d0 = _mm256_loadu_si256((__m256i*)(tmp + f_r_j)); + __m256i tmp0 = _mm256_unpacklo_epi32(res0_256, res4_256); + __m256i tmp4 = _mm256_unpackhi_epi32(res0_256, res4_256); + __m256i mul0_lo = _mm256_mullo_epi16(d0, coef0_256); + __m256i mul0_hi = _mm256_mulhi_epi16(d0, coef0_256); + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); + + tmp0 = _mm256_add_epi32(tmp0, tmp0_lo); + tmp4 = _mm256_add_epi32(tmp4, tmp0_hi); + + tmp0 = _mm256_srai_epi32(tmp0, SPAT_FILTER_OUT_SHIFT); + tmp4 = _mm256_srai_epi32(tmp4, SPAT_FILTER_OUT_SHIFT); + + res0_256 = _mm256_packs_epi32(tmp0, tmp4); + _mm256_storeu_si256((__m256i*)(dst + dst_row_idx + j), res0_256); + } + + __m128i coef0_128 = _mm_set1_epi16(i_filter_coeffs[0]); + for (; j < width_rem_size8; j+=8) + { + int f_l_j = j - half_fw; + int f_r_j = j + half_fw; + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(SPAT_FILTER_OUT_RND); + + for (fj = 0; fj < half_fw; fj+=2){ + jj1 = f_l_j + fj*2; + + __m128i coef0 = _mm_set1_epi32(i32_filter_coeffs[fj]); + __m128i coef1 = _mm_set1_epi32(i32_filter_coeffs[fj+1]); + + __m128i d0 = _mm_loadu_si128((__m128i*)(tmp + jj1)); + __m128i d2 = _mm_loadu_si128((__m128i*)(tmp + jj1 + 2)); + __m128i d1 = _mm_loadu_si128((__m128i*)(tmp + jj1 + 1)); + __m128i d3 = _mm_loadu_si128((__m128i*)(tmp + jj1 + 3)); + + res0_128 = _mm_add_epi32(res0_128, _mm_madd_epi16(d0, coef0)); + res0_128 = _mm_add_epi32(res0_128, _mm_madd_epi16(d2, coef1)); + res4_128 = _mm_add_epi32(res4_128, _mm_madd_epi16(d1, coef0)); + res4_128 = _mm_add_epi32(res4_128, _mm_madd_epi16(d3, coef1)); + } + __m128i d0 = _mm_loadu_si128((__m128i*)(tmp + f_r_j)); + __m128i tmp0 = _mm_unpacklo_epi32(res0_128, res4_128); + __m128i tmp4 = _mm_unpackhi_epi32(res0_128, res4_128); + __m128i mul0_lo = _mm_mullo_epi16(d0, coef0_128); + __m128i mul0_hi = _mm_mulhi_epi16(d0, coef0_128); + __m128i tmp0_lo = _mm_unpacklo_epi16(mul0_lo, mul0_hi); + __m128i tmp0_hi = _mm_unpackhi_epi16(mul0_lo, mul0_hi); + + tmp0 = _mm_add_epi32(tmp0, tmp0_lo); + tmp4 = _mm_add_epi32(tmp4, tmp0_hi); + tmp0 = _mm_srai_epi32(tmp0, SPAT_FILTER_OUT_SHIFT); + tmp4 = _mm_srai_epi32(tmp4, SPAT_FILTER_OUT_SHIFT); + res0_128 = _mm_packs_epi32(tmp0, tmp4); + + _mm_storeu_si128((__m128i*)(dst + dst_row_idx + j), res0_128); + } + + for (; j < (width - half_fw); j++) + { + int f_l_j = j - half_fw; + int f_r_j = j + half_fw; + spat_fil_accum_dtype accum = 0; + /** + * The filter coefficients are symmetric, + * hence the corresponding pixels for whom coefficient values would be same are added first & then multiplied by coeff + * The centre pixel is multiplied and accumulated outside the loop + */ + for (fj = 0; fj < half_fw; fj++){ + + jj1 = f_l_j + fj; + jj2 = f_r_j - fj; + accum += i_filter_coeffs[fj] * ((spat_fil_accum_dtype)tmp[jj1] + tmp[jj2]); //Since filter coefficients are symmetric + } + accum += (spat_fil_inter_dtype) i_filter_coeffs[half_fw] * tmp[j]; + dst[dst_row_idx + j] = (spat_fil_output_dtype) ((accum + SPAT_FILTER_OUT_RND) >> SPAT_FILTER_OUT_SHIFT); + } + + d0_512 = _mm512_loadu_si512((__m512i*)(tmp + j - 22)); + /** + * This loop is to handle virtual padding of the right border pixels + */ + for(; j < width; j++) + { + int fi0 = half_filter_table_w2 + width - half_fw*3 - j - 2; + int fi1 = j - width + half_fw; + __m512i coef1 = _mm512_loadu_si512((__m512i*)(i_filter_coeffs_with_zeros + fi1)); + __m512i coef0 = _mm512_loadu_si512((__m512i*)(i_filter_coeffs_with_zeros + fi0)); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0_512, coef0); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0_512, coef0); + + __m512i mul1_lo_512 = _mm512_mullo_epi16(d0_512, coef1); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d0_512, coef1); + + __m512i tmp0_lo = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + + __m512i tmp1_lo = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + tmp0_lo = _mm512_add_epi32(tmp0_lo, tmp0_hi); + tmp0_hi = _mm512_add_epi32(tmp1_lo, tmp1_hi); + + __m512i res0 = _mm512_add_epi32(tmp0_lo, tmp0_hi); + __m256i r8 = _mm256_add_epi32(_mm512_castsi512_si256(res0), _mm512_extracti32x8_epi32(res0, 1)); + __m128i r4 = _mm_add_epi32(_mm256_castsi256_si128(r8), _mm256_extracti128_si256(r8, 1)); + __m128i r2 = _mm_hadd_epi32(r4, r4); + __m128i r1 = _mm_hadd_epi32(r2, r2); + dst[dst_row_idx + j] = ((_mm_cvtsi128_si32(r1) + SPAT_FILTER_OUT_RND) >> SPAT_FILTER_OUT_SHIFT); + } +} + +void integer_spatial_filter_avx512(void *src, spat_fil_output_dtype *dst, int width, int height, int bitdepth) +{ + const spat_fil_coeff_dtype i_filter_coeffs[21] = { + -900, -1054, -1239, -1452, -1669, -1798, -1547, -66, 4677, 14498, 21495, + 14498, 4677, -66, -1547, -1798, -1669, -1452, -1239, -1054, -900 + }; + + // For madd version + const spat_fil_accum_dtype i32_filter_coeffs[11] = { + -900 + (spat_fil_accum_dtype)(((unsigned int)-1054) << 16) + (1 << 16), + -1239 + (spat_fil_accum_dtype)(((unsigned int)-1452) << 16) + (1 << 16), + -1669 + (spat_fil_accum_dtype)(((unsigned int)-1798) << 16) + (1 << 16), + -1547 + (spat_fil_accum_dtype)(((unsigned int)-66) << 16) + (1 << 16), + 4677 + (14498 << 16) /* + (1 << 16) */, + 21495 + (14498 << 16) /* + (1 << 16) */, + 4677 + (spat_fil_accum_dtype)(((unsigned int)-66) << 16) /* + (1 << 16) */, + -1547 + (spat_fil_accum_dtype)(((unsigned int)-1798) << 16) + (1 << 16), + -1669 + (spat_fil_accum_dtype)(((unsigned int)-1452) << 16) + (1 << 16), + -1239 + (spat_fil_accum_dtype)(((unsigned int)-1054) << 16) + (1 << 16), + -900 + (1 << 16) + }; + + int src_px_stride = width; + int dst_px_stride = width; + int width_rem_size128 = width - (width % 128); + int width_rem_size64 = width - (width % 64); + int width_rem_size32 = width - (width % 32); + int width_rem_size16 = width - (width % 16); + int width_rem_size8 = width - (width % 8); + + spat_fil_inter_dtype *tmp = aligned_malloc(ALIGN_CEIL(src_px_stride * sizeof(spat_fil_inter_dtype)), MAX_ALIGN); + + // spat_fil_inter_dtype imgcoeff; + uint8_t *src_8b = NULL; + uint16_t *src_hbd = NULL; + + int interim_rnd = 0, interim_shift = 0; + + int i, j, fi, ii, ii1, ii2; + //unsigned int i, j, fi, ii, ii1, ii2; + // int fj, jj, jj1, jj; + // spat_fil_coeff_dtype *coeff_ptr; + int fwidth = 21; + int half_fw = fwidth / 2; + + if(8 == bitdepth) + { + src_8b = (uint8_t*)src; + src_hbd = NULL; + interim_rnd = SPAT_FILTER_INTER_RND; + interim_shift = SPAT_FILTER_INTER_SHIFT; + } + else // HBD case + { + src_8b = NULL; + src_hbd = (uint16_t*)src; + interim_shift = SPAT_FILTER_INTER_SHIFT + (bitdepth - 8); + interim_rnd = (1 << (interim_shift - 1)); + } + + __m512i res0_512, res4_512, res8_512, res12_512; + __m512i res0_64_512, res4_64_512, res8_64_512, res12_64_512; + __m512i zero_512 = _mm512_setzero_si512(); + __m256i res0_256, res4_256, res8_256, res12_256; + __m256i zero_256 = _mm256_setzero_si256(); + __m128i res0_128, res4_128, res8_128, res12_128; + __m128i zero_128 = _mm_setzero_si128(); + + __m512i perm0 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0); + __m512i perm32 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); + + /** + * The loop i=0 to height is split into 3 parts + * This is to avoid the if conditions used for virtual padding + */ + for (i = 0; i < half_fw; i++){ + + int diff_i_halffw = i - half_fw; + int pro_mir_end = -diff_i_halffw - 1; + + /* Vertical pass. */ + j = 0; + if(8 == bitdepth) + { + for (; j < width_rem_size128; j+= 128) + { + res0_512 = res4_512 = res8_512 = res12_512 = _mm512_set1_epi32(interim_rnd); + res0_64_512 = res4_64_512 = res8_64_512 = res12_64_512 = _mm512_set1_epi32(interim_rnd); + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi <= pro_mir_end; fi++){ + ii = pro_mir_end - fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_8b + ii * src_px_stride + j)); + __m512i d64 = _mm512_loadu_si512((__m512i*)(src_8b + ii * src_px_stride + j + 64)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + __m512i d0_lo = _mm512_unpacklo_epi8(d0, zero_512); + __m512i d0_hi = _mm512_unpackhi_epi8(d0, zero_512); + __m512i d64_lo = _mm512_unpacklo_epi8(d64, zero_512); + __m512i d64_hi = _mm512_unpackhi_epi8(d64, zero_512); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0_lo, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0_lo, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d0_hi, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d0_hi, coef); + + __m512i mul2_lo_512 = _mm512_mullo_epi16(d64_lo, coef); + __m512i mul2_hi_512 = _mm512_mulhi_epi16(d64_lo, coef); + __m512i mul3_lo_512 = _mm512_mullo_epi16(d64_hi, coef); + __m512i mul3_hi_512 = _mm512_mulhi_epi16(d64_hi, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + __m512i tmp2_lo_512 = _mm512_unpacklo_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp2_hi_512 = _mm512_unpackhi_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp3_lo_512 = _mm512_unpacklo_epi16(mul3_lo_512, mul3_hi_512); + __m512i tmp3_hi_512 = _mm512_unpackhi_epi16(mul3_lo_512, mul3_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + + res0_64_512 = _mm512_add_epi32(tmp2_lo_512, res0_64_512); + res4_64_512 = _mm512_add_epi32(tmp2_hi_512, res4_64_512); + res8_64_512 = _mm512_add_epi32(tmp3_lo_512, res8_64_512); + res12_64_512 = _mm512_add_epi32(tmp3_hi_512, res12_64_512); + } + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = diff_i_halffw + fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_8b + ii * src_px_stride + j)); + __m512i d64 = _mm512_loadu_si512((__m512i*)(src_8b + ii * src_px_stride + j + 64)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + __m512i d0_lo = _mm512_unpacklo_epi8(d0, zero_512); + __m512i d0_hi = _mm512_unpackhi_epi8(d0, zero_512); + __m512i d64_lo = _mm512_unpacklo_epi8(d64, zero_512); + __m512i d64_hi = _mm512_unpackhi_epi8(d64, zero_512); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0_lo, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0_lo, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d0_hi, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d0_hi, coef); + + __m512i mul2_lo_512 = _mm512_mullo_epi16(d64_lo, coef); + __m512i mul2_hi_512 = _mm512_mulhi_epi16(d64_lo, coef); + __m512i mul3_lo_512 = _mm512_mullo_epi16(d64_hi, coef); + __m512i mul3_hi_512 = _mm512_mulhi_epi16(d64_hi, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + __m512i tmp2_lo_512 = _mm512_unpacklo_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp2_hi_512 = _mm512_unpackhi_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp3_lo_512 = _mm512_unpacklo_epi16(mul3_lo_512, mul3_hi_512); + __m512i tmp3_hi_512 = _mm512_unpackhi_epi16(mul3_lo_512, mul3_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + + res0_64_512 = _mm512_add_epi32(tmp2_lo_512, res0_64_512); + res4_64_512 = _mm512_add_epi32(tmp2_hi_512, res4_64_512); + res8_64_512 = _mm512_add_epi32(tmp3_lo_512, res8_64_512); + res12_64_512 = _mm512_add_epi32(tmp3_hi_512, res12_64_512); + } + + res0_512 = _mm512_srai_epi32(res0_512, interim_shift); + res4_512 = _mm512_srai_epi32(res4_512, interim_shift); + res8_512 = _mm512_srai_epi32(res8_512, interim_shift); + res12_512 = _mm512_srai_epi32(res12_512, interim_shift); + + res0_64_512 = _mm512_srai_epi32(res0_64_512, interim_shift); + res4_64_512 = _mm512_srai_epi32(res4_64_512, interim_shift); + res8_64_512 = _mm512_srai_epi32(res8_64_512, interim_shift); + res12_64_512 = _mm512_srai_epi32(res12_64_512, interim_shift); + + res0_512 = _mm512_packs_epi32(res0_512, res4_512); + res8_512 = _mm512_packs_epi32(res8_512, res12_512); + res0_64_512 = _mm512_packs_epi32(res0_64_512, res4_64_512); + res8_64_512 = _mm512_packs_epi32(res8_64_512, res12_64_512); + + __m512i r0 = _mm512_permutex2var_epi64(res0_512, perm0, res8_512); + __m512i r16 = _mm512_permutex2var_epi64(res0_512, perm32, res8_512); + __m512i r32 = _mm512_permutex2var_epi64(res0_64_512, perm0, res8_64_512); + __m512i r48 = _mm512_permutex2var_epi64(res0_64_512, perm32, res8_64_512); + + _mm512_storeu_si512((__m512i*)(tmp + j), r0); + _mm512_storeu_si512((__m512i*)(tmp + j + 32), r16); + _mm512_storeu_si512((__m512i*)(tmp + j + 64), r32); + _mm512_storeu_si512((__m512i*)(tmp + j + 96), r48); + } + + for (; j < width_rem_size64; j+=64) + { + res0_512 = res4_512 = res8_512 = res12_512 = _mm512_set1_epi32(interim_rnd); + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi <= pro_mir_end; fi++){ + ii = pro_mir_end - fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_8b + ii * src_px_stride + j)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + __m512i d0_lo = _mm512_unpacklo_epi8(d0, zero_512); + __m512i d0_hi = _mm512_unpackhi_epi8(d0, zero_512); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0_lo, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0_lo, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d0_hi, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + } + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = diff_i_halffw + fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_8b + ii * src_px_stride + j)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + __m512i d0_lo = _mm512_unpacklo_epi8(d0, zero_512); + __m512i d0_hi = _mm512_unpackhi_epi8(d0, zero_512); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0_lo, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0_lo, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d0_hi, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + } + + res0_512 = _mm512_srai_epi32(res0_512, interim_shift); + res4_512 = _mm512_srai_epi32(res4_512, interim_shift); + res8_512 = _mm512_srai_epi32(res8_512, interim_shift); + res12_512 = _mm512_srai_epi32(res12_512, interim_shift); + + res0_512 = _mm512_packs_epi32(res0_512, res4_512); + res8_512 = _mm512_packs_epi32(res8_512, res12_512); + + __m512i r0 = _mm512_permutex2var_epi64(res0_512, perm0, res8_512); + __m512i r16 = _mm512_permutex2var_epi64(res0_512, perm32, res8_512); + + _mm512_storeu_si512((__m512i*)(tmp + j), r0); + _mm512_storeu_si512((__m512i*)(tmp + j + 32), r16); + } + + for (; j < width_rem_size32; j+=32) + { + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi <= pro_mir_end; fi++){ + ii = pro_mir_end - fi; + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m256i tmp0_lo_256 = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi_256 = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo_256 = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi_256 = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo_256, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi_256, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo_256, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi_256, res12_256); + } + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = diff_i_halffw + fi; + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m256i tmp0_lo_256 = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi_256 = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo_256 = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi_256 = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo_256, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi_256, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo_256, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi_256, res12_256); + } + res0_256 = _mm256_srai_epi32(res0_256, interim_shift); + res4_256 = _mm256_srai_epi32(res4_256, interim_shift); + res8_256 = _mm256_srai_epi32(res8_256, interim_shift); + res12_256 = _mm256_srai_epi32(res12_256, interim_shift); + + res0_256 = _mm256_packs_epi32(res0_256, res4_256); + res8_256 = _mm256_packs_epi32(res8_256, res12_256); + + __m256i r0 = _mm256_permute2x128_si256(res0_256, res8_256, 0x20); + __m256i r8 = _mm256_permute2x128_si256(res0_256, res8_256, 0x31); + + _mm256_store_si256((__m256i*)(tmp + j), r0); + _mm256_store_si256((__m256i*)(tmp + j + 16), r8); + } + + for (; j < width_rem_size16; j+=16) + { + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi <= pro_mir_end; fi++){ + + ii = pro_mir_end - fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i d0_hi = _mm_unpackhi_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d0_hi, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); + + } + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = diff_i_halffw + fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i d0_hi = _mm_unpackhi_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d0_hi, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); + } + + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res8_128 = _mm_srai_epi32(res8_128, interim_shift); + res12_128 = _mm_srai_epi32(res12_128, interim_shift); + + res0_128 = _mm_packs_epi32(res0_128, res4_128); + res8_128 = _mm_packs_epi32(res8_128, res12_128); + + __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_128), res8_128, 1); + _mm256_store_si256((__m256i*)(tmp + j), res); + } + + for (; j < width_rem_size8; j+=8) + { + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi <= pro_mir_end; fi++){ + + ii = pro_mir_end - fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + + // regroup the 2 parts of the result + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + } + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = diff_i_halffw + fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + + // regroup the 2 parts of the result + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + } + + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res0_128 = _mm_packs_epi32(res0_128, res4_128); + _mm_store_si128((__m128i*)(tmp + j), res0_128); + } + + for (; j < width; j++) + { + spat_fil_accum_dtype accum = 0; + + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi <= pro_mir_end; fi++) + { + ii = pro_mir_end - fi; + accum += (spat_fil_inter_dtype) i_filter_coeffs[fi] * src_8b[ii * src_px_stride + j]; + } + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = diff_i_halffw + fi; + accum += (spat_fil_inter_dtype) i_filter_coeffs[fi] * src_8b[ii * src_px_stride + j]; + } + tmp[j] = (spat_fil_inter_dtype) ((accum + interim_rnd) >> interim_shift); + } + } + else + { + for (; j < width_rem_size128; j+=128) + { + res0_512 = res4_512 = res8_512 = res12_512 = _mm512_set1_epi32(interim_rnd); + res0_64_512 = res4_64_512 = res8_64_512 = res12_64_512 = _mm512_set1_epi32(interim_rnd); + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi <= pro_mir_end; fi++){ + ii = pro_mir_end - fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j)); + __m512i d1 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 32)); + __m512i d2 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 64)); + __m512i d3 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 96)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d1, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d1, coef); + __m512i mul2_lo_512 = _mm512_mullo_epi16(d2, coef); + __m512i mul2_hi_512 = _mm512_mulhi_epi16(d2, coef); + __m512i mul3_lo_512 = _mm512_mullo_epi16(d3, coef); + __m512i mul3_hi_512 = _mm512_mulhi_epi16(d3, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp2_lo_512 = _mm512_unpacklo_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp2_hi_512 = _mm512_unpackhi_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp3_lo_512 = _mm512_unpacklo_epi16(mul3_lo_512, mul3_hi_512); + __m512i tmp3_hi_512 = _mm512_unpackhi_epi16(mul3_lo_512, mul3_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + res0_64_512 = _mm512_add_epi32(tmp2_lo_512, res0_64_512); + res4_64_512 = _mm512_add_epi32(tmp2_hi_512, res4_64_512); + res8_64_512 = _mm512_add_epi32(tmp3_lo_512, res8_64_512); + res12_64_512 = _mm512_add_epi32(tmp3_hi_512, res12_64_512); + } + + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = diff_i_halffw + fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j)); + __m512i d1 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 32)); + __m512i d2 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 64)); + __m512i d3 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 96)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d1, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d1, coef); + __m512i mul2_lo_512 = _mm512_mullo_epi16(d2, coef); + __m512i mul2_hi_512 = _mm512_mulhi_epi16(d2, coef); + __m512i mul3_lo_512 = _mm512_mullo_epi16(d3, coef); + __m512i mul3_hi_512 = _mm512_mulhi_epi16(d3, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp2_lo_512 = _mm512_unpacklo_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp2_hi_512 = _mm512_unpackhi_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp3_lo_512 = _mm512_unpacklo_epi16(mul3_lo_512, mul3_hi_512); + __m512i tmp3_hi_512 = _mm512_unpackhi_epi16(mul3_lo_512, mul3_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + res0_64_512 = _mm512_add_epi32(tmp2_lo_512, res0_64_512); + res4_64_512 = _mm512_add_epi32(tmp2_hi_512, res4_64_512); + res8_64_512 = _mm512_add_epi32(tmp3_lo_512, res8_64_512); + res12_64_512 = _mm512_add_epi32(tmp3_hi_512, res12_64_512); + } + + res0_512 = _mm512_srai_epi32(res0_512, interim_shift); + res4_512 = _mm512_srai_epi32(res4_512, interim_shift); + res8_512 = _mm512_srai_epi32(res8_512, interim_shift); + res12_512 = _mm512_srai_epi32(res12_512, interim_shift); + res0_64_512 = _mm512_srai_epi32(res0_64_512, interim_shift); + res4_64_512 = _mm512_srai_epi32(res4_64_512, interim_shift); + res8_64_512 = _mm512_srai_epi32(res8_64_512, interim_shift); + res12_64_512 = _mm512_srai_epi32(res12_64_512, interim_shift); + + res0_512 = _mm512_packs_epi32(res0_512, res4_512); + res8_512 = _mm512_packs_epi32(res8_512, res12_512); + res0_64_512 = _mm512_packs_epi32(res0_64_512, res4_64_512); + res8_64_512 = _mm512_packs_epi32(res8_64_512, res12_64_512); + + _mm512_storeu_si512((__m512i*)(tmp + j), res0_512); + _mm512_storeu_si512((__m512i*)(tmp + j + 32), res8_512); + _mm512_storeu_si512((__m512i*)(tmp + j + 64), res0_64_512); + _mm512_storeu_si512((__m512i*)(tmp + j + 96), res8_64_512); + } + + for (; j < width_rem_size64; j+=64) + { + res0_512 = res4_512 = res8_512 = res12_512 = _mm512_set1_epi32(interim_rnd); + + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi <= pro_mir_end; fi++){ + ii = pro_mir_end - fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j)); + __m512i d1 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 32)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d1, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d1, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + } + + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = diff_i_halffw + fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j)); + __m512i d1 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 32)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d1, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d1, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + } + + res0_512 = _mm512_srai_epi32(res0_512, interim_shift); + res4_512 = _mm512_srai_epi32(res4_512, interim_shift); + res8_512 = _mm512_srai_epi32(res8_512, interim_shift); + res12_512 = _mm512_srai_epi32(res12_512, interim_shift); + + res0_512 = _mm512_packs_epi32(res0_512, res4_512); + res8_512 = _mm512_packs_epi32(res8_512, res12_512); + + _mm512_storeu_si512((__m512i*)(tmp + j), res0_512); + _mm512_storeu_si512((__m512i*)(tmp + j + 32), res8_512); + } + + for (; j < width_rem_size32; j+=32) + { + res0_512 = res4_512 = _mm512_set1_epi32(interim_rnd); + + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi <= pro_mir_end; fi++){ + ii = pro_mir_end - fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + } + + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = diff_i_halffw + fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + } + + res0_512 = _mm512_srai_epi32(res0_512, interim_shift); + res4_512 = _mm512_srai_epi32(res4_512, interim_shift); + + res0_512 = _mm512_packs_epi32(res0_512, res4_512); + + _mm512_storeu_si512((__m512i*)(tmp + j), res0_512); + } + + for (; j < width_rem_size16; j+=16) + { + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); + + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi <= pro_mir_end; fi++){ + + ii = pro_mir_end - fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); + __m128i d1 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j + 8)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d1, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d1, coef); + + // regroup the 2 parts of the result + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); + } + + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = diff_i_halffw + fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); + __m128i d1 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j + 8)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d1, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d1, coef); + + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); + } + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res8_128 = _mm_srai_epi32(res8_128, interim_shift); + res12_128 = _mm_srai_epi32(res12_128, interim_shift); + + res0_128 = _mm_packs_epi32(res0_128, res4_128); + res8_128 = _mm_packs_epi32(res8_128, res12_128); + + __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_128), res8_128, 1); + _mm256_store_si256((__m256i*)(tmp + j), res); + } + + for (; j < width_rem_size8; j+=8) + { + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); + + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi <= pro_mir_end; fi++){ + ii = pro_mir_end - fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); + + // regroup the 2 parts of the result + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + } + + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = diff_i_halffw + fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); + + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + } + + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + + res0_128 = _mm_packs_epi32(res0_128, res4_128); + _mm_store_si128((__m128i*)(tmp + j), res0_128); + } + + for (; j < width; j++) + { + + spat_fil_accum_dtype accum = 0; + + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi <= pro_mir_end; fi++){ + + ii = pro_mir_end - fi; + accum += (spat_fil_inter_dtype) i_filter_coeffs[fi] * src_hbd[ii * src_px_stride + j]; + } + + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = diff_i_halffw + fi; + accum += (spat_fil_inter_dtype) i_filter_coeffs[fi] * src_hbd[ii * src_px_stride + j]; + } + tmp[j] = (spat_fil_inter_dtype) ((accum + interim_rnd) >> interim_shift); + } + } + + /* Horizontal pass. common for 8bit and hbd cases */ + integer_horizontal_filter_avx512(tmp, dst, i_filter_coeffs, width, fwidth, i*dst_px_stride, half_fw); + } + //This is the core loop + for ( ; i < (height - half_fw); i++){ + + int f_l_i = i - half_fw; + int f_r_i = i + half_fw; + /* Vertical pass. */ + j = 0; + if(8 == bitdepth) + { + for (; j < width_rem_size128; j+=128) + { + res0_512 = res4_512 = res8_512 = res12_512 = _mm512_set1_epi32(interim_rnd); + res0_64_512 = res4_64_512 = res8_64_512 = res12_64_512 = _mm512_set1_epi32(interim_rnd); + + for (fi = 0; fi < (half_fw); fi+=2) + { + ii1 = f_l_i + fi; + ii2 = f_r_i - fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_8b + ii1 * src_px_stride + j)); + __m512i d20 = _mm512_loadu_si512((__m512i*)(src_8b + ii2 * src_px_stride + j)); + __m512i d1 = _mm512_loadu_si512((__m512i*)(src_8b + (ii1 + 1) * src_px_stride + j)); + __m512i d19 = _mm512_loadu_si512((__m512i*)(src_8b + (ii2 - 1) * src_px_stride + j)); + + __m512i d0_64 = _mm512_loadu_si512((__m512i*)(src_8b + ii1 * src_px_stride + j + 64)); + __m512i d20_64 = _mm512_loadu_si512((__m512i*)(src_8b + ii2 * src_px_stride + j + 64)); + __m512i d1_64 = _mm512_loadu_si512((__m512i*)(src_8b + (ii1 + 1) * src_px_stride + j + 64)); + __m512i d19_64 = _mm512_loadu_si512((__m512i*)(src_8b + (ii2 - 1) * src_px_stride + j + 64)); + + __m512i f0_1 = _mm512_set1_epi32(i32_filter_coeffs[fi / 2]); + + __m512i d0_lo = _mm512_unpacklo_epi8(d0, zero_512); + __m512i d0_hi = _mm512_unpackhi_epi8(d0, zero_512); + __m512i d20_lo = _mm512_unpacklo_epi8(d20, zero_512); + __m512i d20_hi = _mm512_unpackhi_epi8(d20, zero_512); + __m512i d1_lo = _mm512_unpacklo_epi8(d1, zero_512); + __m512i d1_hi = _mm512_unpackhi_epi8(d1, zero_512); + __m512i d19_lo = _mm512_unpacklo_epi8(d19, zero_512); + __m512i d19_hi = _mm512_unpackhi_epi8(d19, zero_512); + + __m512i d0_64_lo = _mm512_unpacklo_epi8(d0_64, zero_512); + __m512i d0_64_hi = _mm512_unpackhi_epi8(d0_64, zero_512); + __m512i d20_64_lo = _mm512_unpacklo_epi8(d20_64, zero_512); + __m512i d20_64_hi = _mm512_unpackhi_epi8(d20_64, zero_512); + __m512i d1_64_lo = _mm512_unpacklo_epi8(d1_64, zero_512); + __m512i d1_64_hi = _mm512_unpackhi_epi8(d1_64, zero_512); + __m512i d19_64_lo = _mm512_unpacklo_epi8(d19_64, zero_512); + __m512i d19_64_hi = _mm512_unpackhi_epi8(d19_64, zero_512); + + d0_lo = _mm512_add_epi16(d0_lo, d20_lo); + d0_hi = _mm512_add_epi16(d0_hi, d20_hi); + d1_lo = _mm512_add_epi16(d1_lo, d19_lo); + d1_hi = _mm512_add_epi16(d1_hi, d19_hi); + + d0_64_lo = _mm512_add_epi16(d0_64_lo, d20_64_lo); + d0_64_hi = _mm512_add_epi16(d0_64_hi, d20_64_hi); + d1_64_lo = _mm512_add_epi16(d1_64_lo, d19_64_lo); + d1_64_hi = _mm512_add_epi16(d1_64_hi, d19_64_hi); + + __m512i l0_20_1_19_0 = _mm512_unpacklo_epi16(d0_lo, d1_lo); + __m512i l0_20_1_19_4 = _mm512_unpackhi_epi16(d0_lo, d1_lo); + __m512i l0_20_1_19_8 = _mm512_unpacklo_epi16(d0_hi, d1_hi); + __m512i l0_20_1_19_12 = _mm512_unpackhi_epi16(d0_hi, d1_hi); + + __m512i l0_20_1_19_0_64 = _mm512_unpacklo_epi16(d0_64_lo, d1_64_lo); + __m512i l0_20_1_19_4_64 = _mm512_unpackhi_epi16(d0_64_lo, d1_64_lo); + __m512i l0_20_1_19_8_64 = _mm512_unpacklo_epi16(d0_64_hi, d1_64_hi); + __m512i l0_20_1_19_12_64 = _mm512_unpackhi_epi16(d0_64_hi, d1_64_hi); + + res0_512 = _mm512_add_epi32(res0_512, _mm512_madd_epi16(l0_20_1_19_0, f0_1)); + res4_512 = _mm512_add_epi32(res4_512, _mm512_madd_epi16(l0_20_1_19_4, f0_1)); + res8_512 = _mm512_add_epi32(res8_512, _mm512_madd_epi16(l0_20_1_19_8, f0_1)); + res12_512 = _mm512_add_epi32(res12_512, _mm512_madd_epi16(l0_20_1_19_12, f0_1)); + + res0_64_512 = _mm512_add_epi32(res0_64_512, _mm512_madd_epi16(l0_20_1_19_0_64, f0_1)); + res4_64_512 = _mm512_add_epi32(res4_64_512, _mm512_madd_epi16(l0_20_1_19_4_64, f0_1)); + res8_64_512 = _mm512_add_epi32(res8_64_512, _mm512_madd_epi16(l0_20_1_19_8_64, f0_1)); + res12_64_512 = _mm512_add_epi32(res12_64_512, _mm512_madd_epi16(l0_20_1_19_12_64, f0_1)); + } + + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_8b + i * src_px_stride + j)); + __m512i d0_64 = _mm512_loadu_si512((__m512i*)(src_8b + i * src_px_stride + j + 64)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i d0_lo = _mm512_unpacklo_epi8(d0, zero_512); + __m512i d0_hi = _mm512_unpackhi_epi8(d0, zero_512); + __m512i d0_64_lo = _mm512_unpacklo_epi8(d0_64, zero_512); + __m512i d0_64_hi = _mm512_unpackhi_epi8(d0_64, zero_512); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0_lo, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0_lo, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d0_hi, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d0_hi, coef); + + __m512i mul0_64_lo_512 = _mm512_mullo_epi16(d0_64_lo, coef); + __m512i mul0_64_hi_512 = _mm512_mulhi_epi16(d0_64_lo, coef); + __m512i mul1_64_lo_512 = _mm512_mullo_epi16(d0_64_hi, coef); + __m512i mul1_64_hi_512 = _mm512_mulhi_epi16(d0_64_hi, coef); + + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + __m512i tmp0_64_lo_512 = _mm512_unpacklo_epi16(mul0_64_lo_512, mul0_64_hi_512); + __m512i tmp0_64_hi_512 = _mm512_unpackhi_epi16(mul0_64_lo_512, mul0_64_hi_512); + __m512i tmp1_64_lo_512 = _mm512_unpacklo_epi16(mul1_64_lo_512, mul1_64_hi_512); + __m512i tmp1_64_hi_512 = _mm512_unpackhi_epi16(mul1_64_lo_512, mul1_64_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + + res0_64_512 = _mm512_add_epi32(tmp0_64_lo_512, res0_64_512); + res4_64_512 = _mm512_add_epi32(tmp0_64_hi_512, res4_64_512); + res8_64_512 = _mm512_add_epi32(tmp1_64_lo_512, res8_64_512); + res12_64_512 = _mm512_add_epi32(tmp1_64_hi_512, res12_64_512); + + res0_512 = _mm512_srai_epi32(res0_512, interim_shift); + res4_512 = _mm512_srai_epi32(res4_512, interim_shift); + res8_512 = _mm512_srai_epi32(res8_512, interim_shift); + res12_512 = _mm512_srai_epi32(res12_512, interim_shift); + + res0_64_512 = _mm512_srai_epi32(res0_64_512, interim_shift); + res4_64_512 = _mm512_srai_epi32(res4_64_512, interim_shift); + res8_64_512 = _mm512_srai_epi32(res8_64_512, interim_shift); + res12_64_512 = _mm512_srai_epi32(res12_64_512, interim_shift); + + res0_512 = _mm512_packs_epi32(res0_512, res4_512); + res8_512 = _mm512_packs_epi32(res8_512, res12_512); + + res0_64_512 = _mm512_packs_epi32(res0_64_512, res4_64_512); + res8_64_512 = _mm512_packs_epi32(res8_64_512, res12_64_512); + + __m512i r0 = _mm512_permutex2var_epi64(res0_512, perm0, res8_512); + __m512i r16 = _mm512_permutex2var_epi64(res0_512, perm32, res8_512); + __m512i r32 = _mm512_permutex2var_epi64(res0_64_512, perm0, res8_64_512); + __m512i r48 = _mm512_permutex2var_epi64(res0_64_512, perm32, res8_64_512); + + _mm512_storeu_si512((__m512i*)(tmp + j), r0); + _mm512_storeu_si512((__m512i*)(tmp + j + 32), r16); + _mm512_storeu_si512((__m512i*)(tmp + j + 64), r32); + _mm512_storeu_si512((__m512i*)(tmp + j + 96), r48); + } + + for (; j < width_rem_size64; j+=64) + { + res0_512 = res4_512 = res8_512 = res12_512 = _mm512_set1_epi32(interim_rnd); + + for (fi = 0; fi < (half_fw); fi+=2){ + + ii1 = f_l_i + fi; + ii2 = f_r_i - fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_8b + ii1 * src_px_stride + j)); + __m512i d20 = _mm512_loadu_si512((__m512i*)(src_8b + ii2 * src_px_stride + j)); + + __m512i d1 = _mm512_loadu_si512((__m512i*)(src_8b + (ii1 + 1) * src_px_stride + j)); + __m512i d19 = _mm512_loadu_si512((__m512i*)(src_8b + (ii2 - 1) * src_px_stride + j)); + __m512i f0_1 = _mm512_set1_epi32(i32_filter_coeffs[fi / 2]); + + __m512i d0_lo = _mm512_unpacklo_epi8(d0, zero_512); + __m512i d0_hi = _mm512_unpackhi_epi8(d0, zero_512); + __m512i d20_lo = _mm512_unpacklo_epi8(d20, zero_512); + __m512i d20_hi = _mm512_unpackhi_epi8(d20, zero_512); + + __m512i d1_lo = _mm512_unpacklo_epi8(d1, zero_512); + __m512i d1_hi = _mm512_unpackhi_epi8(d1, zero_512); + __m512i d19_lo = _mm512_unpacklo_epi8(d19, zero_512); + __m512i d19_hi = _mm512_unpackhi_epi8(d19, zero_512); + + d0_lo = _mm512_add_epi16(d0_lo, d20_lo); + d0_hi = _mm512_add_epi16(d0_hi, d20_hi); + d1_lo = _mm512_add_epi16(d1_lo, d19_lo); + d1_hi = _mm512_add_epi16(d1_hi, d19_hi); + + __m512i l0_20_1_19_0 = _mm512_unpacklo_epi16(d0_lo, d1_lo); + __m512i l0_20_1_19_4 = _mm512_unpackhi_epi16(d0_lo, d1_lo); + __m512i l0_20_1_19_8 = _mm512_unpacklo_epi16(d0_hi, d1_hi); + __m512i l0_20_1_19_12 = _mm512_unpackhi_epi16(d0_hi, d1_hi); + + res0_512 = _mm512_add_epi32(res0_512, _mm512_madd_epi16(l0_20_1_19_0, f0_1)); + res4_512 = _mm512_add_epi32(res4_512, _mm512_madd_epi16(l0_20_1_19_4, f0_1)); + res8_512 = _mm512_add_epi32(res8_512, _mm512_madd_epi16(l0_20_1_19_8, f0_1)); + res12_512 = _mm512_add_epi32(res12_512, _mm512_madd_epi16(l0_20_1_19_12, f0_1)); + } + + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_8b + i * src_px_stride + j)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + __m512i d0_lo = _mm512_unpacklo_epi8(d0, zero_512); + __m512i d0_hi = _mm512_unpackhi_epi8(d0, zero_512); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0_lo, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0_lo, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d0_hi, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d0_hi, coef); + + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + + res0_512 = _mm512_srai_epi32(res0_512, interim_shift); + res4_512 = _mm512_srai_epi32(res4_512, interim_shift); + res8_512 = _mm512_srai_epi32(res8_512, interim_shift); + res12_512 = _mm512_srai_epi32(res12_512, interim_shift); + + res0_512 = _mm512_packs_epi32(res0_512, res4_512); + res8_512 = _mm512_packs_epi32(res8_512, res12_512); + + __m512i r0 = _mm512_permutex2var_epi64(res0_512, perm0, res8_512); + __m512i r16 = _mm512_permutex2var_epi64(res0_512, perm32, res8_512); + + _mm512_storeu_si512((__m512i*)(tmp + j), r0); + _mm512_storeu_si512((__m512i*)(tmp + j + 32), r16); + } + + for (; j < width_rem_size32; j+=32) + { + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); + + /** + * The filter coefficients are symmetric, + * hence the corresponding pixels for whom coefficient values would be same are added first & then multiplied by coeff + * The centre pixel is multiplied and accumulated outside the loop + */ + for (fi = 0; fi < (half_fw); fi+=2){ + + ii1 = f_l_i + fi; + ii2 = f_r_i - fi; + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii1 * src_px_stride + j)); + __m256i d20 = _mm256_loadu_si256((__m256i*)(src_8b + ii2 * src_px_stride + j)); + + __m256i d1 = _mm256_loadu_si256((__m256i*)(src_8b + (ii1 + 1) * src_px_stride + j)); + __m256i d19 = _mm256_loadu_si256((__m256i*)(src_8b + (ii2 - 1) * src_px_stride + j)); + __m256i f0_1 = _mm256_set1_epi32(i32_filter_coeffs[fi / 2]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d20_lo = _mm256_unpacklo_epi8(d20, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + __m256i d20_hi = _mm256_unpackhi_epi8(d20, zero_256); + + __m256i d1_lo = _mm256_unpacklo_epi8(d1, zero_256); + __m256i d19_lo = _mm256_unpacklo_epi8(d19, zero_256); + __m256i d1_hi = _mm256_unpackhi_epi8(d1, zero_256); + __m256i d19_hi = _mm256_unpackhi_epi8(d19, zero_256); + + d0_lo = _mm256_add_epi16(d0_lo, d20_lo); + d1_lo = _mm256_add_epi16(d1_lo, d19_lo); + d0_hi = _mm256_add_epi16(d0_hi, d20_hi); + d1_hi = _mm256_add_epi16(d1_hi, d19_hi); + + __m256i l0_20_1_19_0 = _mm256_unpacklo_epi16(d0_lo, d1_lo); + __m256i l0_20_1_19_4 = _mm256_unpackhi_epi16(d0_lo, d1_lo); + __m256i l0_20_1_19_8 = _mm256_unpacklo_epi16(d0_hi, d1_hi); + __m256i l0_20_1_19_12 = _mm256_unpackhi_epi16(d0_hi, d1_hi); + + res0_256 = _mm256_add_epi32(res0_256, _mm256_madd_epi16(l0_20_1_19_0, f0_1)); + res4_256 = _mm256_add_epi32(res4_256, _mm256_madd_epi16(l0_20_1_19_4, f0_1)); + res8_256 = _mm256_add_epi32(res8_256, _mm256_madd_epi16(l0_20_1_19_8, f0_1)); + res12_256 = _mm256_add_epi32(res12_256, _mm256_madd_epi16(l0_20_1_19_12, f0_1)); + } + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + i * src_px_stride + j)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + + __m256i mul0_lo = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi = _mm256_mulhi_epi16(d0_hi, coef); + + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + + res0_256 = _mm256_srai_epi32(res0_256, interim_shift); + res4_256 = _mm256_srai_epi32(res4_256, interim_shift); + res8_256 = _mm256_srai_epi32(res8_256, interim_shift); + res12_256 = _mm256_srai_epi32(res12_256, interim_shift); + + res0_256 = _mm256_packs_epi32(res0_256, res4_256); + res8_256 = _mm256_packs_epi32(res8_256, res12_256); + + __m256i r0 = _mm256_permute2x128_si256(res0_256, res8_256, 0x20); + __m256i r8 = _mm256_permute2x128_si256(res0_256, res8_256, 0x31); + _mm256_storeu_si256((__m256i*)(tmp + j), r0); + _mm256_storeu_si256((__m256i*)(tmp + j + 16), r8); + } + + for (; j < width_rem_size16; j+=16) + { + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); + + /** + * The filter coefficients are symmetric, + * hence the corresponding pixels for whom coefficient values would be same are added first & then multiplied by coeff + * The centre pixel is multiplied and accumulated outside the loop + */ + for (fi = 0; fi < (half_fw); fi+=2){ + ii1 = f_l_i + fi; + ii2 = f_r_i - fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii1 * src_px_stride + j)); + __m128i d20 = _mm_loadu_si128((__m128i*)(src_8b + ii2 * src_px_stride + j)); + __m128i d1 = _mm_loadu_si128((__m128i*)(src_8b + (ii1 + 1) * src_px_stride + j)); + __m128i d19 = _mm_loadu_si128((__m128i*)(src_8b + (ii2 - 1) * src_px_stride + j)); + __m128i f0_1 = _mm_set1_epi32(i32_filter_coeffs[fi / 2]); + + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i d0_hi = _mm_unpackhi_epi8(d0, zero_128); + __m128i d20_lo = _mm_unpacklo_epi8(d20, zero_128); + __m128i d20_hi = _mm_unpackhi_epi8(d20, zero_128); + + __m128i d1_lo = _mm_unpacklo_epi8(d1, zero_128); + __m128i d1_hi = _mm_unpackhi_epi8(d1, zero_128); + __m128i d19_lo = _mm_unpacklo_epi8(d19, zero_128); + __m128i d19_hi = _mm_unpackhi_epi8(d19, zero_128); + + d0_lo = _mm_add_epi16(d0_lo, d20_lo); + d0_hi = _mm_add_epi16(d0_hi, d20_hi); + d1_lo = _mm_add_epi16(d1_lo, d19_lo); + d1_hi = _mm_add_epi16(d1_hi, d19_hi); + + __m128i l0_20_1_19_0 = _mm_unpacklo_epi16(d0_lo, d1_lo); + __m128i l0_20_1_19_4 = _mm_unpackhi_epi16(d0_lo, d1_lo); + __m128i l0_20_1_19_8 = _mm_unpacklo_epi16(d0_hi, d1_hi); + __m128i l0_20_1_19_12 = _mm_unpackhi_epi16(d0_hi, d1_hi); + + res0_128 = _mm_add_epi32(res0_128, _mm_madd_epi16(l0_20_1_19_0, f0_1)); + res4_128 = _mm_add_epi32(res4_128, _mm_madd_epi16(l0_20_1_19_4, f0_1)); + res8_128 = _mm_add_epi32(res8_128, _mm_madd_epi16(l0_20_1_19_8, f0_1)); + res12_128 = _mm_add_epi32(res12_128, _mm_madd_epi16(l0_20_1_19_12, f0_1)); + } + __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + i * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i d0_hi = _mm_unpackhi_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d0_hi, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d0_hi, coef); + + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); + + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res8_128 = _mm_srai_epi32(res8_128, interim_shift); + res12_128 = _mm_srai_epi32(res12_128, interim_shift); + + res0_128 = _mm_packs_epi32(res0_128, res4_128); + res8_128 = _mm_packs_epi32(res8_128, res12_128); + + __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_128), res8_128, 1); + _mm256_store_si256((__m256i*)(tmp + j), res); + } + + for (; j < width_rem_size8; j+=8) + { + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); + + /** + * The filter coefficients are symmetric, + * hence the corresponding pixels for whom coefficient values would be same are added first & then multiplied by coeff + * The centre pixel is multiplied and accumulated outside the loop + */ + for (fi = 0; fi < (half_fw); fi+=2){ + ii1 = f_l_i + fi; + ii2 = f_r_i - fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii1 * src_px_stride + j)); + __m128i d20 = _mm_loadu_si128((__m128i*)(src_8b + ii2 * src_px_stride + j)); + __m128i d1 = _mm_loadu_si128((__m128i*)(src_8b + (ii1 + 1) * src_px_stride + j)); + __m128i d19 = _mm_loadu_si128((__m128i*)(src_8b + (ii2 - 1) * src_px_stride + j)); + __m128i f0_1 = _mm_set1_epi32(i32_filter_coeffs[fi / 2]); + + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i d20_lo = _mm_unpacklo_epi8(d20, zero_128); + __m128i d1_lo = _mm_unpacklo_epi8(d1, zero_128); + __m128i d19_lo = _mm_unpacklo_epi8(d19, zero_128); + + d0_lo = _mm_add_epi16(d0_lo, d20_lo); + d1_lo = _mm_add_epi16(d1_lo, d19_lo); + + __m128i l0_20_1_19_0 = _mm_unpacklo_epi16(d0_lo, d1_lo); + __m128i l0_20_1_19_4 = _mm_unpackhi_epi16(d0_lo, d1_lo); + + res0_128 = _mm_add_epi32(res0_128, _mm_madd_epi16(l0_20_1_19_0, f0_1)); + res4_128 = _mm_add_epi32(res4_128, _mm_madd_epi16(l0_20_1_19_4, f0_1)); + } + __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + i * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res0_128 = _mm_packs_epi32(res0_128, res4_128); + + _mm_store_si128((__m128i*)(tmp + j), res0_128); + } + + for (; j < width; j++) + { + spat_fil_accum_dtype accum = 0; + for (fi = 0; fi < (half_fw); fi++){ + ii1 = f_l_i + fi; + ii2 = f_r_i - fi; + accum += i_filter_coeffs[fi] * ((spat_fil_inter_dtype)src_8b[ii1 * src_px_stride + j] + src_8b[ii2 * src_px_stride + j]); + } + accum += (spat_fil_inter_dtype) i_filter_coeffs[fi] * src_8b[i * src_px_stride + j]; + tmp[j] = (spat_fil_inter_dtype) ((accum + interim_rnd) >> interim_shift); + } + } + else + { + for (; j < width_rem_size64; j+=64) + { + res0_512 = res4_512 = res8_512 = res12_512 = _mm512_set1_epi32(interim_rnd); + + /** + * The filter coefficients are symmetric, + * hence the corresponding pixels for whom coefficient values would be same are added first & then multiplied by coeff + * The centre pixel is multiplied and accumulated outside the loop + */ + for (fi = 0; fi < (half_fw); fi+=2) + { + ii1 = f_l_i + fi; + ii2 = f_r_i - fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + ii1 * src_px_stride + j)); + __m512i d20 = _mm512_loadu_si512((__m512i*)(src_hbd + ii2 * src_px_stride + j)); + __m512i d0_32 = _mm512_loadu_si512((__m512i*)(src_hbd + ii1 * src_px_stride + j + 32)); + __m512i d20_32 = _mm512_loadu_si512((__m512i*)(src_hbd + ii2 * src_px_stride + j + 32)); + + __m512i d1 = _mm512_loadu_si512((__m512i*)(src_hbd + (ii1 + 1) * src_px_stride + j)); + __m512i d19 = _mm512_loadu_si512((__m512i*)(src_hbd + (ii2 - 1) * src_px_stride + j)); + __m512i d1_32 = _mm512_loadu_si512((__m512i*)(src_hbd + (ii1 + 1) * src_px_stride + j + 32)); + __m512i d19_32 = _mm512_loadu_si512((__m512i*)(src_hbd + (ii2 - 1) * src_px_stride + j + 32)); + + __m512i f0_1 = _mm512_set1_epi32(i32_filter_coeffs[fi / 2]); + + d0 = _mm512_add_epi16(d0, d20); + d0_32 = _mm512_add_epi16(d0_32, d20_32); + d1 = _mm512_add_epi16(d1, d19); + d1_32 = _mm512_add_epi16(d1_32, d19_32); + + __m512i l0_20_1_19_0 = _mm512_unpacklo_epi16(d0, d1); + __m512i l0_20_1_19_4 = _mm512_unpackhi_epi16(d0, d1); + __m512i l0_20_1_19_16 = _mm512_unpacklo_epi16(d0_32, d1_32); + __m512i l0_20_1_19_20 = _mm512_unpackhi_epi16(d0_32, d1_32); + + res0_512 = _mm512_add_epi32(res0_512, _mm512_madd_epi16(l0_20_1_19_0, f0_1)); + res4_512 = _mm512_add_epi32(res4_512, _mm512_madd_epi16(l0_20_1_19_4, f0_1)); + res8_512 = _mm512_add_epi32(res8_512, _mm512_madd_epi16(l0_20_1_19_16, f0_1)); + res12_512 = _mm512_add_epi32(res12_512, _mm512_madd_epi16(l0_20_1_19_20, f0_1)); + } + + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + i * src_px_stride + j)); + __m512i d0_32 = _mm512_loadu_si512((__m512i*)(src_hbd + i * src_px_stride + j + 32)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d0_32, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d0_32, coef); + + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + + res0_512 = _mm512_srai_epi32(res0_512, interim_shift); + res4_512 = _mm512_srai_epi32(res4_512, interim_shift); + res8_512 = _mm512_srai_epi32(res8_512, interim_shift); + res12_512 = _mm512_srai_epi32(res12_512, interim_shift); + + res0_512 = _mm512_packs_epi32(res0_512, res4_512); + res8_512 = _mm512_packs_epi32(res8_512, res12_512); + + _mm512_storeu_si512((__m512i*)(tmp + j), res0_512); + _mm512_storeu_si512((__m512i*)(tmp + j + 32), res8_512); + } + + for (; j < width_rem_size32; j+=32) + { + res0_512 = res4_512 = _mm512_set1_epi32(interim_rnd); + + /** + * The filter coefficients are symmetric, + * hence the corresponding pixels for whom coefficient values would be same are added first & then multiplied by coeff + * The centre pixel is multiplied and accumulated outside the loop + */ + for (fi = 0; fi < (half_fw); fi+=2) + { + ii1 = f_l_i + fi; + ii2 = f_r_i - fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + ii1 * src_px_stride + j)); + __m512i d20 = _mm512_loadu_si512((__m512i*)(src_hbd + ii2 * src_px_stride + j)); + + __m512i d1 = _mm512_loadu_si512((__m512i*)(src_hbd + (ii1 + 1) * src_px_stride + j)); + __m512i d19 = _mm512_loadu_si512((__m512i*)(src_hbd + (ii2 - 1) * src_px_stride + j)); + __m512i f0_1 = _mm512_set1_epi32(i32_filter_coeffs[fi / 2]); + + d0 = _mm512_add_epi16(d0, d20); + d1 = _mm512_add_epi16(d1, d19); + + __m512i l0_20_1_19_0 = _mm512_unpacklo_epi16(d0, d1); + __m512i l0_20_1_19_4 = _mm512_unpackhi_epi16(d0, d1); + + res0_512 = _mm512_add_epi32(res0_512, _mm512_madd_epi16(l0_20_1_19_0, f0_1)); + res4_512 = _mm512_add_epi32(res4_512, _mm512_madd_epi16(l0_20_1_19_4, f0_1)); + } + + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + i * src_px_stride + j)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0, coef); + + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + + res0_512 = _mm512_srai_epi32(res0_512, interim_shift); + res4_512 = _mm512_srai_epi32(res4_512, interim_shift); + + res0_512 = _mm512_packs_epi32(res0_512, res4_512); + + _mm512_storeu_si512((__m512i*)(tmp + j), res0_512); + } + + for (; j < width_rem_size16; j+=16){ + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); + + /** + * The filter coefficients are symmetric, + * hence the corresponding pixels for whom coefficient values would be same are added first & then multiplied by coeff + * The centre pixel is multiplied and accumulated outside the loop + */ + for (fi = 0; fi < (half_fw); fi+=2){ + ii1 = f_l_i + fi; + ii2 = f_r_i - fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii1 * src_px_stride + j)); + __m128i d20 = _mm_loadu_si128((__m128i*)(src_hbd + ii2 * src_px_stride + j)); + __m128i d0_8 = _mm_loadu_si128((__m128i*)(src_hbd + ii1 * src_px_stride + j + 8)); + __m128i d20_8 = _mm_loadu_si128((__m128i*)(src_hbd + ii2 * src_px_stride + j + 8)); + __m128i d1 = _mm_loadu_si128((__m128i*)(src_hbd + (ii1 + 1) * src_px_stride + j)); + __m128i d19 = _mm_loadu_si128((__m128i*)(src_hbd + (ii2 - 1) * src_px_stride + j)); + __m128i d1_8 = _mm_loadu_si128((__m128i*)(src_hbd + (ii1 + 1) * src_px_stride + j + 8)); + __m128i d19_8 = _mm_loadu_si128((__m128i*)(src_hbd + (ii2 - 1) * src_px_stride + j + 8)); + __m128i f0_1 = _mm_set1_epi32(i32_filter_coeffs[fi / 2]); + + d0 = _mm_add_epi16(d0, d20); + d0_8 = _mm_add_epi16(d0_8, d20_8); + d1 = _mm_add_epi16(d1, d19); + d1_8 = _mm_add_epi16(d1_8, d19_8); + + __m128i l0_20_1_19_0 = _mm_unpacklo_epi16(d0, d1); + __m128i l0_20_1_19_4 = _mm_unpackhi_epi16(d0, d1); + __m128i l0_20_1_19_8 = _mm_unpacklo_epi16(d0_8, d1_8); + __m128i l0_20_1_19_16 = _mm_unpackhi_epi16(d0_8, d1_8); + + res0_128 = _mm_add_epi32(res0_128, _mm_madd_epi16(l0_20_1_19_0, f0_1)); + res4_128 = _mm_add_epi32(res4_128, _mm_madd_epi16(l0_20_1_19_4, f0_1)); + res8_128 = _mm_add_epi32(res8_128, _mm_madd_epi16(l0_20_1_19_8, f0_1)); + res12_128 = _mm_add_epi32(res12_128, _mm_madd_epi16(l0_20_1_19_16, f0_1)); + } + __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + i * src_px_stride + j)); + __m128i d0_16 = _mm_loadu_si128((__m128i*)(src_hbd + i * src_px_stride + j + 8)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d0_16, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d0_16, coef); + + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); + + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res8_128 = _mm_srai_epi32(res8_128, interim_shift); + res12_128 = _mm_srai_epi32(res12_128, interim_shift); + + res0_128 = _mm_packs_epi32(res0_128, res4_128); + res8_128 = _mm_packs_epi32(res8_128, res12_128); + + __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_128), res8_128, 1); + _mm256_store_si256((__m256i*)(tmp + j), res); + } + + for (; j < width_rem_size8; j+=8){ + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); + + /** + * The filter coefficients are symmetric, + * hence the corresponding pixels for whom coefficient values would be same are added first & then multiplied by coeff + * The centre pixel is multiplied and accumulated outside the loop + */ + for (fi = 0; fi < (half_fw); fi+=2){ + ii1 = f_l_i + fi; + ii2 = f_r_i - fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii1 * src_px_stride + j)); + __m128i d20 = _mm_loadu_si128((__m128i*)(src_hbd + ii2 * src_px_stride + j)); + __m128i d1 = _mm_loadu_si128((__m128i*)(src_hbd + (ii1 + 1) * src_px_stride + j)); + __m128i d19 = _mm_loadu_si128((__m128i*)(src_hbd + (ii2 - 1) * src_px_stride + j)); + __m128i f0_1 = _mm_set1_epi32(i32_filter_coeffs[fi / 2]); + + d0 = _mm_add_epi16(d0, d20); + d1 = _mm_add_epi16(d1, d19); + + __m128i l0_20_1_19_0 = _mm_unpacklo_epi16(d0, d1); + __m128i l0_20_1_19_4 = _mm_unpackhi_epi16(d0, d1); + + res0_128 = _mm_add_epi32(res0_128, _mm_madd_epi16(l0_20_1_19_0, f0_1)); + res4_128 = _mm_add_epi32(res4_128, _mm_madd_epi16(l0_20_1_19_4, f0_1)); + } + __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + i * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); + + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res0_128 = _mm_packs_epi32(res0_128, res4_128); + + _mm_store_si128((__m128i*)(tmp + j), res0_128); + } + + for (; j < width; j++){ + + spat_fil_accum_dtype accum = 0; + + /** + * The filter coefficients are symmetric, + * hence the corresponding pixels for whom coefficient values would be same are added first & then multiplied by coeff + * The centre pixel is multiplied and accumulated outside the loop + */ + for (fi = 0; fi < (half_fw); fi++){ + ii1 = f_l_i + fi; + ii2 = f_r_i - fi; + accum += i_filter_coeffs[fi] * ((spat_fil_inter_dtype)src_hbd[ii1 * src_px_stride + j] + src_hbd[ii2 * src_px_stride + j]); + } + accum += (spat_fil_inter_dtype) i_filter_coeffs[fi] * src_hbd[i * src_px_stride + j]; + tmp[j] = (spat_fil_inter_dtype) ((accum + interim_rnd) >> interim_shift); + } + } + + /* Horizontal pass. common for 8bit and hbd cases */ + integer_horizontal_filter_avx512(tmp, dst, i_filter_coeffs, width, fwidth, i*dst_px_stride, half_fw); + } + /** + * This loop is to handle virtual padding of the bottom border pixels + */ + for (; i < height; i++){ + + int diff_i_halffw = i - half_fw; + int epi_mir_i = 2 * height - diff_i_halffw - 1; + int epi_last_i = height - diff_i_halffw; + j = 0; + /* Vertical pass. */ + + if(8 == bitdepth) + { + for (; j < width_rem_size128; j+=128) + { + res0_512 = res4_512 = res8_512 = res12_512 = _mm512_set1_epi32(interim_rnd); + res0_64_512 = res4_64_512 = res8_64_512 = res12_64_512 = _mm512_set1_epi32(interim_rnd); + //Here the normal loop is executed where ii = i - fwidth/2 + fi + for (fi = 0; fi < epi_last_i; fi++){ + ii = diff_i_halffw + fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_8b + ii * src_px_stride + j)); + __m512i d64 = _mm512_loadu_si512((__m512i*)(src_8b + ii * src_px_stride + j + 64)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i d0_lo = _mm512_unpacklo_epi8(d0, zero_512); + __m512i d0_hi = _mm512_unpackhi_epi8(d0, zero_512); + __m512i d64_lo = _mm512_unpacklo_epi8(d64, zero_512); + __m512i d64_hi = _mm512_unpackhi_epi8(d64, zero_512); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0_lo, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0_lo, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d0_hi, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d0_hi, coef); + + __m512i mul2_lo_512 = _mm512_mullo_epi16(d64_lo, coef); + __m512i mul2_hi_512 = _mm512_mulhi_epi16(d64_lo, coef); + __m512i mul3_lo_512 = _mm512_mullo_epi16(d64_hi, coef); + __m512i mul3_hi_512 = _mm512_mulhi_epi16(d64_hi, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + __m512i tmp2_lo_512 = _mm512_unpacklo_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp2_hi_512 = _mm512_unpackhi_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp3_lo_512 = _mm512_unpacklo_epi16(mul3_lo_512, mul3_hi_512); + __m512i tmp3_hi_512 = _mm512_unpackhi_epi16(mul3_lo_512, mul3_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + + res0_64_512 = _mm512_add_epi32(tmp2_lo_512, res0_64_512); + res4_64_512 = _mm512_add_epi32(tmp2_hi_512, res4_64_512); + res8_64_512 = _mm512_add_epi32(tmp3_lo_512, res8_64_512); + res12_64_512 = _mm512_add_epi32(tmp3_hi_512, res12_64_512); + } + //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) + for ( ; fi < fwidth; fi++) + { + ii = epi_mir_i - fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_8b + ii * src_px_stride + j)); + __m512i d64 = _mm512_loadu_si512((__m512i*)(src_8b + ii * src_px_stride + j + 64)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i d0_lo = _mm512_unpacklo_epi8(d0, zero_512); + __m512i d0_hi = _mm512_unpackhi_epi8(d0, zero_512); + __m512i d64_lo = _mm512_unpacklo_epi8(d64, zero_512); + __m512i d64_hi = _mm512_unpackhi_epi8(d64, zero_512); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0_lo, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0_lo, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d0_hi, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d0_hi, coef); + + __m512i mul2_lo_512 = _mm512_mullo_epi16(d64_lo, coef); + __m512i mul2_hi_512 = _mm512_mulhi_epi16(d64_lo, coef); + __m512i mul3_lo_512 = _mm512_mullo_epi16(d64_hi, coef); + __m512i mul3_hi_512 = _mm512_mulhi_epi16(d64_hi, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + __m512i tmp2_lo_512 = _mm512_unpacklo_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp2_hi_512 = _mm512_unpackhi_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp3_lo_512 = _mm512_unpacklo_epi16(mul3_lo_512, mul3_hi_512); + __m512i tmp3_hi_512 = _mm512_unpackhi_epi16(mul3_lo_512, mul3_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi, res12_512); + + res0_64_512 = _mm512_add_epi32(tmp2_lo_512, res0_64_512); + res4_64_512 = _mm512_add_epi32(tmp2_hi_512, res4_64_512); + res8_64_512 = _mm512_add_epi32(tmp3_lo_512, res8_64_512); + res12_64_512 = _mm512_add_epi32(tmp3_hi_512, res12_64_512); + } + + res0_512 = _mm512_srai_epi32(res0_512, interim_shift); + res4_512 = _mm512_srai_epi32(res4_512, interim_shift); + res8_512 = _mm512_srai_epi32(res8_512, interim_shift); + res12_512 = _mm512_srai_epi32(res12_512, interim_shift); + + res0_64_512 = _mm512_srai_epi32(res0_64_512, interim_shift); + res4_64_512 = _mm512_srai_epi32(res4_64_512, interim_shift); + res8_64_512 = _mm512_srai_epi32(res8_64_512, interim_shift); + res12_64_512 = _mm512_srai_epi32(res12_64_512, interim_shift); + + res0_512 = _mm512_packs_epi32(res0_512, res4_512); + res8_512 = _mm512_packs_epi32(res8_512, res12_512); + res0_64_512 = _mm512_packs_epi32(res0_64_512, res4_64_512); + res8_64_512 = _mm512_packs_epi32(res8_64_512, res12_64_512); + + __m512i r0 = _mm512_permutex2var_epi64(res0_512, perm0, res8_512); + __m512i r16 = _mm512_permutex2var_epi64(res0_512, perm32, res8_512); + __m512i r32 = _mm512_permutex2var_epi64(res0_64_512, perm0, res8_64_512); + __m512i r48 = _mm512_permutex2var_epi64(res0_64_512, perm32, res8_64_512); + + _mm512_storeu_si512((__m512i*)(tmp + j), r0); + _mm512_storeu_si512((__m512i*)(tmp + j + 32), r16); + _mm512_storeu_si512((__m512i*)(tmp + j + 64), r32); + _mm512_storeu_si512((__m512i*)(tmp + j + 96), r48); + } + + for (; j < width_rem_size64; j+=64) + { + res0_512 = res4_512 = res8_512 = res12_512 = _mm512_set1_epi32(interim_rnd); + //Here the normal loop is executed where ii = i - fwidth/2 + fi + for (fi = 0; fi < epi_last_i; fi++){ + ii = diff_i_halffw + fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_8b + ii * src_px_stride + j)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i d0_lo = _mm512_unpacklo_epi8(d0, zero_512); + __m512i d0_hi = _mm512_unpackhi_epi8(d0, zero_512); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0_lo, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0_lo, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d0_hi, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + } + //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) + for ( ; fi < fwidth; fi++) + { + ii = epi_mir_i - fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_8b + ii * src_px_stride + j)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i d0_lo = _mm512_unpacklo_epi8(d0, zero_512); + __m512i d0_hi = _mm512_unpackhi_epi8(d0, zero_512); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0_lo, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0_lo, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d0_hi, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi, res12_512); + } + + res0_512 = _mm512_srai_epi32(res0_512, interim_shift); + res4_512 = _mm512_srai_epi32(res4_512, interim_shift); + res8_512 = _mm512_srai_epi32(res8_512, interim_shift); + res12_512 = _mm512_srai_epi32(res12_512, interim_shift); + + res0_512 = _mm512_packs_epi32(res0_512, res4_512); + res8_512 = _mm512_packs_epi32(res8_512, res12_512); + + __m512i r0 = _mm512_permutex2var_epi64(res0_512, perm0, res8_512); + __m512i r16 = _mm512_permutex2var_epi64(res0_512, perm32, res8_512); + + _mm512_storeu_si512((__m512i*)(tmp + j), r0); + _mm512_storeu_si512((__m512i*)(tmp + j + 32), r16); + + } + + for (; j < width_rem_size32; j+=32) + { + res0_256 = res4_256 = res8_256 = res12_256 = _mm256_set1_epi32(interim_rnd); + //Here the normal loop is executed where ii = i - fwidth/2 + fi + for (fi = 0; fi < epi_last_i; fi++){ + ii = diff_i_halffw + fi; + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + + __m256i mul0_lo_256 = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi_256 = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo_256 = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi_256 = _mm256_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo_256, mul0_hi_256); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo_256, mul1_hi_256); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo_256, mul1_hi_256); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + } + //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) + for ( ; fi < fwidth; fi++) + { + ii = epi_mir_i - fi; + __m256i d0 = _mm256_loadu_si256((__m256i*)(src_8b + ii * src_px_stride + j)); + __m256i coef = _mm256_set1_epi16(i_filter_coeffs[fi]); + + __m256i d0_lo = _mm256_unpacklo_epi8(d0, zero_256); + __m256i d0_hi = _mm256_unpackhi_epi8(d0, zero_256); + + __m256i mul0_lo = _mm256_mullo_epi16(d0_lo, coef); + __m256i mul0_hi = _mm256_mulhi_epi16(d0_lo, coef); + __m256i mul1_lo = _mm256_mullo_epi16(d0_hi, coef); + __m256i mul1_hi = _mm256_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m256i tmp0_lo = _mm256_unpacklo_epi16(mul0_lo, mul0_hi); + __m256i tmp0_hi = _mm256_unpackhi_epi16(mul0_lo, mul0_hi); + __m256i tmp1_lo = _mm256_unpacklo_epi16(mul1_lo, mul1_hi); + __m256i tmp1_hi = _mm256_unpackhi_epi16(mul1_lo, mul1_hi); + + res0_256 = _mm256_add_epi32(tmp0_lo, res0_256); + res4_256 = _mm256_add_epi32(tmp0_hi, res4_256); + res8_256 = _mm256_add_epi32(tmp1_lo, res8_256); + res12_256 = _mm256_add_epi32(tmp1_hi, res12_256); + } + + res0_256 = _mm256_srai_epi32(res0_256, interim_shift); + res4_256 = _mm256_srai_epi32(res4_256, interim_shift); + res8_256 = _mm256_srai_epi32(res8_256, interim_shift); + res12_256 = _mm256_srai_epi32(res12_256, interim_shift); + + res0_256 = _mm256_packs_epi32(res0_256, res4_256); + res8_256 = _mm256_packs_epi32(res8_256, res12_256); + + __m256i r0 = _mm256_permute2x128_si256(res0_256, res8_256, 0x20); + __m256i r8 = _mm256_permute2x128_si256(res0_256, res8_256, 0x31); + _mm256_store_si256((__m256i*)(tmp + j), r0); + _mm256_store_si256((__m256i*)(tmp + j + 16), r8); + } + + for (; j < width_rem_size16; j+=16) + { + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); + //Here the normal loop is executed where ii = i - fwidth/2 + fi + for (fi = 0; fi < epi_last_i; fi++){ + ii = diff_i_halffw + fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i d0_hi = _mm_unpackhi_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d0_hi, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); + } + //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) + for ( ; fi < fwidth; fi++) + { + ii = epi_mir_i - fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + __m128i d0_hi = _mm_unpackhi_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d0_hi, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d0_hi, coef); + + // regroup the 2 parts of the result + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); + } + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res8_128 = _mm_srai_epi32(res8_128, interim_shift); + res12_128 = _mm_srai_epi32(res12_128, interim_shift); + + res0_128 = _mm_packs_epi32(res0_128, res4_128); + res8_128 = _mm_packs_epi32(res8_128, res12_128); + + __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_128), res8_128, 1); + _mm256_store_si256((__m256i*)(tmp + j), res); + } + + for (; j < width_rem_size8; j+=8) + { + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); + //Here the normal loop is executed where ii = i - fwidth/2 + fi + for (fi = 0; fi < epi_last_i; fi++){ + ii = diff_i_halffw + fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + + // regroup the 2 parts of the result + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + } + //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) + for ( ; fi < fwidth; fi++) + { + ii = epi_mir_i - fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_8b + ii * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + + __m128i d0_lo = _mm_unpacklo_epi8(d0, zero_128); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0_lo, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0_lo, coef); + + // regroup the 2 parts of the result + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + } + + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + + res0_128 = _mm_packs_epi32(res0_128, res4_128); + _mm_store_si128((__m128i*)(tmp + j), res0_128); + } + + for (; j < width; j++) + { + spat_fil_accum_dtype accum = 0; + //Here the normal loop is executed where ii = i - fwidth/2 + fi + for (fi = 0; fi < epi_last_i; fi++){ + + ii = diff_i_halffw + fi; + accum += (spat_fil_inter_dtype) i_filter_coeffs[fi] * src_8b[ii * src_px_stride + j]; + } + //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) + for ( ; fi < fwidth; fi++) + { + ii = epi_mir_i - fi; + accum += (spat_fil_inter_dtype) i_filter_coeffs[fi] * src_8b[ii * src_px_stride + j]; + } + tmp[j] = (spat_fil_inter_dtype) ((accum + interim_rnd) >> interim_shift); + } + } + else + { + for (; j < width_rem_size128; j+=128) + { + res0_512 = res4_512 = res8_512 = res12_512 = _mm512_set1_epi32(interim_rnd); + res0_64_512 = res4_64_512 = res8_64_512 = res12_64_512 = _mm512_set1_epi32(interim_rnd); + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi < epi_last_i; fi++){ + ii = diff_i_halffw + fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j)); + __m512i d1 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 32)); + __m512i d2 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 64)); + __m512i d3 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 96)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d1, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d1, coef); + __m512i mul2_lo_512 = _mm512_mullo_epi16(d2, coef); + __m512i mul2_hi_512 = _mm512_mulhi_epi16(d2, coef); + __m512i mul3_lo_512 = _mm512_mullo_epi16(d3, coef); + __m512i mul3_hi_512 = _mm512_mulhi_epi16(d3, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp2_lo_512 = _mm512_unpacklo_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp2_hi_512 = _mm512_unpackhi_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp3_lo_512 = _mm512_unpacklo_epi16(mul3_lo_512, mul3_hi_512); + __m512i tmp3_hi_512 = _mm512_unpackhi_epi16(mul3_lo_512, mul3_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + res0_64_512 = _mm512_add_epi32(tmp2_lo_512, res0_64_512); + res4_64_512 = _mm512_add_epi32(tmp2_hi_512, res4_64_512); + res8_64_512 = _mm512_add_epi32(tmp3_lo_512, res8_64_512); + res12_64_512 = _mm512_add_epi32(tmp3_hi_512, res12_64_512); + } + + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = epi_mir_i - fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j)); + __m512i d1 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 32)); + __m512i d2 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 64)); + __m512i d3 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 96)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d1, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d1, coef); + __m512i mul2_lo_512 = _mm512_mullo_epi16(d2, coef); + __m512i mul2_hi_512 = _mm512_mulhi_epi16(d2, coef); + __m512i mul3_lo_512 = _mm512_mullo_epi16(d3, coef); + __m512i mul3_hi_512 = _mm512_mulhi_epi16(d3, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp2_lo_512 = _mm512_unpacklo_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp2_hi_512 = _mm512_unpackhi_epi16(mul2_lo_512, mul2_hi_512); + __m512i tmp3_lo_512 = _mm512_unpacklo_epi16(mul3_lo_512, mul3_hi_512); + __m512i tmp3_hi_512 = _mm512_unpackhi_epi16(mul3_lo_512, mul3_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + res0_64_512 = _mm512_add_epi32(tmp2_lo_512, res0_64_512); + res4_64_512 = _mm512_add_epi32(tmp2_hi_512, res4_64_512); + res8_64_512 = _mm512_add_epi32(tmp3_lo_512, res8_64_512); + res12_64_512 = _mm512_add_epi32(tmp3_hi_512, res12_64_512); + } + + res0_512 = _mm512_srai_epi32(res0_512, interim_shift); + res4_512 = _mm512_srai_epi32(res4_512, interim_shift); + res8_512 = _mm512_srai_epi32(res8_512, interim_shift); + res12_512 = _mm512_srai_epi32(res12_512, interim_shift); + res0_64_512 = _mm512_srai_epi32(res0_64_512, interim_shift); + res4_64_512 = _mm512_srai_epi32(res4_64_512, interim_shift); + res8_64_512 = _mm512_srai_epi32(res8_64_512, interim_shift); + res12_64_512 = _mm512_srai_epi32(res12_64_512, interim_shift); + + res0_512 = _mm512_packs_epi32(res0_512, res4_512); + res8_512 = _mm512_packs_epi32(res8_512, res12_512); + res0_64_512 = _mm512_packs_epi32(res0_64_512, res4_64_512); + res8_64_512 = _mm512_packs_epi32(res8_64_512, res12_64_512); + + _mm512_storeu_si512((__m512i*)(tmp + j), res0_512); + _mm512_storeu_si512((__m512i*)(tmp + j + 32), res8_512); + _mm512_storeu_si512((__m512i*)(tmp + j + 64), res0_64_512); + _mm512_storeu_si512((__m512i*)(tmp + j + 96), res8_64_512); + } + + for (; j < width_rem_size64; j+=64) + { + res0_512 = res4_512 = res8_512 = res12_512 = _mm512_set1_epi32(interim_rnd); + + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi < epi_last_i; fi++) + { + ii = diff_i_halffw + fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j)); + __m512i d1 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 32)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d1, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d1, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + } + + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = epi_mir_i - fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j)); + __m512i d1 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j + 32)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0, coef); + __m512i mul1_lo_512 = _mm512_mullo_epi16(d1, coef); + __m512i mul1_hi_512 = _mm512_mulhi_epi16(d1, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp1_lo_512 = _mm512_unpacklo_epi16(mul1_lo_512, mul1_hi_512); + __m512i tmp1_hi_512 = _mm512_unpackhi_epi16(mul1_lo_512, mul1_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + res8_512 = _mm512_add_epi32(tmp1_lo_512, res8_512); + res12_512 = _mm512_add_epi32(tmp1_hi_512, res12_512); + } + + res0_512 = _mm512_srai_epi32(res0_512, interim_shift); + res4_512 = _mm512_srai_epi32(res4_512, interim_shift); + res8_512 = _mm512_srai_epi32(res8_512, interim_shift); + res12_512 = _mm512_srai_epi32(res12_512, interim_shift); + + res0_512 = _mm512_packs_epi32(res0_512, res4_512); + res8_512 = _mm512_packs_epi32(res8_512, res12_512); + + _mm512_storeu_si512((__m512i*)(tmp + j), res0_512); + _mm512_storeu_si512((__m512i*)(tmp + j + 32), res8_512); + } + + for (; j < width_rem_size32; j+=32) + { + res0_512 = res4_512 = _mm512_set1_epi32(interim_rnd); + + /** + * The full loop is from fi = 0 to fwidth + * During the loop when the centre pixel is at i, + * the top part is available only till i-(fwidth/2) >= 0, + * hence padding (border mirroring) is required when i-fwidth/2 < 0 + */ + //This loop does border mirroring (ii = -(i - fwidth/2 + fi + 1)) + for (fi = 0; fi < epi_last_i; fi++) + { + ii = diff_i_halffw + fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + } + + //Here the normal loop is executed where ii = i - fwidth / 2 + fi + for ( ; fi < fwidth; fi++) + { + ii = epi_mir_i - fi; + __m512i d0 = _mm512_loadu_si512((__m512i*)(src_hbd + ii * src_px_stride + j)); + __m512i coef = _mm512_set1_epi16(i_filter_coeffs[fi]); + + __m512i mul0_lo_512 = _mm512_mullo_epi16(d0, coef); + __m512i mul0_hi_512 = _mm512_mulhi_epi16(d0, coef); + + // regroup the 2 parts of the result + __m512i tmp0_lo_512 = _mm512_unpacklo_epi16(mul0_lo_512, mul0_hi_512); + __m512i tmp0_hi_512 = _mm512_unpackhi_epi16(mul0_lo_512, mul0_hi_512); + + res0_512 = _mm512_add_epi32(tmp0_lo_512, res0_512); + res4_512 = _mm512_add_epi32(tmp0_hi_512, res4_512); + } + + res0_512 = _mm512_srai_epi32(res0_512, interim_shift); + res4_512 = _mm512_srai_epi32(res4_512, interim_shift); + + res0_512 = _mm512_packs_epi32(res0_512, res4_512); + + _mm512_storeu_si512((__m512i*)(tmp + j), res0_512); + } + + for (; j < width_rem_size16; j+=16){ + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); + + //Here the normal loop is executed where ii = i - fwidth/2 + fi + for (fi = 0; fi < epi_last_i; fi++){ + + ii = diff_i_halffw + fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); + __m128i d1 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j + 8)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d1, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d1, coef); + + // regroup the 2 parts of the result + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); + } + + //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) + for ( ; fi < fwidth; fi++) + { + ii = epi_mir_i - fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); + __m128i d1 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j + 8)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); + __m128i mul1_lo_128 = _mm_mullo_epi16(d1, coef); + __m128i mul1_hi_128 = _mm_mulhi_epi16(d1, coef); + + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp1_lo_128 = _mm_unpacklo_epi16(mul1_lo_128, mul1_hi_128); + __m128i tmp1_hi_128 = _mm_unpackhi_epi16(mul1_lo_128, mul1_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + res8_128 = _mm_add_epi32(tmp1_lo_128, res8_128); + res12_128 = _mm_add_epi32(tmp1_hi_128, res12_128); + } + + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + res8_128 = _mm_srai_epi32(res8_128, interim_shift); + res12_128 = _mm_srai_epi32(res12_128, interim_shift); + + res0_128 = _mm_packs_epi32(res0_128, res4_128); + res8_128 = _mm_packs_epi32(res8_128, res12_128); + + __m256i res = _mm256_inserti128_si256(_mm256_castsi128_si256(res0_128), res8_128, 1); + _mm256_store_si256((__m256i*)(tmp + j), res); + } + + for (; j < width_rem_size8; j+=8){ + res0_128 = res4_128 = res8_128 = res12_128 = _mm_set1_epi32(interim_rnd); + + //Here the normal loop is executed where ii = i - fwidth/2 + fi + for (fi = 0; fi < epi_last_i; fi++){ + + ii = diff_i_halffw + fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); + + // regroup the 2 parts of the result + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + } + + //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) + for ( ; fi < fwidth; fi++) + { + ii = epi_mir_i - fi; + __m128i d0 = _mm_loadu_si128((__m128i*)(src_hbd + ii * src_px_stride + j)); + __m128i coef = _mm_set1_epi16(i_filter_coeffs[fi]); + + __m128i mul0_lo_128 = _mm_mullo_epi16(d0, coef); + __m128i mul0_hi_128 = _mm_mulhi_epi16(d0, coef); + + __m128i tmp0_lo_128 = _mm_unpacklo_epi16(mul0_lo_128, mul0_hi_128); + __m128i tmp0_hi_128 = _mm_unpackhi_epi16(mul0_lo_128, mul0_hi_128); + + res0_128 = _mm_add_epi32(tmp0_lo_128, res0_128); + res4_128 = _mm_add_epi32(tmp0_hi_128, res4_128); + } + + res0_128 = _mm_srai_epi32(res0_128, interim_shift); + res4_128 = _mm_srai_epi32(res4_128, interim_shift); + + res0_128 = _mm_packs_epi32(res0_128, res4_128); + _mm_store_si128((__m128i*)(tmp + j), res0_128); + } + + for (; j < width; j++){ + spat_fil_accum_dtype accum = 0; + //Here the normal loop is executed where ii = i - fwidth/2 + fi + for (fi = 0; fi < epi_last_i; fi++){ + + ii = diff_i_halffw + fi; + accum += (spat_fil_inter_dtype) i_filter_coeffs[fi] * src_hbd[ii * src_px_stride + j]; + } + //This loop does border mirroring (ii = 2*height - (i - fwidth/2 + fi) - 1) + for ( ; fi < fwidth; fi++) + { + ii = epi_mir_i - fi; + accum += (spat_fil_inter_dtype) i_filter_coeffs[fi] * src_hbd[ii * src_px_stride + j]; + } + tmp[j] = (spat_fil_inter_dtype) ((accum + interim_rnd) >> interim_shift); + } + } + /* Horizontal pass. common for 8bit and hbd cases */ + integer_horizontal_filter_avx512(tmp, dst, i_filter_coeffs, width, fwidth, i*dst_px_stride, half_fw); + } + + aligned_free(tmp); + + return; +} \ No newline at end of file diff --git a/libvmaf/src/feature/third_party/funque/x86/integer_funque_filters_avx512.h b/libvmaf/src/feature/third_party/funque/x86/integer_funque_filters_avx512.h new file mode 100644 index 000000000..bd22df6fc --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/integer_funque_filters_avx512.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: BSD-3-Clause +* Copyright (C) 2022 Intel Corporation. +*/ +/** + * + * Copyright 2016-2020 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +#include "../integer_funque_filters.h" + +void integer_spatial_filter_avx512(void *src, spat_fil_output_dtype *dst, int width, int height, int bitdepth); + +void integer_funque_dwt2_avx512(spat_fil_output_dtype *src, i_dwt2buffers *dwt2_dst, ptrdiff_t dst_stride, int width, int height); + +void integer_funque_vifdwt2_band0_avx512(dwt2_dtype *src, dwt2_dtype *band_a, ptrdiff_t dst_stride, int width, int height); \ No newline at end of file diff --git a/libvmaf/src/feature/third_party/funque/x86/integer_funque_ssim_avx512.c b/libvmaf/src/feature/third_party/funque/x86/integer_funque_ssim_avx512.c new file mode 100644 index 000000000..63e701b73 --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/integer_funque_ssim_avx512.c @@ -0,0 +1,884 @@ +/* SPDX-License-Identifier: BSD-3-Clause +* Copyright (C) 2022 Intel Corporation. +*/ +/** + * + * Copyright 2016-2020 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include "../integer_funque_filters.h" +#include "../integer_funque_ssim.h" +#include "../funque_ssim_options.h" +#include "integer_funque_ssim_avx512.h" +#include + +#define cvt_1_16x16_to_2_32x8_512(a_16x16, r_32x8_lo, r_32x8_hi) \ +{ \ + r_32x8_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(a_16x16)); \ + r_32x8_hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(a_16x16, 1)); \ +} +#define cvt_1_16x16_to_2_32x8_256(a_16x16, r_32x8_lo, r_32x8_hi) \ +{ \ + r_32x8_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_16x16)); \ + r_32x8_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(a_16x16, 1)); \ +} + +#define cvt_1_32x8_to_2_64x4_512(a_32x8, r_64x4_lo, r_64x4_hi) \ +{ \ + r_64x4_lo = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(a_32x8)); \ + r_64x4_hi = _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(a_32x8, 1)); \ +} +#define cvt_1_32x8_to_2_64x4_256(a_32x8, r_64x4_lo, r_64x4_hi) \ +{ \ + r_64x4_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(a_32x8)); \ + r_64x4_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(a_32x8, 1)); \ +} + +#define cvt_1_32x4_to_2_64x2(a_32x8, r_64x4_lo, r_64x4_hi) \ +{ \ + r_64x4_lo = _mm_cvtepi32_epi64(a_32x8); \ + r_64x4_hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(a_32x8, 0x0E)); \ +} +#define cvt_1_16x8_to_2_32x4(a_16x16, r_32x8_lo, r_32x8_hi) \ +{ \ + r_32x8_lo = _mm_cvtepi16_epi32(a_16x16); \ + r_32x8_hi = _mm_cvtepi16_epi32(_mm_shuffle_epi32(a_16x16, 0x0E)); \ +} + +#define Multiply64Bit_512(ab, cd, res) \ +{ \ + __m512i ac = _mm512_mul_epu32(ab, cd); \ + __m512i b = _mm512_srli_epi64(ab, 32); \ + __m512i bc = _mm512_mul_epu32(b, cd); \ + __m512i d = _mm512_srli_epi64(cd, 32); \ + __m512i ad = _mm512_mul_epu32(ab, d); \ + __m512i high = _mm512_add_epi64(bc, ad); \ + high = _mm512_slli_epi64(high, 32); \ + res = _mm512_add_epi64(high, ac); \ +} + +#define Multiply64Bit_256(ab, cd, res) \ +{ \ + __m256i ac = _mm256_mul_epu32(ab, cd); \ + __m256i b = _mm256_srli_epi64(ab, 32); \ + __m256i bc = _mm256_mul_epu32(b, cd); \ + __m256i d = _mm256_srli_epi64(cd, 32); \ + __m256i ad = _mm256_mul_epu32(ab, d); \ + __m256i high = _mm256_add_epi64(bc, ad); \ + high = _mm256_slli_epi64(high, 32); \ + res = _mm256_add_epi64(high, ac); \ +} + +#define Multiply64Bit_128(ab, cd, res) \ +{ \ + __m128i ac = _mm_mul_epu32(ab, cd); \ + __m128i b = _mm_srli_epi64(ab, 32); \ + __m128i bc = _mm_mul_epu32(b, cd); \ + __m128i d = _mm_srli_epi64(cd, 32); \ + __m128i ad = _mm_mul_epu32(ab, d); \ + __m128i high = _mm_add_epi64(bc, ad); \ + high = _mm_slli_epi64(high, 32); \ + res = _mm_add_epi64(high, ac); \ +} + +static inline int16_t get_best_i16_from_u64(uint64_t temp, int *power) +{ + assert(temp >= 0x20000); + int k = __builtin_clzll(temp); + k = 49 - k; + temp = temp >> k; + *power = k; + return (int16_t) temp; +} + +int integer_compute_ssim_funque_avx512(i_dwt2buffers *ref, i_dwt2buffers *dist, double *score, int max_val, float K1, float K2, int pending_div, int32_t *div_lookup) +{ + int ret = 1; + + int width = ref->width; + int height = ref->height; + + /** + * C1 is constant is added to ref^2, dist^2, + * - hence we have to multiply by pending_div^2 + * As per floating point,C1 is added to 2*(mx/win_dim)*(my/win_dim) & (mx/win_dim)*(mx/win_dim)+(my/win_dim)*(my/win_dim) + * win_dim = 1 << n_levels, where n_levels = 1 + * Since win_dim division is avoided for mx & my, C1 is left shifted by 1 + */ + ssim_inter_dtype C1 = ((K1 * max_val) * (K1 * max_val) * ((pending_div*pending_div) << (2 - SSIM_INTER_L_SHIFT))); + /** + * shifts are handled similar to C1 + * not shifted left because the other terms to which this is added undergoes equivalent right shift + */ + ssim_inter_dtype C2 = ((K2 * max_val) * (K2 * max_val) * ((pending_div*pending_div) >> (SSIM_INTER_VAR_SHIFTS+SSIM_INTER_CS_SHIFT-2))); + + ssim_inter_dtype var_x, var_y, cov_xy; + ssim_inter_dtype map; + ssim_accum_dtype map_num; + ssim_accum_dtype map_den; + int16_t i16_map_den; + dwt2_dtype mx, my; + ssim_inter_dtype var_x_band0, var_y_band0, cov_xy_band0; + ssim_inter_dtype l_num, l_den, cs_num, cs_den; + +#if ENABLE_MINK3POOL + ssim_accum_dtype rowcube_1minus_map = 0; + double accumcube_1minus_map = 0; + const ssim_inter_dtype const_1 = 32768; //div_Q_factor>>SSIM_SHIFT_DIV + + __m512i const_1_512 = _mm512_set1_epi64(32768); + __m512i accum_rowcube_512 = _mm512_setzero_si512(); + + __m256i const_1_256 = _mm256_set1_epi64x(32768); + __m256i accum_rowcube_256 = _mm256_setzero_si256(); + + __m128i const_1_128 = _mm_set1_epi64x(32768); + __m128i accum_rowcube_128 = _mm_setzero_si128(); +#else + ssim_accum_dtype accum_map = 0; + ssim_accum_dtype accum_map_sq = 0; + ssim_accum_dtype map_sq_insum = 0; + + __m512i accum_map_512 = _mm512_setzero_si512(); + __m512i accum_map_sq_512 = _mm512_setzero_si512(); + + __m256i accum_map_256 = _mm256_setzero_si256(); + __m256i accum_map_sq_256 = _mm256_setzero_si256(); + + __m128i accum_map_128 = _mm_setzero_si128(); + __m128i accum_map_sq_128 = _mm_setzero_si128(); + +#endif + __m512i C1_512 = _mm512_set1_epi32(C1); + __m512i C2_512 = _mm512_set1_epi32(C2); + + __m256i C1_256 = _mm256_set1_epi32(C1); + __m256i C2_256 = _mm256_set1_epi32(C2); + + __m128i C1_128 = _mm_set1_epi32(C1); + __m128i C2_128 = _mm_set1_epi32(C2); + + int64_t *numVal = (int64_t *)malloc(width * sizeof(int64_t)); + int64_t *denVal = (int64_t *)malloc(width * sizeof(int64_t)); + + int width_rem_size32 = width - (width % 32); + int width_rem_size16 = width - (width % 16); + int width_rem_size8 = width - (width % 8); + int index = 0, j; + + for (int i = 0; i < height; i++) + { + j = 0; + for (; j < width_rem_size32; j+=32) + { + index = i * width + j; + + __m512i ref_b0 = _mm512_loadu_si512((__m512i*)(ref->bands[0] + index)); + __m512i dis_b0 = _mm512_loadu_si512((__m512i*)(dist->bands[0] + index)); + + __m512i ref_b0_lo, ref_b0_hi, dis_b0_lo, dis_b0_hi; + + cvt_1_16x16_to_2_32x8_512(ref_b0, ref_b0_lo, ref_b0_hi); + cvt_1_16x16_to_2_32x8_512(dis_b0, dis_b0_lo, dis_b0_hi); + + __m512i var_x_b0_lo = _mm512_mullo_epi32(ref_b0_lo, ref_b0_lo); + __m512i var_x_b0_hi = _mm512_mullo_epi32(ref_b0_hi, ref_b0_hi); + __m512i var_y_b0_lo = _mm512_mullo_epi32(dis_b0_lo, dis_b0_lo); + __m512i var_y_b0_hi = _mm512_mullo_epi32(dis_b0_hi, dis_b0_hi); + __m512i cov_xy_b0_lo = _mm512_mullo_epi32(ref_b0_lo, dis_b0_lo); + __m512i cov_xy_b0_hi = _mm512_mullo_epi32(ref_b0_hi, dis_b0_hi); + + __m512i ref_b1 = _mm512_loadu_si512((__m512i*)(ref->bands[1] + index)); + __m512i dis_b1 = _mm512_loadu_si512((__m512i*)(dist->bands[1] + index)); + __m512i ref_b2 = _mm512_loadu_si512((__m512i*)(ref->bands[2] + index)); + __m512i dis_b2 = _mm512_loadu_si512((__m512i*)(dist->bands[2] + index)); + __m512i ref_b3 = _mm512_loadu_si512((__m512i*)(ref->bands[3] + index)); + __m512i dis_b3 = _mm512_loadu_si512((__m512i*)(dist->bands[3] + index)); + + __m512i ref_b1_lo, ref_b1_hi, dis_b1_lo, dis_b1_hi, \ + ref_b2_lo, ref_b2_hi, dis_b2_lo, dis_b2_hi, \ + ref_b3_lo, ref_b3_hi, dis_b3_lo, dis_b3_hi; + cvt_1_16x16_to_2_32x8_512(ref_b1, ref_b1_lo, ref_b1_hi); + cvt_1_16x16_to_2_32x8_512(dis_b1, dis_b1_lo, dis_b1_hi); + cvt_1_16x16_to_2_32x8_512(ref_b2, ref_b2_lo, ref_b2_hi); + cvt_1_16x16_to_2_32x8_512(dis_b2, dis_b2_lo, dis_b2_hi); + cvt_1_16x16_to_2_32x8_512(ref_b3, ref_b3_lo, ref_b3_hi); + cvt_1_16x16_to_2_32x8_512(dis_b3, dis_b3_lo, dis_b3_hi); + + __m512i var_x_b1_lo = _mm512_mullo_epi32(ref_b1_lo, ref_b1_lo); + __m512i var_x_b1_hi = _mm512_mullo_epi32(ref_b1_hi, ref_b1_hi); + __m512i var_y_b1_lo = _mm512_mullo_epi32(dis_b1_lo, dis_b1_lo); + __m512i var_y_b1_hi = _mm512_mullo_epi32(dis_b1_hi, dis_b1_hi); + + __m512i cov_xy_b1_lo = _mm512_mullo_epi32(ref_b1_lo, dis_b1_lo); + __m512i cov_xy_b1_hi = _mm512_mullo_epi32(ref_b1_hi, dis_b1_hi); + + __m512i var_x_b2_lo = _mm512_mullo_epi32(ref_b2_lo, ref_b2_lo); + __m512i var_x_b2_hi = _mm512_mullo_epi32(ref_b2_hi, ref_b2_hi); + __m512i var_y_b2_lo = _mm512_mullo_epi32(dis_b2_lo, dis_b2_lo); + __m512i var_y_b2_hi = _mm512_mullo_epi32(dis_b2_hi, dis_b2_hi); + __m512i cov_xy_b2_lo = _mm512_mullo_epi32(ref_b2_lo, dis_b2_lo); + __m512i cov_xy_b2_hi = _mm512_mullo_epi32(ref_b2_hi, dis_b2_hi); + + __m512i var_x_b3_lo = _mm512_mullo_epi32(ref_b3_lo, ref_b3_lo); + __m512i var_x_b3_hi = _mm512_mullo_epi32(ref_b3_hi, ref_b3_hi); + __m512i var_y_b3_lo = _mm512_mullo_epi32(dis_b3_lo, dis_b3_lo); + __m512i var_y_b3_hi = _mm512_mullo_epi32(dis_b3_hi, dis_b3_hi); + __m512i cov_xy_b3_lo = _mm512_mullo_epi32(ref_b3_lo, dis_b3_lo); + __m512i cov_xy_b3_hi = _mm512_mullo_epi32(ref_b3_hi, dis_b3_hi); + + __m512i var_x_lo = _mm512_add_epi32(var_x_b1_lo, var_x_b2_lo); + __m512i var_x_hi = _mm512_add_epi32(var_x_b1_hi, var_x_b2_hi); + __m512i var_y_lo = _mm512_add_epi32(var_y_b1_lo, var_y_b2_lo); + __m512i var_y_hi = _mm512_add_epi32(var_y_b1_hi, var_y_b2_hi); + __m512i cov_xy_lo = _mm512_add_epi32(cov_xy_b1_lo, cov_xy_b2_lo); + __m512i cov_xy_hi = _mm512_add_epi32(cov_xy_b1_hi, cov_xy_b2_hi); + var_x_lo = _mm512_add_epi32(var_x_lo, var_x_b3_lo); + var_x_hi = _mm512_add_epi32(var_x_hi, var_x_b3_hi); + var_y_lo = _mm512_add_epi32(var_y_lo, var_y_b3_lo); + var_y_hi = _mm512_add_epi32(var_y_hi, var_y_b3_hi); + cov_xy_lo = _mm512_add_epi32(cov_xy_lo, cov_xy_b3_lo); + cov_xy_hi = _mm512_add_epi32(cov_xy_hi, cov_xy_b3_hi); + + __m512i l_den_lo = _mm512_add_epi32(var_x_b0_lo, var_y_b0_lo); + __m512i l_den_hi = _mm512_add_epi32(var_x_b0_hi, var_y_b0_hi); + + var_x_lo = _mm512_srai_epi32(var_x_lo, SSIM_INTER_VAR_SHIFTS); + var_x_hi = _mm512_srai_epi32(var_x_hi, SSIM_INTER_VAR_SHIFTS); + var_y_lo = _mm512_srai_epi32(var_y_lo, SSIM_INTER_VAR_SHIFTS); + var_y_hi = _mm512_srai_epi32(var_y_hi, SSIM_INTER_VAR_SHIFTS); + cov_xy_lo = _mm512_srai_epi32(cov_xy_lo, SSIM_INTER_VAR_SHIFTS); + cov_xy_hi = _mm512_srai_epi32(cov_xy_hi, SSIM_INTER_VAR_SHIFTS); + + l_den_lo = _mm512_srai_epi32(l_den_lo, SSIM_INTER_L_SHIFT); + l_den_hi = _mm512_srai_epi32(l_den_hi, SSIM_INTER_L_SHIFT); + + __m512i l_num_lo = _mm512_add_epi32(cov_xy_b0_lo, C1_512); + __m512i l_num_hi = _mm512_add_epi32(cov_xy_b0_hi, C1_512); + + __m512i cs_den_lo = _mm512_add_epi32(var_x_lo, var_y_lo); + __m512i cs_den_hi = _mm512_add_epi32(var_x_hi, var_y_hi); + __m512i cs_num_lo = _mm512_add_epi32(cov_xy_lo, C2_512); + __m512i cs_num_hi = _mm512_add_epi32(cov_xy_hi, C2_512); + + cs_den_lo = _mm512_srai_epi32(cs_den_lo, SSIM_INTER_CS_SHIFT); + cs_den_hi = _mm512_srai_epi32(cs_den_hi, SSIM_INTER_CS_SHIFT); + + l_den_lo = _mm512_add_epi32(l_den_lo, C1_512); + l_den_hi = _mm512_add_epi32(l_den_hi, C1_512); + + cs_den_lo = _mm512_add_epi32(cs_den_lo, C2_512); + cs_den_hi = _mm512_add_epi32(cs_den_hi, C2_512); + + __m512i map_num_lo0, map_num_lo1, map_num_hi0, map_num_hi1; + __m512i map_den_lo0, map_den_lo1, map_den_hi0, map_den_hi1; + + map_num_lo0 = _mm512_mul_epi32(l_num_lo, cs_num_lo); + map_num_lo1 = _mm512_mul_epi32(_mm512_srai_epi64(l_num_lo, 32), _mm512_srai_epi64(cs_num_lo, 32)); + + map_num_hi0 = _mm512_mul_epi32(l_num_hi, cs_num_hi); + map_num_hi1 = _mm512_mul_epi32(_mm512_srai_epi64(l_num_hi, 32), _mm512_srai_epi64(cs_num_hi, 32)); + + map_den_lo0 = _mm512_mul_epi32(l_den_lo, cs_den_lo); + map_den_lo1 = _mm512_mul_epi32(_mm512_srai_epi64(l_den_lo, 32), _mm512_srai_epi64(cs_den_lo, 32)); + + map_den_hi0 = _mm512_mul_epi32(l_den_hi, cs_den_hi); + map_den_hi1 = _mm512_mul_epi32(_mm512_srai_epi64(l_den_hi, 32), _mm512_srai_epi64(cs_den_hi, 32)); + + __m512i zcnt_lo0 = _mm512_lzcnt_epi64(map_den_lo0); + __m512i zcnt_lo1 = _mm512_lzcnt_epi64(map_den_lo1); + __m512i zcnt_hi0 = _mm512_lzcnt_epi64(map_den_hi0); + __m512i zcnt_hi1 = _mm512_lzcnt_epi64(map_den_hi1); + + zcnt_lo0 = _mm512_sub_epi64(_mm512_set1_epi64(49), zcnt_lo0); + zcnt_lo1 = _mm512_sub_epi64(_mm512_set1_epi64(49), zcnt_lo1); + zcnt_hi0 = _mm512_sub_epi64(_mm512_set1_epi64(49), zcnt_hi0); + zcnt_hi1 = _mm512_sub_epi64(_mm512_set1_epi64(49), zcnt_hi1); + + map_den_lo0 = _mm512_srav_epi64(map_den_lo0, zcnt_lo0); + map_num_lo0 = _mm512_srav_epi64(map_num_lo0, zcnt_lo0); + map_den_lo1 = _mm512_srav_epi64(map_den_lo1, zcnt_lo1); + map_num_lo1 = _mm512_srav_epi64(map_num_lo1, zcnt_lo1); + map_den_hi0 = _mm512_srav_epi64(map_den_hi0, zcnt_hi0); + map_num_hi0 = _mm512_srav_epi64(map_num_hi0, zcnt_hi0); + map_den_hi1 = _mm512_srav_epi64(map_den_hi1, zcnt_hi1); + map_num_hi1 = _mm512_srav_epi64(map_num_hi1, zcnt_hi1); + + map_den_lo0 = _mm512_add_epi64(map_den_lo0, _mm512_set1_epi64(32768)); + map_den_lo1 = _mm512_add_epi64(map_den_lo1, _mm512_set1_epi64(32768)); + map_den_hi0 = _mm512_add_epi64(map_den_hi0, _mm512_set1_epi64(32768)); + map_den_hi1 = _mm512_add_epi64(map_den_hi1, _mm512_set1_epi64(32768)); + + __m256i div_lookup_lo0 = _mm512_i64gather_epi32(map_den_lo0, div_lookup, 4); + __m256i div_lookup_lo1 = _mm512_i64gather_epi32(map_den_lo1, div_lookup, 4); + __m256i div_lookup_hi0 = _mm512_i64gather_epi32(map_den_hi0, div_lookup, 4); + __m256i div_lookup_hi1 = _mm512_i64gather_epi32(map_den_hi1, div_lookup, 4); + __m512i map_lo0, map_lo1, map_hi0, map_hi1; + + Multiply64Bit_512(map_num_lo0, _mm512_cvtepi32_epi64(div_lookup_lo0), map_lo0); + Multiply64Bit_512(map_num_lo1, _mm512_cvtepi32_epi64(div_lookup_lo1), map_lo1); + Multiply64Bit_512(map_num_hi0, _mm512_cvtepi32_epi64(div_lookup_hi0), map_hi0); + Multiply64Bit_512(map_num_hi1, _mm512_cvtepi32_epi64(div_lookup_hi1), map_hi1); + + map_lo0 = _mm512_srai_epi64(map_lo0, SSIM_SHIFT_DIV); + map_lo1 = _mm512_srai_epi64(map_lo1, SSIM_SHIFT_DIV); + map_hi0 = _mm512_srai_epi64(map_hi0, SSIM_SHIFT_DIV); + map_hi1 = _mm512_srai_epi64(map_hi1, SSIM_SHIFT_DIV); + +#if ENABLE_MINK3POOL + __m512i const1_minus_map_lo0 = _mm512_sub_epi64(const_1_512, map_lo0); + __m512i const1_minus_map_lo1 = _mm512_sub_epi64(const_1_512, map_lo1); + __m512i const1_minus_map_hi0 = _mm512_sub_epi64(const_1_512, map_hi0); + __m512i const1_minus_map_hi1 = _mm512_sub_epi64(const_1_512, map_hi1); + + __m512i const1_minus_map_sq_lo0 = _mm512_mul_epi32(const1_minus_map_lo0, const1_minus_map_lo0); + __m512i const1_minus_map_sq_lo1 = _mm512_mul_epi32(const1_minus_map_lo1, const1_minus_map_lo1); + __m512i const1_minus_map_sq_hi0 = _mm512_mul_epi32(const1_minus_map_hi0, const1_minus_map_hi0); + __m512i const1_minus_map_sq_hi1 = _mm512_mul_epi32(const1_minus_map_hi1, const1_minus_map_hi1); + + __m512i rowcube_1minus_map_lo0, rowcube_1minus_map_lo1, rowcube_1minus_map_hi0, rowcube_1minus_map_hi1; + + Multiply64Bit_512(const1_minus_map_sq_lo0, const1_minus_map_lo0, rowcube_1minus_map_lo0); + Multiply64Bit_512(const1_minus_map_sq_lo1, const1_minus_map_lo1, rowcube_1minus_map_lo1); + Multiply64Bit_512(const1_minus_map_sq_hi0, const1_minus_map_hi0, rowcube_1minus_map_hi0); + Multiply64Bit_512(const1_minus_map_sq_hi1, const1_minus_map_hi1, rowcube_1minus_map_hi1); + + rowcube_1minus_map_lo0 = _mm512_add_epi64(rowcube_1minus_map_lo0, rowcube_1minus_map_lo1); + rowcube_1minus_map_hi0 = _mm512_add_epi64(rowcube_1minus_map_hi0, rowcube_1minus_map_hi1); + rowcube_1minus_map_lo0 = _mm512_add_epi64(rowcube_1minus_map_lo0, rowcube_1minus_map_hi0); + accum_rowcube_512 = _mm512_add_epi64(accum_rowcube_512, rowcube_1minus_map_lo0); +#else + __m512i map_sq_lo0, map_sq_lo1, map_sq_hi0, map_sq_hi1; + Multiply64Bit_512(map_lo0, map_lo0, map_sq_lo0); + Multiply64Bit_512(map_lo1, map_lo1, map_sq_lo1); + Multiply64Bit_512(map_hi0, map_hi0, map_sq_hi0); + Multiply64Bit_512(map_hi1, map_hi1, map_sq_hi1); + + map_lo0 = _mm512_add_epi64(map_lo0, map_lo1); + map_hi0 = _mm512_add_epi64(map_hi0, map_hi1); + map_lo0 = _mm512_add_epi64(map_lo0, map_hi0); + accum_map_512 = _mm512_add_epi64(accum_map_512, map_lo0); + + map_sq_lo0 = _mm512_add_epi64(map_sq_lo0, map_sq_lo1); + map_sq_hi0 = _mm512_add_epi64(map_sq_hi0, map_sq_hi1); + map_sq_lo0 = _mm512_add_epi64(map_sq_lo0, map_sq_hi0); + accum_map_sq_512 = _mm512_add_epi64(accum_map_sq_512, map_sq_lo0); +#endif + } + + for (; j < width_rem_size16; j+=16) + { + index = i * width + j; + + __m256i ref_b0 = _mm256_loadu_si256((__m256i*)(ref->bands[0] + index)); + __m256i dis_b0 = _mm256_loadu_si256((__m256i*)(dist->bands[0] + index)); + + __m256i ref_b0_lo, ref_b0_hi, dis_b0_lo, dis_b0_hi; + + cvt_1_16x16_to_2_32x8_256(ref_b0, ref_b0_lo, ref_b0_hi); + cvt_1_16x16_to_2_32x8_256(dis_b0, dis_b0_lo, dis_b0_hi); + + __m256i var_x_b0_lo = _mm256_mullo_epi32(ref_b0_lo, ref_b0_lo); + __m256i var_x_b0_hi = _mm256_mullo_epi32(ref_b0_hi, ref_b0_hi); + __m256i var_y_b0_lo = _mm256_mullo_epi32(dis_b0_lo, dis_b0_lo); + __m256i var_y_b0_hi = _mm256_mullo_epi32(dis_b0_hi, dis_b0_hi); + __m256i cov_xy_b0_lo = _mm256_mullo_epi32(ref_b0_lo, dis_b0_lo); + __m256i cov_xy_b0_hi = _mm256_mullo_epi32(ref_b0_hi, dis_b0_hi); + + __m256i ref_b1 = _mm256_loadu_si256((__m256i*)(ref->bands[1] + index)); + __m256i dis_b1 = _mm256_loadu_si256((__m256i*)(dist->bands[1] + index)); + __m256i ref_b2 = _mm256_loadu_si256((__m256i*)(ref->bands[2] + index)); + __m256i dis_b2 = _mm256_loadu_si256((__m256i*)(dist->bands[2] + index)); + __m256i ref_b3 = _mm256_loadu_si256((__m256i*)(ref->bands[3] + index)); + __m256i dis_b3 = _mm256_loadu_si256((__m256i*)(dist->bands[3] + index)); + + __m256i ref_b1_lo, ref_b1_hi, dis_b1_lo, dis_b1_hi, \ + ref_b2_lo, ref_b2_hi, dis_b2_lo, dis_b2_hi, \ + ref_b3_lo, ref_b3_hi, dis_b3_lo, dis_b3_hi; + cvt_1_16x16_to_2_32x8_256(ref_b1, ref_b1_lo, ref_b1_hi); + cvt_1_16x16_to_2_32x8_256(dis_b1, dis_b1_lo, dis_b1_hi); + cvt_1_16x16_to_2_32x8_256(ref_b2, ref_b2_lo, ref_b2_hi); + cvt_1_16x16_to_2_32x8_256(dis_b2, dis_b2_lo, dis_b2_hi); + cvt_1_16x16_to_2_32x8_256(ref_b3, ref_b3_lo, ref_b3_hi); + cvt_1_16x16_to_2_32x8_256(dis_b3, dis_b3_lo, dis_b3_hi); + + __m256i var_x_b1_lo = _mm256_mullo_epi32(ref_b1_lo, ref_b1_lo); + __m256i var_x_b1_hi = _mm256_mullo_epi32(ref_b1_hi, ref_b1_hi); + __m256i var_y_b1_lo = _mm256_mullo_epi32(dis_b1_lo, dis_b1_lo); + __m256i var_y_b1_hi = _mm256_mullo_epi32(dis_b1_hi, dis_b1_hi); + + __m256i cov_xy_b1_lo = _mm256_mullo_epi32(ref_b1_lo, dis_b1_lo); + __m256i cov_xy_b1_hi = _mm256_mullo_epi32(ref_b1_hi, dis_b1_hi); + + __m256i var_x_b2_lo = _mm256_mullo_epi32(ref_b2_lo, ref_b2_lo); + __m256i var_x_b2_hi = _mm256_mullo_epi32(ref_b2_hi, ref_b2_hi); + __m256i var_y_b2_lo = _mm256_mullo_epi32(dis_b2_lo, dis_b2_lo); + __m256i var_y_b2_hi = _mm256_mullo_epi32(dis_b2_hi, dis_b2_hi); + __m256i cov_xy_b2_lo = _mm256_mullo_epi32(ref_b2_lo, dis_b2_lo); + __m256i cov_xy_b2_hi = _mm256_mullo_epi32(ref_b2_hi, dis_b2_hi); + + __m256i var_x_b3_lo = _mm256_mullo_epi32(ref_b3_lo, ref_b3_lo); + __m256i var_x_b3_hi = _mm256_mullo_epi32(ref_b3_hi, ref_b3_hi); + __m256i var_y_b3_lo = _mm256_mullo_epi32(dis_b3_lo, dis_b3_lo); + __m256i var_y_b3_hi = _mm256_mullo_epi32(dis_b3_hi, dis_b3_hi); + __m256i cov_xy_b3_lo = _mm256_mullo_epi32(ref_b3_lo, dis_b3_lo); + __m256i cov_xy_b3_hi = _mm256_mullo_epi32(ref_b3_hi, dis_b3_hi); + + __m256i var_x_lo = _mm256_add_epi32(var_x_b1_lo, var_x_b2_lo); + __m256i var_x_hi = _mm256_add_epi32(var_x_b1_hi, var_x_b2_hi); + __m256i var_y_lo = _mm256_add_epi32(var_y_b1_lo, var_y_b2_lo); + __m256i var_y_hi = _mm256_add_epi32(var_y_b1_hi, var_y_b2_hi); + __m256i cov_xy_lo = _mm256_add_epi32(cov_xy_b1_lo, cov_xy_b2_lo); + __m256i cov_xy_hi = _mm256_add_epi32(cov_xy_b1_hi, cov_xy_b2_hi); + var_x_lo = _mm256_add_epi32(var_x_lo, var_x_b3_lo); + var_x_hi = _mm256_add_epi32(var_x_hi, var_x_b3_hi); + var_y_lo = _mm256_add_epi32(var_y_lo, var_y_b3_lo); + var_y_hi = _mm256_add_epi32(var_y_hi, var_y_b3_hi); + cov_xy_lo = _mm256_add_epi32(cov_xy_lo, cov_xy_b3_lo); + cov_xy_hi = _mm256_add_epi32(cov_xy_hi, cov_xy_b3_hi); + + __m256i l_den_lo = _mm256_add_epi32(var_x_b0_lo, var_y_b0_lo); + __m256i l_den_hi = _mm256_add_epi32(var_x_b0_hi, var_y_b0_hi); + + var_x_lo = _mm256_srai_epi32(var_x_lo, SSIM_INTER_VAR_SHIFTS); + var_x_hi = _mm256_srai_epi32(var_x_hi, SSIM_INTER_VAR_SHIFTS); + var_y_lo = _mm256_srai_epi32(var_y_lo, SSIM_INTER_VAR_SHIFTS); + var_y_hi = _mm256_srai_epi32(var_y_hi, SSIM_INTER_VAR_SHIFTS); + cov_xy_lo = _mm256_srai_epi32(cov_xy_lo, SSIM_INTER_VAR_SHIFTS); + cov_xy_hi = _mm256_srai_epi32(cov_xy_hi, SSIM_INTER_VAR_SHIFTS); + + l_den_lo = _mm256_srai_epi32(l_den_lo, SSIM_INTER_L_SHIFT); + l_den_hi = _mm256_srai_epi32(l_den_hi, SSIM_INTER_L_SHIFT); + + __m256i l_num_lo = _mm256_add_epi32(cov_xy_b0_lo, C1_256); + __m256i l_num_hi = _mm256_add_epi32(cov_xy_b0_hi, C1_256); + + __m256i cs_den_lo = _mm256_add_epi32(var_x_lo, var_y_lo); + __m256i cs_den_hi = _mm256_add_epi32(var_x_hi, var_y_hi); + __m256i cs_num_lo = _mm256_add_epi32(cov_xy_lo, C2_256); + __m256i cs_num_hi = _mm256_add_epi32(cov_xy_hi, C2_256); + + cs_den_lo = _mm256_srai_epi32(cs_den_lo, SSIM_INTER_CS_SHIFT); + cs_den_hi = _mm256_srai_epi32(cs_den_hi, SSIM_INTER_CS_SHIFT); + + l_den_lo = _mm256_add_epi32(l_den_lo, C1_256); + l_den_hi = _mm256_add_epi32(l_den_hi, C1_256); + + cs_den_lo = _mm256_add_epi32(cs_den_lo, C2_256); + cs_den_hi = _mm256_add_epi32(cs_den_hi, C2_256); + + __m256i map_num_lo0, map_num_lo1, map_num_hi0, map_num_hi1; + __m256i map_den_lo0, map_den_lo1, map_den_hi0, map_den_hi1; + + map_num_lo0 = _mm256_mul_epi32(l_num_lo, cs_num_lo); + map_num_lo1 = _mm256_mul_epi32(_mm256_srai_epi64(l_num_lo, 32), _mm256_srai_epi64(cs_num_lo, 32)); + + map_num_hi0 = _mm256_mul_epi32(l_num_hi, cs_num_hi); + map_num_hi1 = _mm256_mul_epi32(_mm256_srai_epi64(l_num_hi, 32), _mm256_srai_epi64(cs_num_hi, 32)); + + map_den_lo0 = _mm256_mul_epi32(l_den_lo, cs_den_lo); + map_den_lo1 = _mm256_mul_epi32(_mm256_srai_epi64(l_den_lo, 32), _mm256_srai_epi64(cs_den_lo, 32)); + + map_den_hi0 = _mm256_mul_epi32(l_den_hi, cs_den_hi); + map_den_hi1 = _mm256_mul_epi32(_mm256_srai_epi64(l_den_hi, 32), _mm256_srai_epi64(cs_den_hi, 32)); + + __m256i zcnt_lo0 = _mm256_lzcnt_epi64(map_den_lo0); + __m256i zcnt_lo1 = _mm256_lzcnt_epi64(map_den_lo1); + __m256i zcnt_hi0 = _mm256_lzcnt_epi64(map_den_hi0); + __m256i zcnt_hi1 = _mm256_lzcnt_epi64(map_den_hi1); + + zcnt_lo0 = _mm256_sub_epi64(_mm256_set1_epi64x(49), zcnt_lo0); + zcnt_lo1 = _mm256_sub_epi64(_mm256_set1_epi64x(49), zcnt_lo1); + zcnt_hi0 = _mm256_sub_epi64(_mm256_set1_epi64x(49), zcnt_hi0); + zcnt_hi1 = _mm256_sub_epi64(_mm256_set1_epi64x(49), zcnt_hi1); + + map_den_lo0 = _mm256_srav_epi64(map_den_lo0, zcnt_lo0); + map_num_lo0 = _mm256_srav_epi64(map_num_lo0, zcnt_lo0); + map_den_lo1 = _mm256_srav_epi64(map_den_lo1, zcnt_lo1); + map_num_lo1 = _mm256_srav_epi64(map_num_lo1, zcnt_lo1); + map_den_hi0 = _mm256_srav_epi64(map_den_hi0, zcnt_hi0); + map_num_hi0 = _mm256_srav_epi64(map_num_hi0, zcnt_hi0); + map_den_hi1 = _mm256_srav_epi64(map_den_hi1, zcnt_hi1); + map_num_hi1 = _mm256_srav_epi64(map_num_hi1, zcnt_hi1); + + map_den_lo0 = _mm256_add_epi64(map_den_lo0, _mm256_set1_epi64x(32768)); + map_den_lo1 = _mm256_add_epi64(map_den_lo1, _mm256_set1_epi64x(32768)); + map_den_hi0 = _mm256_add_epi64(map_den_hi0, _mm256_set1_epi64x(32768)); + map_den_hi1 = _mm256_add_epi64(map_den_hi1, _mm256_set1_epi64x(32768)); + + __m128i div_lookup_lo0 = _mm256_i64gather_epi32(div_lookup, map_den_lo0, 4); + __m128i div_lookup_lo1 = _mm256_i64gather_epi32(div_lookup, map_den_lo1, 4); + __m128i div_lookup_hi0 = _mm256_i64gather_epi32(div_lookup, map_den_hi0, 4); + __m128i div_lookup_hi1 = _mm256_i64gather_epi32(div_lookup, map_den_hi1, 4); + __m256i map_lo0, map_lo1, map_hi0, map_hi1; + + Multiply64Bit_256(map_num_lo0, _mm256_cvtepi32_epi64(div_lookup_lo0), map_lo0); + Multiply64Bit_256(map_num_lo1, _mm256_cvtepi32_epi64(div_lookup_lo1), map_lo1); + Multiply64Bit_256(map_num_hi0, _mm256_cvtepi32_epi64(div_lookup_hi0), map_hi0); + Multiply64Bit_256(map_num_hi1, _mm256_cvtepi32_epi64(div_lookup_hi1), map_hi1); + + map_lo0 = _mm256_srai_epi64(map_lo0, SSIM_SHIFT_DIV); + map_lo1 = _mm256_srai_epi64(map_lo1, SSIM_SHIFT_DIV); + map_hi0 = _mm256_srai_epi64(map_hi0, SSIM_SHIFT_DIV); + map_hi1 = _mm256_srai_epi64(map_hi1, SSIM_SHIFT_DIV); + +#if ENABLE_MINK3POOL + __m256i const1_minus_map_lo0 = _mm256_sub_epi64(const_1_256, map_lo0); + __m256i const1_minus_map_lo1 = _mm256_sub_epi64(const_1_256, map_lo1); + __m256i const1_minus_map_hi0 = _mm256_sub_epi64(const_1_256, map_hi0); + __m256i const1_minus_map_hi1 = _mm256_sub_epi64(const_1_256, map_hi1); + + __m256i const1_minus_map_sq_lo0 = _mm256_mul_epi32(const1_minus_map_lo0, const1_minus_map_lo0); + __m256i const1_minus_map_sq_lo1 = _mm256_mul_epi32(const1_minus_map_lo1, const1_minus_map_lo1); + __m256i const1_minus_map_sq_hi0 = _mm256_mul_epi32(const1_minus_map_hi0, const1_minus_map_hi0); + __m256i const1_minus_map_sq_hi1 = _mm256_mul_epi32(const1_minus_map_hi1, const1_minus_map_hi1); + + __m256i rowcube_1minus_map_lo0, rowcube_1minus_map_lo1, rowcube_1minus_map_hi0, rowcube_1minus_map_hi1; + + Multiply64Bit_256(const1_minus_map_sq_lo0, const1_minus_map_lo0, rowcube_1minus_map_lo0); + Multiply64Bit_256(const1_minus_map_sq_lo1, const1_minus_map_lo1, rowcube_1minus_map_lo1); + Multiply64Bit_256(const1_minus_map_sq_hi0, const1_minus_map_hi0, rowcube_1minus_map_hi0); + Multiply64Bit_256(const1_minus_map_sq_hi1, const1_minus_map_hi1, rowcube_1minus_map_hi1); + + rowcube_1minus_map_lo0 = _mm256_add_epi64(rowcube_1minus_map_lo0, rowcube_1minus_map_lo1); + rowcube_1minus_map_hi0 = _mm256_add_epi64(rowcube_1minus_map_hi0, rowcube_1minus_map_hi1); + rowcube_1minus_map_lo0 = _mm256_add_epi64(rowcube_1minus_map_lo0, rowcube_1minus_map_hi0); + accum_rowcube_256 = _mm256_add_epi64(accum_rowcube_256, rowcube_1minus_map_lo0); + +#else + __m512i map_sq_lo0, map_sq_lo1, map_sq_hi0, map_sq_hi1; + Multiply64Bit_256(map_lo0, map_lo0, map_sq_lo0); + Multiply64Bit_256(map_lo1, map_lo1, map_sq_lo1); + Multiply64Bit_256(map_hi0, map_hi0, map_sq_hi0); + Multiply64Bit_256(map_hi1, map_hi1, map_sq_hi1); + + map_lo0 = _mm256_add_epi64(map_lo0, map_lo1); + map_hi0 = _mm256_add_epi64(map_hi0, map_hi1); + map_lo0 = _mm256_add_epi64(map_lo0, map_hi0); + accum_map_256 = _mm256_add_epi64(accum_map_256, map_lo0); + + map_sq_lo0 = _mm256_add_epi64(map_sq_lo0, map_sq_lo1); + map_sq_hi0 = _mm256_add_epi64(map_sq_hi0, map_sq_hi1); + map_sq_lo0 = _mm256_add_epi64(map_sq_lo0, map_sq_hi0); + accum_map_sq_256 = _mm256_add_epi64(accum_map_sq_256, map_sq_lo0); +#endif + } + + for (; j < width_rem_size8; j+=8) + { + index = i * width + j; + + __m128i ref_b0 = _mm_loadu_si128((__m128i*)(ref->bands[0] + index)); + __m128i dis_b0 = _mm_loadu_si128((__m128i*)(dist->bands[0] + index)); + + __m128i ref_b0_lo, ref_b0_hi, dis_b0_lo, dis_b0_hi; + + cvt_1_16x8_to_2_32x4(ref_b0, ref_b0_lo, ref_b0_hi); + cvt_1_16x8_to_2_32x4(dis_b0, dis_b0_lo, dis_b0_hi); + + __m128i var_x_b0_lo = _mm_mullo_epi32(ref_b0_lo, ref_b0_lo); + __m128i var_x_b0_hi = _mm_mullo_epi32(ref_b0_hi, ref_b0_hi); + __m128i var_y_b0_lo = _mm_mullo_epi32(dis_b0_lo, dis_b0_lo); + __m128i var_y_b0_hi = _mm_mullo_epi32(dis_b0_hi, dis_b0_hi); + __m128i cov_xy_b0_lo = _mm_mullo_epi32(ref_b0_lo, dis_b0_lo); + __m128i cov_xy_b0_hi = _mm_mullo_epi32(ref_b0_hi, dis_b0_hi); + + __m128i ref_b1 = _mm_loadu_si128((__m128i*)(ref->bands[1] + index)); + __m128i dis_b1 = _mm_loadu_si128((__m128i*)(dist->bands[1] + index)); + __m128i ref_b2 = _mm_loadu_si128((__m128i*)(ref->bands[2] + index)); + __m128i dis_b2 = _mm_loadu_si128((__m128i*)(dist->bands[2] + index)); + __m128i ref_b3 = _mm_loadu_si128((__m128i*)(ref->bands[3] + index)); + __m128i dis_b3 = _mm_loadu_si128((__m128i*)(dist->bands[3] + index)); + + __m128i ref_b1_lo, ref_b1_hi, dis_b1_lo, dis_b1_hi, \ + ref_b2_lo, ref_b2_hi, dis_b2_lo, dis_b2_hi, \ + ref_b3_lo, ref_b3_hi, dis_b3_lo, dis_b3_hi; + cvt_1_16x8_to_2_32x4(ref_b1, ref_b1_lo, ref_b1_hi); + cvt_1_16x8_to_2_32x4(dis_b1, dis_b1_lo, dis_b1_hi); + cvt_1_16x8_to_2_32x4(ref_b2, ref_b2_lo, ref_b2_hi); + cvt_1_16x8_to_2_32x4(dis_b2, dis_b2_lo, dis_b2_hi); + cvt_1_16x8_to_2_32x4(ref_b3, ref_b3_lo, ref_b3_hi); + cvt_1_16x8_to_2_32x4(dis_b3, dis_b3_lo, dis_b3_hi); + + __m128i var_x_b1_lo = _mm_mullo_epi32(ref_b1_lo, ref_b1_lo); + __m128i var_x_b1_hi = _mm_mullo_epi32(ref_b1_hi, ref_b1_hi); + __m128i var_y_b1_lo = _mm_mullo_epi32(dis_b1_lo, dis_b1_lo); + __m128i var_y_b1_hi = _mm_mullo_epi32(dis_b1_hi, dis_b1_hi); + + __m128i cov_xy_b1_lo = _mm_mullo_epi32(ref_b1_lo, dis_b1_lo); + __m128i cov_xy_b1_hi = _mm_mullo_epi32(ref_b1_hi, dis_b1_hi); + + __m128i var_x_b2_lo = _mm_mullo_epi32(ref_b2_lo, ref_b2_lo); + __m128i var_x_b2_hi = _mm_mullo_epi32(ref_b2_hi, ref_b2_hi); + __m128i var_y_b2_lo = _mm_mullo_epi32(dis_b2_lo, dis_b2_lo); + __m128i var_y_b2_hi = _mm_mullo_epi32(dis_b2_hi, dis_b2_hi); + __m128i cov_xy_b2_lo = _mm_mullo_epi32(ref_b2_lo, dis_b2_lo); + __m128i cov_xy_b2_hi = _mm_mullo_epi32(ref_b2_hi, dis_b2_hi); + + __m128i var_x_b3_lo = _mm_mullo_epi32(ref_b3_lo, ref_b3_lo); + __m128i var_x_b3_hi = _mm_mullo_epi32(ref_b3_hi, ref_b3_hi); + __m128i var_y_b3_lo = _mm_mullo_epi32(dis_b3_lo, dis_b3_lo); + __m128i var_y_b3_hi = _mm_mullo_epi32(dis_b3_hi, dis_b3_hi); + __m128i cov_xy_b3_lo = _mm_mullo_epi32(ref_b3_lo, dis_b3_lo); + __m128i cov_xy_b3_hi = _mm_mullo_epi32(ref_b3_hi, dis_b3_hi); + + __m128i var_x_lo = _mm_add_epi32(var_x_b1_lo, var_x_b2_lo); + __m128i var_x_hi = _mm_add_epi32(var_x_b1_hi, var_x_b2_hi); + __m128i var_y_lo = _mm_add_epi32(var_y_b1_lo, var_y_b2_lo); + __m128i var_y_hi = _mm_add_epi32(var_y_b1_hi, var_y_b2_hi); + __m128i cov_xy_lo = _mm_add_epi32(cov_xy_b1_lo, cov_xy_b2_lo); + __m128i cov_xy_hi = _mm_add_epi32(cov_xy_b1_hi, cov_xy_b2_hi); + var_x_lo = _mm_add_epi32(var_x_lo, var_x_b3_lo); + var_x_hi = _mm_add_epi32(var_x_hi, var_x_b3_hi); + var_y_lo = _mm_add_epi32(var_y_lo, var_y_b3_lo); + var_y_hi = _mm_add_epi32(var_y_hi, var_y_b3_hi); + cov_xy_lo = _mm_add_epi32(cov_xy_lo, cov_xy_b3_lo); + cov_xy_hi = _mm_add_epi32(cov_xy_hi, cov_xy_b3_hi); + + __m128i l_den_lo = _mm_add_epi32(var_x_b0_lo, var_y_b0_lo); + __m128i l_den_hi = _mm_add_epi32(var_x_b0_hi, var_y_b0_hi); + + var_x_lo = _mm_srai_epi32(var_x_lo, SSIM_INTER_VAR_SHIFTS); + var_x_hi = _mm_srai_epi32(var_x_hi, SSIM_INTER_VAR_SHIFTS); + var_y_lo = _mm_srai_epi32(var_y_lo, SSIM_INTER_VAR_SHIFTS); + var_y_hi = _mm_srai_epi32(var_y_hi, SSIM_INTER_VAR_SHIFTS); + cov_xy_lo = _mm_srai_epi32(cov_xy_lo, SSIM_INTER_VAR_SHIFTS); + cov_xy_hi = _mm_srai_epi32(cov_xy_hi, SSIM_INTER_VAR_SHIFTS); + + l_den_lo = _mm_srai_epi32(l_den_lo, SSIM_INTER_L_SHIFT); + l_den_hi = _mm_srai_epi32(l_den_hi, SSIM_INTER_L_SHIFT); + + __m128i l_num_lo = _mm_add_epi32(cov_xy_b0_lo, C1_128); + __m128i l_num_hi = _mm_add_epi32(cov_xy_b0_hi, C1_128); + + __m128i cs_den_lo = _mm_add_epi32(var_x_lo, var_y_lo); + __m128i cs_den_hi = _mm_add_epi32(var_x_hi, var_y_hi); + __m128i cs_num_lo = _mm_add_epi32(cov_xy_lo, C2_128); + __m128i cs_num_hi = _mm_add_epi32(cov_xy_hi, C2_128); + + cs_den_lo = _mm_srai_epi32(cs_den_lo, SSIM_INTER_CS_SHIFT); + cs_den_hi = _mm_srai_epi32(cs_den_hi, SSIM_INTER_CS_SHIFT); + + l_den_lo = _mm_add_epi32(l_den_lo, C1_128); + l_den_hi = _mm_add_epi32(l_den_hi, C1_128); + + cs_den_lo = _mm_add_epi32(cs_den_lo, C2_128); + cs_den_hi = _mm_add_epi32(cs_den_hi, C2_128); + + __m128i map_num_lo0, map_num_lo1, map_num_hi0, map_num_hi1; + __m128i map_den_lo0, map_den_lo1, map_den_hi0, map_den_hi1; + + map_num_lo0 = _mm_mul_epi32(l_num_lo, cs_num_lo); + map_num_lo1 = _mm_mul_epi32(_mm_srai_epi64(l_num_lo, 32), _mm_srai_epi64(cs_num_lo, 32)); + + map_num_hi0 = _mm_mul_epi32(l_num_hi, cs_num_hi); + map_num_hi1 = _mm_mul_epi32(_mm_srai_epi64(l_num_hi, 32), _mm_srai_epi64(cs_num_hi, 32)); + + map_den_lo0 = _mm_mul_epi32(l_den_lo, cs_den_lo); + map_den_lo1 = _mm_mul_epi32(_mm_srai_epi64(l_den_lo, 32), _mm_srai_epi64(cs_den_lo, 32)); + + map_den_hi0 = _mm_mul_epi32(l_den_hi, cs_den_hi); + map_den_hi1 = _mm_mul_epi32(_mm_srai_epi64(l_den_hi, 32), _mm_srai_epi64(cs_den_hi, 32)); + + __m128i zcnt_lo0 = _mm_lzcnt_epi64(map_den_lo0); + __m128i zcnt_lo1 = _mm_lzcnt_epi64(map_den_lo1); + __m128i zcnt_hi0 = _mm_lzcnt_epi64(map_den_hi0); + __m128i zcnt_hi1 = _mm_lzcnt_epi64(map_den_hi1); + + zcnt_lo0 = _mm_sub_epi64(_mm_set1_epi64x(49), zcnt_lo0); + zcnt_lo1 = _mm_sub_epi64(_mm_set1_epi64x(49), zcnt_lo1); + zcnt_hi0 = _mm_sub_epi64(_mm_set1_epi64x(49), zcnt_hi0); + zcnt_hi1 = _mm_sub_epi64(_mm_set1_epi64x(49), zcnt_hi1); + + map_den_lo0 = _mm_srav_epi64(map_den_lo0, zcnt_lo0); + map_num_lo0 = _mm_srav_epi64(map_num_lo0, zcnt_lo0); + map_den_lo1 = _mm_srav_epi64(map_den_lo1, zcnt_lo1); + map_num_lo1 = _mm_srav_epi64(map_num_lo1, zcnt_lo1); + map_den_hi0 = _mm_srav_epi64(map_den_hi0, zcnt_hi0); + map_num_hi0 = _mm_srav_epi64(map_num_hi0, zcnt_hi0); + map_den_hi1 = _mm_srav_epi64(map_den_hi1, zcnt_hi1); + map_num_hi1 = _mm_srav_epi64(map_num_hi1, zcnt_hi1); + + map_den_lo0 = _mm_add_epi64(map_den_lo0, _mm_set1_epi64x(32768)); + map_den_lo1 = _mm_add_epi64(map_den_lo1, _mm_set1_epi64x(32768)); + map_den_hi0 = _mm_add_epi64(map_den_hi0, _mm_set1_epi64x(32768)); + map_den_hi1 = _mm_add_epi64(map_den_hi1, _mm_set1_epi64x(32768)); + + __m128i div_lookup_lo0 = _mm_i64gather_epi32(div_lookup, map_den_lo0, 4); + __m128i div_lookup_lo1 = _mm_i64gather_epi32(div_lookup, map_den_lo1, 4); + __m128i div_lookup_hi0 = _mm_i64gather_epi32(div_lookup, map_den_hi0, 4); + __m128i div_lookup_hi1 = _mm_i64gather_epi32(div_lookup, map_den_hi1, 4); + __m128i map_lo0, map_lo1, map_hi0, map_hi1; + + Multiply64Bit_128(map_num_lo0, _mm_cvtepi32_epi64(div_lookup_lo0), map_lo0); + Multiply64Bit_128(map_num_lo1, _mm_cvtepi32_epi64(div_lookup_lo1), map_lo1); + Multiply64Bit_128(map_num_hi0, _mm_cvtepi32_epi64(div_lookup_hi0), map_hi0); + Multiply64Bit_128(map_num_hi1, _mm_cvtepi32_epi64(div_lookup_hi1), map_hi1); + + map_lo0 = _mm_srai_epi64(map_lo0, SSIM_SHIFT_DIV); + map_lo1 = _mm_srai_epi64(map_lo1, SSIM_SHIFT_DIV); + map_hi0 = _mm_srai_epi64(map_hi0, SSIM_SHIFT_DIV); + map_hi1 = _mm_srai_epi64(map_hi1, SSIM_SHIFT_DIV); + +#if ENABLE_MINK3POOL + __m128i const1_minus_map_lo0 = _mm_sub_epi64(const_1_128, map_lo0); + __m128i const1_minus_map_lo1 = _mm_sub_epi64(const_1_128, map_lo1); + __m128i const1_minus_map_hi0 = _mm_sub_epi64(const_1_128, map_hi0); + __m128i const1_minus_map_hi1 = _mm_sub_epi64(const_1_128, map_hi1); + + __m128i const1_minus_map_sq_lo0 = _mm_mul_epi32(const1_minus_map_lo0, const1_minus_map_lo0); + __m128i const1_minus_map_sq_lo1 = _mm_mul_epi32(const1_minus_map_lo1, const1_minus_map_lo1); + __m128i const1_minus_map_sq_hi0 = _mm_mul_epi32(const1_minus_map_hi0, const1_minus_map_hi0); + __m128i const1_minus_map_sq_hi1 = _mm_mul_epi32(const1_minus_map_hi1, const1_minus_map_hi1); + + __m128i rowcube_1minus_map_lo0, rowcube_1minus_map_lo1, rowcube_1minus_map_hi0, rowcube_1minus_map_hi1; + + Multiply64Bit_128(const1_minus_map_sq_lo0, const1_minus_map_lo0, rowcube_1minus_map_lo0); + Multiply64Bit_128(const1_minus_map_sq_lo1, const1_minus_map_lo1, rowcube_1minus_map_lo1); + Multiply64Bit_128(const1_minus_map_sq_hi0, const1_minus_map_hi0, rowcube_1minus_map_hi0); + Multiply64Bit_128(const1_minus_map_sq_hi1, const1_minus_map_hi1, rowcube_1minus_map_hi1); + + rowcube_1minus_map_lo0 = _mm_add_epi64(rowcube_1minus_map_lo0, rowcube_1minus_map_lo1); + rowcube_1minus_map_hi0 = _mm_add_epi64(rowcube_1minus_map_hi0, rowcube_1minus_map_hi1); + rowcube_1minus_map_lo0 = _mm_add_epi64(rowcube_1minus_map_lo0, rowcube_1minus_map_hi0); + accum_rowcube_128 = _mm_add_epi64(accum_rowcube_128, rowcube_1minus_map_lo0); + +#else + __m128i map_sq_lo0, map_sq_lo1, map_sq_hi0, map_sq_hi1; + Multiply64Bit_128(map_lo0, map_lo0, map_sq_lo0); + Multiply64Bit_128(map_lo1, map_lo1, map_sq_lo1); + Multiply64Bit_128(map_hi0, map_hi0, map_sq_hi0); + Multiply64Bit_128(map_hi1, map_hi1, map_sq_hi1); + + map_lo0 = _mm_add_epi64(map_lo0, map_lo1); + map_hi0 = _mm_add_epi64(map_hi0, map_hi1); + map_lo0 = _mm_add_epi64(map_lo0, map_hi0); + accum_map_128 = _mm_add_epi64(accum_map_128, map_lo0); + + map_sq_lo0 = _mm_add_epi64(map_sq_lo0, map_sq_lo1); + map_sq_hi0 = _mm_add_epi64(map_sq_hi0, map_sq_hi1); + map_sq_lo0 = _mm_add_epi64(map_sq_lo0, map_sq_hi0); + accum_map_sq_128 = _mm_add_epi64(accum_map_sq_128, map_sq_lo0); +#endif + } + + for (; j < width; j++) + { + index = i * width + j; + mx = ref->bands[0][index]; + my = dist->bands[0][index]; + + var_x = 0; + var_y = 0; + cov_xy = 0; + + for (int k = 1; k < 4; k++) + { + var_x += ((ssim_inter_dtype)ref->bands[k][index] * ref->bands[k][index]); + var_y += ((ssim_inter_dtype)dist->bands[k][index] * dist->bands[k][index]); + cov_xy += ((ssim_inter_dtype)ref->bands[k][index] * dist->bands[k][index]); + } + var_x_band0 = (ssim_inter_dtype)mx * mx; + var_y_band0 = (ssim_inter_dtype)my * my; + cov_xy_band0 = (ssim_inter_dtype)mx * my; + + var_x = (var_x >> SSIM_INTER_VAR_SHIFTS); + var_y = (var_y >> SSIM_INTER_VAR_SHIFTS); + cov_xy = (cov_xy >> SSIM_INTER_VAR_SHIFTS); + + l_num = (cov_xy_band0 + C1); + l_den = (((var_x_band0 + var_y_band0) >> SSIM_INTER_L_SHIFT) + C1); + cs_num = (cov_xy + C2); + cs_den = (((var_x + var_y) >> SSIM_INTER_CS_SHIFT) + C2); + + numVal[j] = (ssim_accum_dtype)l_num * cs_num; + denVal[j] = (ssim_accum_dtype)l_den * cs_den; + + map_num = (ssim_accum_dtype)l_num * cs_num; + map_den = (ssim_accum_dtype)l_den * cs_den; + + /** + * l_den & cs_den are variance terms, hence they will always be +ve + * getting best 15bits and retaining one signed bit, using get_best_i16_from_u64 + * This is done to reuse ADM division LUT, which has LUT for values from -2^15 to 2^15 + */ + int power_val; + i16_map_den = get_best_i16_from_u64((uint64_t) map_den, &power_val); + /** + * The actual equation of map is map_num/map_den + * The division is done using LUT, results of div_lookup = 2^30/i16_map_den + * map = map_num/map_den => map = map_num / (i16_map_den << power_val) + * => map = (map_num >> power_val) / i16_map_den + * => map = (map_num >> power_val) * (div_lookup[i16_map_den + 32768] >> 30) //since it has -ve vals in 1st half + * => map = ((map_num >> power_val) * div_lookup[i16_map_den + 32768]) >> 30 + * Shift by 30 might be very high even for 32 bits precision, hence shift only by 15 + */ + map = ((map_num >> power_val) * div_lookup[i16_map_den + 32768]) >> SSIM_SHIFT_DIV; + +#if ENABLE_MINK3POOL + ssim_accum_dtype const1_minus_map = const_1 - map; + rowcube_1minus_map += const1_minus_map * const1_minus_map * const1_minus_map; +#else + accum_map += map; + map_sq_insum += (ssim_accum_dtype)(((ssim_accum_dtype)map * map)); +#endif + } +#if ENABLE_MINK3POOL + __m256i r4 = _mm256_add_epi64( _mm512_castsi512_si256(accum_rowcube_512), _mm512_extracti64x4_epi64(accum_rowcube_512, 1)); + r4 = _mm256_add_epi64(r4, accum_rowcube_256); + __m128i r2 = _mm_add_epi64(_mm256_castsi256_si128(r4), _mm256_extracti128_si256(r4, 1)); + r2 = _mm_add_epi64(r2, accum_rowcube_128); + accumcube_1minus_map += (double)(_mm_extract_epi64(r2, 0) + _mm_extract_epi64(r2, 1) + rowcube_1minus_map); + accum_rowcube_512 = _mm512_setzero_si512(); + accum_rowcube_256 = _mm256_setzero_si256(); + accum_rowcube_128 = _mm_setzero_si128(); + rowcube_1minus_map = 0; +#endif + } + +#if ENABLE_MINK3POOL + double ssim_val = 1 - cbrt(accumcube_1minus_map/(width*height))/const_1; + *score = ssim_clip(ssim_val, 0, 1); +#else + __m256i r4_map = _mm256_add_epi64(_mm512_castsi512_si256(accum_map_512), _mm512_extracti64x4_epi64(accum_map_512, 1)); + r4_map = _mm256_add_epi64(r4_map, accum_map_256); + __m256i r4_map_sq = _mm256_add_epi64(_mm512_castsi512_si256(accum_map_sq_512), _mm512_extracti64x4_epi64(accum_map_sq_512, 1)); + r4_map_sq = _mm256_add_epi64(r4_map_sq, accum_map_sq_256); + __m128i r2_map = _mm_add_epi64(_mm256_castsi256_si128(r4_map), _mm256_extracti64x2_epi64(r4_map, 1)); + r2_map = _mm_add_epi64(r2_map, accum_map_128); + __m128i r2_map_sq = _mm_add_epi64(_mm256_castsi256_si128(r4_map_sq), _mm256_extracti64x2_epi64(r4_map_sq, 1)); + r2_map_sq = _mm_add_epi64(r2_map_sq, accum_map_sq_128); + int64_t r1_map = _mm_extract_epi64(r2_map, 0) + _mm_extract_epi64(r2_map, 1); + int64_t r1_map_sq = _mm_extract_epi64(r2_map_sq, 0) + _mm_extract_epi64(r2_map_sq, 1); + + accum_map += r1_map; + map_sq_insum += r1_map_sq; + accum_map_sq = map_sq_insum / (height * width); + double ssim_mean = (double)accum_map / (height * width); + double ssim_std; + ssim_std = sqrt(MAX(0, ((double) accum_map_sq - ssim_mean*ssim_mean))); + *score = (ssim_std / ssim_mean); + +#endif + + free(numVal); + free(denVal); + ret = 0; + return ret; +} \ No newline at end of file diff --git a/libvmaf/src/feature/third_party/funque/x86/integer_funque_ssim_avx512.h b/libvmaf/src/feature/third_party/funque/x86/integer_funque_ssim_avx512.h new file mode 100644 index 000000000..fb4bf6a92 --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/integer_funque_ssim_avx512.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: BSD-3-Clause +* Copyright (C) 2022 Intel Corporation. +*/ +/** + * + * Copyright 2016-2020 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +int integer_compute_ssim_funque_avx512(i_dwt2buffers *ref, i_dwt2buffers *dist, double *score, int max_val, float K1, float K2, int pending_div, int32_t *div_lookup); \ No newline at end of file diff --git a/libvmaf/src/feature/third_party/funque/x86/integer_funque_vif_avx2.c b/libvmaf/src/feature/third_party/funque/x86/integer_funque_vif_avx2.c index 5c74e00c6..543a4333b 100644 --- a/libvmaf/src/feature/third_party/funque/x86/integer_funque_vif_avx2.c +++ b/libvmaf/src/feature/third_party/funque/x86/integer_funque_vif_avx2.c @@ -33,7 +33,7 @@ #include #if USE_DYNAMIC_SIGMA_NSQ -int integer_compute_vif_funque_c(const dwt2_dtype* x_t, const dwt2_dtype* y_t, size_t width, size_t height, +int integer_compute_vif_funque_avx2(const dwt2_dtype* x_t, const dwt2_dtype* y_t, size_t width, size_t height, double* score, double* score_num, double* score_den, int k, int stride, double sigma_nsq_arg, int64_t shift_val, uint32_t* log_18, int vif_level) diff --git a/libvmaf/src/feature/third_party/funque/x86/integer_funque_vif_avx2.h b/libvmaf/src/feature/third_party/funque/x86/integer_funque_vif_avx2.h index 01e799dc7..ce1205f4d 100644 --- a/libvmaf/src/feature/third_party/funque/x86/integer_funque_vif_avx2.h +++ b/libvmaf/src/feature/third_party/funque/x86/integer_funque_vif_avx2.h @@ -23,22 +23,11 @@ #include #include "../integer_funque_vif.h" -#define extract_and_sum(vec, a, a0, a1, a2, a3, a4, a5, a6, a7) \ -{ a0 = a + _mm256_extract_epi32(vec, 0); \ - a1 = a0 + _mm256_extract_epi32(vec, 1); \ - a2 = a1 + _mm256_extract_epi32(vec, 2); \ - a3 = a2 + _mm256_extract_epi32(vec, 3); \ - a4 = a3 + _mm256_extract_epi32(vec, 4); \ - a5 = a4 + _mm256_extract_epi32(vec, 5); \ - a6 = a5 + _mm256_extract_epi32(vec, 6); \ - a7 = a6 + _mm256_extract_epi32(vec, 7); } - - #if USE_DYNAMIC_SIGMA_NSQ int integer_compute_vif_funque_avx2(const dwt2_dtype* x_t, const dwt2_dtype* y_t, size_t width, size_t height, double* score, double* score_num, double* score_den, int k, int stride, double sigma_nsq_arg, - int64_t shift_val, uint32_t* log_18); + int64_t shift_val, uint32_t* log_18, int vif_level); #else int integer_compute_vif_funque_avx2(const dwt2_dtype* x_t, const dwt2_dtype* y_t, size_t width, size_t height, double* score, double* score_num, double* score_den, diff --git a/libvmaf/src/feature/third_party/funque/x86/integer_funque_vif_avx512.c b/libvmaf/src/feature/third_party/funque/x86/integer_funque_vif_avx512.c new file mode 100644 index 000000000..b271df50c --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/integer_funque_vif_avx512.c @@ -0,0 +1,634 @@ +/* SPDX-License-Identifier: BSD-3-Clause +* Copyright (C) 2022 Intel Corporation. +*/ +/** + * + * Copyright 2016-2020 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include +#include + +#include "../funque_vif_options.h" +#include "../integer_funque_filters.h" +#include "../common/macros.h" +#include "../integer_funque_vif.h" +#include "integer_funque_vif_avx512.h" +#include + +#if USE_DYNAMIC_SIGMA_NSQ +int integer_compute_vif_funque_avx512(const dwt2_dtype* x_t, const dwt2_dtype* y_t, size_t width, size_t height, + double* score, double* score_num, double* score_den, + int k, int stride, double sigma_nsq_arg, + int64_t shift_val, uint32_t* log_18, int vif_level) +#else +int integer_compute_vif_funque_avx512(const dwt2_dtype* x_t, const dwt2_dtype* y_t, size_t width, size_t height, + double* score, double* score_num, double* score_den, + int k, int stride, double sigma_nsq_arg, + int64_t shift_val, uint32_t* log_18) +#endif +{ + int ret = 1; + + int kh = k; + int kw = k; + int k_norm = kw * kh; + + int x_reflect = (int)((kh - stride) / 2); // amount for reflecting + int y_reflect = (int)((kw - stride) / 2); + size_t vif_width, vif_height; + +#if VIF_REFLECT_PAD + vif_width = width; + vif_height = height; +#else + vif_width = width - (2 * y_reflect); + vif_height = height - (2 * x_reflect); +#endif + + size_t r_width = vif_width + (2 * x_reflect); // after reflect pad + size_t r_height = vif_height + (2 * x_reflect); + + + dwt2_dtype* x_pad_t, *y_pad_t; + +#if VIF_REFLECT_PAD + x_pad_t = (dwt2_dtype*)malloc(sizeof(dwt2_dtype*) * (vif_width + (2 * x_reflect)) * (vif_height + (2 * x_reflect))); + y_pad_t = (dwt2_dtype*)malloc(sizeof(dwt2_dtype*) * (vif_width + (2 * y_reflect)) * (vif_height + (2 * y_reflect))); + integer_reflect_pad(x_t, vif_width, vif_height, x_reflect, x_pad_t); + integer_reflect_pad(y_t, vif_width, vif_height, y_reflect, y_pad_t); +#else + x_pad_t = x_t; + y_pad_t = y_t; +#endif + + int64_t exp_t = 1; // using 1 because exp in Q32 format is still 0 + int32_t sigma_nsq_t = (int64_t)((int64_t)sigma_nsq_arg*shift_val*shift_val*k_norm) >> VIF_COMPUTE_METRIC_R_SHIFT ; +#if VIF_STABILITY + double sigma_nsq_base = sigma_nsq_arg / (255.0*255.0); +#if USE_DYNAMIC_SIGMA_NSQ + sigma_nsq_base = sigma_nsq_base * (2 << (vif_level + 1)); +#endif + sigma_nsq_t = (int64_t)((int64_t)(sigma_nsq_base*shift_val*shift_val*k_norm)) >> VIF_COMPUTE_METRIC_R_SHIFT ; +#endif + int64_t score_num_t = 0; + int64_t num_power = 0; + int64_t score_den_t = 0; + int64_t den_power = 0; + + int16_t knorm_fact = 25891; // (2^21)/81 knorm factor is multiplied and shifted instead of division + int16_t knorm_shift = 21; + + { + int width_p1 = r_width + 1; + int height_p1 = r_height + 1; + int64_t *interim_2_x = (int64_t*)malloc(width_p1 * sizeof(int64_t)); + int32_t *interim_1_x = (int32_t*)malloc(width_p1 * sizeof(int32_t)); + int64_t *interim_2_y = (int64_t*)malloc(width_p1 * sizeof(int64_t)); + int32_t *interim_1_y = (int32_t*)malloc(width_p1 * sizeof(int32_t)); + int64_t *interim_x_y = (int64_t*)malloc(width_p1 * sizeof(int64_t)); + + memset(interim_2_x, 0, width_p1 * sizeof(int64_t)); + memset(interim_1_x, 0, width_p1 * sizeof(int32_t)); + memset(interim_2_y, 0, width_p1 * sizeof(int64_t)); + memset(interim_1_y, 0, width_p1 * sizeof(int32_t)); + memset(interim_x_y, 0, width_p1 * sizeof(int64_t)); + + int i = 0; + + int width_p1_32 = (width_p1) - ((width_p1) % 32); + int width_p1_16 = (width_p1) - ((width_p1) % 16); + + __m512i interim_1_x_512, interim_1_x16_512, interim_1_y_512, interim_1_y16_512, \ + interim_2_x0_512, interim_2_x8_512, interim_2_x16_512, interim_2_x24_512, \ + interim_2_y0_512, interim_2_y8_512, interim_2_y16_512, interim_2_y24_512, \ + interim_x_y0_512, interim_x_y8_512, interim_x_y16_512, interim_x_y24_512; + + //The height loop is broken into 2 parts, + //1st loop, prev kh row is not available to subtract during vertical summation + int j = 1; + for (; j < width_p1_32; j+=32) + { + int j_minus1 = j-1; + + interim_1_x_512 = interim_1_x16_512 = interim_1_y_512 = interim_1_y16_512 = \ + interim_2_x0_512 = interim_2_x8_512 = interim_2_x16_512 = interim_2_x24_512 = \ + interim_2_y0_512 = interim_2_y8_512 = interim_2_y16_512 = interim_2_y24_512 = \ + interim_x_y0_512 = interim_x_y8_512 = interim_x_y16_512 = interim_x_y24_512 = _mm512_setzero_si512(); + + for (i=1; i + +#include +#include "../integer_funque_vif.h" + +#define Multiply64Bit_512(ab, cd, res){ \ + __m512i ac = _mm512_mul_epu32(ab, cd); \ + __m512i b = _mm512_srli_epi64(ab, 32); \ + __m512i bc = _mm512_mul_epu32(b, cd); \ + __m512i d = _mm512_srli_epi64(cd, 32); \ + __m512i ad = _mm512_mul_epu32(ab, d); \ + __m512i high = _mm512_add_epi64(bc, ad); \ + high = _mm512_slli_epi64(high, 32); \ + res = _mm512_add_epi64(high, ac); } + +#define shift15_64b_signExt_512(a, r)\ +{ \ + r = _mm512_add_epi64( _mm512_srli_epi64(a, 6) , _mm512_and_si512(a, _mm512_set1_epi64(0xFC00000000000000)));\ +} + +#if VIF_STABILITY +static inline void vif_stats_calc_avx512(__m512i int_1_x_512, __m512i int_1_y_512, __m512i int_2_x0_512, __m512i int_2_x4_512, + __m512i int_2_y0_512, __m512i int_2_y4_512, __m512i int_x_y0_512, __m512i int_x_y4_512, + int16_t knorm_fact, int16_t knorm_shift, + int16_t exp, int32_t sigma_nsq, uint32_t *log_18, + int64_t *score_num, int64_t *num_power, + int64_t *score_den, int64_t *den_power,int64_t shift_val, int k_norm) +#else +static inline void vif_stats_calc_avx512(__m512i int_1_x_512, __m512i int_1_y_512, __m512i int_2_x0_512, __m512i int_2_x4_512, + __m512i int_2_y0_512, __m512i int_2_y4_512, __m512i int_x_y0_512, __m512i int_x_y4_512, + int16_t knorm_fact, int16_t knorm_shift, + int16_t exp, int32_t sigma_nsq, uint32_t *log_18, + int64_t *score_num, int64_t *num_power, + int64_t *score_den, int64_t *den_power) +#endif +{ + __m512i sigma_512 = _mm512_set1_epi64(sigma_nsq); + __m512i kf_512 = _mm512_set1_epi64(knorm_fact); + __m512i exp_512 = _mm512_set1_epi64(exp); + __m512i zero_512 = _mm512_setzero_si512(); + + __m512i int_1_x4 = _mm512_srli_epi64(int_1_x_512, 32); + __m512i int_1_y4 = _mm512_srli_epi64(int_1_y_512, 32); + + __m512i xx0_512, xx4_512, yy0_512, yy4_512, xy0_512, xy4_512; + __m512i kxx0_512, kxx4_512, kyy0_512, kyy4_512, kxy0_512, kxy4_512; + + xx0_512 = _mm512_mul_epi32(int_1_x_512, int_1_x_512); + xx4_512 = _mm512_mul_epi32(int_1_x4, int_1_x4); + yy0_512 = _mm512_mul_epi32(int_1_y_512, int_1_y_512); + yy4_512 = _mm512_mul_epi32(int_1_y4, int_1_y4); + xy0_512 = _mm512_mul_epi32(int_1_x_512, int_1_y_512); + xy4_512 = _mm512_mul_epi32(int_1_x4, int_1_y4); + + Multiply64Bit_512(xx0_512, kf_512, kxx0_512); + Multiply64Bit_512(xx4_512, kf_512, kxx4_512); + Multiply64Bit_512(yy0_512, kf_512, kyy0_512); + Multiply64Bit_512(yy4_512, kf_512, kyy4_512); + Multiply64Bit_512(xy0_512, kf_512, kxy0_512); + Multiply64Bit_512(xy4_512, kf_512, kxy4_512); + + __mmask8 mask_neg_xx0 = _mm512_cmpgt_epi64_mask(zero_512, kxx0_512); + __mmask8 mask_neg_xx4 = _mm512_cmpgt_epi64_mask(zero_512, kxx4_512); + __mmask8 mask_neg_yy0 = _mm512_cmpgt_epi64_mask(zero_512, kyy0_512); + __mmask8 mask_neg_yy4 = _mm512_cmpgt_epi64_mask(zero_512, kyy4_512); + __mmask8 mask_neg_xy0 = _mm512_cmpgt_epi64_mask(zero_512, kxy0_512); + __mmask8 mask_neg_xy4 = _mm512_cmpgt_epi64_mask(zero_512, kxy4_512); + + kxx0_512 = _mm512_srli_epi64(kxx0_512, knorm_shift); + kxx4_512 = _mm512_srli_epi64(kxx4_512, knorm_shift); + kyy0_512 = _mm512_srli_epi64(kyy0_512, knorm_shift); + kyy4_512 = _mm512_srli_epi64(kyy4_512, knorm_shift); + kxy0_512 = _mm512_srli_epi64(kxy0_512, knorm_shift); + kxy4_512 = _mm512_srli_epi64(kxy4_512, knorm_shift); + + __m512i sign_extend_xx0 = _mm512_mask_blend_epi64(mask_neg_xx0, zero_512, _mm512_set1_epi64(0xFFFFF80000000000)); + __m512i sign_extend_xx4 = _mm512_mask_blend_epi64(mask_neg_xx4, zero_512, _mm512_set1_epi64(0xFFFFF80000000000)); + __m512i sign_extend_yy0 = _mm512_mask_blend_epi64(mask_neg_yy0, zero_512, _mm512_set1_epi64(0xFFFFF80000000000)); + __m512i sign_extend_yy4 = _mm512_mask_blend_epi64(mask_neg_yy4, zero_512, _mm512_set1_epi64(0xFFFFF80000000000)); + __m512i sign_extend_xy0 = _mm512_mask_blend_epi64(mask_neg_xy0, zero_512, _mm512_set1_epi64(0xFFFFF80000000000)); + __m512i sign_extend_xy4 = _mm512_mask_blend_epi64(mask_neg_xy4, zero_512, _mm512_set1_epi64(0xFFFFF80000000000)); + + kxx0_512 = _mm512_or_epi64(kxx0_512, sign_extend_xx0); + kxx4_512 = _mm512_or_epi64(kxx4_512, sign_extend_xx4); + kyy0_512 = _mm512_or_epi64(kyy0_512, sign_extend_yy0); + kyy4_512 = _mm512_or_epi64(kyy4_512, sign_extend_yy4); + kxy0_512 = _mm512_or_epi64(kxy0_512, sign_extend_xy0); + kxy4_512 = _mm512_or_epi64(kxy4_512, sign_extend_xy4); + + kxx0_512 = _mm512_sub_epi64(int_2_x0_512, kxx0_512); + kxx4_512 = _mm512_sub_epi64(int_2_x4_512, kxx4_512); + kyy0_512 = _mm512_sub_epi64(int_2_y0_512, kyy0_512); + kyy4_512 = _mm512_sub_epi64(int_2_y4_512, kyy4_512); + kxy0_512 = _mm512_sub_epi64(int_x_y0_512, kxy0_512); + kxy4_512 = _mm512_sub_epi64(int_x_y4_512, kxy4_512); + + __m512i var_x0_512, var_x4_512, var_y0_512, var_y4_512, cov_xy0_512, cov_xy4_512; + + shift15_64b_signExt_512(kxx0_512, var_x0_512); + shift15_64b_signExt_512(kxx4_512, var_x4_512); + shift15_64b_signExt_512(kyy0_512, var_y0_512); + shift15_64b_signExt_512(kyy4_512, var_y4_512); + shift15_64b_signExt_512(kxy0_512, cov_xy0_512); + shift15_64b_signExt_512(kxy4_512, cov_xy4_512); + + __mmask8 mask_x0 = _mm512_cmpgt_epi64_mask(exp_512, var_x0_512); + __mmask8 mask_x4 = _mm512_cmpgt_epi64_mask(exp_512, var_x4_512); + __mmask8 mask_y0 = _mm512_cmpgt_epi64_mask(exp_512, var_y0_512); + __mmask8 mask_y4 = _mm512_cmpgt_epi64_mask(exp_512, var_y4_512); + + __mmask8 mask_xy0 = _kor_mask8(mask_x0, mask_y0); + __mmask8 mask_xy4 = _kor_mask8(mask_x4, mask_y4); + + var_x0_512 = _mm512_mask_blend_epi64(mask_x0, var_x0_512, zero_512); + var_x4_512 = _mm512_mask_blend_epi64(mask_x4, var_x4_512, zero_512); + var_y0_512 = _mm512_mask_blend_epi64(mask_y0, var_y0_512, zero_512); + var_y4_512 = _mm512_mask_blend_epi64(mask_y4, var_y4_512, zero_512); + + cov_xy0_512 = _mm512_mask_blend_epi64(mask_xy0, cov_xy0_512, zero_512); + cov_xy4_512 = _mm512_mask_blend_epi64(mask_xy4, cov_xy4_512, zero_512); + + __m512i g_den0_512 = _mm512_add_epi64(var_x0_512, exp_512); + __m512i g_den4_512 = _mm512_add_epi64(var_x4_512, exp_512); + __m512i sv_sq_0 = _mm512_mul_epi32(cov_xy0_512, cov_xy0_512); + __m512i sv_sq_4 = _mm512_mul_epi32(cov_xy4_512, cov_xy4_512); + + sv_sq_0[0] /= g_den0_512[0]; + sv_sq_0[1] /= g_den0_512[1]; + sv_sq_0[2] /= g_den0_512[2]; + sv_sq_0[3] /= g_den0_512[3]; + sv_sq_0[4] /= g_den0_512[4]; + sv_sq_0[5] /= g_den0_512[5]; + sv_sq_0[6] /= g_den0_512[6]; + sv_sq_0[7] /= g_den0_512[7]; + + sv_sq_4[0] /= g_den4_512[0]; + sv_sq_4[1] /= g_den4_512[1]; + sv_sq_4[2] /= g_den4_512[2]; + sv_sq_4[3] /= g_den4_512[3]; + sv_sq_4[4] /= g_den4_512[4]; + sv_sq_4[5] /= g_den4_512[5]; + sv_sq_4[6] /= g_den4_512[6]; + sv_sq_4[7] /= g_den4_512[7]; + + // 0 2 4 6 8 10 12 14 + sv_sq_0 = _mm512_sub_epi64(var_y0_512, sv_sq_0); + // 1 3 5 7 9 11 13 15 + sv_sq_4 = _mm512_sub_epi64(var_y4_512, sv_sq_4); + + // g_num < 0 + __mmask8 maskz_g_num0 = _mm512_cmpgt_epi64_mask(cov_xy0_512, _mm512_setzero_si512()); + __mmask8 maskz_g_num4 = _mm512_cmpgt_epi64_mask(cov_xy4_512, _mm512_setzero_si512()); + // g_den > 0 + __mmask8 maskz_g_den0 = _mm512_cmpgt_epi64_mask(_mm512_setzero_si512(), g_den0_512); + __mmask8 maskz_g_den4 = _mm512_cmpgt_epi64_mask(_mm512_setzero_si512(), g_den4_512); + + // if((g_num < 0 && g_den > 0) || (g_den < 0 && g_num > 0)) + __mmask8 cond_0 = _kxnor_mask8(maskz_g_num0, maskz_g_den0); + __mmask8 cond_4 = _kxnor_mask8(maskz_g_num4, maskz_g_den4); + + __m512i g_num_0 = _mm512_mask_blend_epi64(cond_0, cov_xy0_512, zero_512); + __m512i g_num_4 = _mm512_mask_blend_epi64(cond_4, cov_xy4_512, zero_512); + + sv_sq_0 = _mm512_mask_blend_epi64(cond_0, sv_sq_0, var_x0_512); + sv_sq_4 = _mm512_mask_blend_epi64(cond_4, sv_sq_4, var_x4_512); + + // if (sv_sq < exp) + __mmask8 mask_sv0 = _mm512_cmpgt_epi64_mask(exp_512, sv_sq_0); + __mmask8 mask_sv4 = _mm512_cmpgt_epi64_mask(exp_512, sv_sq_4); + sv_sq_0 = _mm512_mask_blend_epi64(mask_sv0, sv_sq_0, exp_512); + sv_sq_4 = _mm512_mask_blend_epi64(mask_sv4, sv_sq_4, exp_512); + + // ((int64_t)g_num * g_num) + __m512i p1_0 = _mm512_mul_epi32(g_num_0, g_num_0); + __m512i p1_4 = _mm512_mul_epi32(g_num_4, g_num_4); + + // ((int64_t)g_num * g_num)/g_den; + p1_0[0] /= g_den0_512[0]; + p1_0[1] /= g_den0_512[1]; + p1_0[2] /= g_den0_512[2]; + p1_0[3] /= g_den0_512[3]; + p1_0[4] /= g_den0_512[4]; + p1_0[5] /= g_den0_512[5]; + p1_0[6] /= g_den0_512[6]; + p1_0[7] /= g_den0_512[7]; + + p1_4[0] /= g_den4_512[0]; + p1_4[1] /= g_den4_512[1]; + p1_4[2] /= g_den4_512[2]; + p1_4[3] /= g_den4_512[3]; + p1_4[4] /= g_den4_512[4]; + p1_4[5] /= g_den4_512[5]; + p1_4[6] /= g_den4_512[6]; + p1_4[7] /= g_den4_512[7]; + + // (p1 * p2) + __m512i p1_mul_p2_0 = _mm512_mul_epi32(p1_0, var_x0_512); + __m512i p1_mul_p2_4 = _mm512_mul_epi32(p1_4, var_x4_512); + + // ((int64_t) sv_sq + sigma_nsq) + __m512i n2_0 = _mm512_add_epi64(sv_sq_0, sigma_512); + __m512i n2_4 = _mm512_add_epi64(sv_sq_4, sigma_512); + // g_den * ((int64_t) sv_sq + sigma_nsq) + n2_0 = _mm512_mul_epi32(n2_0, g_den0_512); + n2_4 = _mm512_mul_epi32(n2_4, g_den4_512); + // n2 + (p1 * p2) + __m512i n1_0 = _mm512_add_epi64(n2_0, p1_mul_p2_0); + __m512i n1_4 = _mm512_add_epi64(n2_4, p1_mul_p2_4); + + __m512i log_in_num_1_0, log_in_num_1_4, log_in_num_2_0, log_in_num_2_4; + __m512i x1_0, x1_4, x2_0, x2_4; + + x1_0 = _mm512_lzcnt_epi64(n1_0); + x1_4 = _mm512_lzcnt_epi64(n1_4); + x2_0 = _mm512_lzcnt_epi64(n2_0); + x2_4 = _mm512_lzcnt_epi64(n2_4); + + x1_0 = _mm512_sub_epi64(_mm512_set1_epi64(46), x1_0); + x1_4 = _mm512_sub_epi64(_mm512_set1_epi64(46), x1_4); + x2_0 = _mm512_sub_epi64(_mm512_set1_epi64(46), x2_0); + x2_4 = _mm512_sub_epi64(_mm512_set1_epi64(46), x2_4); + + x1_0 = _mm512_max_epi64(x1_0, _mm512_setzero_si512()); + x1_4 = _mm512_max_epi64(x1_4, _mm512_setzero_si512()); + x2_0 = _mm512_max_epi64(x2_0, _mm512_setzero_si512()); + x2_4 = _mm512_max_epi64(x2_4, _mm512_setzero_si512()); + + log_in_num_1_0 = _mm512_srav_epi64(n1_0, x1_0); + log_in_num_1_4 = _mm512_srav_epi64(n1_4, x1_4); + log_in_num_2_0 = _mm512_srav_epi64(n2_0, x2_0); + log_in_num_2_4 = _mm512_srav_epi64(n2_4, x2_4); + + __m512i log_18_1_0 = _mm512_cvtepi32_epi64(_mm512_i64gather_epi32(log_in_num_1_0, log_18, 4)); + __m512i log_18_1_4 = _mm512_cvtepi32_epi64(_mm512_i64gather_epi32(log_in_num_1_4, log_18, 4)); + __m512i log_18_2_0 = _mm512_cvtepi32_epi64(_mm512_i64gather_epi32(log_in_num_2_0, log_18, 4)); + __m512i log_18_2_4 = _mm512_cvtepi32_epi64(_mm512_i64gather_epi32(log_in_num_2_4, log_18, 4)); + + __m512i log_18_0 = _mm512_sub_epi64(log_18_1_0, log_18_2_0); + __m512i log_18_4 = _mm512_sub_epi64(log_18_1_4, log_18_2_4); + + __m512i x1 = _mm512_sub_epi64(x1_0, x2_0); + __m512i x2 = _mm512_sub_epi64(x1_4, x2_4); + +#if VIF_STABILITY + __mmask8 cov_xy0_lt_z = _mm512_cmplt_epi64_mask(g_num_0, zero_512); + __mmask8 cov_xy4_lt_z = _mm512_cmplt_epi64_mask(g_num_4, zero_512); + + log_18_0 = _mm512_mask_blend_epi32(cov_xy0_lt_z, log_18_0, zero_512); + log_18_4 = _mm512_mask_blend_epi32(cov_xy4_lt_z, log_18_4, zero_512); + x1 = _mm512_mask_blend_epi64(cov_xy0_lt_z, x1, zero_512); + x2 = _mm512_mask_blend_epi64(cov_xy4_lt_z, x2, zero_512); + + __m512i shift_val_512 = _mm512_set1_epi32(shift_val); + __m512i k_norm_512 = _mm512_set1_epi32(k_norm); + __mmask8 mask_var_x0_lt_sigma = _mm512_cmplt_epi64_mask(var_x0_512, sigma_512); + __mmask8 mask_var_x4_lt_sigma = _mm512_cmplt_epi64_mask(var_x4_512, sigma_512); + + // shift_val*shift_val + __m512i shift_val_sq = _mm512_mullo_epi32(shift_val_512, shift_val_512); + // (shift_val*shift_val*k_norm) + __m512i shift_val_sq_mul_k_norm_512 = _mm512_mul_epi32(shift_val_sq, k_norm_512); + // ((shift_val*shift_val*k_norm)>> VIF_COMPUTE_METRIC_R_SHIFT) + shift_val_sq_mul_k_norm_512 = _mm512_srli_epi64(shift_val_sq_mul_k_norm_512, VIF_COMPUTE_METRIC_R_SHIFT); + // ((int32_t)((var_y * sigma_max_inv))) + __m512i var_y0_mul_4 = _mm512_slli_epi64(var_y0_512, 2); + __m512i var_y4_mul_4 = _mm512_slli_epi64(var_y4_512, 2); + // ((shift_val*shift_val*k_norm)>> VIF_COMPUTE_METRIC_R_SHIFT) - ((int32_t)((var_y * sigma_max_inv))) + __m512i tmp_num_0 = _mm512_sub_epi64(shift_val_sq_mul_k_norm_512, var_y0_mul_4); + __m512i tmp_num_4 = _mm512_sub_epi64(shift_val_sq_mul_k_norm_512, var_y4_mul_4); + + log_18_0 = _mm512_mask_blend_epi64(mask_var_x0_lt_sigma, log_18_0, tmp_num_0); + log_18_4 = _mm512_mask_blend_epi64(mask_var_x4_lt_sigma, log_18_4, tmp_num_4); + x1 = _mm512_mask_blend_epi64(mask_var_x0_lt_sigma, x1, zero_512); + x2 = _mm512_mask_blend_epi64(mask_var_x4_lt_sigma, x2, zero_512); +#endif + + log_18_0 = _mm512_add_epi64(log_18_0, log_18_4); + __m256i r4 = _mm256_add_epi64(_mm512_castsi512_si256(log_18_0), _mm512_extracti64x4_epi64(log_18_0, 1)); + __m128i r2 = _mm_add_epi64(_mm256_castsi256_si128(r4), _mm256_extracti64x2_epi64(r4, 1)); + int64_t temp_num = _mm_extract_epi64(r2, 0) + _mm_extract_epi64(r2, 1); + + __m512i temp_power_num_0 = _mm512_add_epi64(x1, x2); + __m256i r4_x = _mm256_add_epi64(_mm512_castsi512_si256(temp_power_num_0), _mm512_extracti64x4_epi64(temp_power_num_0, 1)); + __m128i r2_x = _mm_add_epi64(_mm256_castsi256_si128(r4_x), _mm256_extracti64x2_epi64(r4_x, 1)); + int32_t temp_power_num = _mm_extract_epi64(r2_x, 0) + _mm_extract_epi64(r2_x, 1); + *score_num += temp_num; + *num_power += temp_power_num; + + __m512i d1_0 = _mm512_add_epi64(sigma_512, var_x0_512); + __m512i d1_4 = _mm512_add_epi64(sigma_512, var_x4_512); + + __m512i log_in_den_1_0, log_in_den_1_4, log_in_den_2_0; + __m512i y1_0, y1_4, y2_0; + + y1_0 = _mm512_lzcnt_epi64(d1_0); + y1_4 = _mm512_lzcnt_epi64(d1_4); + y2_0 = _mm512_lzcnt_epi64(sigma_512); + + y1_0 = _mm512_sub_epi64(_mm512_set1_epi64(46), y1_0); + y1_4 = _mm512_sub_epi64(_mm512_set1_epi64(46), y1_4); + y2_0 = _mm512_sub_epi64(_mm512_set1_epi64(46), y2_0); + + __m512i y1_0_pos = _mm512_max_epi64(y1_0, zero_512); + __m512i y1_4_pos = _mm512_max_epi64(y1_4, zero_512); + __m512i y2_0_pos = _mm512_max_epi64(y2_0, zero_512); + + __m512i y1_0_neg = _mm512_min_epi64(y1_0, zero_512); + __m512i y1_4_neg = _mm512_min_epi64(y1_4, zero_512); + __m512i y2_0_neg = _mm512_min_epi64(y2_0, zero_512); + y1_0_neg = _mm512_abs_epi64(y1_0_neg); + y1_4_neg = _mm512_abs_epi64(y1_4_neg); + y2_0_neg = _mm512_abs_epi64(y2_0_neg); + + log_in_den_1_0 = _mm512_srav_epi64(d1_0, y1_0_pos); + log_in_den_1_4 = _mm512_srav_epi64(d1_4, y1_4_pos); + log_in_den_2_0 = _mm512_srav_epi64(sigma_512, y2_0_pos); + + log_in_den_1_0 = _mm512_sllv_epi64(log_in_den_1_0, y1_0_neg); + log_in_den_1_4 = _mm512_sllv_epi64(log_in_den_1_4, y1_4_neg); + log_in_den_2_0 = _mm512_sllv_epi64(log_in_den_2_0, y2_0_neg); + + log_18_1_0 = _mm512_cvtepi32_epi64(_mm512_i64gather_epi32(log_in_den_1_0, log_18, 4)); + log_18_1_4 = _mm512_cvtepi32_epi64(_mm512_i64gather_epi32(log_in_den_1_4, log_18, 4)); + log_18_2_0 = _mm512_cvtepi32_epi64(_mm512_i64gather_epi32(log_in_den_2_0, log_18, 4)); + log_18_2_4 = _mm512_cvtepi32_epi64(_mm512_i64gather_epi32(log_in_den_2_0, log_18, 4)); + + log_18_0 = _mm512_sub_epi64(log_18_1_0, log_18_2_0); + log_18_4 = _mm512_sub_epi64(log_18_1_4, log_18_2_4); + + __m512i y1 = _mm512_sub_epi64(y1_0, y2_0); + __m512i y2 = _mm512_sub_epi64(y1_4, y2_0); +#if VIF_STABILITY + log_18_0 = _mm512_mask_blend_epi64(mask_var_x0_lt_sigma, log_18_0, shift_val_sq_mul_k_norm_512); + log_18_4 = _mm512_mask_blend_epi64(mask_var_x4_lt_sigma, log_18_4, shift_val_sq_mul_k_norm_512); + y1 = _mm512_mask_blend_epi64(mask_var_x0_lt_sigma, y1, zero_512); + y2 = _mm512_mask_blend_epi64(mask_var_x4_lt_sigma, y2, zero_512); +#endif + + log_18_0 = _mm512_add_epi64(log_18_0, log_18_4); + r4 = _mm256_add_epi64(_mm512_castsi512_si256(log_18_0), _mm512_extracti64x4_epi64(log_18_0, 1)); + r2 = _mm_add_epi64(_mm256_castsi256_si128(r4), _mm256_extracti64x2_epi64(r4, 1)); + int64_t temp_den = _mm_extract_epi64(r2, 0) + _mm_extract_epi64(r2, 1); + + __m512i temp_power_den_0 = _mm512_add_epi64(y1, y2); + __m256i r4_y = _mm256_add_epi64(_mm512_castsi512_si256(temp_power_den_0), _mm512_extracti64x4_epi64(temp_power_den_0, 1)); + __m128i r2_y = _mm_add_epi64(_mm256_castsi256_si128(r4_y), _mm256_extracti64x2_epi64(r4_y, 1)); + int32_t temp_power_den = _mm_extract_epi64(r2_y, 0) + _mm_extract_epi64(r2_y, 1); + + *score_den += temp_den; + *den_power += temp_power_den; +} + +#if USE_DYNAMIC_SIGMA_NSQ +int integer_compute_vif_funque_avx512(const dwt2_dtype* x_t, const dwt2_dtype* y_t, size_t width, size_t height, + double* score, double* score_num, double* score_den, + int k, int stride, double sigma_nsq_arg, + int64_t shift_val, uint32_t* log_18, int vif_level); +#else +int integer_compute_vif_funque_avx512(const dwt2_dtype* x_t, const dwt2_dtype* y_t, size_t width, size_t height, + double* score, double* score_num, double* score_den, + int k, int stride, double sigma_nsq_arg, + int64_t shift_val, uint32_t* log_18); +#endif + +#if VIF_STABILITY +static inline void vif_horz_integralsum_avx512(int kw, int width_p1, + int16_t knorm_fact, int16_t knorm_shift, + int16_t exp, int32_t sigma_nsq, uint32_t *log_18, + int32_t *interim_1_x, int32_t *interim_1_y, + int64_t *interim_2_x, int64_t *interim_2_y, int64_t *interim_x_y, + int64_t *score_num, int64_t *num_power, + int64_t *score_den, int64_t *den_power, int64_t shift_val, int k_norm) +#else +static inline void vif_horz_integralsum_avx512(int kw, int width_p1, + int16_t knorm_fact, int16_t knorm_shift, + int16_t exp, int32_t sigma_nsq, uint32_t *log_18, + int32_t *interim_1_x, int32_t *interim_1_y, + int64_t *interim_2_x, int64_t *interim_2_y, int64_t *interim_x_y, + int64_t *score_num, int64_t *num_power, + int64_t *score_den, int64_t *den_power) +#endif +{ + int32_t int_1_x, int_1_y; + int64_t int_2_x, int_2_y, int_x_y; + int width_p1_16 = (width_p1) - ((width_p1 - kw - 1) % 16); + //1st column vals are 0, hence intialising to 0 + int_1_x = 0; + int_1_y = 0; + int_2_x = 0; + int_2_y = 0; + int_x_y = 0; + /** + * The horizontal accumulation similar to vertical accumulation + * metric_sum = prev_col_metric_sum + interim_metric_vertical_sum + * The previous kw col interim metric sum is not subtracted since it is not available here + */ + + __m512i interim_1_x0_512 = _mm512_loadu_si512((__m512i*)(interim_1_x + 1)); + __m512i interim_1_y0_512 = _mm512_loadu_si512((__m512i*)(interim_1_y + 1)); + + __m256i interim_1_x0_256 = _mm512_castsi512_si256(interim_1_x0_512); + __m256i interim_1_y0_256 = _mm512_castsi512_si256(interim_1_y0_512); + + __m512i interim_2_x0_512 = _mm512_loadu_si512((__m512i*)(interim_2_x + 1)); + __m512i interim_2_y0_512 = _mm512_loadu_si512((__m512i*)(interim_2_y + 1)); + __m512i interim_x_y0_512 = _mm512_loadu_si512((__m512i*)(interim_x_y + 1)); + __m512i interim_2_x8_512 = _mm512_loadu_si512((__m512i*)(interim_2_x + 9)); + __m512i interim_2_y8_512 = _mm512_loadu_si512((__m512i*)(interim_2_y + 9)); + __m512i interim_x_y8_512 = _mm512_loadu_si512((__m512i*)(interim_x_y + 9)); + + __m128i int_1_x_r4 = _mm_add_epi32(_mm256_castsi256_si128(interim_1_x0_256), _mm256_extracti128_si256(interim_1_x0_256, 1)); + __m128i int_1_y_r4 = _mm_add_epi32(_mm256_castsi256_si128(interim_1_y0_256), _mm256_extracti128_si256(interim_1_y0_256, 1)); + + __m256i sum_x04 = _mm256_add_epi64(_mm512_castsi512_si256(interim_2_x0_512), _mm512_extracti64x4_epi64(interim_2_x0_512, 1)); + __m256i sum_y04 = _mm256_add_epi64(_mm512_castsi512_si256(interim_2_y0_512), _mm512_extracti64x4_epi64(interim_2_y0_512, 1)); + __m256i sum_xy04 = _mm256_add_epi64(_mm512_castsi512_si256(interim_x_y0_512), _mm512_extracti64x4_epi64(interim_x_y0_512, 1)); + + __m128i int_1_x_r2 = _mm_hadd_epi32(int_1_x_r4, int_1_x_r4); + __m128i int_1_y_r2 = _mm_hadd_epi32(int_1_y_r4, int_1_y_r4); + __m128i int_2_x_r2 = _mm_add_epi64(_mm256_castsi256_si128(sum_x04), _mm256_extracti128_si256(sum_x04, 1)); + __m128i int_2_y_r2 = _mm_add_epi64(_mm256_castsi256_si128(sum_y04), _mm256_extracti128_si256(sum_y04, 1)); + __m128i int_x_y_r2 = _mm_add_epi64(_mm256_castsi256_si128(sum_xy04), _mm256_extracti128_si256(sum_xy04, 1)); + + __m128i int_1_x_r1 = _mm_hadd_epi32(int_1_x_r2, int_1_x_r2); + __m128i int_1_y_r1 = _mm_hadd_epi32(int_1_y_r2, int_1_y_r2); + __m128i int_2_x_r1 = _mm_add_epi64(int_2_x_r2, _mm_unpackhi_epi64(int_2_x_r2, _mm_setzero_si128())); + __m128i int_2_y_r1 = _mm_add_epi64(int_2_y_r2, _mm_unpackhi_epi64(int_2_y_r2, _mm_setzero_si128())); + __m128i int_x_y_r1 = _mm_add_epi64(int_x_y_r2, _mm_unpackhi_epi64(int_x_y_r2, _mm_setzero_si128())); + + int32_t int_1_x0 = _mm_extract_epi32(int_1_x_r1, 0); + int32_t int_1_y0 = _mm_extract_epi32(int_1_y_r1, 0); + int64_t int_2_x0 = _mm_extract_epi64(int_2_x_r1, 0); + int64_t int_2_y0 = _mm_extract_epi64(int_2_y_r1, 0); + int64_t int_x_y0 = _mm_extract_epi64(int_x_y_r1, 0); + + int_1_x = interim_1_x[kw] + int_1_x0; + int_1_y = interim_1_y[kw] + int_1_y0; + int_2_x = interim_2_x[kw] + int_2_x0; + int_2_y = interim_2_y[kw] + int_2_y0; + int_x_y = interim_x_y[kw] + int_x_y0; + + /** + * The score needs to be calculated for kw column as well, + * whose interim result calc is different from rest of the columns, + * hence calling vif_stats_calc for kw column separately + */ + +#if VIF_STABILITY + vif_stats_calc(int_1_x, int_1_y, int_2_x, int_2_y, int_x_y, + knorm_fact, knorm_shift, + exp, sigma_nsq, log_18, + score_num, num_power, score_den, den_power, shift_val, k_norm); +#else + vif_stats_calc(int_1_x, int_1_y, int_2_x, int_2_y, int_x_y, + knorm_fact, knorm_shift, + exp, sigma_nsq, log_18, + score_num, num_power, score_den, den_power); +#endif + + __m512i interim_1_x9_512, interim_2_x9_512, interim_2_x17_512, interim_1_y9_512, \ + interim_2_y9_512, interim_2_y17_512, interim_x_y9_512, interim_x_y17_512; + //Similar to prev loop, but previous kw col interim metric sum is subtracted + int j; + for (j = kw+1; j +#include +#include +#include +#if ARCH_AARCH64 +#include +#endif + +#include "resizer_avx512.h" +#include +#include + +#if !OPTIMISED_COEFF +static void interpolateCubic(float x, float *coeffs) +{ + const float A = -0.75f; + + coeffs[0] = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A; + coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1; + coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} +#endif + +#if OPTIMISED_COEFF +void hresize_avx512(const unsigned char **src, int **dst, int count, + const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#else +void hresize_avx512(const unsigned char **src, int **dst, int count, + const int *xofs, const short *alpha, + int swidth, int dwidth, int cn, int xmin, int xmax) +#endif +{ + int xmax_64 = xmax - (xmax % 64); + int xmax_32 = xmax - (xmax % 32); + int xmax_16 = xmax - (xmax % 16); + int xmax_8 = xmax - (xmax % 8); + int xmax_4 = xmax - (xmax % 4); + + __m512i coef0_512 = _mm512_set1_epi32(alpha[0] + (alpha[1] << 16) + (1 << 16)); + __m512i coef2_512 = _mm512_set1_epi32(alpha[2] + (alpha[3] << 16)); + __m512i permlo_512 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0); + __m512i permhi_512 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); + __m512i zero_512 = _mm512_setzero_si512(); + + __m256i coef0_256 = _mm256_set1_epi32(alpha[0] + (alpha[1] << 16) + (1 << 16)); + __m256i coef2_256 = _mm256_set1_epi32(alpha[2] + (alpha[3] << 16)); + + __m128i coef0_128 = _mm_set1_epi32(alpha[0] + (alpha[1] << 16) + (1 << 16)); + __m128i coef2_128 = _mm_set1_epi32(alpha[2] + (alpha[3] << 16)); + + for (int k = 0; k < count; k++) + { + const unsigned char *S = src[k]; + int *D = dst[k]; + int dx = 0, limit = xmin; + for (;;) + { +#if OPTIMISED_COEFF + for (; dx < limit; dx++) + { + int j; + int sx = (dx * 2) - cn; +#else + for (; dx < limit; dx++, alpha += 4) + { + int j; + int sx = xofs[dx] - cn; +#endif + int v = 0; + for (j = 0; j < 4; j++) + { + int sxj = sx + j * cn; + if ((unsigned)sxj >= (unsigned)swidth) + { + while (sxj < 0) + sxj += cn; + while (sxj >= swidth) + sxj -= cn; + } + v += S[sxj] * alpha[j]; + } + D[dx] = v; + } + if (limit == dwidth) + break; +#if OPTIMISED_COEFF + for (; dx < xmax_64; dx+=64) + { + int sx = dx * 2; +#else + for (; dx < xmax; dx++, alpha += 4) + { + int sx = xofs[dx]; // sx - 2, 4, 6, 8.... +#endif + __m512i val0 = _mm512_loadu_si512((__m512i*)(S + sx - 1)); + __m512i val2 = _mm512_loadu_si512((__m512i*)(S + sx + 1)); + __m512i val64 = _mm512_loadu_si512((__m512i*)(S + sx - 1 + 64)); + __m512i val66 = _mm512_loadu_si512((__m512i*)(S + sx + 1 + 64)); + + __m512i val0_lo = _mm512_unpacklo_epi8(val0, zero_512); + __m512i val0_hi = _mm512_unpackhi_epi8(val0, zero_512); + __m512i val2_lo = _mm512_unpacklo_epi8(val2, zero_512); + __m512i val2_hi = _mm512_unpackhi_epi8(val2, zero_512); + + __m512i val64_lo = _mm512_unpacklo_epi8(val64, zero_512); + __m512i val64_hi = _mm512_unpackhi_epi8(val64, zero_512); + __m512i val66_lo = _mm512_unpacklo_epi8(val66, zero_512); + __m512i val66_hi = _mm512_unpackhi_epi8(val66, zero_512); + + __m512i res0_lo = _mm512_madd_epi16(val0_lo, coef0_512); + __m512i res0_hi = _mm512_madd_epi16(val0_hi, coef0_512); + __m512i res2_lo = _mm512_madd_epi16(val2_lo, coef2_512); + __m512i res2_hi = _mm512_madd_epi16(val2_hi, coef2_512); + + __m512i res64_lo = _mm512_madd_epi16(val64_lo, coef0_512); + __m512i res64_hi = _mm512_madd_epi16(val64_hi, coef0_512); + __m512i res66_lo = _mm512_madd_epi16(val66_lo, coef2_512); + __m512i res66_hi = _mm512_madd_epi16(val66_hi, coef2_512); + + __m512i r0_lo = _mm512_add_epi32(res0_lo, res2_lo); + __m512i r0_hi = _mm512_add_epi32(res0_hi, res2_hi); + __m512i r1_lo = _mm512_add_epi32(res64_lo, res66_lo); + __m512i r1_hi = _mm512_add_epi32(res64_hi, res66_hi); + __m512i tmp0 = r0_lo; + __m512i tmp1 = r1_lo; + + r0_lo = _mm512_permutex2var_epi64(r0_lo, permlo_512, r0_hi); + r0_hi = _mm512_permutex2var_epi64(tmp0, permhi_512, r0_hi); + r1_lo = _mm512_permutex2var_epi64(r1_lo, permlo_512, r1_hi); + r1_hi = _mm512_permutex2var_epi64(tmp1, permhi_512, r1_hi); + + _mm512_storeu_si512((__m512i*)(D + dx), r0_lo); + _mm512_storeu_si512((__m512i*)(D + dx + 16), r0_hi); + _mm512_storeu_si512((__m512i*)(D + dx + 32), r1_lo); + _mm512_storeu_si512((__m512i*)(D + dx + 48), r1_hi); + } + for (; dx < xmax_32; dx+=32) + { + int sx = dx * 2; + __m512i val0 = _mm512_loadu_si512((__m512i*)(S + sx - 1)); + __m512i val2 = _mm512_loadu_si512((__m512i*)(S + sx + 1)); + + __m512i val0_lo = _mm512_unpacklo_epi8(val0, zero_512); + __m512i val0_hi = _mm512_unpackhi_epi8(val0, zero_512); + + __m512i val2_lo = _mm512_unpacklo_epi8(val2, zero_512); + __m512i val2_hi = _mm512_unpackhi_epi8(val2, zero_512); + + __m512i res0_lo = _mm512_madd_epi16(val0_lo, coef0_512); + __m512i res0_hi = _mm512_madd_epi16(val0_hi, coef0_512); + __m512i res2_lo = _mm512_madd_epi16(val2_lo, coef2_512); + __m512i res2_hi = _mm512_madd_epi16(val2_hi, coef2_512); + + __m512i res_lo = _mm512_add_epi32(res0_lo, res2_lo); + __m512i res_hi = _mm512_add_epi32(res0_hi, res2_hi); + __m512i tmp = res_lo; + + res_lo = _mm512_permutex2var_epi64(res_lo, permlo_512, res_hi); + res_hi = _mm512_permutex2var_epi64(tmp, permhi_512, res_hi); + + _mm512_storeu_si512((__m512i*)(D + dx), res_lo); + _mm512_storeu_si512((__m512i*)(D + dx + 16), res_hi); + } + for (; dx < xmax_16; dx+=16) + { + int sx = dx * 2; + __m512i val0 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(S + sx - 1))); + __m512i val2 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(S + sx + 1))); + + __m512i res0_lo = _mm512_madd_epi16(val0, coef0_512); + __m512i res0_hi = _mm512_madd_epi16(val0, coef0_512); + __m512i res2_lo = _mm512_madd_epi16(val2, coef2_512); + __m512i res2_hi = _mm512_madd_epi16(val2, coef2_512); + + __m512i res_lo = _mm512_add_epi32(res0_lo, res2_lo); + __m512i res_hi = _mm512_add_epi32(res0_hi, res2_hi); + + _mm512_storeu_si512((__m512i*)(D + dx), res_lo); + _mm512_storeu_si512((__m512i*)(D + dx + 16), res_hi); + } + for (; dx < xmax_8; dx+=8) + { + int sx = dx * 2; + + __m128i val0 = _mm_loadu_si128((__m128i*)(S + sx - 1)); + __m128i val2 = _mm_loadu_si128((__m128i*)(S + sx + 1)); + + __m256i val0_16 = _mm256_cvtepu8_epi16(val0); + __m256i val2_16 = _mm256_cvtepu8_epi16(val2); + + __m256i res0 = _mm256_madd_epi16(val0_16, coef0_256); + __m256i res2 = _mm256_madd_epi16(val2_16, coef2_256); + + __m256i res = _mm256_add_epi32(res0, res2); + _mm256_storeu_si256((__m256i*)(D + dx), res); + } + for (; dx < xmax_4; dx+=4) + { + int sx = dx * 2; + + __m128i val0 = _mm_loadu_si128((__m128i*)(S + sx - 1)); + __m128i val2 = _mm_loadu_si128((__m128i*)(S + sx + 1)); + + __m128i val0_16 = _mm_cvtepu8_epi16(val0); + __m128i val2_16 = _mm_cvtepu8_epi16(val2); + + __m128i res0 = _mm_madd_epi16(val0_16, coef0_128); + __m128i res2 = _mm_madd_epi16(val2_16, coef2_128); + + __m128i res = _mm_add_epi32(res0, res2); + _mm_storeu_si128((__m128i*)(D + dx), res); + } + for (; dx < xmax; dx++) + { + int sx = dx * 2; + D[dx] = S[sx - 1] * alpha[0] + S[sx] * alpha[1] + S[sx + 1] * alpha[2] + S[sx + 2] * alpha[3]; + } + limit = dwidth; + } +#if !OPTIMISED_COEFF + alpha -= dwidth * 4; +#endif + } +} + +void vresize_avx512(const int **src, unsigned char *dst, const short *beta, int width) +{ + int b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3]; + const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + int bits = 22; + + __m512i sh_32_to_8_512 = _mm512_set_epi64(0x8080808080808080, 0x808080800C080400, 0x8080808080808080, 0x808080800C080400, 0x8080808080808080, 0x808080800C080400, 0x8080808080808080, 0x808080800C080400); + __m512i perm0_512 = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 8, 4, 0); + __m512i perm8_512 = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 12, 8, 4, 0, 1, 1, 1, 1); + __m512i coef0_512 = _mm512_set1_epi32(beta[0]); + __m512i coef1_512 = _mm512_set1_epi32(beta[1]); + __m512i delta_512 = _mm512_set1_epi32(1 << (bits - 1)); + __m512i max_char_512 = _mm512_set1_epi32(255); + __m512i zero_512 = _mm512_setzero_si512(); + + __m256i sh_32_to_8_256 = _mm256_set_epi64x(0x8080808080808080, 0x808080800C080400, 0x8080808080808080, 0x808080800C080400); + __m256i perm0_256 = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 4, 0); + __m256i perm8_256 = _mm256_set_epi32(1, 1, 1, 1, 4, 0, 1, 1); + __m256i coef0_256 = _mm256_set1_epi32(beta[0]); + __m256i coef1_256 = _mm256_set1_epi32(beta[1]); + __m256i delta_256 = _mm256_set1_epi32(1 << (bits - 1)); + __m256i max_char_256 = _mm256_set1_epi32(255); + __m256i zero_256 = _mm256_setzero_si256(); + + __m128i sh_32_to_8_128 = _mm_set_epi64x(0x8080808080808080, 0x808080800C080400); + __m128i coef0_128 = _mm_set1_epi32(beta[0]); + __m128i coef1_128 = _mm_set1_epi32(beta[1]); + __m128i delta_128 = _mm_set1_epi32(1 << (bits - 1)); + __m128i max_char_128 = _mm_set1_epi32(255); + __m128i zero_128 = _mm_setzero_si128(); + + int width_32 = width - (width % 32); + int width_16 = width - (width % 16); + int width_8 = width - (width % 8); + int width_4 = width - (width % 4); + int x = 0; + + for (; x < width_32; x+=32) + { + __m512i src0_0 = _mm512_loadu_si512((__m512i*)(S0 + x)); + __m512i src1_0 = _mm512_loadu_si512((__m512i*)(S1 + x)); + __m512i src2_0 = _mm512_loadu_si512((__m512i*)(S2 + x)); + __m512i src3_0 = _mm512_loadu_si512((__m512i*)(S3 + x)); + + __m512i src0_16 = _mm512_loadu_si512((__m512i*)(S0 + x + 16)); + __m512i src1_16 = _mm512_loadu_si512((__m512i*)(S1 + x + 16)); + __m512i src2_16 = _mm512_loadu_si512((__m512i*)(S2 + x + 16)); + __m512i src3_16 = _mm512_loadu_si512((__m512i*)(S3 + x + 16)); + + __m512i mul0_0 = _mm512_mullo_epi32(src0_0, coef0_512); + __m512i mul1_0 = _mm512_mullo_epi32(src1_0, coef1_512); + __m512i mul2_0 = _mm512_mullo_epi32(src2_0, coef1_512); + __m512i mul3_0 = _mm512_mullo_epi32(src3_0, coef0_512); + + __m512i mul0_8 = _mm512_mullo_epi32(src0_16, coef0_512); + __m512i mul1_8 = _mm512_mullo_epi32(src1_16, coef1_512); + __m512i mul2_8 = _mm512_mullo_epi32(src2_16, coef1_512); + __m512i mul3_8 = _mm512_mullo_epi32(src3_16, coef0_512); + + __m512i accum_01_0 = _mm512_add_epi32(mul0_0, mul1_0); + __m512i accum_23_0 = _mm512_add_epi32(mul2_0, mul3_0); + __m512i accum_01_8 = _mm512_add_epi32(mul0_8, mul1_8); + __m512i accum_23_8 = _mm512_add_epi32(mul2_8, mul3_8); + __m512i accum_0123_0 = _mm512_add_epi32(accum_01_0, accum_23_0); + __m512i accum_0123_8 = _mm512_add_epi32(accum_01_8, accum_23_8); + + accum_0123_0 = _mm512_add_epi32(accum_0123_0, delta_512); + accum_0123_8 = _mm512_add_epi32(accum_0123_8, delta_512); + accum_0123_0 = _mm512_srai_epi32(accum_0123_0, bits); + accum_0123_8 = _mm512_srai_epi32(accum_0123_8, bits); + + accum_0123_0 = _mm512_max_epi32(accum_0123_0, zero_512); + accum_0123_8 = _mm512_max_epi32(accum_0123_8, zero_512); + accum_0123_0 = _mm512_min_epi32(accum_0123_0, max_char_512); + accum_0123_8 = _mm512_min_epi32(accum_0123_8,max_char_512); + + accum_0123_0 = _mm512_shuffle_epi8(accum_0123_0, sh_32_to_8_512); + accum_0123_8 = _mm512_shuffle_epi8(accum_0123_8, sh_32_to_8_512); + + accum_0123_0 = _mm512_permutexvar_epi32(perm0_512, accum_0123_0); + accum_0123_8 = _mm512_permutexvar_epi32(perm8_512, accum_0123_8); + __m256i accum = _mm512_extracti32x8_epi32(_mm512_or_si512(accum_0123_0, accum_0123_8), 0); + _mm256_storeu_si256((__m256i*)(dst + x), accum); + } + for (; x < width_16; x+=16) + { + __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x)); + __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x)); + __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x)); + __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x)); + + __m256i src0_8 = _mm256_loadu_si256((__m256i*)(S0 + x + 8)); + __m256i src1_8 = _mm256_loadu_si256((__m256i*)(S1 + x + 8)); + __m256i src2_8 = _mm256_loadu_si256((__m256i*)(S2 + x + 8)); + __m256i src3_8 = _mm256_loadu_si256((__m256i*)(S3 + x + 8)); + + __m256i mul0_0 = _mm256_mullo_epi32(src0_0, coef0_256); + __m256i mul1_0 = _mm256_mullo_epi32(src1_0, coef1_256); + __m256i mul2_0 = _mm256_mullo_epi32(src2_0, coef1_256); + __m256i mul3_0 = _mm256_mullo_epi32(src3_0, coef0_256); + + __m256i mul0_8 = _mm256_mullo_epi32(src0_8, coef0_256); + __m256i mul1_8 = _mm256_mullo_epi32(src1_8, coef1_256); + __m256i mul2_8 = _mm256_mullo_epi32(src2_8, coef1_256); + __m256i mul3_8 = _mm256_mullo_epi32(src3_8, coef0_256); + + __m256i accum_01_0 = _mm256_add_epi32(mul0_0, mul1_0); + __m256i accum_23_0 = _mm256_add_epi32(mul2_0, mul3_0); + __m256i accum_01_8 = _mm256_add_epi32(mul0_8, mul1_8); + __m256i accum_23_8 = _mm256_add_epi32(mul2_8, mul3_8); + __m256i accum_0123_0 = _mm256_add_epi32(accum_01_0, accum_23_0); + __m256i accum_0123_8 = _mm256_add_epi32(accum_01_8, accum_23_8); + + accum_0123_0 = _mm256_add_epi32(accum_0123_0, delta_256); + accum_0123_8 = _mm256_add_epi32(accum_0123_8, delta_256); + accum_0123_0 = _mm256_srai_epi32(accum_0123_0, bits); + accum_0123_8 = _mm256_srai_epi32(accum_0123_8, bits); + + accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256); + accum_0123_8 = _mm256_max_epi32(accum_0123_8, zero_256); + accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256); + accum_0123_8 = _mm256_min_epi32(accum_0123_8, max_char_256); + + accum_0123_0 = _mm256_shuffle_epi8(accum_0123_0, sh_32_to_8_256); + accum_0123_8 = _mm256_shuffle_epi8(accum_0123_8, sh_32_to_8_256); + accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm0_256); + accum_0123_8 = _mm256_permutevar8x32_epi32(accum_0123_8, perm8_256); + + __m128i accum = _mm256_extracti128_si256(_mm256_or_si256(accum_0123_0, accum_0123_8), 0); + _mm_storeu_si128((__m128i*)(dst + x), accum); + } + for (; x < width_8; x+=8) + { + __m256i src0_0 = _mm256_loadu_si256((__m256i*)(S0 + x)); + __m256i src1_0 = _mm256_loadu_si256((__m256i*)(S1 + x)); + __m256i src2_0 = _mm256_loadu_si256((__m256i*)(S2 + x)); + __m256i src3_0 = _mm256_loadu_si256((__m256i*)(S3 + x)); + + __m256i mul0_0 = _mm256_mullo_epi32(src0_0, coef0_256); + __m256i mul1_0 = _mm256_mullo_epi32(src1_0, coef1_256); + __m256i mul2_0 = _mm256_mullo_epi32(src2_0, coef1_256); + __m256i mul3_0 = _mm256_mullo_epi32(src3_0, coef0_256); + + __m256i accum_01_0 = _mm256_add_epi32(mul0_0, mul1_0); + __m256i accum_23_0 = _mm256_add_epi32(mul2_0, mul3_0); + __m256i accum_0123_0 = _mm256_add_epi32(accum_01_0, accum_23_0); + + accum_0123_0 = _mm256_add_epi32(accum_0123_0, delta_256); + accum_0123_0 = _mm256_srai_epi32(accum_0123_0, bits); + + accum_0123_0 = _mm256_max_epi32(accum_0123_0, zero_256); + accum_0123_0 = _mm256_min_epi32(accum_0123_0, max_char_256); + + accum_0123_0 = _mm256_shuffle_epi8(accum_0123_0, sh_32_to_8_256); + accum_0123_0 = _mm256_permutevar8x32_epi32(accum_0123_0, perm0_256); + + __m128i accum = _mm256_castsi256_si128(accum_0123_0); + _mm_storel_epi64((__m128i*)(dst + x), accum); + } + for (; x < width_4; x+=4) + { + __m128i src0_0 = _mm_loadu_si128((__m128i*)(S0 + x)); + __m128i src1_0 = _mm_loadu_si128((__m128i*)(S1 + x)); + __m128i src2_0 = _mm_loadu_si128((__m128i*)(S2 + x)); + __m128i src3_0 = _mm_loadu_si128((__m128i*)(S3 + x)); + + __m128i mul0_0 = _mm_mullo_epi32(src0_0, coef0_128); + __m128i mul1_0 = _mm_mullo_epi32(src1_0, coef1_128); + __m128i mul2_0 = _mm_mullo_epi32(src2_0, coef1_128); + __m128i mul3_0 = _mm_mullo_epi32(src3_0, coef0_128); + + __m128i accum_01_0 = _mm_add_epi32(mul0_0, mul1_0); + __m128i accum_23_0 = _mm_add_epi32(mul2_0, mul3_0); + __m128i accum_0123_0 = _mm_add_epi32(accum_01_0, accum_23_0); + + accum_0123_0 = _mm_add_epi32(accum_0123_0, delta_128); + accum_0123_0 = _mm_srai_epi32(accum_0123_0, bits); + + accum_0123_0 = _mm_max_epi32(accum_0123_0, zero_128); + accum_0123_0 = _mm_min_epi32(accum_0123_0, max_char_128); + + accum_0123_0 = _mm_shuffle_epi8(accum_0123_0, sh_32_to_8_128); + _mm_maskstore_epi32((int*)(dst + x), _mm_set_epi32(0, 0, 0, 0x80000000), accum_0123_0); + } + + for (; x < width; x++) + dst[x] = castOp(S0[x] * b0 + S1[x] * b1 + S2[x] * b2 + S3[x] * b3); +} + +static int clip(int x, int a, int b) +{ + return x >= a ? (x < b ? x : b - 1) : a; +} + +#if OPTIMISED_COEFF +void step_avx512(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax) +#else +void step_avx512(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax) +#endif +{ + int dy, cn = channels; + + int bufstep = (int)((dwidth + 16 - 1) & -16); + int *_buffer = (int *)malloc(bufstep * ksize * sizeof(int)); + if (_buffer == NULL) + { + printf("resizer: malloc fails\n"); + return; + } + const unsigned char *srows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int *rows[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int prev_sy[MAX_ESIZE]; + + for (int k = 0; k < ksize; k++) + { + prev_sy[k] = -1; + rows[k] = _buffer + bufstep * k; + } + +#if !OPTIMISED_COEFF + const short *beta = _beta + ksize * start; +#endif + +#if OPTIMISED_COEFF + for (dy = start; dy < end; dy++) + { + int sy0 = dy * 2; +#else + for (dy = start; dy < end; dy++, beta += ksize) + { + int sy0 = yofs[dy]; +#endif + int k0 = ksize, k1 = 0, ksize2 = ksize / 2; + + for (int k = 0; k < ksize; k++) + { + int sy = clip(sy0 - ksize2 + 1 + k, 0, iheight); + for (k1 = MAX(k1, k); k1 < ksize; k1++) + { + if (k1 < MAX_ESIZE && sy == prev_sy[k1]) // if the sy-th row has been computed already, reuse it. + { + if (k1 > k) + memcpy(rows[k], rows[k1], bufstep * sizeof(rows[0][0])); + break; + } + } + if (k1 == ksize) + k0 = MIN(k0, k); // remember the first row that needs to be computed + srows[k] = _src + (sy * iwidth); + prev_sy[k] = sy; + } + + + + // regular c +#if OPTIMISED_COEFF + if (k0 < ksize) + { + hresize_avx512((srows + k0), (rows + k0), ksize - k0, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + vresize_avx512((const int **)rows, (_dst + dwidth * dy), _beta, dwidth); +#else + if (k0 < ksize) + { + hresize_avx512((srows + k0), (rows + k0), ksize - k0, xofs, _alpha, + iwidth, dwidth, cn, xmin, xmax); + } + vresize_avx512((const int **)rows, (_dst + dwidth * dy), beta, dwidth); +#endif + } + free(_buffer); +} \ No newline at end of file diff --git a/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.h b/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.h new file mode 100644 index 000000000..b021ad1ce --- /dev/null +++ b/libvmaf/src/feature/third_party/funque/x86/resizer_avx512.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: BSD-3-Clause +* Copyright (C) 2022 Intel Corporation. +*/ +/** + * + * Copyright 2016-2020 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "../resizer.h" + +void vresize_avx512(const int **src, unsigned char *dst, const short *beta, int width); +#if OPTIMISED_COEFF +void step_avx512(const unsigned char *_src, unsigned char *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax); +void hbd_step_avx512(const unsigned short *_src, unsigned short *_dst, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth); +#else +void step_avx512(const unsigned char *_src, unsigned char *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax); +void hbd_step_avx512(const unsigned short *_src, unsigned short *_dst, const int *xofs, const int *yofs, const short *_alpha, const short *_beta, int iwidth, int iheight, int dwidth, int dheight, int channels, int ksize, int start, int end, int xmin, int xmax, int bitdepth); +#endif \ No newline at end of file diff --git a/libvmaf/src/meson.build b/libvmaf/src/meson.build index 374c23740..f36591b30 100644 --- a/libvmaf/src/meson.build +++ b/libvmaf/src/meson.build @@ -305,6 +305,17 @@ if is_asm_enabled feature_src_dir + 'x86/vif_avx512.c', ] + if funque_fixed_enabled + x86_avx512_sources += [ + funque_feature_dir + 'x86/integer_funque_filters_avx512.c', + funque_feature_dir + 'x86/resizer_avx512.c', + funque_feature_dir + 'x86/hbd_resizer_avx512.c', + funque_feature_dir + 'x86/integer_funque_ssim_avx512.c', + funque_feature_dir + 'x86/integer_funque_adm_avx512.c', + funque_feature_dir + 'x86/integer_funque_vif_avx512.c', + ] + endif + x86_avx512_static_lib = static_library( 'x86_avx512', x86_avx512_sources,