From 9f64b9ff26ea7338fcade34cccac7e167dbe1f2c Mon Sep 17 00:00:00 2001
From: Johannes Demel <demel@uni-bremen.de>
Date: Sun, 22 Oct 2023 17:14:40 +0200
Subject: [PATCH] kernel: Refactor 32f_s32f_multiply_32f kernel

This kernel should be easier to read now and without redundant kernel
definitions. Multiple small code improvements are added.

1. generic kernel moved to the top
2. a_generic kernel removed. This was just a copy of the generic one.
3. Move loop counter declaration into loop
4. Combine variable declaration and initialization

Tests indicate that the generic kernel is faster in most cases.

Signed-off-by: Johannes Demel <demel@uni-bremen.de>
---
 kernels/volk/volk_32f_s32f_multiply_32f.h | 117 ++++++++--------------
 1 file changed, 40 insertions(+), 77 deletions(-)

diff --git a/kernels/volk/volk_32f_s32f_multiply_32f.h b/kernels/volk/volk_32f_s32f_multiply_32f.h
index 28dc14eee..290210c12 100644
--- a/kernels/volk/volk_32f_s32f_multiply_32f.h
+++ b/kernels/volk/volk_32f_s32f_multiply_32f.h
@@ -59,6 +59,18 @@
 #include <inttypes.h>
 #include <stdio.h>
 
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
+                                                      const float* aVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
+{
+    for (unsigned int number = 0; number < num_points; number++) {
+        *cVector++ = (*aVector++) * scalar;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
 
@@ -67,18 +79,16 @@ static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
                                                     const float scalar,
                                                     unsigned int num_points)
 {
-    unsigned int number = 0;
     const unsigned int quarterPoints = num_points / 4;
 
     float* cPtr = cVector;
     const float* aPtr = aVector;
 
-    __m128 aVal, bVal, cVal;
-    bVal = _mm_set_ps1(scalar);
-    for (; number < quarterPoints; number++) {
-        aVal = _mm_loadu_ps(aPtr);
+    const __m128 bVal = _mm_set_ps1(scalar);
+    for (unsigned int number = 0; number < quarterPoints; number++) {
+        __m128 aVal = _mm_loadu_ps(aPtr);
 
-        cVal = _mm_mul_ps(aVal, bVal);
+        __m128 cVal = _mm_mul_ps(aVal, bVal);
 
         _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
 
@@ -86,8 +96,7 @@ static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
         cPtr += 4;
     }
 
-    number = quarterPoints * 4;
-    for (; number < num_points; number++) {
+    for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
         *cPtr++ = (*aPtr++) * scalar;
     }
 }
@@ -101,19 +110,16 @@ static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
                                                     const float scalar,
                                                     unsigned int num_points)
 {
-    unsigned int number = 0;
     const unsigned int eighthPoints = num_points / 8;
 
     float* cPtr = cVector;
     const float* aPtr = aVector;
 
-    __m256 aVal, bVal, cVal;
-    bVal = _mm256_set1_ps(scalar);
-    for (; number < eighthPoints; number++) {
+    const __m256 bVal = _mm256_set1_ps(scalar);
+    for (unsigned int number = 0; number < eighthPoints; number++) {
+        __m256 aVal = _mm256_loadu_ps(aPtr);
 
-        aVal = _mm256_loadu_ps(aPtr);
-
-        cVal = _mm256_mul_ps(aVal, bVal);
+        __m256 cVal = _mm256_mul_ps(aVal, bVal);
 
         _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
 
@@ -121,8 +127,7 @@ static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
         cPtr += 8;
     }
 
-    number = eighthPoints * 8;
-    for (; number < num_points; number++) {
+    for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
         *cPtr++ = (*aPtr++) * scalar;
     }
 }
@@ -135,22 +140,6 @@ extern void volk_32f_s32f_multiply_32f_sifive_u74(float* cVector,
                                                   unsigned int num_points);
 #endif /* LV_HAVE_RISCV64 */
 
-#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
-                                                      const float* aVector,
-                                                      const float scalar,
-                                                      unsigned int num_points)
-{
-    unsigned int number = 0;
-    const float* inputPtr = aVector;
-    float* outputPtr = cVector;
-    for (number = 0; number < num_points; number++) {
-        *outputPtr = (*inputPtr) * scalar;
-        inputPtr++;
-        outputPtr++;
-    }
-}
-#endif /* LV_HAVE_GENERIC */
 
 #endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
 
@@ -169,18 +158,16 @@ static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
                                                     const float scalar,
                                                     unsigned int num_points)
 {
-    unsigned int number = 0;
     const unsigned int quarterPoints = num_points / 4;
 
     float* cPtr = cVector;
     const float* aPtr = aVector;
 
-    __m128 aVal, bVal, cVal;
-    bVal = _mm_set_ps1(scalar);
-    for (; number < quarterPoints; number++) {
-        aVal = _mm_load_ps(aPtr);
+    const __m128 bVal = _mm_set_ps1(scalar);
+    for (unsigned int number = 0; number < quarterPoints; number++) {
+        __m128 aVal = _mm_load_ps(aPtr);
 
-        cVal = _mm_mul_ps(aVal, bVal);
+        __m128 cVal = _mm_mul_ps(aVal, bVal);
 
         _mm_store_ps(cPtr, cVal); // Store the results back into the C container
 
@@ -188,8 +175,7 @@ static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
         cPtr += 4;
     }
 
-    number = quarterPoints * 4;
-    for (; number < num_points; number++) {
+    for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
         *cPtr++ = (*aPtr++) * scalar;
     }
 }
@@ -203,18 +189,16 @@ static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
                                                     const float scalar,
                                                     unsigned int num_points)
 {
-    unsigned int number = 0;
     const unsigned int eighthPoints = num_points / 8;
 
     float* cPtr = cVector;
     const float* aPtr = aVector;
 
-    __m256 aVal, bVal, cVal;
-    bVal = _mm256_set1_ps(scalar);
-    for (; number < eighthPoints; number++) {
-        aVal = _mm256_load_ps(aPtr);
+    const __m256 bVal = _mm256_set1_ps(scalar);
+    for (unsigned int number = 0; number < eighthPoints; number++) {
+        __m256 aVal = _mm256_load_ps(aPtr);
 
-        cVal = _mm256_mul_ps(aVal, bVal);
+        __m256 cVal = _mm256_mul_ps(aVal, bVal);
 
         _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
 
@@ -222,8 +206,7 @@ static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
         cPtr += 8;
     }
 
-    number = eighthPoints * 8;
-    for (; number < num_points; number++) {
+    for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
         *cPtr++ = (*aPtr++) * scalar;
     }
 }
@@ -237,46 +220,26 @@ static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector,
                                                      const float scalar,
                                                      unsigned int num_points)
 {
-    unsigned int number = 0;
-    const float* inputPtr = aVector;
-    float* outputPtr = cVector;
     const unsigned int quarterPoints = num_points / 4;
 
-    float32x4_t aVal, cVal;
+    const float* inputPtr = aVector;
+    float* outputPtr = cVector;
 
-    for (number = 0; number < quarterPoints; number++) {
-        aVal = vld1q_f32(inputPtr);       // Load into NEON regs
-        cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
+    for (unsigned int number = 0; number < quarterPoints; number++) {
+        float32x4_t aVal = vld1q_f32(inputPtr);       // Load into NEON regs
+        float32x4_t cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
         vst1q_f32(outputPtr, cVal);       // Store results back to output
         inputPtr += 4;
         outputPtr += 4;
     }
-    for (number = quarterPoints * 4; number < num_points; number++) {
+
+    for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
         *outputPtr++ = (*inputPtr++) * scalar;
     }
 }
 #endif /* LV_HAVE_NEON */
 
 
-#ifdef LV_HAVE_GENERIC
-
-static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector,
-                                                        const float* aVector,
-                                                        const float scalar,
-                                                        unsigned int num_points)
-{
-    unsigned int number = 0;
-    const float* inputPtr = aVector;
-    float* outputPtr = cVector;
-    for (number = 0; number < num_points; number++) {
-        *outputPtr = (*inputPtr) * scalar;
-        inputPtr++;
-        outputPtr++;
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
 #ifdef LV_HAVE_ORC
 
 extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst,
@@ -292,6 +255,6 @@ static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
     volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
 }
 
-#endif /* LV_HAVE_GENERIC */
+#endif /* LV_HAVE_ORC */
 
 #endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */