From df7fa97a76044b3ad2844a7cbab953469ad36ed1 Mon Sep 17 00:00:00 2001
From: Emmanuele Bassi <ebassi@gnome.org>
Date: Mon, 12 Aug 2024 11:53:36 +0100
Subject: [PATCH 1/2] Add AVX detection

All supported compilers define `__AVX__` when building with the AVX
instruction set enabled.
---
 include/graphene-config.h.meson | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/graphene-config.h.meson b/include/graphene-config.h.meson
index 1600830..cba1741 100644
--- a/include/graphene-config.h.meson
+++ b/include/graphene-config.h.meson
@@ -57,6 +57,7 @@ extern "C" {
 # if defined(GRAPHENE_USE_SSE)
 #  include <xmmintrin.h>
 #  include <emmintrin.h>
+#
 #  if defined(_M_IX86_FP)
 #   if _M_IX86_FP >= 2
 #    define GRAPHENE_USE_SSE4_1
@@ -66,9 +67,18 @@ extern "C" {
 #  elif defined(_MSC_VER)
 #   define GRAPHENE_USE_SSE4_1
 #  endif
+#
+#  if defined(__AVX__)
+#    #define GRAPHENE_USE_AVX
+#  endif
+#
 #  if defined(GRAPHENE_USE_SSE4_1)
 #   include <smmintrin.h>
 #  endif
+#
+#  if defined(GRAPHENE_USE_AVX)
+#   include <immintrin.h>
+#  endif
 typedef __m128 graphene_simd4f_t;
 # elif defined(GRAPHENE_USE_ARM_NEON)
 #  if defined (_MSC_VER) && (_MSC_VER < 1920) && defined (_M_ARM64)

From b185f556ae51d937b753249690f0b465236fe0fe Mon Sep 17 00:00:00 2001
From: Emmanuele Bassi <ebassi@gnome.org>
Date: Mon, 12 Aug 2024 11:55:44 +0100
Subject: [PATCH 2/2] Use madd() intrinsic if available

AVX introduced the _mm_fmadd_ps() intrinsic, so we can use it if AVX (or
an equivalent instruction set) is available when building Graphene.

There is no functional difference in this commit if AVX is not
available, except that we moved from a generic static inline
implementation to a SIMD-specific one.
---
 include/graphene-simd4f.h | 73 ++++++++++++++++++++++++++++-----------
 src/graphene-simd4f.c     | 28 +++++++++++++++
 2 files changed, 81 insertions(+), 20 deletions(-)

diff --git a/include/graphene-simd4f.h b/include/graphene-simd4f.h
index 758343d..9029d51 100644
--- a/include/graphene-simd4f.h
+++ b/include/graphene-simd4f.h
@@ -179,6 +179,11 @@ graphene_simd4f_t       graphene_simd4f_ceil            (const graphene_simd4f_t
 GRAPHENE_AVAILABLE_IN_1_12
 graphene_simd4f_t       graphene_simd4f_floor           (const graphene_simd4f_t s);
 
+GRAPHENE_AVAILABLE_IN_1_0
+graphene_simd4f_t       graphene_simd4f_madd            (const graphene_simd4f_t a,
+                                                         const graphene_simd4f_t b,
+                                                         const graphene_simd4f_t c);
+
 #if !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_SSE)
 
 /* SSE2 implementation of SIMD 4f */
@@ -504,6 +509,18 @@ typedef GRAPHENE_ALIGN16 union {
   }))
 #  endif
 
+#  if defined(GRAPHENE_USE_AVX)
+#   define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) _mm_fmadd_ps ((a), (b), (c)); \
+  }))
+#  else
+#   define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) _mm_add_ps (_mm_mul_ps ((a), (b)), (c)); \
+  }))
+#  endif
+
 /* On MSVC, we use static inlines */
 # elif defined (_MSC_VER) /* Visual Studio SSE intrinsics */
 
@@ -835,6 +852,20 @@ _simd4f_floor (const graphene_simd4f_t s)
 #endif
 }
 
+#define graphene_simd4f_madd(a,b,c) _simd4f_madd(a,b,c)
+
+static inline graphene_simd4f_t
+_simd4f_madd (const graphene_simd4f_t a,
+              const graphene_simd4f_t b,
+              const graphene_simd4f_t c)
+{
+#if defined(GRAPHENE_USE_AVX)
+  return _mm_fmadd_ps (a, b, c);
+#else
+  return _mm_add_ps (_mm_mul_ps (a, b), c);
+#endif
+}
+
 #else /* SSE intrinsics-not GCC or Visual Studio */
 
 #  error "Need GCC-compatible or Visual Studio compiler for SSE extensions."
@@ -1158,6 +1189,11 @@ typedef int graphene_simd4i_t __attribute__((vector_size (16)));
     (graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \
   }))
 
+# define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \
+  }))
+
 #elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_ARM_NEON)
 
 /* ARM Neon implementation of SIMD4f */
@@ -1498,6 +1534,11 @@ typedef float32x2_t graphene_simd2f_t;
     (graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \
   }))
 
+# define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \
+  }))
+
 #elif defined _MSC_VER /* Visual Studio ARM */
 
 # define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w)
@@ -1840,6 +1881,16 @@ _simd4f_floor (const graphene_simd4f_t s)
   return graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w);
 }
 
+# define graphene_simd4f_madd(a,b,c) _simd4f_madd(a,b,c)
+
+static inline graphene_simd4f_t
+_simd4f_madd (const graphene_simd4f_t a,
+              const graphene_simd4f_t b,
+              const graphene_simd4f_t c)
+{
+  return graphene_simd4f_add (graphene_simd4f_mul (a, b), c);
+}
+
 #else /* ARM NEON intrinsics-not GCC or Visual Studio */
 
 #  error "Need GCC-compatible or Visual Studio compiler for ARM NEON extensions."
@@ -1956,6 +2007,8 @@ _simd4f_floor (const graphene_simd4f_t s)
   (graphene_simd4f_ceil ((s)))
 #define graphene_simd4f_floor(s) \
   (graphene_simd4f_floor ((s)))
+#define graphene_simd4f_madd(a,b,c) \
+  (graphene_simd4f_madd ((a), (b), (c)))
 
 #else
 # error "Unsupported simd4f implementation."
@@ -1963,26 +2016,6 @@ _simd4f_floor (const graphene_simd4f_t s)
 
 /* Generic operations, inlined */
 
-/**
- * graphene_simd4f_madd:
- * @m1: a #graphene_simd4f_t
- * @m2: a #graphene_simd4f_t
- * @a: a #graphene_simd4f_t
- *
- * Adds @a to the product of @m1 and @m2.
- *
- * Returns: the result vector
- *
- * Since: 1.0
- */
-static inline graphene_simd4f_t
-graphene_simd4f_madd (const graphene_simd4f_t m1,
-                      const graphene_simd4f_t m2,
-                      const graphene_simd4f_t a)
-{
-  return graphene_simd4f_add (graphene_simd4f_mul (m1, m2), a);
-}
-
 /**
  * graphene_simd4f_sum:
  * @v: a #graphene_simd4f_t
diff --git a/src/graphene-simd4f.c b/src/graphene-simd4f.c
index 00c545b..d9f7e99 100644
--- a/src/graphene-simd4f.c
+++ b/src/graphene-simd4f.c
@@ -1073,6 +1073,26 @@ graphene_simd4f_t
   return graphene_simd4f_floor (s);
 }
 
+/**
+ * graphene_simd4f_madd:
+ * @a: a #graphene_simd4f_t
+ * @b: a #graphene_simd4f_t
+ * @c: a #graphene_simd4f_t
+ *
+ * Adds @a to the product of @m1 and @m2.
+ *
+ * Returns: the result vector
+ *
+ * Since: 1.0
+ */
+graphene_simd4f_t
+(graphene_simd4f_madd) (const graphene_simd4f_t a,
+                        const graphene_simd4f_t b,
+                        const graphene_simd4f_t c)
+{
+  return graphene_simd4f_madd (a, b, c);
+}
+
 #else /* GRAPHENE_USE_SCALAR */
 
 graphene_simd4f_t
@@ -1516,4 +1536,12 @@ graphene_simd4f_t
   return graphene_simd4f_init (floorf (s.x), floorf (s.y), floorf (s.z), floorf (s.w));
 }
 
+graphene_simd4f_t
+(graphene_simd4f_madd) (const graphene_simd4f_t a,
+                        const graphene_simd4f_t b,
+                        const graphene_simd4f_t c)
+{
+  return graphene_simd4f_add (graphene_simd4f_mul (a, b), c);
+}
+
 #endif /* GRAPHENE_USE_SCALAR */