Skip to content

Commit

Permalink
Use madd() intrinsic if available
Browse files Browse the repository at this point in the history
AVX introduced the _mm_fmadd_ps() intrinsic, so we can use it if AVX (or
an equivalent instruction set) is available when building Graphene.

There is no functional difference in this commit if AVX is not
available, except that we moved from a generic static inline
implementation to a SIMD-specific one.
  • Loading branch information
ebassi committed Aug 12, 2024
1 parent df7fa97 commit ac3f9a2
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 20 deletions.
73 changes: 53 additions & 20 deletions include/graphene-simd4f.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,11 @@ graphene_simd4f_t graphene_simd4f_ceil (const graphene_simd4f_t
GRAPHENE_AVAILABLE_IN_1_12
graphene_simd4f_t graphene_simd4f_floor (const graphene_simd4f_t s);

GRAPHENE_AVAILABLE_IN_1_0
graphene_simd4f_t graphene_simd4f_madd (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c);

#if !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_SSE)

/* SSE2 implementation of SIMD 4f */
Expand Down Expand Up @@ -504,6 +509,18 @@ typedef GRAPHENE_ALIGN16 union {
}))
# endif

# if defined(GRAPHENE_USE_AVX)
# define graphene_simd4f_madd(a,b,c) \
(__extension__ ({ \
(graphene_simd4f_t) _mm_fmadd_ps ((a), (b), (c)); \
}))
# else
# define graphene_simd4f_madd(a,b,c) \
(__extension__ ({ \
(graphene_simd4f_t) _mm_add_ps (_mm_mul_ps ((a), (b)), (c)); \
}))
# endif

/* On MSVC, we use static inlines */
# elif defined (_MSC_VER) /* Visual Studio SSE intrinsics */

Expand Down Expand Up @@ -835,6 +852,20 @@ _simd4f_floor (const graphene_simd4f_t s)
#endif
}

#define graphene_simd4f_madd(a,b,c) _simd4f_madd(a,b,c)

static inline graphene_simd4f_t
_simd4f_madd (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c)
{
#if defined(GRAPHENE_USE_AVX)
return _mm_fmadd_ps (a, b, c);
#else
return _mm_add_ps (_mm_mul_ps (a, b), c);
#endif
}

#else /* SSE intrinsics-not GCC or Visual Studio */

# error "Need GCC-compatible or Visual Studio compiler for SSE extensions."
Expand Down Expand Up @@ -1158,6 +1189,11 @@ typedef int graphene_simd4i_t __attribute__((vector_size (16)));
(graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \
}))

# define graphene_simd4f_madd(a,b,c) \
(__extension__ ({ \
(graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \
}))

#elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_ARM_NEON)

/* ARM Neon implementation of SIMD4f */
Expand Down Expand Up @@ -1498,6 +1534,11 @@ typedef float32x2_t graphene_simd2f_t;
(graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \
}))

# define graphene_simd4f_madd(a,b,c) \
(__extension__ ({ \
(graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \
}))

#elif defined _MSC_VER /* Visual Studio ARM */

# define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w)
Expand Down Expand Up @@ -1840,6 +1881,16 @@ _simd4f_floor (const graphene_simd4f_t s)
return graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w);
}

# define graphene_simd4f_madd(a,b,c) _simd4d_madd(a,b,c)

static inline graphene_simd4f_t
_simd4f_madd (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c)
{
return graphene_simd4f_add (graphene_simd4f_mul (a, b), c);
}

#else /* ARM NEON intrinsics-not GCC or Visual Studio */

# error "Need GCC-compatible or Visual Studio compiler for ARM NEON extensions."
Expand Down Expand Up @@ -1956,33 +2007,15 @@ _simd4f_floor (const graphene_simd4f_t s)
(graphene_simd4f_ceil ((s)))
#define graphene_simd4f_floor(s) \
(graphene_simd4f_floor ((s)))
#define graphene_simd4f_madd(a,b,c) \
(graphene_simd4f_madd ((a), (b), (c)))

#else
# error "Unsupported simd4f implementation."
#endif

/* Generic operations, inlined */

/**
* graphene_simd4f_madd:
* @m1: a #graphene_simd4f_t
* @m2: a #graphene_simd4f_t
* @a: a #graphene_simd4f_t
*
* Adds @a to the product of @m1 and @m2.
*
* Returns: the result vector
*
* Since: 1.0
*/
static inline graphene_simd4f_t
graphene_simd4f_madd (const graphene_simd4f_t m1,
const graphene_simd4f_t m2,
const graphene_simd4f_t a)
{
return graphene_simd4f_add (graphene_simd4f_mul (m1, m2), a);
}

/**
* graphene_simd4f_sum:
* @v: a #graphene_simd4f_t
Expand Down
28 changes: 28 additions & 0 deletions src/graphene-simd4f.c
Original file line number Diff line number Diff line change
Expand Up @@ -1073,6 +1073,26 @@ graphene_simd4f_t
return graphene_simd4f_floor (s);
}

/**
* graphene_simd4f_madd:
* @a: a #graphene_simd4f_t
* @b: a #graphene_simd4f_t
* @c: a #graphene_simd4f_t
*
* Adds @a to the product of @m1 and @m2.
*
* Returns: the result vector
*
* Since: 1.0
*/
graphene_simd4f_t
(graphene_simd4f_madd) (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c)
{
return graphene_simd4f_madd (a, b, c);
}

#else /* GRAPHENE_USE_SCALAR */

graphene_simd4f_t
Expand Down Expand Up @@ -1516,4 +1536,12 @@ graphene_simd4f_t
return graphene_simd4f_init (floorf (s.x), floorf (s.y), floorf (s.z), floorf (s.w));
}

graphene_simd4f_t
(graphene_simd4f_madd) (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c)
{
return graphene_simd4f_add (graphene_simd4f_mul (a, b), c);
}

#endif /* GRAPHENE_USE_SCALAR */

0 comments on commit ac3f9a2

Please sign in to comment.