From 728a3ce5cc8b0652ab0acc0083f4e989eed4d932 Mon Sep 17 00:00:00 2001 From: Cameron Cawley Date: Mon, 6 Apr 2020 00:10:25 +0100 Subject: [PATCH] [librpcpu] Improve ifunc dispatch functions --- src/gtk/GdkImageConv.hpp | 47 +++-- src/libromdata/utils/SuperMagicDrive.hpp | 84 ++++---- src/librpcpu/byteswap.h | 5 +- src/librpcpu/cpu_dispatch.h | 14 -- src/librptexture/decoder/ImageDecoder.hpp | 227 +++++++++++----------- 5 files changed, 177 insertions(+), 200 deletions(-) diff --git a/src/gtk/GdkImageConv.hpp b/src/gtk/GdkImageConv.hpp index 4e6463af6..2ff3e333d 100644 --- a/src/gtk/GdkImageConv.hpp +++ b/src/gtk/GdkImageConv.hpp @@ -21,6 +21,9 @@ namespace LibRpTexture { #if defined(RP_CPU_I386) || defined(RP_CPU_AMD64) # include "librpcpu/cpuflags_x86.h" +# ifdef RP_HAS_IFUNC +# define GDKIMAGECONV_HAS_IFUNC 1 +# endif # define GDKIMAGECONV_HAS_SSSE3 1 #endif @@ -51,36 +54,38 @@ class GdkImageConv static GdkPixbuf *rp_image_to_GdkPixbuf_ssse3(const LibRpTexture::rp_image *img); #endif /* GDKIMAGECONV_HAS_SSSE3 */ +#ifdef GDKIMAGECONV_HAS_HAS_IFUNC + /* System has IFUNC. Use it for dispatching. */ + /** * Convert an rp_image to GdkPixbuf. * @param img [in] rp_image. * @return GdkPixbuf, or nullptr on error. */ - static IFUNC_INLINE GdkPixbuf *rp_image_to_GdkPixbuf(const LibRpTexture::rp_image *img); -}; - -#if !defined(RP_HAS_IFUNC) || (!defined(RP_CPU_I386) && !defined(RP_CPU_AMD64)) + static GdkPixbuf *rp_image_to_GdkPixbuf(const LibRpTexture::rp_image *img); -// System does not support IFUNC, or we don't have optimizations for these CPUs. -// Use standard inline dispatch. +#else + // System does not support IFUNC, or we don't have optimizations for these CPUs. + // Use standard inline dispatch. -/** - * Convert an rp_image to GdkPixbuf. - * @param img rp_image. - * @return GdkPixbuf, or nullptr on error. - */ -inline GdkPixbuf *GdkImageConv::rp_image_to_GdkPixbuf(const LibRpTexture::rp_image *img) -{ + /** + * Convert an rp_image to GdkPixbuf. + * @param img [in] rp_image. + * @return GdkPixbuf, or nullptr on error. + */ + static inline GdkPixbuf *GdkImageConv::rp_image_to_GdkPixbuf(const LibRpTexture::rp_image *img) + { #ifdef GDKIMAGECONV_HAS_SSSE3 - if (RP_CPU_HasSSSE3()) { - return rp_image_to_GdkPixbuf_ssse3(img); - } else + if (RP_CPU_HasSSSE3()) { + return rp_image_to_GdkPixbuf_ssse3(img); + } else #endif /* GDKIMAGECONV_HAS_SSSE3 */ - { - return rp_image_to_GdkPixbuf_cpp(img); - } -} + { + return rp_image_to_GdkPixbuf_cpp(img); + } + } -#endif /* !defined(RP_HAS_IFUNC) || (!defined(RP_CPU_I386) && !defined(RP_CPU_AMD64)) */ +#endif +}; #endif /* __ROMPROPERTIES_GTK_GDKIMAGECONV_HPP__ */ diff --git a/src/libromdata/utils/SuperMagicDrive.hpp b/src/libromdata/utils/SuperMagicDrive.hpp index 6bd988504..311ea6373 100644 --- a/src/libromdata/utils/SuperMagicDrive.hpp +++ b/src/libromdata/utils/SuperMagicDrive.hpp @@ -27,6 +27,10 @@ #endif #ifdef RP_CPU_AMD64 # define SMD_ALWAYS_HAS_SSE2 1 +#else +# ifdef RP_HAS_IFUNC +# define SMD_HAS_IFUNC 1 +# endif #endif namespace LibRomData { @@ -77,70 +81,54 @@ class SuperMagicDrive // SMD block size. static const unsigned int SMD_BLOCK_SIZE = 16384; + // TODO: Use gcc target-specific function attributes if available? + // (IFUNC dispatcher, etc.) + +#ifdef SMD_HAS_HAS_IFUNC + /* System has IFUNC. Use it for dispatching. */ + /** * Decode a Super Magic Drive interleaved block. * NOTE: Pointers must be 16-byte aligned if using SSE2. * @param pDest [out] Destination block. (Must be 16 KB.) * @param pSrc [in] Source block. (Must be 16 KB.) */ - static IFUNC_SSE2_INLINE void decodeBlock(uint8_t *RESTRICT pDest, const uint8_t *RESTRICT pSrc); -}; - -// TODO: Use gcc target-specific function attributes if available? -// (IFUNC dispatcher, etc.) - -/** Dispatch functions. **/ - -#if defined(RP_HAS_IFUNC) && defined(SMD_ALWAYS_HAS_SSE2) - -// System does support IFUNC, but it's always guaranteed to have SSE2. -// Eliminate the IFUNC dispatch on this system. - -/** - * Decode a Super Magic Drive interleaved block. - * NOTE: Pointers must be 16-byte aligned if using SSE2. - * @param dest [out] Destination block. (Must be 16 KB.) - * @param src [in] Source block. (Must be 16 KB.) - */ -inline void SuperMagicDrive::decodeBlock(uint8_t *RESTRICT pDest, const uint8_t *RESTRICT pSrc) -{ - // amd64 always has SSE2. - decodeBlock_sse2(pDest, pSrc); -} - -#endif /* defined(RP_HAS_IFUNC) && defined(SMD_ALWAYS_HAS_SSE2) */ + static void decodeBlock(uint8_t *RESTRICT pDest, const uint8_t *RESTRICT pSrc); -#if !defined(RP_HAS_IFUNC) || (!defined(RP_CPU_I386) && !defined(RP_CPU_AMD64)) +#else + // System does not support IFUNC, or we don't have optimizations for these CPUs. + // Use standard inline dispatch. -/** - * Decode a Super Magic Drive interleaved block. - * NOTE: Pointers must be 16-byte aligned if using SSE2. - * @param dest [out] Destination block. (Must be 16 KB.) - * @param src [in] Source block. (Must be 16 KB.) - */ -inline void SuperMagicDrive::decodeBlock(uint8_t *RESTRICT pDest, const uint8_t *RESTRICT pSrc) -{ + /** + * Decode a Super Magic Drive interleaved block. + * NOTE: Pointers must be 16-byte aligned if using SSE2. + * @param dest [out] Destination block. (Must be 16 KB.) + * @param src [in] Source block. (Must be 16 KB.) + */ + static inline void decodeBlock(uint8_t *RESTRICT pDest, const uint8_t *RESTRICT pSrc) + { #ifdef SMD_ALWAYS_HAS_SSE2 - // amd64 always has SSE2. - decodeBlock_sse2(pDest, pSrc); + // amd64 always has SSE2. + decodeBlock_sse2(pDest, pSrc); #else /* SMD_ALWAYS_HAS_SSE2 */ # ifdef SMD_HAS_SSE2 - if (RP_CPU_HasSSE2()) { - decodeBlock_sse2(pDest, pSrc); - } else + if (RP_CPU_HasSSE2()) { + decodeBlock_sse2(pDest, pSrc); + } else # endif /* SMD_HAS_SSE2 */ # ifdef SMD_HAS_MMX - if (RP_CPU_HasMMX()) { - decodeBlock_mmx(pDest, pSrc); - } else + if (RP_CPU_HasMMX()) { + decodeBlock_mmx(pDest, pSrc); + } else #endif /* SMD_HAS_MMX */ - { - decodeBlock_cpp(pDest, pSrc); - } + { + decodeBlock_cpp(pDest, pSrc); + } #endif /* SMD_ALWAYS_HAS_SSE2 */ -} + } -#endif /* !defined(RP_HAS_IFUNC) || (!defined(RP_CPU_I386) && !defined(RP_CPU_AMD64)) */ +#endif +}; } diff --git a/src/librpcpu/byteswap.h b/src/librpcpu/byteswap.h index a51c018d2..d600ec796 100644 --- a/src/librpcpu/byteswap.h +++ b/src/librpcpu/byteswap.h @@ -21,6 +21,9 @@ #if defined(RP_CPU_I386) || defined(RP_CPU_AMD64) # include "cpuflags_x86.h" +# ifdef RP_HAS_IFUNC +# define BYTESWAP_HAS_IFUNC 1 +# endif /* MSVC does not support MMX intrinsics in 64-bit builds. */ /* Reference: https://msdn.microsoft.com/en-us/library/08x3t697(v=vs.110).aspx */ /* In addition, amd64 CPUs all support SSE2 as a minimum, */ @@ -186,7 +189,7 @@ void __byte_swap_16_array_ssse3(uint16_t *ptr, size_t n); void __byte_swap_32_array_ssse3(uint32_t *ptr, size_t n); #endif /* BYTESWAP_HAS_SSSE3 */ -#if defined(RP_HAS_IFUNC) +#ifdef BYTESWAP_HAS_IFUNC /* System has IFUNC. Use it for dispatching. */ /** diff --git a/src/librpcpu/cpu_dispatch.h b/src/librpcpu/cpu_dispatch.h index 97e90ffbf..70016620f 100644 --- a/src/librpcpu/cpu_dispatch.h +++ b/src/librpcpu/cpu_dispatch.h @@ -65,23 +65,9 @@ #endif // IFUNC attribute. -// - IFUNC_SSE2_INLINE: inline if CPU always has SSE2. #ifdef RP_HAS_IFUNC -# define IFUNC_INLINE -# define IFUNC_STATIC_INLINE -# ifdef RP_CPU_AMD64 -# define IFUNC_SSE2_INLINE inline -# define IFUNC_SSE2_STATIC_INLINE static inline -# else -# define IFUNC_SSE2_INLINE -# define IFUNC_SSE2_STATIC_INLINE -# endif # define IFUNC_ATTR(func) __attribute__((ifunc(#func))) #else -# define IFUNC_INLINE inline -# define IFUNC_STATIC_INLINE static inline -# define IFUNC_SSE2_INLINE inline -# define IFUNC_SSE2_STATIC_INLINE static inline # define IFUNC_ATTR(func) #endif diff --git a/src/librptexture/decoder/ImageDecoder.hpp b/src/librptexture/decoder/ImageDecoder.hpp index 618573dce..a08ea623b 100644 --- a/src/librptexture/decoder/ImageDecoder.hpp +++ b/src/librptexture/decoder/ImageDecoder.hpp @@ -19,6 +19,9 @@ #if defined(RP_CPU_I386) || defined(RP_CPU_AMD64) # include "librpcpu/cpuflags_x86.h" +# ifdef RP_HAS_IFUNC +# define IMAGEDECODER_HAS_IFUNC 1 +# endif # define IMAGEDECODER_HAS_SSE2 1 # define IMAGEDECODER_HAS_SSSE3 1 #endif @@ -227,6 +230,9 @@ rp_image *fromLinear16_sse2(PixelFormat px_format, const uint16_t *RESTRICT img_buf, int img_siz, int stride = 0); #endif /* IMAGEDECODER_HAS_SSE2 */ +#if defined(IMAGEDECODER_HAS_IFUNC) && !defined(IMAGEDECODER_ALWAYS_HAS_SSE2) +/* System has IFUNC. Use it for dispatching. */ + /** * Convert a linear 16-bit RGB image to rp_image. * @param px_format [in] 16-bit pixel format. @@ -237,10 +243,46 @@ rp_image *fromLinear16_sse2(PixelFormat px_format, * @param stride [in,opt] Stride, in bytes. If 0, assumes width*bytespp. * @return rp_image, or nullptr on error. */ -IFUNC_SSE2_STATIC_INLINE rp_image *fromLinear16(PixelFormat px_format, +rp_image *fromLinear16(PixelFormat px_format, int width, int height, const uint16_t *RESTRICT img_buf, int img_siz, int stride = 0); +#else + +// System does not support IFUNC, or we aren't guaranteed to have +// optimizations for these CPUs. Use standard inline dispatch. + +/** + * Convert a linear 16-bit RGB image to rp_image. + * @param px_format [in] 16-bit pixel format. + * @param width [in] Image width. + * @param height [in] Image height. + * @param img_buf [in] Image buffer. + * @param img_siz [in] Size of image data. [must be >= (w*h)*3] + * @param stride [in,opt] Stride, in bytes. If 0, assumes width*bytespp. + * @return rp_image, or nullptr on error. + */ +static inline rp_image *fromLinear16(PixelFormat px_format, + int width, int height, + const uint16_t *RESTRICT img_buf, int img_siz, int stride = 0) +{ +#ifdef IMAGEDECODER_ALWAYS_HAS_SSE2 + // amd64 always has SSE2. + return fromLinear16_sse2(px_format, width, height, img_buf, img_siz, stride); +#else /* !IMAGEDECODER_ALWAYS_HAS_SSE2 */ +# ifdef IMAGEDECODER_HAS_SSE2 + if (RP_CPU_HasSSE2()) { + return fromLinear16_sse2(px_format, width, height, img_buf, img_siz, stride); + } else +# endif /* IMAGEDECODER_HAS_SSE2 */ + { + return fromLinear16_cpp(px_format, width, height, img_buf, img_siz, stride); + } +#endif /* IMAGEDECODER_ALWAYS_HAS_SSE2 */ +} + +#endif + /** 24-bit **/ /** @@ -275,6 +317,9 @@ rp_image *fromLinear24_ssse3(PixelFormat px_format, const uint8_t *RESTRICT img_buf, int img_siz, int stride = 0); #endif /* IMAGEDECODER_HAS_SSSE3 */ +#ifdef IMAGEDECODER_HAS_IFUNC +/* System has IFUNC. Use it for dispatching. */ + /** * Convert a linear 24-bit RGB image to rp_image. * @param px_format [in] 24-bit pixel format. @@ -285,10 +330,40 @@ rp_image *fromLinear24_ssse3(PixelFormat px_format, * @param stride [in,opt] Stride, in bytes. If 0, assumes width*bytespp. * @return rp_image, or nullptr on error. */ -IFUNC_STATIC_INLINE rp_image *fromLinear24(PixelFormat px_format, +rp_image *fromLinear24(PixelFormat px_format, int width, int height, const uint8_t *RESTRICT img_buf, int img_siz, int stride = 0); +#else +// System does not support IFUNC, or we aren't guaranteed to have +// optimizations for these CPUs. Use standard inline dispatch. + +/** + * Convert a linear 24-bit RGB image to rp_image. + * @param px_format [in] 24-bit pixel format. + * @param width [in] Image width. + * @param height [in] Image height. + * @param img_buf [in] Image buffer. (must be byte-addressable) + * @param img_siz [in] Size of image data. [must be >= (w*h)*3] + * @param stride [in,opt] Stride, in bytes. If 0, assumes width*bytespp. + * @return rp_image, or nullptr on error. + */ +static inline rp_image *fromLinear24(PixelFormat px_format, + int width, int height, + const uint8_t *RESTRICT img_buf, int img_siz, int stride = 0) +{ +#ifdef IMAGEDECODER_HAS_SSSE3 + if (RP_CPU_HasSSSE3()) { + return fromLinear24_ssse3(px_format, width, height, img_buf, img_siz, stride); + } else +#endif /* IMAGEDECODER_HAS_SSSE3 */ + { + return fromLinear24_cpp(px_format, width, height, img_buf, img_siz, stride); + } +} + +#endif + /** 32-bit **/ /** @@ -323,6 +398,9 @@ rp_image *fromLinear32_ssse3(PixelFormat px_format, const uint32_t *RESTRICT img_buf, int img_siz, int stride = 0); #endif /* IMAGEDECODER_HAS_SSSE3 */ +#ifdef IMAGEDECODER_HAS_IFUNC +/* System has IFUNC. Use it for dispatching. */ + /** * Convert a linear 32-bit RGB image to rp_image. * @param px_format [in] 32-bit pixel format. @@ -333,10 +411,40 @@ rp_image *fromLinear32_ssse3(PixelFormat px_format, * @param stride [in,opt] Stride, in bytes. If 0, assumes width*bytespp. * @return rp_image, or nullptr on error. */ -IFUNC_STATIC_INLINE rp_image *fromLinear32(PixelFormat px_format, +rp_image *fromLinear32(PixelFormat px_format, int width, int height, const uint32_t *RESTRICT img_buf, int img_siz, int stride = 0); +#else +// System does not support IFUNC, or we aren't guaranteed to have +// optimizations for these CPUs. Use standard inline dispatch. + +/** + * Convert a linear 32-bit RGB image to rp_image. + * @param px_format [in] 32-bit pixel format. + * @param width [in] Image width. + * @param height [in] Image height. + * @param img_buf [in] 32-bit image buffer. + * @param img_siz [in] Size of image data. [must be >= (w*h)*2] + * @param stride [in,opt] Stride, in bytes. If 0, assumes width*bytespp. + * @return rp_image, or nullptr on error. + */ +static inline rp_image *fromLinear32(PixelFormat px_format, + int width, int height, + const uint32_t *RESTRICT img_buf, int img_siz, int stride = 0) +{ +#ifdef IMAGEDECODER_HAS_SSSE3 + if (RP_CPU_HasSSSE3()) { + return fromLinear32_ssse3(px_format, width, height, img_buf, img_siz, stride); + } else +#endif /* IMAGEDECODER_HAS_SSSE3 */ + { + return fromLinear32_cpp(px_format, width, height, img_buf, img_siz, stride); + } +} + +#endif + /** GameCube **/ /** @@ -724,119 +832,6 @@ rp_image *fromPVRTCII(int width, int height, rp_image *fromBC7(int width, int height, const uint8_t *img_buf, int img_siz); -/************************* - ** Dispatch functions. ** - *************************/ - -#if defined(RP_HAS_IFUNC) && defined(IMAGEDECODER_ALWAYS_HAS_SSE2) - -// System does support IFUNC, but it's always guaranteed to have SSE2. -// Eliminate the IFUNC dispatch on this system. - -/** - * Convert a linear 16-bit RGB image to rp_image. - * @param px_format [in] 16-bit pixel format. - * @param width [in] Image width. - * @param height [in] Image height. - * @param img_buf [in] Image buffer. - * @param img_siz [in] Size of image data. [must be >= (w*h)*3] - * @param stride [in,opt] Stride, in bytes. If 0, assumes width*bytespp. - * @return rp_image, or nullptr on error. - */ -static inline rp_image *fromLinear16(PixelFormat px_format, - int width, int height, - const uint16_t *img_buf, int img_siz, int stride) -{ - // amd64 always has SSE2. - return fromLinear16_sse2(px_format, width, height, img_buf, img_siz, stride); -} - -#endif /* defined(RP_HAS_IFUNC) && defined(IMAGEDECODER_ALWAYS_HAS_SSE2) */ - -#if !defined(RP_HAS_IFUNC) || (!defined(RP_CPU_I386) && !defined(RP_CPU_AMD64)) - -// System does not support IFUNC, or we aren't guaranteed to have -// optimizations for these CPUs. Use standard inline dispatch. - -/** - * Convert a linear 16-bit RGB image to rp_image. - * @param px_format [in] 16-bit pixel format. - * @param width [in] Image width. - * @param height [in] Image height. - * @param img_buf [in] Image buffer. - * @param img_siz [in] Size of image data. [must be >= (w*h)*3] - * @param stride [in,opt] Stride, in bytes. If 0, assumes width*bytespp. - * @return rp_image, or nullptr on error. - */ -static inline rp_image *fromLinear16(PixelFormat px_format, - int width, int height, - const uint16_t *RESTRICT img_buf, int img_siz, int stride) -{ -#ifdef IMAGEDECODER_ALWAYS_HAS_SSE2 - // amd64 always has SSE2. - return fromLinear16_sse2(px_format, width, height, img_buf, img_siz, stride); -#else /* !IMAGEDECODER_ALWAYS_HAS_SSE2 */ -# ifdef IMAGEDECODER_HAS_SSE2 - if (RP_CPU_HasSSE2()) { - return fromLinear16_sse2(px_format, width, height, img_buf, img_siz, stride); - } else -# endif /* IMAGEDECODER_HAS_SSE2 */ - { - return fromLinear16_cpp(px_format, width, height, img_buf, img_siz, stride); - } -#endif /* IMAGEDECODER_ALWAYS_HAS_SSE2 */ -} - -/** - * Convert a linear 24-bit RGB image to rp_image. - * @param px_format [in] 24-bit pixel format. - * @param width [in] Image width. - * @param height [in] Image height. - * @param img_buf [in] Image buffer. (must be byte-addressable) - * @param img_siz [in] Size of image data. [must be >= (w*h)*3] - * @param stride [in,opt] Stride, in bytes. If 0, assumes width*bytespp. - * @return rp_image, or nullptr on error. - */ -static inline rp_image *fromLinear24(PixelFormat px_format, - int width, int height, - const uint8_t *RESTRICT img_buf, int img_siz, int stride) -{ -#ifdef IMAGEDECODER_HAS_SSSE3 - if (RP_CPU_HasSSSE3()) { - return fromLinear24_ssse3(px_format, width, height, img_buf, img_siz, stride); - } else -#endif /* IMAGEDECODER_HAS_SSSE3 */ - { - return fromLinear24_cpp(px_format, width, height, img_buf, img_siz, stride); - } -} - -/** - * Convert a linear 32-bit RGB image to rp_image. - * @param px_format [in] 32-bit pixel format. - * @param width [in] Image width. - * @param height [in] Image height. - * @param img_buf [in] 32-bit image buffer. - * @param img_siz [in] Size of image data. [must be >= (w*h)*2] - * @param stride [in,opt] Stride, in bytes. If 0, assumes width*bytespp. - * @return rp_image, or nullptr on error. - */ -static rp_image *fromLinear32(PixelFormat px_format, - int width, int height, - const uint32_t *RESTRICT img_buf, int img_siz, int stride) -{ -#ifdef IMAGEDECODER_HAS_SSSE3 - if (RP_CPU_HasSSSE3()) { - return fromLinear32_ssse3(px_format, width, height, img_buf, img_siz, stride); - } else -#endif /* IMAGEDECODER_HAS_SSSE3 */ - { - return fromLinear32_cpp(px_format, width, height, img_buf, img_siz, stride); - } -} - -#endif /* !defined(RP_HAS_IFUNC) || (!defined(RP_CPU_I386) && !defined(RP_CPU_AMD64)) */ - } } #endif /* __ROMPROPERTIES_LIBRPTEXTURE_DECODER_IMAGEDECODER_HPP__ */