Skip to content

Commit

Permalink
Merge 'aocl-fftw-3.0/amd-staging-milan-3.0' into amd-fftw
Browse files Browse the repository at this point in the history
  • Loading branch information
BiplabRaut committed Mar 15, 2021
2 parents 2a05028 + 7033737 commit 94199c3
Show file tree
Hide file tree
Showing 16 changed files with 560 additions and 29 deletions.
10 changes: 9 additions & 1 deletion README_AMD.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ functions (cpy2d and cpy2d_pair used in rank-0 transform and buffering plan),
improved 256-bit kernels selection by Planner and an optional in-place
transpose for large problem sizes. AMD Optimized FFTW improves the performance
of in-place MPI FFT over FFTW 3.3.8 by employing a faster in-place MPI
transpose function.
transpose function. As of AMD FFTW 3.0, a new fast planner is added as an
extension to the original planner that improves planning time of various
planning modes in general and PATIENT mode in particular.

FFTW is a free collection of fast C routines for computing the
Discrete Fourier Transform and various special cases thereof in one or more
Expand Down Expand Up @@ -51,6 +53,12 @@ configure option "--enable-generic-simd128" or "--enable-generic-simd256".
The optional configure option "--enable-amd-mpifft" enables the MPI FFT
related optimizations.

The new fast planner can be enabled using optional configure option
"--enable-amd-fast-planner". It is supported for single and double precisions.

An optional configure option "AMD_ARCH" is supported that can be set to CPU
architecture values like "auto" or "znver1" or "znver2" for AMD EPYC processors.

An optional configure option "--enable-amd-trans" is provided that may benefit
the performance of transpose operations in case of very large FFT problem sizes.
This is by default not enabled and provided as an experimental optional switch.
Expand Down
7 changes: 7 additions & 0 deletions api/configure.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
* Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
Expand All @@ -25,7 +26,13 @@

void X(configure_planner)(planner *plnr)
{
#ifdef AMD_FAST_PLANNER
X(reodft_conf_standard)(plnr);
X(rdft_conf_standard)(plnr);
X(dft_conf_standard)(plnr);
#else
X(dft_conf_standard)(plnr);
X(rdft_conf_standard)(plnr);
X(reodft_conf_standard)(plnr);
#endif
}
4 changes: 3 additions & 1 deletion api/fftw3.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
* Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved.
*
* The following statement of license applies *only* to this header file,
* and *not* to the other files distributed with FFTW or derived therefrom:
Expand Down Expand Up @@ -448,6 +449,7 @@ FFTW_EXTERN int \
FFTW_CDECL X(alignment_of)(R *p); \
\
FFTW_EXTERN const char X(version)[]; \
FFTW_EXTERN const char X(aoclversion)[]; \
FFTW_EXTERN const char X(cc)[]; \
FFTW_EXTERN const char X(codelet_optim)[];

Expand All @@ -460,7 +462,7 @@ FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)

/* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64
for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */
#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \
#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || (__clang__ && __clang_major__ >= 10)) \
&& !(defined(__ICC) || defined(__INTEL_COMPILER) || defined(__CUDACC__) || defined(__PGI)) \
&& (defined(__i386__) || defined(__x86_64__) || defined(__ia64__))
# if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
Expand Down
3 changes: 3 additions & 0 deletions api/version.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
* Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -28,6 +29,8 @@ const char X(cc)[] = FFTW_CC;
of the ABI */
const char X(codelet_optim)[] = "";

const char X(aoclversion)[] = AOCL_FFTW_VERSION;

const char X(version)[] = PACKAGE "-" PACKAGE_VERSION

#if HAVE_FMA
Expand Down
6 changes: 6 additions & 0 deletions config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,18 @@
/* Define to enable AMD cpu specific optimizations. */
#undef AMD_OPT_ALL

/* Define to enable AMD Fast Planner for AMD cpus. */
#undef AMD_OPT_FAST_PLANNER

/* Define to enable AMD cpu optimized MPI FFT. */
#undef AMD_OPT_MPIFFT

/* Define to enable AMD cpu optimized Transpose. */
#undef AMD_OPT_TRANS

/* AOCL Version of AMD-FFTW */
#undef AOCL_FFTW_VERSION

/* Define if the machine architecture "naturally" prefers fused multiply-add
instructions */
#undef ARCH_PREFERS_FMA
Expand Down
129 changes: 119 additions & 10 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,7 @@ enable_mpi
enable_amd_opt
enable_amd_trans
enable_amd_mpifft
enable_amd_fast_planner
enable_fortran
with_g77_wrappers
enable_openmp
Expand Down Expand Up @@ -1586,6 +1587,9 @@ Optional Features:
--enable-amd-opt enable AMD cpu specific optimizations
--enable-amd-trans enable AMD cpu optimized Transpose
--enable-amd-mpifft enable AMD cpu optimized MPI FFT
--enable-amd-fast-planner
enable AMD Fast Planner for a faster planning time
on AMD cpus
--disable-fortran don't include Fortran-callable wrappers
--enable-openmp use OpenMP directives for parallelism
--enable-threads compile FFTW SMP threads library
Expand Down Expand Up @@ -3676,7 +3680,9 @@ else
ok=no
fi

long_double_supported=no;
if test "$ok" = "yes"; then
long_double_supported=yes;
if test "$PRECISION" = "s"; then
as_fn_error $? "--enable-single/--enable-long-double conflict" "$LINENO" 5
fi
Expand Down Expand Up @@ -3704,7 +3710,9 @@ else
ok=no
fi

quad_precision_supported=no;
if test "$ok" = "yes"; then
quad_precision_supported=yes;
if test "$PRECISION" != "d"; then
as_fn_error $? "conflicting precisions specified" "$LINENO" 5
fi
Expand Down Expand Up @@ -17039,11 +17047,40 @@ fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_gcc_4_6_0" >&5
$as_echo "$ax_cv_gcc_4_6_0" >&6; }
if test "$ax_cv_gcc_4_6_0" = yes; then
:
gcc_supported=1
else
as_fn_error $? "gcc 4.6 or later required for quad precision support" "$LINENO" 5
gcc_supported=0
fi

{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using clang 10 or later" >&5
$as_echo_n "checking whether we are using clang 10 or later... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */

#ifdef __clang__
#if __clang_major__ < 10
#error Clang 10 or later is recommended for Quad precision
#endif
#else
#error Other compiler unsupported
#endif

_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
$as_echo "yes" >&6; }
clang_supported=1
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
clang_supported=0
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
if test "$gcc_supported" = "1" || test "$clang_supported" = "1"; then
:
else
as_fn_error $? "gcc 4.6 or later OR clang 10 or later required for quad precision support" "$LINENO" 5
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for sinq in -lquadmath" >&5
$as_echo_n "checking for sinq in -lquadmath... " >&6; }
if ${ac_cv_lib_quadmath_sinq+:} false; then :
Expand Down Expand Up @@ -17448,17 +17485,74 @@ else
have_amd_opt=no
fi

if test "$have_amd_opt" = yes; then
if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then
AMDZENFAMILY=$(expr `cat /proc/cpuinfo | grep -m1 family|cut -f2 -d:`)
AMDZENMODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:`)
if test "$ac_test_CFLAGS" != "set"; then
if test "$CC" = "clang"; then
if [ "$AMD_ARCH" = "auto" ]; then
AMD_ARCH=""
fi
SUBSTRCLANG='clang'
SUBSTRGCC='gcc'
if grep -q "$SUBSTRCLANG" <<<"$CC"; then
CFLAGS="$CFLAGS -mavx2 -mfma"
else
GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.` \>= 9)
AMDZEN2MODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:` \>= 48)
if test "$GCCVERSION" = "1" && test "$AMDZEN2MODEL" = "1"; then
CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
elif grep -q "$SUBSTRGCC" <<<"$CC"; then
GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.`)
case "$AMDZENFAMILY" in
"23")
if [ -z "${AMD_ARCH}" ]; then
if [ "$GCCVERSION" -ge "9" ] && [ "$AMDZENMODEL" -ge "48" ]; then
CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
else
CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
fi
else
CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
fi
;;
"25")
if [ -z "${AMD_ARCH}" ]; then
if [ "$GCCVERSION" -ge "9" ] && [ "$AMDZENMODEL" -le "15" ]; then
CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
elif [ "$GCCVERSION" -lt "9" ] && [ "$AMDZENMODEL" -le "15" ]; then
CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
else
CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
fi
else
CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
fi
;;
*)
if [ -z "${AMD_ARCH}" ]; then
CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
else
CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
fi
;;
esac
fi
else
if [ -n "${AMD_ARCH}" ]; then
if [ "$AMD_ARCH" = "auto" ]; then
case "$AMDZENFAMILY" in
"23")
if [ "$AMDZENMODEL" -ge "48" ]; then
CFLAGS="$CFLAGS -march=znver2"
else
CFLAGS="$CFLAGS -march=znver1"
fi
;;
"25")
if [ "$AMDZENMODEL" -le "15" ]; then
CFLAGS="$CFLAGS -march=znver2"
else
CFLAGS="$CFLAGS -mavx2"
fi
;;
esac
else
CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
CFLAGS="$CFLAGS -march=$AMD_ARCH"
fi
fi
fi
Expand Down Expand Up @@ -17491,6 +17585,21 @@ $as_echo "#define AMD_OPT_MPIFFT 1" >>confdefs.h

fi

$as_echo "#define AOCL_FFTW_VERSION \"AOCL-3.0\"" >>confdefs.h

# Check whether --enable-amd-fast-planner was given.
if test "${enable_amd_fast_planner+set}" = set; then :
enableval=$enable_amd_fast_planner; have_amd_fast_planner=$enableval
else
have_amd_fast_planner=no
fi

if test "$have_amd_fast_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then

$as_echo "#define AMD_OPT_FAST_PLANNER 1" >>confdefs.h

fi

if test "$USE_MAINTAINER_MODE" = yes; then
# Extract the first word of "indent", so it can be a program name with args.
set dummy indent; ac_word=$2
Expand Down
Loading

0 comments on commit 94199c3

Please sign in to comment.