Merge 'aocl-fftw-3.0/amd-staging-milan-3.0' into amd-fftw

amd · Mar 15, 2021 · 94199c3 · 94199c3
2 parents 2a05028 + 7033737
commit 94199c3
Show file tree

Hide file tree

Showing 16 changed files with 560 additions and 29 deletions.
diff --git a/README_AMD.md b/README_AMD.md
@@ -12,7 +12,9 @@ functions (cpy2d and cpy2d_pair used in rank-0 transform and buffering plan),
 improved 256-bit kernels selection by Planner and an optional in-place 
 transpose for large problem sizes. AMD Optimized FFTW improves the performance
 of in-place MPI FFT over FFTW 3.3.8 by employing a faster in-place MPI
-transpose function.
+transpose function. As of AMD FFTW 3.0, a new fast planner is added as an
+extension to the original planner that improves planning time of various
+planning modes in general and PATIENT mode in particular.
 
 FFTW is a free collection of fast C routines for computing the
 Discrete Fourier Transform and various special cases thereof in one or more
@@ -51,6 +53,12 @@ configure option "--enable-generic-simd128" or "--enable-generic-simd256".
 The optional configure option "--enable-amd-mpifft" enables the MPI FFT
 related optimizations.
 
+The new fast planner can be enabled using optional configure option 
+"--enable-amd-fast-planner". It is supported for single and double precisions.
+
+An optional configure option "AMD_ARCH" is supported that can be set to CPU 
+architecture values like "auto" or "znver1" or "znver2" for AMD EPYC processors.
+
 An optional configure option "--enable-amd-trans" is provided that may benefit
 the performance of transpose operations in case of very large FFT problem sizes.
 This is by default not enabled and provided as an experimental optional switch. 

diff --git a/api/configure.c b/api/configure.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -25,7 +26,13 @@
 
 void X(configure_planner)(planner *plnr)
 {
+#ifdef AMD_FAST_PLANNER
+     X(reodft_conf_standard)(plnr);
+     X(rdft_conf_standard)(plnr);
+     X(dft_conf_standard)(plnr);
+#else
      X(dft_conf_standard)(plnr);
      X(rdft_conf_standard)(plnr);
      X(reodft_conf_standard)(plnr);
+#endif
 }
diff --git a/api/fftw3.h b/api/fftw3.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ * Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * The following statement of license applies *only* to this header file,
  * and *not* to the other files distributed with FFTW or derived therefrom:
@@ -448,6 +449,7 @@ FFTW_EXTERN int                                                         \
 FFTW_CDECL X(alignment_of)(R *p);                                       \
                                                                         \
 FFTW_EXTERN const char X(version)[];                                    \
+FFTW_EXTERN const char X(aoclversion)[];                                \
 FFTW_EXTERN const char X(cc)[];                                         \
 FFTW_EXTERN const char X(codelet_optim)[];
 
@@ -460,7 +462,7 @@ FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
 
 /* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64
    for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */
-#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \
+#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || (__clang__ && __clang_major__ >= 10)) \
  && !(defined(__ICC) || defined(__INTEL_COMPILER) || defined(__CUDACC__) || defined(__PGI)) \
  && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__))
 #  if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)

diff --git a/api/version.c b/api/version.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ * Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -28,6 +29,8 @@ const char X(cc)[] = FFTW_CC;
    of the ABI */
 const char X(codelet_optim)[] = "";
 
+const char X(aoclversion)[] = AOCL_FFTW_VERSION;
+
 const char X(version)[] = PACKAGE "-" PACKAGE_VERSION
 
 #if HAVE_FMA

diff --git a/config.h.in b/config.h.in
@@ -3,12 +3,18 @@
 /* Define to enable AMD cpu specific optimizations. */
 #undef AMD_OPT_ALL
 
+/* Define to enable AMD Fast Planner for AMD cpus. */
+#undef AMD_OPT_FAST_PLANNER
+
 /* Define to enable AMD cpu optimized MPI FFT. */
 #undef AMD_OPT_MPIFFT
 
 /* Define to enable AMD cpu optimized Transpose. */
 #undef AMD_OPT_TRANS
 
+/* AOCL Version of AMD-FFTW */
+#undef AOCL_FFTW_VERSION
+
 /* Define if the machine architecture "naturally" prefers fused multiply-add
    instructions */
 #undef ARCH_PREFERS_FMA

diff --git a/configure b/configure
@@ -877,6 +877,7 @@ enable_mpi
 enable_amd_opt
 enable_amd_trans
 enable_amd_mpifft
+enable_amd_fast_planner
 enable_fortran
 with_g77_wrappers
 enable_openmp
@@ -1586,6 +1587,9 @@ Optional Features:
   --enable-amd-opt        enable AMD cpu specific optimizations
   --enable-amd-trans      enable AMD cpu optimized Transpose
   --enable-amd-mpifft     enable AMD cpu optimized MPI FFT
+  --enable-amd-fast-planner
+                          enable AMD Fast Planner for a faster planning time
+                          on AMD cpus
   --disable-fortran       don't include Fortran-callable wrappers
   --enable-openmp         use OpenMP directives for parallelism
   --enable-threads        compile FFTW SMP threads library
@@ -3676,7 +3680,9 @@ else
   ok=no
 fi
 
+long_double_supported=no;
 if test "$ok" = "yes"; then
+	long_double_supported=yes;
 	if test "$PRECISION" = "s"; then
 		as_fn_error $? "--enable-single/--enable-long-double conflict" "$LINENO" 5
 	fi
@@ -3704,7 +3710,9 @@ else
   ok=no
 fi
 
+quad_precision_supported=no;
 if test "$ok" = "yes"; then
+	quad_precision_supported=yes;
 	if test "$PRECISION" != "d"; then
 		as_fn_error $? "conflicting precisions specified" "$LINENO" 5
 	fi
@@ -17039,11 +17047,40 @@ fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_gcc_4_6_0" >&5
 $as_echo "$ax_cv_gcc_4_6_0" >&6; }
 if test "$ax_cv_gcc_4_6_0" = yes; then
-	:
+	gcc_supported=1
 else
-	as_fn_error $? "gcc 4.6 or later required for quad precision support" "$LINENO" 5
+	gcc_supported=0
 fi
 
+   { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using clang 10 or later" >&5
+$as_echo_n "checking whether we are using clang 10 or later... " >&6; }
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+		      #ifdef __clang__
+		      #if __clang_major__ < 10
+		      #error Clang 10 or later is recommended for Quad precision
+		      #endif
+		      #else
+		      #error Other compiler unsupported
+		      #endif
+
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+		       clang_supported=1
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+		       clang_supported=0
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   if test "$gcc_supported" = "1" || test "$clang_supported" = "1"; then
+	   :
+   else
+	as_fn_error $? "gcc 4.6 or later OR clang 10 or later required for quad precision support" "$LINENO" 5
+   fi
    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sinq in -lquadmath" >&5
 $as_echo_n "checking for sinq in -lquadmath... " >&6; }
 if ${ac_cv_lib_quadmath_sinq+:} false; then :
@@ -17448,17 +17485,74 @@ else
   have_amd_opt=no
 fi
 
-if test "$have_amd_opt" = yes; then
+if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then
+	AMDZENFAMILY=$(expr `cat /proc/cpuinfo | grep -m1 family|cut -f2 -d:`)
+	AMDZENMODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:`)
 	if test "$ac_test_CFLAGS" != "set"; then
-		if test "$CC" = "clang"; then
+		if [ "$AMD_ARCH" = "auto" ]; then
+			AMD_ARCH=""
+		fi
+		SUBSTRCLANG='clang'
+		SUBSTRGCC='gcc'
+		if grep -q "$SUBSTRCLANG" <<<"$CC"; then
 			CFLAGS="$CFLAGS -mavx2 -mfma"
-		else
-			GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.` \>= 9)
-			AMDZEN2MODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:` \>= 48)
-			if test "$GCCVERSION" = "1"  && test "$AMDZEN2MODEL" = "1"; then
-				CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
+		elif grep -q "$SUBSTRGCC" <<<"$CC"; then
+			GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.`)
+			case "$AMDZENFAMILY" in
+				"23")
+				if [ -z "${AMD_ARCH}" ]; then
+				if [ "$GCCVERSION" -ge "9" ] && [ "$AMDZENMODEL" -ge "48" ]; then
+					CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
+				else
+					CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
+				fi
+				else
+					CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
+				fi
+				;;
+				"25")
+				if [ -z "${AMD_ARCH}" ]; then
+				if [ "$GCCVERSION" -ge "9" ] && [ "$AMDZENMODEL" -le "15" ]; then
+					CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
+				elif [ "$GCCVERSION" -lt "9" ] && [ "$AMDZENMODEL" -le "15" ]; then
+					CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
+				else
+					CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
+				fi
+				else
+					CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
+				fi
+				;;
+				*)
+				if [ -z "${AMD_ARCH}" ]; then
+				CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
+				else
+				CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
+				fi
+				;;
+			esac
+		fi
+	else
+		if [ -n "${AMD_ARCH}" ]; then
+			if [ "$AMD_ARCH" = "auto" ]; then
+			case "$AMDZENFAMILY" in
+				"23")
+				if [ "$AMDZENMODEL" -ge "48" ]; then
+					CFLAGS="$CFLAGS -march=znver2"
+				else
+					CFLAGS="$CFLAGS -march=znver1"
+				fi
+				;;
+				"25")
+				if [ "$AMDZENMODEL" -le "15" ]; then
+					CFLAGS="$CFLAGS -march=znver2"
+				else
+					CFLAGS="$CFLAGS -mavx2"
+				fi
+				;;
+			esac
 			else
-				CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma"
+				CFLAGS="$CFLAGS -march=$AMD_ARCH"
 			fi
 		fi
 	fi
@@ -17491,6 +17585,21 @@ $as_echo "#define AMD_OPT_MPIFFT 1" >>confdefs.h
 
 fi
 
+$as_echo "#define AOCL_FFTW_VERSION \"AOCL-3.0\"" >>confdefs.h
+
+# Check whether --enable-amd-fast-planner was given.
+if test "${enable_amd_fast_planner+set}" = set; then :
+  enableval=$enable_amd_fast_planner; have_amd_fast_planner=$enableval
+else
+  have_amd_fast_planner=no
+fi
+
+if test "$have_amd_fast_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then
+
+$as_echo "#define AMD_OPT_FAST_PLANNER 1" >>confdefs.h
+
+fi
+
 if test "$USE_MAINTAINER_MODE" = yes; then
         # Extract the first word of "indent", so it can be a program name with args.
 set dummy indent; ac_word=$2