diff --git a/README_AMD.md b/README_AMD.md index 0362dcab..c5b52b05 100644 --- a/README_AMD.md +++ b/README_AMD.md @@ -12,7 +12,9 @@ functions (cpy2d and cpy2d_pair used in rank-0 transform and buffering plan), improved 256-bit kernels selection by Planner and an optional in-place transpose for large problem sizes. AMD Optimized FFTW improves the performance of in-place MPI FFT over FFTW 3.3.8 by employing a faster in-place MPI -transpose function. +transpose function. As of AMD FFTW 3.0, a new fast planner is added as an +extension to the original planner that improves planning time of various +planning modes in general and PATIENT mode in particular. FFTW is a free collection of fast C routines for computing the Discrete Fourier Transform and various special cases thereof in one or more @@ -51,6 +53,12 @@ configure option "--enable-generic-simd128" or "--enable-generic-simd256". The optional configure option "--enable-amd-mpifft" enables the MPI FFT related optimizations. +The new fast planner can be enabled using optional configure option +"--enable-amd-fast-planner". It is supported for single and double precisions. + +An optional configure option "AMD_ARCH" is supported that can be set to CPU +architecture values like "auto" or "znver1" or "znver2" for AMD EPYC processors. + An optional configure option "--enable-amd-trans" is provided that may benefit the performance of transpose operations in case of very large FFT problem sizes. This is by default not enabled and provided as an experimental optional switch. diff --git a/api/configure.c b/api/configure.c index 08b074cd..c7ad44e2 100644 --- a/api/configure.c +++ b/api/configure.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,7 +26,13 @@ void X(configure_planner)(planner *plnr) { +#ifdef AMD_FAST_PLANNER + X(reodft_conf_standard)(plnr); + X(rdft_conf_standard)(plnr); + X(dft_conf_standard)(plnr); +#else X(dft_conf_standard)(plnr); X(rdft_conf_standard)(plnr); X(reodft_conf_standard)(plnr); +#endif } diff --git a/api/fftw3.h b/api/fftw3.h index 7bd4c6e5..5c5ffa0d 100644 --- a/api/fftw3.h +++ b/api/fftw3.h @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved. * * The following statement of license applies *only* to this header file, * and *not* to the other files distributed with FFTW or derived therefrom: @@ -448,6 +449,7 @@ FFTW_EXTERN int \ FFTW_CDECL X(alignment_of)(R *p); \ \ FFTW_EXTERN const char X(version)[]; \ +FFTW_EXTERN const char X(aoclversion)[]; \ FFTW_EXTERN const char X(cc)[]; \ FFTW_EXTERN const char X(codelet_optim)[]; @@ -460,7 +462,7 @@ FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex) /* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64 for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */ -#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \ +#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || (__clang__ && __clang_major__ >= 10)) \ && !(defined(__ICC) || defined(__INTEL_COMPILER) || defined(__CUDACC__) || defined(__PGI)) \ && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__)) # if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I) diff --git a/api/version.c b/api/version.c index 4f14de15..2dfe94a4 100644 --- a/api/version.c +++ b/api/version.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -28,6 +29,8 @@ const char X(cc)[] = FFTW_CC; of the ABI */ const char X(codelet_optim)[] = ""; +const char X(aoclversion)[] = AOCL_FFTW_VERSION; + const char X(version)[] = PACKAGE "-" PACKAGE_VERSION #if HAVE_FMA diff --git a/config.h.in b/config.h.in index 61791074..a7d84765 100644 --- a/config.h.in +++ b/config.h.in @@ -3,12 +3,18 @@ /* Define to enable AMD cpu specific optimizations. */ #undef AMD_OPT_ALL +/* Define to enable AMD Fast Planner for AMD cpus. */ +#undef AMD_OPT_FAST_PLANNER + /* Define to enable AMD cpu optimized MPI FFT. */ #undef AMD_OPT_MPIFFT /* Define to enable AMD cpu optimized Transpose. */ #undef AMD_OPT_TRANS +/* AOCL Version of AMD-FFTW */ +#undef AOCL_FFTW_VERSION + /* Define if the machine architecture "naturally" prefers fused multiply-add instructions */ #undef ARCH_PREFERS_FMA diff --git a/configure b/configure index 075d5745..26737ea2 100755 --- a/configure +++ b/configure @@ -877,6 +877,7 @@ enable_mpi enable_amd_opt enable_amd_trans enable_amd_mpifft +enable_amd_fast_planner enable_fortran with_g77_wrappers enable_openmp @@ -1586,6 +1587,9 @@ Optional Features: --enable-amd-opt enable AMD cpu specific optimizations --enable-amd-trans enable AMD cpu optimized Transpose --enable-amd-mpifft enable AMD cpu optimized MPI FFT + --enable-amd-fast-planner + enable AMD Fast Planner for a faster planning time + on AMD cpus --disable-fortran don't include Fortran-callable wrappers --enable-openmp use OpenMP directives for parallelism --enable-threads compile FFTW SMP threads library @@ -3676,7 +3680,9 @@ else ok=no fi +long_double_supported=no; if test "$ok" = "yes"; then + long_double_supported=yes; if test "$PRECISION" = "s"; then as_fn_error $? "--enable-single/--enable-long-double conflict" "$LINENO" 5 fi @@ -3704,7 +3710,9 @@ else ok=no fi +quad_precision_supported=no; if test "$ok" = "yes"; then + quad_precision_supported=yes; if test "$PRECISION" != "d"; then as_fn_error $? "conflicting precisions specified" "$LINENO" 5 fi @@ -17039,11 +17047,40 @@ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_gcc_4_6_0" >&5 $as_echo "$ax_cv_gcc_4_6_0" >&6; } if test "$ax_cv_gcc_4_6_0" = yes; then - : + gcc_supported=1 else - as_fn_error $? "gcc 4.6 or later required for quad precision support" "$LINENO" 5 + gcc_supported=0 fi + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using clang 10 or later" >&5 +$as_echo_n "checking whether we are using clang 10 or later... " >&6; } + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + #ifdef __clang__ + #if __clang_major__ < 10 + #error Clang 10 or later is recommended for Quad precision + #endif + #else + #error Other compiler unsupported + #endif + +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + clang_supported=1 +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + clang_supported=0 +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + if test "$gcc_supported" = "1" || test "$clang_supported" = "1"; then + : + else + as_fn_error $? "gcc 4.6 or later OR clang 10 or later required for quad precision support" "$LINENO" 5 + fi { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sinq in -lquadmath" >&5 $as_echo_n "checking for sinq in -lquadmath... " >&6; } if ${ac_cv_lib_quadmath_sinq+:} false; then : @@ -17448,17 +17485,74 @@ else have_amd_opt=no fi -if test "$have_amd_opt" = yes; then +if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then + AMDZENFAMILY=$(expr `cat /proc/cpuinfo | grep -m1 family|cut -f2 -d:`) + AMDZENMODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:`) if test "$ac_test_CFLAGS" != "set"; then - if test "$CC" = "clang"; then + if [ "$AMD_ARCH" = "auto" ]; then + AMD_ARCH="" + fi + SUBSTRCLANG='clang' + SUBSTRGCC='gcc' + if grep -q "$SUBSTRCLANG" <<<"$CC"; then CFLAGS="$CFLAGS -mavx2 -mfma" - else - GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.` \>= 9) - AMDZEN2MODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:` \>= 48) - if test "$GCCVERSION" = "1" && test "$AMDZEN2MODEL" = "1"; then - CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + elif grep -q "$SUBSTRGCC" <<<"$CC"; then + GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.`) + case "$AMDZENFAMILY" in + "23") + if [ -z "${AMD_ARCH}" ]; then + if [ "$GCCVERSION" -ge "9" ] && [ "$AMDZENMODEL" -ge "48" ]; then + CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + else + CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + ;; + "25") + if [ -z "${AMD_ARCH}" ]; then + if [ "$GCCVERSION" -ge "9" ] && [ "$AMDZENMODEL" -le "15" ]; then + CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + elif [ "$GCCVERSION" -lt "9" ] && [ "$AMDZENMODEL" -le "15" ]; then + CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + else + CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + ;; + *) + if [ -z "${AMD_ARCH}" ]; then + CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + ;; + esac + fi + else + if [ -n "${AMD_ARCH}" ]; then + if [ "$AMD_ARCH" = "auto" ]; then + case "$AMDZENFAMILY" in + "23") + if [ "$AMDZENMODEL" -ge "48" ]; then + CFLAGS="$CFLAGS -march=znver2" + else + CFLAGS="$CFLAGS -march=znver1" + fi + ;; + "25") + if [ "$AMDZENMODEL" -le "15" ]; then + CFLAGS="$CFLAGS -march=znver2" + else + CFLAGS="$CFLAGS -mavx2" + fi + ;; + esac else - CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + CFLAGS="$CFLAGS -march=$AMD_ARCH" fi fi fi @@ -17491,6 +17585,21 @@ $as_echo "#define AMD_OPT_MPIFFT 1" >>confdefs.h fi +$as_echo "#define AOCL_FFTW_VERSION \"AOCL-3.0\"" >>confdefs.h + +# Check whether --enable-amd-fast-planner was given. +if test "${enable_amd_fast_planner+set}" = set; then : + enableval=$enable_amd_fast_planner; have_amd_fast_planner=$enableval +else + have_amd_fast_planner=no +fi + +if test "$have_amd_fast_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then + +$as_echo "#define AMD_OPT_FAST_PLANNER 1" >>confdefs.h + +fi + if test "$USE_MAINTAINER_MODE" = yes; then # Extract the first word of "indent", so it can be a program name with args. set dummy indent; ac_word=$2 diff --git a/configure.ac b/configure.ac index 70d8a707..5f636da6 100644 --- a/configure.ac +++ b/configure.ac @@ -74,7 +74,9 @@ fi AM_CONDITIONAL(SINGLE, test "$ok" = "yes") AC_ARG_ENABLE(long-double, [AC_HELP_STRING([--enable-long-double],[compile fftw in long-double precision])], ok=$enableval, ok=no) +long_double_supported=no; if test "$ok" = "yes"; then + long_double_supported=yes; if test "$PRECISION" = "s"; then AC_MSG_ERROR([--enable-single/--enable-long-double conflict]) fi @@ -85,7 +87,9 @@ fi AM_CONDITIONAL(LDOUBLE, test "$ok" = "yes") AC_ARG_ENABLE(quad-precision, [AC_HELP_STRING([--enable-quad-precision],[compile fftw in quadruple precision if available])], ok=$enableval, ok=no) +quad_precision_supported=no; if test "$ok" = "yes"; then + quad_precision_supported=yes; if test "$PRECISION" != "d"; then AC_MSG_ERROR([conflicting precisions specified]) fi @@ -563,7 +567,26 @@ AC_FUNC_VPRINTF AC_CHECK_LIB(m, sin) if test $PRECISION = q; then - AX_GCC_VERSION(4,6,0,[],[AC_MSG_ERROR([gcc 4.6 or later required for quad precision support])]) + AX_GCC_VERSION(4,6,0,[gcc_supported=1],[gcc_supported=0]) + AC_MSG_CHECKING([whether we are using clang 10 or later]) + AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ + #ifdef __clang__ + #if __clang_major__ < 10 + #error Clang 10 or later is recommended for Quad precision + #endif + #else + #error Other compiler unsupported + #endif + ]])], + [AC_MSG_RESULT([yes]) + clang_supported=1], + [AC_MSG_RESULT([no]) + clang_supported=0]) + if test "$gcc_supported" = "1" || test "$clang_supported" = "1"; then + : + else + AC_MSG_ERROR([gcc 4.6 or later OR clang 10 or later required for quad precision support]) + fi AC_CHECK_LIB(quadmath, sinq, [], [AC_MSG_ERROR([quad precision requires libquadmath for quad-precision trigonometric routines])]) LIBQUADMATH=-lquadmath fi @@ -616,17 +639,74 @@ fi dnl amd optimization switch and CFLAGS setting based on config arg option --enable-amd-opt AC_ARG_ENABLE(amd-opt, [AC_HELP_STRING([--enable-amd-opt],[enable AMD cpu specific optimizations])], have_amd_opt=$enableval, have_amd_opt=no) -if test "$have_amd_opt" = yes; then +if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then + AMDZENFAMILY=$(expr `cat /proc/cpuinfo | grep -m1 family|cut -f2 -d:`) + AMDZENMODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:`) if test "$ac_test_CFLAGS" != "set"; then - if test "$CC" = "clang"; then + if [[ "$AMD_ARCH" = "auto" ]]; then + AMD_ARCH="" + fi + SUBSTRCLANG='clang' + SUBSTRGCC='gcc' + if grep -q "$SUBSTRCLANG" <<<"$CC"; then CFLAGS="$CFLAGS -mavx2 -mfma" - else - GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.` \>= 9) - AMDZEN2MODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:` \>= 48) - if test "$GCCVERSION" = "1" && test "$AMDZEN2MODEL" = "1"; then - CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + elif grep -q "$SUBSTRGCC" <<<"$CC"; then + GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.`) + case "$AMDZENFAMILY" in + "23") + if [[ -z "${AMD_ARCH}" ]]; then + if [[ "$GCCVERSION" -ge "9" ]] && [[ "$AMDZENMODEL" -ge "48" ]]; then + CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + else + CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + ;; + "25") + if [[ -z "${AMD_ARCH}" ]]; then + if [[ "$GCCVERSION" -ge "9" ]] && [[ "$AMDZENMODEL" -le "15" ]]; then + CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + elif [[ "$GCCVERSION" -lt "9" ]] && [[ "$AMDZENMODEL" -le "15" ]]; then + CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + else + CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + ;; + *) + if [[ -z "${AMD_ARCH}" ]]; then + CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + ;; + esac + fi + else + if [[ -n "${AMD_ARCH}" ]]; then + if [[ "$AMD_ARCH" = "auto" ]]; then + case "$AMDZENFAMILY" in + "23") + if [[ "$AMDZENMODEL" -ge "48" ]]; then + CFLAGS="$CFLAGS -march=znver2" + else + CFLAGS="$CFLAGS -march=znver1" + fi + ;; + "25") + if [[ "$AMDZENMODEL" -le "15" ]]; then + CFLAGS="$CFLAGS -march=znver2" + else + CFLAGS="$CFLAGS -mavx2" + fi + ;; + esac else - CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + CFLAGS="$CFLAGS -march=$AMD_ARCH" fi fi fi @@ -642,6 +722,13 @@ AC_ARG_ENABLE(amd-mpifft, [AC_HELP_STRING([--enable-amd-mpifft],[enable AMD cpu if test "$have_amd_mpifft" = yes; then AC_DEFINE(AMD_OPT_MPIFFT,1,[Define to enable AMD cpu optimized MPI FFT.]) fi +dnl aocl version number of amd-fftw +AC_DEFINE(AOCL_FFTW_VERSION,"AOCL-3.0",[AOCL Version of AMD-FFTW]) +dnl amd optimization switch to enable AMD Fast Planner for AMD cpus --enable-amd-fast-planner +AC_ARG_ENABLE(amd-fast-planner, [AC_HELP_STRING([--enable-amd-fast-planner],[enable AMD Fast Planner for a faster planning time on AMD cpus])], have_amd_fast_planner=$enableval, have_amd_fast_planner=no) +if test "$have_amd_fast_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then + AC_DEFINE(AMD_OPT_FAST_PLANNER,1,[Define to enable AMD Fast Planner for AMD cpus.]) +fi dnl check for a proper indent in maintainer mode if test "$USE_MAINTAINER_MODE" = yes; then diff --git a/dft/dftw-direct.c b/dft/dftw-direct.c index 952cf1ee..53178938 100644 --- a/dft/dftw-direct.c +++ b/dft/dftw-direct.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -244,7 +245,11 @@ static int applicable(const S *ego, v, m * r, r)) return 0; +#ifdef AMD_FAST_PLANNER + if ((m * r > 262144 && NO_FIXED_RADIX_LARGE_NP(plnr)) || v > 4096) +#else if (m * r > 262144 && NO_FIXED_RADIX_LARGE_NP(plnr)) +#endif return 0; return 1; diff --git a/dft/problem.c b/dft/problem.c index 4e9640aa..56610089 100644 --- a/dft/problem.c +++ b/dft/problem.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,6 +21,9 @@ #include "dft/dft.h" +#ifdef AMD_FAST_PLANNER +#include "simd-support/simd-common.h" +#endif #include static void destroy(problem *ego_) @@ -40,8 +44,63 @@ static void hash(const problem *p_, md5 *m) X(md5int)(m, X(ialignment_of)(p->ii)); X(md5int)(m, X(ialignment_of)(p->ro)); X(md5int)(m, X(ialignment_of)(p->io)); + +#ifdef AMD_FAST_PLANNER + X(md5int)(m, p->sz->rnk); + if (FINITE_RNK(p->sz->rnk)) { + int x1, x2, x3, x4, x5, x6, x7, x8, x9; + for (int i = 0; i < p->sz->rnk; ++i) { + X(md5INT)(m, p->sz->dims[i].n); + x1 = (p->sz->dims[i].is == p->sz->dims[i].os); + x2 = (p->sz->dims[i].is == 2); + x3 = (p->sz->dims[i].os == 2); + x4 = !((p->sz->dims[i].is * sizeof(R)) % ALIGNMENT); + x5 = !((p->sz->dims[i].os * sizeof(R)) % ALIGNMENT); + x6 = !((p->sz->dims[i].is * sizeof(R)) % ALIGNMENTA); + x7 = !((p->sz->dims[i].os * sizeof(R)) % ALIGNMENTA); +#ifdef AMD_FAST_PLANNING_HASH_V1 + if (i == 0) + { + if (p->vecsz->rnk > 0) + x8 = (p->sz->dims[i].is <= p->vecsz->dims[i].is); + else + x8 = (p->sz->dims[i].is <= 0); + } + else + x8 = 0; + x9 = (x8<<7) | (x7<<6) | (x6<<5) | (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; +#else //AMD_FAST_PLANNING_HASH_V2 + x9 = (x7<<6) | (x6<<5) | (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; +#endif + X(md5INT)(m, x9); + } + } + int max_ind = X(tensor_max_index)(p->sz); + X(md5int)(m, p->vecsz->rnk); + if (FINITE_RNK(p->vecsz->rnk)) { + int x1=0, x2=0, x3=0, x4, x5, x6, x7, x8, x9, x10=0; + for (int i = 0; i < p->vecsz->rnk; ++i) { + X(md5INT)(m, p->vecsz->dims[i].n); + x1 = (p->vecsz->dims[i].is == p->vecsz->dims[i].os); + x2 = (p->vecsz->dims[i].is == 2); + x3 = (p->vecsz->dims[i].os == 2); + if (x1) + { + x10 = (X(iabs)(p->vecsz->dims[i].is) < max_ind); + } + x4 = !((p->vecsz->dims[i].is * sizeof(R)) % ALIGNMENT); + x5 = !((p->vecsz->dims[i].os * sizeof(R)) % ALIGNMENT); + x6 = !((p->vecsz->dims[i].is * sizeof(R)) % ALIGNMENTA); + x7 = !((p->vecsz->dims[i].os * sizeof(R)) % ALIGNMENTA); + x9 = (x7<<6) | (x6<<5) | (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; + X(md5INT)(m, x9); + } + X(md5int)(m, x10); + } +#else X(tensor_md5)(m, p->sz); X(tensor_md5)(m, p->vecsz); +#endif } static void print(const problem *ego_, printer *p) diff --git a/kernel/ifftw.h b/kernel/ifftw.h index dd7c3159..7b8a2977 100644 --- a/kernel/ifftw.h +++ b/kernel/ifftw.h @@ -87,7 +87,8 @@ extern "C" //-------------------------------- //disables 128-bit AVX2 versions of kernels and prefers only 256-bit AVX2 kernels support -#define AMD_OPT_PREFER_256BIT_FPU +//This optimization switch is disabled by default. If it is enabled, WISDOM feature is not to be used. +//#define AMD_OPT_PREFER_256BIT_FPU #define AMD_OPT_128BIT_KERNELS_THRESHOLD 1024//Below this SIZE, 128-bit AVX2 kernels allowed //-------------------------------- //CPY2d related optimizations :- enable Either (i)C switch Or (ii)INTRIN switch @@ -127,6 +128,19 @@ extern "C" //#define AMD_MPI_TRANSPOSE_LOGS #endif //-------------------------------- +//NEW FAST PLANNER for AMD CPUs can be enabled with the below switch AMD_FAST_PLANNER. +//A new generalized Hash key based Planner is implemented that achieves high reuse of solvers among similar problems. +//A minor variation is available in the generalized Hash key method controlled by :- +//AMD_FAST_PLANNING_HASH_V1 and AMD_FAST_PLANNING_HASH_V2. +//AMD_FAST_PLANNING_HASH_V1 is by default turned on. +//UNBLESSED HASH table is kept alive till the process/thread life like the BLESSED HASH table. +//Since UNBLESSED HASH table keeps growing, so it is cleared smartly beyond a MAX SIZE by swapping with BLESSED table. +#ifdef AMD_OPT_FAST_PLANNER +#define AMD_FAST_PLANNER +#define AMD_FAST_PLANNING_HASH_V1 +//#define AMD_FAST_PLANNING_HASH_V2 +#define AMD_HASH_UNBLESS_MAX_SIZE 10485760 +#endif #endif//#ifdef AMD_OPT_ALL //============================================================ diff --git a/kernel/planner.c b/kernel/planner.c index 9c712905..2d15b2ba 100644 --- a/kernel/planner.c +++ b/kernel/planner.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2000 Matteo Frigo * Copyright (c) 2000 Massachusetts Institute of Technology + * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,6 +20,9 @@ */ #include "kernel/ifftw.h" +#ifdef AMD_FAST_PLANNER +#include "dft/dft.h" +#endif #include /* GNU Coding Standards, Sec. 5.2: "Please write the comments in a GNU @@ -619,12 +623,11 @@ static plan *search(planner *ego, const problem *p, unsigned *slvndx, (ego->wisdom_state = ego->bogosity_hook(ego->wisdom_state, p)) \ : ego->wisdom_state) == WISDOM_IS_BOGUS) \ goto wisdom_is_bogus; - static plan *mkplan(planner *ego, const problem *p) { plan *pln; md5 m; - unsigned slvndx; + unsigned slvndx=0; flags_t flags_of_solution; solution *sol; solver *s; @@ -663,7 +666,6 @@ static plan *mkplan(planner *ego, const problem *p) goto do_search; /* ignore not-ok wisdom */ slvndx = SLVNDX(sol); - if (slvndx == INFEASIBLE_SLVNDX) { if (ego->wisdom_state == WISDOM_IGNORE_INFEASIBLE) goto do_search; @@ -676,8 +678,13 @@ static plan *mkplan(planner *ego, const problem *p) /* inherit blessing either from wisdom or from the planner */ flags_of_solution.hash_info |= BLISS(ego->flags); - + +#ifdef AMD_FAST_PLANNER + if (ego->wisdom_state != WISDOM_ONLY) + ego->wisdom_state = WISDOM_NORMAL; +#else ego->wisdom_state = WISDOM_ONLY; +#endif s = ego->slvdescs[slvndx].slv; if (p->adt->problem_kind != s->adt->problem_kind) @@ -692,7 +699,16 @@ static plan *mkplan(planner *ego, const problem *p) reuse it. */ if (!pln) + { +#ifdef AMD_FAST_PLANNER + if (ego->wisdom_state == WISDOM_ONLY) goto wisdom_is_bogus; + else + goto do_search; +#else + goto wisdom_is_bogus; +#endif + } ego->wisdom_state = owisdom_state; @@ -709,8 +725,8 @@ static plan *mkplan(planner *ego, const problem *p) flags_of_solution = ego->flags; pln = search(ego, p, &slvndx, &flags_of_solution); - CHECK_FOR_BOGOSITY; /* catch error in child solvers */ + CHECK_FOR_BOGOSITY; /* catch error in child solvers */ if (ego->timed_out) { A(!pln); if (PLNR_TIMELIMIT_IMPATIENCE(ego) != 0) { @@ -737,7 +753,6 @@ static plan *mkplan(planner *ego, const problem *p) hinsert(ego, m.s, &flags_of_solution, INFEASIBLE_SLVNDX); } } - return pln; wisdom_is_bogus: @@ -766,6 +781,48 @@ static void mkhashtab(hashtab *ht) /* destroy hash table entries. If FORGET_EVERYTHING, destroy the whole table. If FORGET_ACCURSED, then destroy entries that are not blessed. */ +#ifdef AMD_FAST_PLANNER +static void forget(planner *ego, amnesia a) +{ + switch (a) { + case FORGET_ACCURSED: + //Do not delete the unblessed hash table after setup done. + //But maintain its state in order to reuse the solvers next time. + //htab_destroy(&ego->htab_unblessed); + //mkhashtab(&ego->htab_unblessed); + + //Check the unblessed size and switch it with blessed when its size grows beyond the MAX size + //AMD_HASH_UNBLESS_MAX_SIZE + if ((sizeof(struct solution_s)*ego->htab_unblessed.hashsiz) > AMD_HASH_UNBLESS_MAX_SIZE) + { + solution *ht_unblessed_sols = ego->htab_unblessed.solutions; + ego->htab_unblessed.solutions = ego->htab_blessed.solutions; + ego->htab_unblessed.hashsiz = ego->htab_blessed.hashsiz; + ego->htab_unblessed.nelem = ego->htab_blessed.nelem; + ego->htab_unblessed.lookup = ego->htab_blessed.lookup; + ego->htab_unblessed.succ_lookup = ego->htab_blessed.succ_lookup; + ego->htab_unblessed.lookup_iter = ego->htab_blessed.lookup_iter; + ego->htab_unblessed.insert = ego->htab_blessed.insert; + ego->htab_unblessed.insert_iter = ego->htab_blessed.insert_iter; + ego->htab_unblessed.insert_unknown = ego->htab_blessed.insert_unknown; + ego->htab_unblessed.nrehash = ego->htab_blessed.nrehash; + X(ifree)(ht_unblessed_sols); + //Should blessed be cleared and deleted at this point?? + mkhashtab(&ego->htab_blessed); + } + break; + case FORGET_EVERYTHING: + //When wisdom is set bogus; delete both blessed and unblessed hash table + htab_destroy(&ego->htab_blessed); + mkhashtab(&ego->htab_blessed); + htab_destroy(&ego->htab_unblessed); + mkhashtab(&ego->htab_unblessed); + break; + default: + break; + } +} +#else static void forget(planner *ego, amnesia a) { switch (a) { @@ -781,6 +838,7 @@ static void forget(planner *ego, amnesia a) break; } } +#endif /* FIXME: what sort of version information should we write? */ #define WISDOM_PREAMBLE PACKAGE "-" VERSION " " STRINGIZE(X(wisdom)) diff --git a/libbench2/bench-user.h b/libbench2/bench-user.h index 89b2337d..951de1df 100644 --- a/libbench2/bench-user.h +++ b/libbench2/bench-user.h @@ -1,7 +1,7 @@ /* * Copyright (c) 2001 Matteo Frigo * Copyright (c) 2001 Massachusetts Institute of Technology - * Copyright (C) 2019, Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -58,7 +58,11 @@ typedef bench_real bench_complex[2]; #undef SINGLE_PRECISION #define SINGLE_PRECISION (!DOUBLE_PRECISION && sizeof(bench_real) == sizeof(float)) #undef LDOUBLE_PRECISION +#ifdef BENCHFFT_LDOUBLE #define LDOUBLE_PRECISION (!DOUBLE_PRECISION && sizeof(bench_real) == sizeof(long double)) +#else +#define LDOUBLE_PRECISION 0 +#endif #undef QUAD_PRECISION #ifdef BENCHFFT_QUAD diff --git a/mpi/mpi-bench.c b/mpi/mpi-bench.c index d4dd931c..af4b4fcc 100644 --- a/mpi/mpi-bench.c +++ b/mpi/mpi-bench.c @@ -1,4 +1,8 @@ /**************************************************************************/ +/* + * Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved. + */ + /* NOTE to users: this is the FFTW-MPI self-test and benchmark program. It is probably NOT a good place to learn FFTW usage, since it has a lot of added complexity in order to exercise and test the full API, @@ -33,6 +37,7 @@ #endif static const char *mkversion(void) { return FFTW(version); } +static const char *mkaoclversion(void) { return FFTW(aoclversion); } static const char *mkcc(void) { return FFTW(cc); } static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); } static const char *mknproc(void) { @@ -50,6 +55,7 @@ static const char *mknproc(void) { BEGIN_BENCH_DOC BENCH_DOC("name", "fftw3_mpi") BENCH_DOCF("version", mkversion) +BENCH_DOCF("aocl-version", mkaoclversion) BENCH_DOCF("cc", mkcc) BENCH_DOCF("codelet-optim", mkcodelet_optim) BENCH_DOCF("nproc", mknproc) diff --git a/rdft/problem.c b/rdft/problem.c index a10db034..1de6b747 100644 --- a/rdft/problem.c +++ b/rdft/problem.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,8 +21,22 @@ #include "rdft/rdft.h" +#ifdef AMD_FAST_PLANNER +#include "simd-support/simd-common.h" +#endif #include +#ifdef AMD_FAST_PLANNER +#define MAXRRNK 32 /* FIXME: should malloc() */ +typedef struct { + plan_rdft super; + INT vl; + int rnk; + iodim d[MAXRRNK]; + const char *nam; +} P; +#endif + static void destroy(problem *ego_) { problem_rdft *ego = (problem_rdft *) ego_; @@ -39,6 +54,29 @@ static void kind_hash(md5 *m, const rdft_kind *kind, int rnk) X(md5int)(m, kind[i]); } +#ifdef AMD_FAST_PLANNER +static int fill_iodim(P *pln, const problem_rdft *p) +{ + int i; + const tensor *vecsz = p->vecsz; + + pln->vl = 1; + pln->rnk = 0; + for (i = 0; i < vecsz->rnk; ++i) { + /* extract contiguous dimensions */ + if (pln->vl == 1 && + vecsz->dims[i].is == 1 && vecsz->dims[i].os == 1) + pln->vl = vecsz->dims[i].n; + else if (pln->rnk == MAXRRNK) + return 0; + else + pln->d[pln->rnk++] = vecsz->dims[i]; + } + + return 1; +} +#endif + static void hash(const problem *p_, md5 *m) { const problem_rdft *p = (const problem_rdft *) p_; @@ -47,8 +85,80 @@ static void hash(const problem *p_, md5 *m) kind_hash(m, p->kind, p->sz->rnk); X(md5int)(m, X(ialignment_of)(p->I)); X(md5int)(m, X(ialignment_of)(p->O)); +#ifdef AMD_FAST_PLANNER + X(md5int)(m, p->sz->rnk); + if (FINITE_RNK(p->sz->rnk)) { + int x1, x2, x3, x4, x5, x6, x7, x8, x9, x10; + if (p->sz->rnk > 0) + { + x8 = 1; + x10 = X(is_prime)(p->sz->dims[0].n); + } + else + { + x8 = 0; + x10 = 0; + } + for (int i = 0; i < p->sz->rnk; ++i) { + X(md5INT)(m, p->sz->dims[i].n); + x1 = (p->sz->dims[i].is == p->sz->dims[i].os); + x2 = (p->sz->dims[i].is == 2); + x3 = (p->sz->dims[i].os == 2); + x4 = !((p->sz->dims[i].is * sizeof(R)) % ALIGNMENT); + x5 = !((p->sz->dims[i].os * sizeof(R)) % ALIGNMENT); + x6 = !((p->sz->dims[i].is * sizeof(R)) % ALIGNMENTA); + x7 = !((p->sz->dims[i].os * sizeof(R)) % ALIGNMENTA); + x8 = x8 & ((p->sz->dims[i].is <= 2) && (p->sz->dims[i].os > 2)); + x9 = (x8<<7) | (x7<<6) | (x6<<5) | (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; + X(md5INT)(m, x9); + } + x10 = (x10<<1) | x8; + X(md5int)(m, x10); + } + X(md5int)(m, p->vecsz->rnk); + if (FINITE_RNK(p->vecsz->rnk)) { + int x1, x2, x3, x4, x5, x6, x7, x8, x9; + P pln; + for (int i = 0; i < p->vecsz->rnk; ++i) { + X(md5INT)(m, p->vecsz->dims[i].n); + x1 = (p->vecsz->dims[i].is == p->vecsz->dims[i].os); + x2 = (p->vecsz->dims[i].is == 2); + x3 = (p->vecsz->dims[i].os == 2); + x4 = !((p->vecsz->dims[i].is * sizeof(R)) % ALIGNMENT); + x5 = !((p->vecsz->dims[i].os * sizeof(R)) % ALIGNMENT); + x6 = !((p->vecsz->dims[i].is * sizeof(R)) % ALIGNMENTA); + x7 = !((p->vecsz->dims[i].os * sizeof(R)) % ALIGNMENTA); + x9 = (x7<<6) | (x6<<5) | (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; + X(md5INT)(m, x9); + } + fill_iodim(&pln, p); + x1 = (pln.vl > 2); + if (pln.rnk >= 2) + { + int rnk = pln.rnk; + x2 = (pln.d[rnk-2].n == pln.d[rnk-1].n && + pln.d[rnk-2].is == pln.d[rnk-1].os && + pln.d[rnk-2].os == pln.d[rnk-1].is); + x3 = (X(iabs)(pln.d[rnk-2].is) <= X(iabs)(pln.d[rnk-1].is) || + X(iabs)(pln.d[rnk-2].os) <= X(iabs)(pln.d[rnk-1].os)); + x4 = (X(compute_tilesz)(pln.vl, 1) > 4); + x5 = (X(compute_tilesz)(pln.vl, 2) > 4); + + } + else + { + x2 = 0; + x3 = 0; + x4 = 0; + x5 = 0; + } + x9 = (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; + X(md5int)(m, x9); + } +#else X(tensor_md5)(m, p->sz); X(tensor_md5)(m, p->vecsz); +#endif } static void recur(const iodim *dims, int rnk, R *I) diff --git a/rdft/problem2.c b/rdft/problem2.c index a1445258..e0074fe6 100644 --- a/rdft/problem2.c +++ b/rdft/problem2.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,6 +21,9 @@ #include "dft/dft.h" +#ifdef AMD_FAST_PLANNER +#include "simd-support/simd-common.h" +#endif #include "rdft/rdft.h" #include @@ -42,8 +46,51 @@ static void hash(const problem *p_, md5 *m) X(md5int)(m, X(ialignment_of)(p->cr)); X(md5int)(m, X(ialignment_of)(p->ci)); X(md5int)(m, p->kind); +#ifdef AMD_FAST_PLANNER + X(md5int)(m, p->sz->rnk); + if (FINITE_RNK(p->sz->rnk)) { + int x1, x2, x3, x4, x5, x6, x7, x8, x9; + for (int i = 0; i < p->sz->rnk; ++i) { + X(md5INT)(m, p->sz->dims[i].n); + x1 = (p->sz->dims[i].is == p->sz->dims[i].os); + x2 = (p->sz->dims[i].is == 2); + x3 = (p->sz->dims[i].os == 2); + x4 = !((p->sz->dims[i].is * sizeof(R)) % ALIGNMENT); + x5 = !((p->sz->dims[i].os * sizeof(R)) % ALIGNMENT); + x6 = !((p->sz->dims[i].is * sizeof(R)) % ALIGNMENTA); + x7 = !((p->sz->dims[i].os * sizeof(R)) % ALIGNMENTA); + if (p->vecsz->rnk > i) + x8 = (p->sz->dims[i].is <= p->vecsz->dims[i].is); + else + x8 = 0; + x9 = (x8<<7) | (x7<<6) | (x6<<5) | (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; + X(md5INT)(m, x9); + } + } + X(md5int)(m, p->vecsz->rnk); + if (FINITE_RNK(p->vecsz->rnk)) { + int x1, x2, x3, x4, x5, x6, x7, x8, x9; + for (int i = 0; i < p->vecsz->rnk; ++i) { + X(md5INT)(m, p->vecsz->dims[i].n); + x1 = (p->vecsz->dims[i].is == p->vecsz->dims[i].os); + x2 = (p->vecsz->dims[i].is == 2); + x3 = (p->vecsz->dims[i].os == 2); + x4 = !((p->vecsz->dims[i].is * sizeof(R)) % ALIGNMENT); + x5 = !((p->vecsz->dims[i].os * sizeof(R)) % ALIGNMENT); + x6 = !((p->vecsz->dims[i].is * sizeof(R)) % ALIGNMENTA); + x7 = !((p->vecsz->dims[i].os * sizeof(R)) % ALIGNMENTA); + if (p->sz->rnk > i) + x8 = (p->vecsz->dims[i].n <= p->sz->dims[i].n); + else + x8 = 0; + x9 = (x8<<7) | (x7<<6) | (x6<<5) | (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; + X(md5INT)(m, x9); + } + } +#else X(tensor_md5)(m, p->sz); X(tensor_md5)(m, p->vecsz); +#endif } static void print(const problem *ego_, printer *p) diff --git a/tests/bench.c b/tests/bench.c index 9fd5dd35..9ce7cb30 100644 --- a/tests/bench.c +++ b/tests/bench.c @@ -1,4 +1,8 @@ /**************************************************************************/ +/* + * Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved. + */ + /* NOTE to users: this is the FFTW self-test and benchmark program. It is probably NOT a good place to learn FFTW usage, since it has a lot of added complexity in order to exercise and test the full API, @@ -14,12 +18,14 @@ #include "tests/fftw-bench.h" static const char *mkversion(void) { return FFTW(version); } +static const char *mkaoclversion(void) { return FFTW(aoclversion); } static const char *mkcc(void) { return FFTW(cc); } static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); } BEGIN_BENCH_DOC BENCH_DOC("name", "fftw3") BENCH_DOCF("version", mkversion) +BENCH_DOCF("aocl-version", mkaoclversion) BENCH_DOCF("cc", mkcc) BENCH_DOCF("codelet-optim", mkcodelet_optim) END_BENCH_DOC