From 95123de14bd0de99fdc18f3bdbdbeb6f3d0cedd7 Mon Sep 17 00:00:00 2001 From: sraut Date: Tue, 14 Jul 2020 15:26:09 +0530 Subject: [PATCH 1/4] This change adds support for AOCL versioning of the library. 1) AOCL version number is set in configure.ac file. New API fftw_aoclversion() is added to the source files fftw3.h and version.c under api/ directory that returns the AOCL version number of the library. 2) Bench programs under tests/ and mpi/ directories are modified to call fftw_aoclversion() and print the current aocl-version. 3) Current AOCL version number is 2.2.1 Change-Id: I1db98c40950e53a44da93f8ea9399c7c437fc5b7 --- api/fftw3.h | 2 ++ api/version.c | 3 +++ config.h.in | 3 +++ configure | 3 +++ configure.ac | 2 ++ mpi/mpi-bench.c | 6 ++++++ tests/bench.c | 6 ++++++ 7 files changed, 25 insertions(+) diff --git a/api/fftw3.h b/api/fftw3.h index 7bd4c6e5..72db8d88 100644 --- a/api/fftw3.h +++ b/api/fftw3.h @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved. * * The following statement of license applies *only* to this header file, * and *not* to the other files distributed with FFTW or derived therefrom: @@ -448,6 +449,7 @@ FFTW_EXTERN int \ FFTW_CDECL X(alignment_of)(R *p); \ \ FFTW_EXTERN const char X(version)[]; \ +FFTW_EXTERN const char X(aoclversion)[]; \ FFTW_EXTERN const char X(cc)[]; \ FFTW_EXTERN const char X(codelet_optim)[]; diff --git a/api/version.c b/api/version.c index 4f14de15..2dfe94a4 100644 --- a/api/version.c +++ b/api/version.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -28,6 +29,8 @@ const char X(cc)[] = FFTW_CC; of the ABI */ const char X(codelet_optim)[] = ""; +const char X(aoclversion)[] = AOCL_FFTW_VERSION; + const char X(version)[] = PACKAGE "-" PACKAGE_VERSION #if HAVE_FMA diff --git a/config.h.in b/config.h.in index 61791074..195db4fa 100644 --- a/config.h.in +++ b/config.h.in @@ -9,6 +9,9 @@ /* Define to enable AMD cpu optimized Transpose. */ #undef AMD_OPT_TRANS +/* AOCL Version of AMD-FFTW */ +#undef AOCL_FFTW_VERSION + /* Define if the machine architecture "naturally" prefers fused multiply-add instructions */ #undef ARCH_PREFERS_FMA diff --git a/configure b/configure index 075d5745..b1407431 100755 --- a/configure +++ b/configure @@ -17491,6 +17491,9 @@ $as_echo "#define AMD_OPT_MPIFFT 1" >>confdefs.h fi +$as_echo "#define AOCL_FFTW_VERSION \"AOCL-2.2.1\"" >>confdefs.h + + if test "$USE_MAINTAINER_MODE" = yes; then # Extract the first word of "indent", so it can be a program name with args. set dummy indent; ac_word=$2 diff --git a/configure.ac b/configure.ac index 70d8a707..19f5a8be 100644 --- a/configure.ac +++ b/configure.ac @@ -642,6 +642,8 @@ AC_ARG_ENABLE(amd-mpifft, [AC_HELP_STRING([--enable-amd-mpifft],[enable AMD cpu if test "$have_amd_mpifft" = yes; then AC_DEFINE(AMD_OPT_MPIFFT,1,[Define to enable AMD cpu optimized MPI FFT.]) fi +dnl aocl version number of amd-fftw +AC_DEFINE(AOCL_FFTW_VERSION,"AOCL-2.2.1",[AOCL Version of AMD-FFTW]) dnl check for a proper indent in maintainer mode if test "$USE_MAINTAINER_MODE" = yes; then diff --git a/mpi/mpi-bench.c b/mpi/mpi-bench.c index d4dd931c..af4b4fcc 100644 --- a/mpi/mpi-bench.c +++ b/mpi/mpi-bench.c @@ -1,4 +1,8 @@ /**************************************************************************/ +/* + * Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved. + */ + /* NOTE to users: this is the FFTW-MPI self-test and benchmark program. It is probably NOT a good place to learn FFTW usage, since it has a lot of added complexity in order to exercise and test the full API, @@ -33,6 +37,7 @@ #endif static const char *mkversion(void) { return FFTW(version); } +static const char *mkaoclversion(void) { return FFTW(aoclversion); } static const char *mkcc(void) { return FFTW(cc); } static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); } static const char *mknproc(void) { @@ -50,6 +55,7 @@ static const char *mknproc(void) { BEGIN_BENCH_DOC BENCH_DOC("name", "fftw3_mpi") BENCH_DOCF("version", mkversion) +BENCH_DOCF("aocl-version", mkaoclversion) BENCH_DOCF("cc", mkcc) BENCH_DOCF("codelet-optim", mkcodelet_optim) BENCH_DOCF("nproc", mknproc) diff --git a/tests/bench.c b/tests/bench.c index 9fd5dd35..9ce7cb30 100644 --- a/tests/bench.c +++ b/tests/bench.c @@ -1,4 +1,8 @@ /**************************************************************************/ +/* + * Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved. + */ + /* NOTE to users: this is the FFTW self-test and benchmark program. It is probably NOT a good place to learn FFTW usage, since it has a lot of added complexity in order to exercise and test the full API, @@ -14,12 +18,14 @@ #include "tests/fftw-bench.h" static const char *mkversion(void) { return FFTW(version); } +static const char *mkaoclversion(void) { return FFTW(aoclversion); } static const char *mkcc(void) { return FFTW(cc); } static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); } BEGIN_BENCH_DOC BENCH_DOC("name", "fftw3") BENCH_DOCF("version", mkversion) +BENCH_DOCF("aocl-version", mkaoclversion) BENCH_DOCF("cc", mkcc) BENCH_DOCF("codelet-optim", mkcodelet_optim) END_BENCH_DOC From db43ba8566ceb6428992791d0b7b7cd7883a222b Mon Sep 17 00:00:00 2001 From: sraut Date: Fri, 2 Oct 2020 16:46:45 +0530 Subject: [PATCH 2/4] This change adds a new AMD optimized Fast Planner to FFTW. 1) The new fast planner improves the planning time for various planning modes in general and OPATIENT mode in particular. A very big planning time reduction is achieved with a small tradeoff in MFLOPS performance. A generalized Hash method is implemented to achieve a good reuse of plans and solvers across problems and sub-problems. The Unblessed hash table is kept alive untill program termination with a check on its table size. The order of registering of solvers is also modified. 2) The configure user option --enable-amd-fast-planner enables the new fast planner. The macro AMD_FAST_PLANNER enables all the code implementation for this new planner. The maximum allowed size of Unblessed hash table is controlled by macro AMD_HASH_UNBLESS_MAX_SIZE. 3) Code changes are contained in files :- api/configure.c, dft/dftw-direct.c, dft/problem.c, kernel/ifftw.h, kernel/planner.c, rdft/problem.c and rdft/problem2.c. Files configure.ac, configure and config.h.in are modified to enable option --enable-amd-fast-planner. 4) The new planner supports openMP multi-threaded and MPI FFT planning as well. This code change relates to Jira task AMD-Internal: [CPUPL-1077] Change-Id: Ida934d77d8bebfe49f1107b0e23ddb17550d17bd --- api/configure.c | 7 +++ config.h.in | 3 ++ configure | 16 +++++++ configure.ac | 5 +++ dft/dftw-direct.c | 5 +++ dft/problem.c | 59 +++++++++++++++++++++++++ kernel/ifftw.h | 13 ++++++ kernel/planner.c | 70 ++++++++++++++++++++++++++--- rdft/problem.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++ rdft/problem2.c | 47 ++++++++++++++++++++ 10 files changed, 329 insertions(+), 6 deletions(-) diff --git a/api/configure.c b/api/configure.c index 08b074cd..c7ad44e2 100644 --- a/api/configure.c +++ b/api/configure.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,7 +26,13 @@ void X(configure_planner)(planner *plnr) { +#ifdef AMD_FAST_PLANNER + X(reodft_conf_standard)(plnr); + X(rdft_conf_standard)(plnr); + X(dft_conf_standard)(plnr); +#else X(dft_conf_standard)(plnr); X(rdft_conf_standard)(plnr); X(reodft_conf_standard)(plnr); +#endif } diff --git a/config.h.in b/config.h.in index 195db4fa..a7d84765 100644 --- a/config.h.in +++ b/config.h.in @@ -3,6 +3,9 @@ /* Define to enable AMD cpu specific optimizations. */ #undef AMD_OPT_ALL +/* Define to enable AMD Fast Planner for AMD cpus. */ +#undef AMD_OPT_FAST_PLANNER + /* Define to enable AMD cpu optimized MPI FFT. */ #undef AMD_OPT_MPIFFT diff --git a/configure b/configure index b1407431..e8427f53 100755 --- a/configure +++ b/configure @@ -877,6 +877,7 @@ enable_mpi enable_amd_opt enable_amd_trans enable_amd_mpifft +enable_amd_fast_planner enable_fortran with_g77_wrappers enable_openmp @@ -1586,6 +1587,9 @@ Optional Features: --enable-amd-opt enable AMD cpu specific optimizations --enable-amd-trans enable AMD cpu optimized Transpose --enable-amd-mpifft enable AMD cpu optimized MPI FFT + --enable-amd-fast-planner + enable AMD Fast Planner for a faster planning time + on AMD cpus --disable-fortran don't include Fortran-callable wrappers --enable-openmp use OpenMP directives for parallelism --enable-threads compile FFTW SMP threads library @@ -17493,6 +17497,18 @@ fi $as_echo "#define AOCL_FFTW_VERSION \"AOCL-2.2.1\"" >>confdefs.h +# Check whether --enable-amd-fast-planner was given. +if test "${enable_amd_fast_planner+set}" = set; then : + enableval=$enable_amd_fast_planner; have_amd_fast_planner=$enableval +else + have_amd_fast_planner=no +fi + +if test "$have_amd_fast_planner" = yes; then + +$as_echo "#define AMD_OPT_FAST_PLANNER 1" >>confdefs.h + +fi if test "$USE_MAINTAINER_MODE" = yes; then # Extract the first word of "indent", so it can be a program name with args. diff --git a/configure.ac b/configure.ac index 19f5a8be..e98083cb 100644 --- a/configure.ac +++ b/configure.ac @@ -644,6 +644,11 @@ if test "$have_amd_mpifft" = yes; then fi dnl aocl version number of amd-fftw AC_DEFINE(AOCL_FFTW_VERSION,"AOCL-2.2.1",[AOCL Version of AMD-FFTW]) +dnl amd optimization switch to enable AMD Fast Planner for AMD cpus --enable-amd-fast-planner +AC_ARG_ENABLE(amd-fast-planner, [AC_HELP_STRING([--enable-amd-fast-planner],[enable AMD Fast Planner for a faster planning time on AMD cpus])], have_amd_fast_planner=$enableval, have_amd_fast_planner=no) +if test "$have_amd_fast_planner" = yes; then + AC_DEFINE(AMD_OPT_FAST_PLANNER,1,[Define to enable AMD Fast Planner for AMD cpus.]) +fi dnl check for a proper indent in maintainer mode if test "$USE_MAINTAINER_MODE" = yes; then diff --git a/dft/dftw-direct.c b/dft/dftw-direct.c index 952cf1ee..53178938 100644 --- a/dft/dftw-direct.c +++ b/dft/dftw-direct.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -244,7 +245,11 @@ static int applicable(const S *ego, v, m * r, r)) return 0; +#ifdef AMD_FAST_PLANNER + if ((m * r > 262144 && NO_FIXED_RADIX_LARGE_NP(plnr)) || v > 4096) +#else if (m * r > 262144 && NO_FIXED_RADIX_LARGE_NP(plnr)) +#endif return 0; return 1; diff --git a/dft/problem.c b/dft/problem.c index 4e9640aa..56610089 100644 --- a/dft/problem.c +++ b/dft/problem.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,6 +21,9 @@ #include "dft/dft.h" +#ifdef AMD_FAST_PLANNER +#include "simd-support/simd-common.h" +#endif #include static void destroy(problem *ego_) @@ -40,8 +44,63 @@ static void hash(const problem *p_, md5 *m) X(md5int)(m, X(ialignment_of)(p->ii)); X(md5int)(m, X(ialignment_of)(p->ro)); X(md5int)(m, X(ialignment_of)(p->io)); + +#ifdef AMD_FAST_PLANNER + X(md5int)(m, p->sz->rnk); + if (FINITE_RNK(p->sz->rnk)) { + int x1, x2, x3, x4, x5, x6, x7, x8, x9; + for (int i = 0; i < p->sz->rnk; ++i) { + X(md5INT)(m, p->sz->dims[i].n); + x1 = (p->sz->dims[i].is == p->sz->dims[i].os); + x2 = (p->sz->dims[i].is == 2); + x3 = (p->sz->dims[i].os == 2); + x4 = !((p->sz->dims[i].is * sizeof(R)) % ALIGNMENT); + x5 = !((p->sz->dims[i].os * sizeof(R)) % ALIGNMENT); + x6 = !((p->sz->dims[i].is * sizeof(R)) % ALIGNMENTA); + x7 = !((p->sz->dims[i].os * sizeof(R)) % ALIGNMENTA); +#ifdef AMD_FAST_PLANNING_HASH_V1 + if (i == 0) + { + if (p->vecsz->rnk > 0) + x8 = (p->sz->dims[i].is <= p->vecsz->dims[i].is); + else + x8 = (p->sz->dims[i].is <= 0); + } + else + x8 = 0; + x9 = (x8<<7) | (x7<<6) | (x6<<5) | (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; +#else //AMD_FAST_PLANNING_HASH_V2 + x9 = (x7<<6) | (x6<<5) | (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; +#endif + X(md5INT)(m, x9); + } + } + int max_ind = X(tensor_max_index)(p->sz); + X(md5int)(m, p->vecsz->rnk); + if (FINITE_RNK(p->vecsz->rnk)) { + int x1=0, x2=0, x3=0, x4, x5, x6, x7, x8, x9, x10=0; + for (int i = 0; i < p->vecsz->rnk; ++i) { + X(md5INT)(m, p->vecsz->dims[i].n); + x1 = (p->vecsz->dims[i].is == p->vecsz->dims[i].os); + x2 = (p->vecsz->dims[i].is == 2); + x3 = (p->vecsz->dims[i].os == 2); + if (x1) + { + x10 = (X(iabs)(p->vecsz->dims[i].is) < max_ind); + } + x4 = !((p->vecsz->dims[i].is * sizeof(R)) % ALIGNMENT); + x5 = !((p->vecsz->dims[i].os * sizeof(R)) % ALIGNMENT); + x6 = !((p->vecsz->dims[i].is * sizeof(R)) % ALIGNMENTA); + x7 = !((p->vecsz->dims[i].os * sizeof(R)) % ALIGNMENTA); + x9 = (x7<<6) | (x6<<5) | (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; + X(md5INT)(m, x9); + } + X(md5int)(m, x10); + } +#else X(tensor_md5)(m, p->sz); X(tensor_md5)(m, p->vecsz); +#endif } static void print(const problem *ego_, printer *p) diff --git a/kernel/ifftw.h b/kernel/ifftw.h index dd7c3159..ed19571a 100644 --- a/kernel/ifftw.h +++ b/kernel/ifftw.h @@ -127,6 +127,19 @@ extern "C" //#define AMD_MPI_TRANSPOSE_LOGS #endif //-------------------------------- +//NEW FAST PLANNER for AMD CPUs can be enabled with the below switch AMD_FAST_PLANNER. +//A new generalized Hash key based Planner is implemented that achieves high reuse of solvers among similar problems. +//A minor variation is available in the generalized Hash key method controlled by :- +//AMD_FAST_PLANNING_HASH_V1 and AMD_FAST_PLANNING_HASH_V2. +//AMD_FAST_PLANNING_HASH_V1 is by default turned on. +//UNBLESSED HASH table is kept alive till the process/thread life like the BLESSED HASH table. +//Since UNBLESSED HASH table keeps growing, so it is cleared smartly beyond a MAX SIZE by swapping with BLESSED table. +#ifdef AMD_OPT_FAST_PLANNER +#define AMD_FAST_PLANNER +#define AMD_FAST_PLANNING_HASH_V1 +//#define AMD_FAST_PLANNING_HASH_V2 +#define AMD_HASH_UNBLESS_MAX_SIZE 10485760 +#endif #endif//#ifdef AMD_OPT_ALL //============================================================ diff --git a/kernel/planner.c b/kernel/planner.c index 9c712905..2d15b2ba 100644 --- a/kernel/planner.c +++ b/kernel/planner.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2000 Matteo Frigo * Copyright (c) 2000 Massachusetts Institute of Technology + * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,6 +20,9 @@ */ #include "kernel/ifftw.h" +#ifdef AMD_FAST_PLANNER +#include "dft/dft.h" +#endif #include /* GNU Coding Standards, Sec. 5.2: "Please write the comments in a GNU @@ -619,12 +623,11 @@ static plan *search(planner *ego, const problem *p, unsigned *slvndx, (ego->wisdom_state = ego->bogosity_hook(ego->wisdom_state, p)) \ : ego->wisdom_state) == WISDOM_IS_BOGUS) \ goto wisdom_is_bogus; - static plan *mkplan(planner *ego, const problem *p) { plan *pln; md5 m; - unsigned slvndx; + unsigned slvndx=0; flags_t flags_of_solution; solution *sol; solver *s; @@ -663,7 +666,6 @@ static plan *mkplan(planner *ego, const problem *p) goto do_search; /* ignore not-ok wisdom */ slvndx = SLVNDX(sol); - if (slvndx == INFEASIBLE_SLVNDX) { if (ego->wisdom_state == WISDOM_IGNORE_INFEASIBLE) goto do_search; @@ -676,8 +678,13 @@ static plan *mkplan(planner *ego, const problem *p) /* inherit blessing either from wisdom or from the planner */ flags_of_solution.hash_info |= BLISS(ego->flags); - + +#ifdef AMD_FAST_PLANNER + if (ego->wisdom_state != WISDOM_ONLY) + ego->wisdom_state = WISDOM_NORMAL; +#else ego->wisdom_state = WISDOM_ONLY; +#endif s = ego->slvdescs[slvndx].slv; if (p->adt->problem_kind != s->adt->problem_kind) @@ -692,7 +699,16 @@ static plan *mkplan(planner *ego, const problem *p) reuse it. */ if (!pln) + { +#ifdef AMD_FAST_PLANNER + if (ego->wisdom_state == WISDOM_ONLY) goto wisdom_is_bogus; + else + goto do_search; +#else + goto wisdom_is_bogus; +#endif + } ego->wisdom_state = owisdom_state; @@ -709,8 +725,8 @@ static plan *mkplan(planner *ego, const problem *p) flags_of_solution = ego->flags; pln = search(ego, p, &slvndx, &flags_of_solution); - CHECK_FOR_BOGOSITY; /* catch error in child solvers */ + CHECK_FOR_BOGOSITY; /* catch error in child solvers */ if (ego->timed_out) { A(!pln); if (PLNR_TIMELIMIT_IMPATIENCE(ego) != 0) { @@ -737,7 +753,6 @@ static plan *mkplan(planner *ego, const problem *p) hinsert(ego, m.s, &flags_of_solution, INFEASIBLE_SLVNDX); } } - return pln; wisdom_is_bogus: @@ -766,6 +781,48 @@ static void mkhashtab(hashtab *ht) /* destroy hash table entries. If FORGET_EVERYTHING, destroy the whole table. If FORGET_ACCURSED, then destroy entries that are not blessed. */ +#ifdef AMD_FAST_PLANNER +static void forget(planner *ego, amnesia a) +{ + switch (a) { + case FORGET_ACCURSED: + //Do not delete the unblessed hash table after setup done. + //But maintain its state in order to reuse the solvers next time. + //htab_destroy(&ego->htab_unblessed); + //mkhashtab(&ego->htab_unblessed); + + //Check the unblessed size and switch it with blessed when its size grows beyond the MAX size + //AMD_HASH_UNBLESS_MAX_SIZE + if ((sizeof(struct solution_s)*ego->htab_unblessed.hashsiz) > AMD_HASH_UNBLESS_MAX_SIZE) + { + solution *ht_unblessed_sols = ego->htab_unblessed.solutions; + ego->htab_unblessed.solutions = ego->htab_blessed.solutions; + ego->htab_unblessed.hashsiz = ego->htab_blessed.hashsiz; + ego->htab_unblessed.nelem = ego->htab_blessed.nelem; + ego->htab_unblessed.lookup = ego->htab_blessed.lookup; + ego->htab_unblessed.succ_lookup = ego->htab_blessed.succ_lookup; + ego->htab_unblessed.lookup_iter = ego->htab_blessed.lookup_iter; + ego->htab_unblessed.insert = ego->htab_blessed.insert; + ego->htab_unblessed.insert_iter = ego->htab_blessed.insert_iter; + ego->htab_unblessed.insert_unknown = ego->htab_blessed.insert_unknown; + ego->htab_unblessed.nrehash = ego->htab_blessed.nrehash; + X(ifree)(ht_unblessed_sols); + //Should blessed be cleared and deleted at this point?? + mkhashtab(&ego->htab_blessed); + } + break; + case FORGET_EVERYTHING: + //When wisdom is set bogus; delete both blessed and unblessed hash table + htab_destroy(&ego->htab_blessed); + mkhashtab(&ego->htab_blessed); + htab_destroy(&ego->htab_unblessed); + mkhashtab(&ego->htab_unblessed); + break; + default: + break; + } +} +#else static void forget(planner *ego, amnesia a) { switch (a) { @@ -781,6 +838,7 @@ static void forget(planner *ego, amnesia a) break; } } +#endif /* FIXME: what sort of version information should we write? */ #define WISDOM_PREAMBLE PACKAGE "-" VERSION " " STRINGIZE(X(wisdom)) diff --git a/rdft/problem.c b/rdft/problem.c index a10db034..1de6b747 100644 --- a/rdft/problem.c +++ b/rdft/problem.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,8 +21,22 @@ #include "rdft/rdft.h" +#ifdef AMD_FAST_PLANNER +#include "simd-support/simd-common.h" +#endif #include +#ifdef AMD_FAST_PLANNER +#define MAXRRNK 32 /* FIXME: should malloc() */ +typedef struct { + plan_rdft super; + INT vl; + int rnk; + iodim d[MAXRRNK]; + const char *nam; +} P; +#endif + static void destroy(problem *ego_) { problem_rdft *ego = (problem_rdft *) ego_; @@ -39,6 +54,29 @@ static void kind_hash(md5 *m, const rdft_kind *kind, int rnk) X(md5int)(m, kind[i]); } +#ifdef AMD_FAST_PLANNER +static int fill_iodim(P *pln, const problem_rdft *p) +{ + int i; + const tensor *vecsz = p->vecsz; + + pln->vl = 1; + pln->rnk = 0; + for (i = 0; i < vecsz->rnk; ++i) { + /* extract contiguous dimensions */ + if (pln->vl == 1 && + vecsz->dims[i].is == 1 && vecsz->dims[i].os == 1) + pln->vl = vecsz->dims[i].n; + else if (pln->rnk == MAXRRNK) + return 0; + else + pln->d[pln->rnk++] = vecsz->dims[i]; + } + + return 1; +} +#endif + static void hash(const problem *p_, md5 *m) { const problem_rdft *p = (const problem_rdft *) p_; @@ -47,8 +85,80 @@ static void hash(const problem *p_, md5 *m) kind_hash(m, p->kind, p->sz->rnk); X(md5int)(m, X(ialignment_of)(p->I)); X(md5int)(m, X(ialignment_of)(p->O)); +#ifdef AMD_FAST_PLANNER + X(md5int)(m, p->sz->rnk); + if (FINITE_RNK(p->sz->rnk)) { + int x1, x2, x3, x4, x5, x6, x7, x8, x9, x10; + if (p->sz->rnk > 0) + { + x8 = 1; + x10 = X(is_prime)(p->sz->dims[0].n); + } + else + { + x8 = 0; + x10 = 0; + } + for (int i = 0; i < p->sz->rnk; ++i) { + X(md5INT)(m, p->sz->dims[i].n); + x1 = (p->sz->dims[i].is == p->sz->dims[i].os); + x2 = (p->sz->dims[i].is == 2); + x3 = (p->sz->dims[i].os == 2); + x4 = !((p->sz->dims[i].is * sizeof(R)) % ALIGNMENT); + x5 = !((p->sz->dims[i].os * sizeof(R)) % ALIGNMENT); + x6 = !((p->sz->dims[i].is * sizeof(R)) % ALIGNMENTA); + x7 = !((p->sz->dims[i].os * sizeof(R)) % ALIGNMENTA); + x8 = x8 & ((p->sz->dims[i].is <= 2) && (p->sz->dims[i].os > 2)); + x9 = (x8<<7) | (x7<<6) | (x6<<5) | (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; + X(md5INT)(m, x9); + } + x10 = (x10<<1) | x8; + X(md5int)(m, x10); + } + X(md5int)(m, p->vecsz->rnk); + if (FINITE_RNK(p->vecsz->rnk)) { + int x1, x2, x3, x4, x5, x6, x7, x8, x9; + P pln; + for (int i = 0; i < p->vecsz->rnk; ++i) { + X(md5INT)(m, p->vecsz->dims[i].n); + x1 = (p->vecsz->dims[i].is == p->vecsz->dims[i].os); + x2 = (p->vecsz->dims[i].is == 2); + x3 = (p->vecsz->dims[i].os == 2); + x4 = !((p->vecsz->dims[i].is * sizeof(R)) % ALIGNMENT); + x5 = !((p->vecsz->dims[i].os * sizeof(R)) % ALIGNMENT); + x6 = !((p->vecsz->dims[i].is * sizeof(R)) % ALIGNMENTA); + x7 = !((p->vecsz->dims[i].os * sizeof(R)) % ALIGNMENTA); + x9 = (x7<<6) | (x6<<5) | (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; + X(md5INT)(m, x9); + } + fill_iodim(&pln, p); + x1 = (pln.vl > 2); + if (pln.rnk >= 2) + { + int rnk = pln.rnk; + x2 = (pln.d[rnk-2].n == pln.d[rnk-1].n && + pln.d[rnk-2].is == pln.d[rnk-1].os && + pln.d[rnk-2].os == pln.d[rnk-1].is); + x3 = (X(iabs)(pln.d[rnk-2].is) <= X(iabs)(pln.d[rnk-1].is) || + X(iabs)(pln.d[rnk-2].os) <= X(iabs)(pln.d[rnk-1].os)); + x4 = (X(compute_tilesz)(pln.vl, 1) > 4); + x5 = (X(compute_tilesz)(pln.vl, 2) > 4); + + } + else + { + x2 = 0; + x3 = 0; + x4 = 0; + x5 = 0; + } + x9 = (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; + X(md5int)(m, x9); + } +#else X(tensor_md5)(m, p->sz); X(tensor_md5)(m, p->vecsz); +#endif } static void recur(const iodim *dims, int rnk, R *I) diff --git a/rdft/problem2.c b/rdft/problem2.c index a1445258..e0074fe6 100644 --- a/rdft/problem2.c +++ b/rdft/problem2.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,6 +21,9 @@ #include "dft/dft.h" +#ifdef AMD_FAST_PLANNER +#include "simd-support/simd-common.h" +#endif #include "rdft/rdft.h" #include @@ -42,8 +46,51 @@ static void hash(const problem *p_, md5 *m) X(md5int)(m, X(ialignment_of)(p->cr)); X(md5int)(m, X(ialignment_of)(p->ci)); X(md5int)(m, p->kind); +#ifdef AMD_FAST_PLANNER + X(md5int)(m, p->sz->rnk); + if (FINITE_RNK(p->sz->rnk)) { + int x1, x2, x3, x4, x5, x6, x7, x8, x9; + for (int i = 0; i < p->sz->rnk; ++i) { + X(md5INT)(m, p->sz->dims[i].n); + x1 = (p->sz->dims[i].is == p->sz->dims[i].os); + x2 = (p->sz->dims[i].is == 2); + x3 = (p->sz->dims[i].os == 2); + x4 = !((p->sz->dims[i].is * sizeof(R)) % ALIGNMENT); + x5 = !((p->sz->dims[i].os * sizeof(R)) % ALIGNMENT); + x6 = !((p->sz->dims[i].is * sizeof(R)) % ALIGNMENTA); + x7 = !((p->sz->dims[i].os * sizeof(R)) % ALIGNMENTA); + if (p->vecsz->rnk > i) + x8 = (p->sz->dims[i].is <= p->vecsz->dims[i].is); + else + x8 = 0; + x9 = (x8<<7) | (x7<<6) | (x6<<5) | (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; + X(md5INT)(m, x9); + } + } + X(md5int)(m, p->vecsz->rnk); + if (FINITE_RNK(p->vecsz->rnk)) { + int x1, x2, x3, x4, x5, x6, x7, x8, x9; + for (int i = 0; i < p->vecsz->rnk; ++i) { + X(md5INT)(m, p->vecsz->dims[i].n); + x1 = (p->vecsz->dims[i].is == p->vecsz->dims[i].os); + x2 = (p->vecsz->dims[i].is == 2); + x3 = (p->vecsz->dims[i].os == 2); + x4 = !((p->vecsz->dims[i].is * sizeof(R)) % ALIGNMENT); + x5 = !((p->vecsz->dims[i].os * sizeof(R)) % ALIGNMENT); + x6 = !((p->vecsz->dims[i].is * sizeof(R)) % ALIGNMENTA); + x7 = !((p->vecsz->dims[i].os * sizeof(R)) % ALIGNMENTA); + if (p->sz->rnk > i) + x8 = (p->vecsz->dims[i].n <= p->sz->dims[i].n); + else + x8 = 0; + x9 = (x8<<7) | (x7<<6) | (x6<<5) | (x5<<4) | (x4<<3) | (x3<<2) | (x2<<1) | x1; + X(md5INT)(m, x9); + } + } +#else X(tensor_md5)(m, p->sz); X(tensor_md5)(m, p->vecsz); +#endif } static void print(const problem *ego_, printer *p) From 8beebbe2e53a9af2017024a80948364581082113 Mon Sep 17 00:00:00 2001 From: sraut Date: Tue, 10 Nov 2020 17:10:24 +0000 Subject: [PATCH 3/4] This change includes AOCL version bump and support for various config/compile-time features. 1) AOCL version is changed to 3.0 2) Compilation support for AMD zen3 CPUs is added through the configure script. 3) Cross compilation supported through configure for AMD zen1, zen2 and zen3 CPUs. Additionally, a new configure option AMD_ARCH is provided to a set specific AMD CPU arch option or "auto". 4) Enabled quad precision for AOCC clang compiler with version 10 or above. 5) Environment variable "CC" can now take absolute path when configuring for "Clang" with --enable-amd-opt. 6) Correction in "benchmark-precision" displayed by Test bench binary with "--info-all" for quad-precision. Previously it used to get wrongly displyed as "long-double" for quad-precision. 7) Fixed the support for --enable-debug when used with --enable-amd-opt configure option. 8) README_AMD is updated. This code change relates to Jira task AMD-Internal: [CPUPL-1259] Change-Id: I099b4f5cf4264f5ca39b6dbed762458f5c87669a --- README_AMD.md | 10 +++- api/fftw3.h | 2 +- configure | 114 ++++++++++++++++++++++++++++++++++++----- configure.ac | 102 ++++++++++++++++++++++++++++++++---- libbench2/bench-user.h | 6 ++- 5 files changed, 208 insertions(+), 26 deletions(-) diff --git a/README_AMD.md b/README_AMD.md index 0362dcab..c5b52b05 100644 --- a/README_AMD.md +++ b/README_AMD.md @@ -12,7 +12,9 @@ functions (cpy2d and cpy2d_pair used in rank-0 transform and buffering plan), improved 256-bit kernels selection by Planner and an optional in-place transpose for large problem sizes. AMD Optimized FFTW improves the performance of in-place MPI FFT over FFTW 3.3.8 by employing a faster in-place MPI -transpose function. +transpose function. As of AMD FFTW 3.0, a new fast planner is added as an +extension to the original planner that improves planning time of various +planning modes in general and PATIENT mode in particular. FFTW is a free collection of fast C routines for computing the Discrete Fourier Transform and various special cases thereof in one or more @@ -51,6 +53,12 @@ configure option "--enable-generic-simd128" or "--enable-generic-simd256". The optional configure option "--enable-amd-mpifft" enables the MPI FFT related optimizations. +The new fast planner can be enabled using optional configure option +"--enable-amd-fast-planner". It is supported for single and double precisions. + +An optional configure option "AMD_ARCH" is supported that can be set to CPU +architecture values like "auto" or "znver1" or "znver2" for AMD EPYC processors. + An optional configure option "--enable-amd-trans" is provided that may benefit the performance of transpose operations in case of very large FFT problem sizes. This is by default not enabled and provided as an experimental optional switch. diff --git a/api/fftw3.h b/api/fftw3.h index 72db8d88..5c5ffa0d 100644 --- a/api/fftw3.h +++ b/api/fftw3.h @@ -462,7 +462,7 @@ FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex) /* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64 for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */ -#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \ +#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || (__clang__ && __clang_major__ >= 10)) \ && !(defined(__ICC) || defined(__INTEL_COMPILER) || defined(__CUDACC__) || defined(__PGI)) \ && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__)) # if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I) diff --git a/configure b/configure index e8427f53..26737ea2 100755 --- a/configure +++ b/configure @@ -3680,7 +3680,9 @@ else ok=no fi +long_double_supported=no; if test "$ok" = "yes"; then + long_double_supported=yes; if test "$PRECISION" = "s"; then as_fn_error $? "--enable-single/--enable-long-double conflict" "$LINENO" 5 fi @@ -3708,7 +3710,9 @@ else ok=no fi +quad_precision_supported=no; if test "$ok" = "yes"; then + quad_precision_supported=yes; if test "$PRECISION" != "d"; then as_fn_error $? "conflicting precisions specified" "$LINENO" 5 fi @@ -17043,11 +17047,40 @@ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_gcc_4_6_0" >&5 $as_echo "$ax_cv_gcc_4_6_0" >&6; } if test "$ax_cv_gcc_4_6_0" = yes; then - : + gcc_supported=1 else - as_fn_error $? "gcc 4.6 or later required for quad precision support" "$LINENO" 5 + gcc_supported=0 fi + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using clang 10 or later" >&5 +$as_echo_n "checking whether we are using clang 10 or later... " >&6; } + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + #ifdef __clang__ + #if __clang_major__ < 10 + #error Clang 10 or later is recommended for Quad precision + #endif + #else + #error Other compiler unsupported + #endif + +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + clang_supported=1 +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + clang_supported=0 +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + if test "$gcc_supported" = "1" || test "$clang_supported" = "1"; then + : + else + as_fn_error $? "gcc 4.6 or later OR clang 10 or later required for quad precision support" "$LINENO" 5 + fi { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sinq in -lquadmath" >&5 $as_echo_n "checking for sinq in -lquadmath... " >&6; } if ${ac_cv_lib_quadmath_sinq+:} false; then : @@ -17452,17 +17485,74 @@ else have_amd_opt=no fi -if test "$have_amd_opt" = yes; then +if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then + AMDZENFAMILY=$(expr `cat /proc/cpuinfo | grep -m1 family|cut -f2 -d:`) + AMDZENMODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:`) if test "$ac_test_CFLAGS" != "set"; then - if test "$CC" = "clang"; then + if [ "$AMD_ARCH" = "auto" ]; then + AMD_ARCH="" + fi + SUBSTRCLANG='clang' + SUBSTRGCC='gcc' + if grep -q "$SUBSTRCLANG" <<<"$CC"; then CFLAGS="$CFLAGS -mavx2 -mfma" - else - GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.` \>= 9) - AMDZEN2MODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:` \>= 48) - if test "$GCCVERSION" = "1" && test "$AMDZEN2MODEL" = "1"; then - CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + elif grep -q "$SUBSTRGCC" <<<"$CC"; then + GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.`) + case "$AMDZENFAMILY" in + "23") + if [ -z "${AMD_ARCH}" ]; then + if [ "$GCCVERSION" -ge "9" ] && [ "$AMDZENMODEL" -ge "48" ]; then + CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + else + CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + ;; + "25") + if [ -z "${AMD_ARCH}" ]; then + if [ "$GCCVERSION" -ge "9" ] && [ "$AMDZENMODEL" -le "15" ]; then + CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + elif [ "$GCCVERSION" -lt "9" ] && [ "$AMDZENMODEL" -le "15" ]; then + CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + else + CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + ;; + *) + if [ -z "${AMD_ARCH}" ]; then + CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + ;; + esac + fi + else + if [ -n "${AMD_ARCH}" ]; then + if [ "$AMD_ARCH" = "auto" ]; then + case "$AMDZENFAMILY" in + "23") + if [ "$AMDZENMODEL" -ge "48" ]; then + CFLAGS="$CFLAGS -march=znver2" + else + CFLAGS="$CFLAGS -march=znver1" + fi + ;; + "25") + if [ "$AMDZENMODEL" -le "15" ]; then + CFLAGS="$CFLAGS -march=znver2" + else + CFLAGS="$CFLAGS -mavx2" + fi + ;; + esac else - CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + CFLAGS="$CFLAGS -march=$AMD_ARCH" fi fi fi @@ -17495,7 +17585,7 @@ $as_echo "#define AMD_OPT_MPIFFT 1" >>confdefs.h fi -$as_echo "#define AOCL_FFTW_VERSION \"AOCL-2.2.1\"" >>confdefs.h +$as_echo "#define AOCL_FFTW_VERSION \"AOCL-3.0\"" >>confdefs.h # Check whether --enable-amd-fast-planner was given. if test "${enable_amd_fast_planner+set}" = set; then : @@ -17504,7 +17594,7 @@ else have_amd_fast_planner=no fi -if test "$have_amd_fast_planner" = yes; then +if test "$have_amd_fast_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then $as_echo "#define AMD_OPT_FAST_PLANNER 1" >>confdefs.h diff --git a/configure.ac b/configure.ac index e98083cb..5f636da6 100644 --- a/configure.ac +++ b/configure.ac @@ -74,7 +74,9 @@ fi AM_CONDITIONAL(SINGLE, test "$ok" = "yes") AC_ARG_ENABLE(long-double, [AC_HELP_STRING([--enable-long-double],[compile fftw in long-double precision])], ok=$enableval, ok=no) +long_double_supported=no; if test "$ok" = "yes"; then + long_double_supported=yes; if test "$PRECISION" = "s"; then AC_MSG_ERROR([--enable-single/--enable-long-double conflict]) fi @@ -85,7 +87,9 @@ fi AM_CONDITIONAL(LDOUBLE, test "$ok" = "yes") AC_ARG_ENABLE(quad-precision, [AC_HELP_STRING([--enable-quad-precision],[compile fftw in quadruple precision if available])], ok=$enableval, ok=no) +quad_precision_supported=no; if test "$ok" = "yes"; then + quad_precision_supported=yes; if test "$PRECISION" != "d"; then AC_MSG_ERROR([conflicting precisions specified]) fi @@ -563,7 +567,26 @@ AC_FUNC_VPRINTF AC_CHECK_LIB(m, sin) if test $PRECISION = q; then - AX_GCC_VERSION(4,6,0,[],[AC_MSG_ERROR([gcc 4.6 or later required for quad precision support])]) + AX_GCC_VERSION(4,6,0,[gcc_supported=1],[gcc_supported=0]) + AC_MSG_CHECKING([whether we are using clang 10 or later]) + AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ + #ifdef __clang__ + #if __clang_major__ < 10 + #error Clang 10 or later is recommended for Quad precision + #endif + #else + #error Other compiler unsupported + #endif + ]])], + [AC_MSG_RESULT([yes]) + clang_supported=1], + [AC_MSG_RESULT([no]) + clang_supported=0]) + if test "$gcc_supported" = "1" || test "$clang_supported" = "1"; then + : + else + AC_MSG_ERROR([gcc 4.6 or later OR clang 10 or later required for quad precision support]) + fi AC_CHECK_LIB(quadmath, sinq, [], [AC_MSG_ERROR([quad precision requires libquadmath for quad-precision trigonometric routines])]) LIBQUADMATH=-lquadmath fi @@ -616,17 +639,74 @@ fi dnl amd optimization switch and CFLAGS setting based on config arg option --enable-amd-opt AC_ARG_ENABLE(amd-opt, [AC_HELP_STRING([--enable-amd-opt],[enable AMD cpu specific optimizations])], have_amd_opt=$enableval, have_amd_opt=no) -if test "$have_amd_opt" = yes; then +if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then + AMDZENFAMILY=$(expr `cat /proc/cpuinfo | grep -m1 family|cut -f2 -d:`) + AMDZENMODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:`) if test "$ac_test_CFLAGS" != "set"; then - if test "$CC" = "clang"; then + if [[ "$AMD_ARCH" = "auto" ]]; then + AMD_ARCH="" + fi + SUBSTRCLANG='clang' + SUBSTRGCC='gcc' + if grep -q "$SUBSTRCLANG" <<<"$CC"; then CFLAGS="$CFLAGS -mavx2 -mfma" - else - GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.` \>= 9) - AMDZEN2MODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:` \>= 48) - if test "$GCCVERSION" = "1" && test "$AMDZEN2MODEL" = "1"; then - CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + elif grep -q "$SUBSTRGCC" <<<"$CC"; then + GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.`) + case "$AMDZENFAMILY" in + "23") + if [[ -z "${AMD_ARCH}" ]]; then + if [[ "$GCCVERSION" -ge "9" ]] && [[ "$AMDZENMODEL" -ge "48" ]]; then + CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + else + CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + ;; + "25") + if [[ -z "${AMD_ARCH}" ]]; then + if [[ "$GCCVERSION" -ge "9" ]] && [[ "$AMDZENMODEL" -le "15" ]]; then + CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + elif [[ "$GCCVERSION" -lt "9" ]] && [[ "$AMDZENMODEL" -le "15" ]]; then + CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + else + CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + ;; + *) + if [[ -z "${AMD_ARCH}" ]]; then + CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + ;; + esac + fi + else + if [[ -n "${AMD_ARCH}" ]]; then + if [[ "$AMD_ARCH" = "auto" ]]; then + case "$AMDZENFAMILY" in + "23") + if [[ "$AMDZENMODEL" -ge "48" ]]; then + CFLAGS="$CFLAGS -march=znver2" + else + CFLAGS="$CFLAGS -march=znver1" + fi + ;; + "25") + if [[ "$AMDZENMODEL" -le "15" ]]; then + CFLAGS="$CFLAGS -march=znver2" + else + CFLAGS="$CFLAGS -mavx2" + fi + ;; + esac else - CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + CFLAGS="$CFLAGS -march=$AMD_ARCH" fi fi fi @@ -643,10 +723,10 @@ if test "$have_amd_mpifft" = yes; then AC_DEFINE(AMD_OPT_MPIFFT,1,[Define to enable AMD cpu optimized MPI FFT.]) fi dnl aocl version number of amd-fftw -AC_DEFINE(AOCL_FFTW_VERSION,"AOCL-2.2.1",[AOCL Version of AMD-FFTW]) +AC_DEFINE(AOCL_FFTW_VERSION,"AOCL-3.0",[AOCL Version of AMD-FFTW]) dnl amd optimization switch to enable AMD Fast Planner for AMD cpus --enable-amd-fast-planner AC_ARG_ENABLE(amd-fast-planner, [AC_HELP_STRING([--enable-amd-fast-planner],[enable AMD Fast Planner for a faster planning time on AMD cpus])], have_amd_fast_planner=$enableval, have_amd_fast_planner=no) -if test "$have_amd_fast_planner" = yes; then +if test "$have_amd_fast_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then AC_DEFINE(AMD_OPT_FAST_PLANNER,1,[Define to enable AMD Fast Planner for AMD cpus.]) fi diff --git a/libbench2/bench-user.h b/libbench2/bench-user.h index 89b2337d..951de1df 100644 --- a/libbench2/bench-user.h +++ b/libbench2/bench-user.h @@ -1,7 +1,7 @@ /* * Copyright (c) 2001 Matteo Frigo * Copyright (c) 2001 Massachusetts Institute of Technology - * Copyright (C) 2019, Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -58,7 +58,11 @@ typedef bench_real bench_complex[2]; #undef SINGLE_PRECISION #define SINGLE_PRECISION (!DOUBLE_PRECISION && sizeof(bench_real) == sizeof(float)) #undef LDOUBLE_PRECISION +#ifdef BENCHFFT_LDOUBLE #define LDOUBLE_PRECISION (!DOUBLE_PRECISION && sizeof(bench_real) == sizeof(long double)) +#else +#define LDOUBLE_PRECISION 0 +#endif #undef QUAD_PRECISION #ifdef BENCHFFT_QUAD From 7033737baca17a5fd4fae6730ee43c67a46ffef0 Mon Sep 17 00:00:00 2001 From: sraut Date: Wed, 18 Nov 2020 04:41:48 +0000 Subject: [PATCH 4/4] Changed the behaviour of optimization switch AMD_OPT_PREFER_256BIT_FPU. Disabled by default. It should be turned on explicitly when WISDOM feature is not used. Change-Id: I92c2929fa329a44cfe83b5a8929b05fcb66ab420 --- kernel/ifftw.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/ifftw.h b/kernel/ifftw.h index ed19571a..7b8a2977 100644 --- a/kernel/ifftw.h +++ b/kernel/ifftw.h @@ -87,7 +87,8 @@ extern "C" //-------------------------------- //disables 128-bit AVX2 versions of kernels and prefers only 256-bit AVX2 kernels support -#define AMD_OPT_PREFER_256BIT_FPU +//This optimization switch is disabled by default. If it is enabled, WISDOM feature is not to be used. +//#define AMD_OPT_PREFER_256BIT_FPU #define AMD_OPT_128BIT_KERNELS_THRESHOLD 1024//Below this SIZE, 128-bit AVX2 kernels allowed //-------------------------------- //CPY2d related optimizations :- enable Either (i)C switch Or (ii)INTRIN switch