From b17026fee14eaa7c422cdd6006aa30fcde97ff02 Mon Sep 17 00:00:00 2001 From: sraut Date: Tue, 27 Jul 2021 15:11:48 +0530 Subject: [PATCH 1/5] This code change provides windows support for advanced AMD MPI optimizations and few bug fixes. 1) New fast AMD MPI transpose method is supported on windows. 2) Minor performance regression as observed in few cases is fixed. 3) Compile time error seen in Visual studio clang based build for windows is fixed - Unused linux system defined struct variables were causing this and they have been removed. This code change contains fix for the Jira issue AMD-Internal: [CPUPL-1715] Change-Id: Id4cbfc2c3e62fddc6cbbdfb793b6669651296b36 --- CMakeLists.txt | 8 +++++++- kernel/ifftw.h | 4 ++-- mpi/transpose-blk-scheme1.c | 2 -- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f4d2e623..320de960 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,7 +47,8 @@ option (ENABLE_AVX "Compile with AVX instruction set support" OFF) option (ENABLE_AVX2 "Compile with AVX2 instruction set support" ON) option (ENABLE_AMD_OPT "Enable AMD specific optimization" OFF) -option (ENABLE_AMD_MPIFFT "Compile with AMD MPIFFT support" OFF) +option (ENABLE_AMD_MPIFFT "Enable AMD MPI FFT optimizations" OFF) +option (ENABLE_AMD_MPI_VADER_LIMIT "Enable advanced AMD MPI optimizations for in-place transpose based on VADER LIMIT" OFF) option (ENABLE_MPI "compile FFTW MPI library" OFF) option (ENABLE_AMD_TRANS "Enable amd optimized transpose" OFF) @@ -87,6 +88,10 @@ if(ENABLE_AMD_MPIFFT) add_definitions(-DAMD_OPT_MPIFFT) endif() +if(ENABLE_AMD_MPI_VADER_LIMIT) + add_definitions(-DAMD_MPI_VADER_LIMIT_SET) +endif() + if(ENABLE_MPI) if(ENABLE_QUAD_PRECISION) message (FATAL_ERROR "quad precision is not supported in MPI") @@ -127,6 +132,7 @@ message(ENABLE_AVX : ${ENABLE_AVX}) message(ENABLE_AVX2 : ${ENABLE_AVX2}) message(ENABLE_AMD_OPT : ${ENABLE_AMD_OPT}) message(ENABLE_AMD_MPIFFT : ${ENABLE_AMD_MPIFFT}) +message(ENABLE_AMD_MPI_VADER_LIMIT : ${ENABLE_AMD_MPI_VADER_LIMIT}) message(ENABLE_MPI : ${ENABLE_MPI}) message(ENABLE_AMD_TRANS : ${ENABLE_AMD_TRANS}) message(DISABLE_FORTRAN : ${DISABLE_FORTRAN}) diff --git a/kernel/ifftw.h b/kernel/ifftw.h index 098bb4a4..fc5a2f10 100644 --- a/kernel/ifftw.h +++ b/kernel/ifftw.h @@ -98,7 +98,7 @@ extern "C" #define AMD_OPT_USE_MEMCPY_TO_CPY //Below switch enables the unrolling of memory read and write SIMD operations in cpy2d routine. #if (!defined(FFTW_LDOUBLE) && !defined(FFTW_QUAD) && !defined(FFTW_SINGLE)) -#define AMD_OPT_UNROLL_CPY2D +//#define AMD_OPT_UNROLL_CPY2D #endif //-------------------------------- //In-place Transpose related optimization switches :- @@ -125,7 +125,7 @@ extern "C" // (for this optimization switch, AMD_OPT_AUTO_TUNED_TRANS_BLK_SIZE should also be enabled) //#define AMD_OPT_AUTO_TUNED_RASTER_TILED_TRANS_METHOD //The below switch enables AMD optimizations for the in-place square transpose routine. -#define AMD_OPT_IN_PLACE_SQU_TRANS +//#define AMD_OPT_IN_PLACE_SQU_TRANS //The below switch enables AMD optimizations for the in-situ Toms513 algorithm. #define AMD_OPT_TOMS513_TRANS //-------------------------------- diff --git a/mpi/transpose-blk-scheme1.c b/mpi/transpose-blk-scheme1.c index 6712239c..d4754b2e 100644 --- a/mpi/transpose-blk-scheme1.c +++ b/mpi/transpose-blk-scheme1.c @@ -24,7 +24,6 @@ #include "mpi-transpose.h" #include -#include typedef struct { solver super; @@ -59,7 +58,6 @@ static void transpose_chunks(int *sched, int n_pes, int my_pe, int other, size, bufSize_s, blksize_s, totSize_s, bufSize_r, blksize_r, totSize_r; R *sendbuf, *recvbuf; MPI_Status status; - struct timeval t1,t2; //n_pes is the no. of ranks tp paralelly communicate -> blocksize //blksize = VADER_LIMIT;//(n_pes*VADER_LIMIT)/(bufSize*sizeof(R));; From 718d7d7e01cdf701db6e80b80a1ae4e33a98e787 Mon Sep 17 00:00:00 2001 From: sraut Date: Thu, 5 Aug 2021 20:42:54 +0000 Subject: [PATCH 2/5] This code change adds AMD application optimization layer to uplift performance of HPC and scientific applications. 1) This optimization layer configures the library to use FFTW_PATIENT setup mode in such a way that the best FFT performance is achieved with a negligibly small planning cost. 2) It also implements a separate memory region for the planner to avoid overwriting of input buffer when planning with FFTW_PATIENT setup mode for in-place problems. 3) This optimization layer reads the wisdom file when available otherwise creates it when its associated switch AMD_APP_OPT_GENERATE_WISDOM is enabled. New directory structure named wisdom_files is created to make available pre-optimized wisdom files for direct use. Pre-optimized wisdom file for Ta2O5 test case of QE application is added. 4) A new configure option --enable-amd-app-opt is added to enable this AMD application optimization layer. 5) This optimization layer is supported for complex DFT problem types in single-threaded mode for now. This code change relates to Jira task AMD-Internal : [CPUPL-1740] Change-Id: Ibdec2932a54a58d3482b0fb3b5cb8b56d716bd42 --- api/apiplan.c | 282 ++++++++++++++++++++++++++++- config.h.in | 3 + configure | 17 ++ configure.ac | 5 + kernel/ifftw.h | 13 ++ kernel/planner.c | 2 + wisdom_files/zen3/QE/Ta2O5/wis.dat | 90 +++++++++ 7 files changed, 411 insertions(+), 1 deletion(-) create mode 100644 wisdom_files/zen3/QE/Ta2O5/wis.dat diff --git a/api/apiplan.c b/api/apiplan.c index eb8d477d..c6b754fc 100644 --- a/api/apiplan.c +++ b/api/apiplan.c @@ -21,6 +21,13 @@ #include "api/api.h" +#ifdef AMD_APP_OPT_LAYER +#include "kernel/ifftw.h" +#include "dft/dft.h" +#include "rdft/rdft.h" + +static int wisdom_one_time_read = 0; +#endif static planner_hook_t before_planner_hook = 0, after_planner_hook = 0; void X(set_planner_hooks)(planner_hook_t before, planner_hook_t after) @@ -145,7 +152,107 @@ apiplan *X(mkapiplan)(int sign, unsigned flags, problem *prb) FFTW_PATIENT, FFTW_EXHAUSTIVE}; int pat, pat_max; double pcost = 0; +#ifdef AMD_APP_OPT_LAYER + R *ri, *ii, *ro, *io; + int isz, osz, inplace = 0; + int align_bytes = 0, in_alignment = 1, cur_alloc_alignment = 1, iaddr_changed = 0, oaddr_changed = 0; + + flags &= ~(FFTW_ESTIMATE | FFTW_MEASURE | FFTW_PATIENT | FFTW_EXHAUSTIVE); + flags |= FFTW_PATIENT; + if (wisdom_one_time_read == 0) + { + if (!X(import_wisdom_from_filename)("wis.dat")) + { + fprintf(stderr, "apiplan: ERROR reading wisdom wis.dat\n"); + } +#ifndef AMD_APP_OPT_GENERATE_WISDOM + wisdom_one_time_read = 1; +#endif + } + if(prb->adt->problem_kind == PROBLEM_DFT) + { + problem_dft *pdft = (problem_dft *) prb; + isz = 1; + osz = 1; + if (FINITE_RNK(pdft->sz->rnk)) + { + for (int i = 0; i < pdft->sz->rnk; ++i) + { + const iodim *q = pdft->sz->dims + i; + isz *= (q->n); + osz *= (q->n); + } + } + if (FINITE_RNK(pdft->vecsz->rnk)) + { + for (int i = 0; i < pdft->vecsz->rnk; ++i) + { + const iodim *q = pdft->vecsz->dims + i; + isz *= (q->n); + osz *= (q->n); + } + } +#ifdef AMD_APP_LAYER_API_LOGS + printf("start-QE: %d*%d*%d*%d\n", pdft->sz->rnk, pdft->vecsz->rnk, pdft->sz->dims->n, pdft->vecsz->dims->n); + printf("start-QE: %x, %x; %x, %x\n", pdft->ri, pdft->ii, pdft->ro, pdft->io); +#endif + align_bytes = (2 * sizeof(R))-1; + if (((ptrdiff_t)pdft->ri) & align_bytes) + in_alignment = 0; + + ri = pdft->ri; + ii = pdft->ii; + inplace = (pdft->ri == pdft->ro); + pdft->ri = (R *) malloc((isz * sizeof(R) * 2) + sizeof(R)); + + if (((ptrdiff_t)pdft->ri) & align_bytes) + cur_alloc_alignment = 0; + if ((in_alignment == 0 && cur_alloc_alignment == 1) || + (in_alignment == 1 && cur_alloc_alignment == 0)) + { + pdft->ri += 1; + iaddr_changed = 1; + } + + pdft->ii = pdft->ri + 1; + if (inplace) + { + pdft->ro = pdft->ri; + pdft->io = pdft->ii; + } + else + { +#ifdef AMD_APP_OPT_OUT_BUFFER_MEM + ro = pdft->ro; + io = pdft->io; + in_alignment = 1; + cur_alloc_alignment = 1; + if (((ptrdiff_t)pdft->ro) & align_bytes) + in_alignment = 0; + pdft->ro = (R *) malloc((osz * sizeof(R) * 2) + sizeof(R)); + if (((ptrdiff_t)pdft->ro) & align_bytes) + cur_alloc_alignment = 0; + if ((in_alignment == 0 && cur_alloc_alignment == 1) || + (in_alignment == 1 && cur_alloc_alignment == 0)) + { + pdft->ro += 1; + oaddr_changed = 1; + } + pdft->io = pdft->ro + 1; +#endif + } +#ifdef AMD_APP_LAYER_API_LOGS + printf("start-FFTW: (in-place:%d), %x, %x; %x, %x\n", inplace, pdft->ri, pdft->ii, pdft->ro, pdft->io); + printf("%x, %x; %x, %x\n", ri, ii, ro, io); +#endif + } + else + { + fprintf(stderr, "apiplan: UNSUPPORTED problem type/kind\n"); + return NULL; + } +#endif if (before_planner_hook) before_planner_hook(); @@ -225,6 +332,43 @@ apiplan *X(mkapiplan)(int sign, unsigned flags, problem *prb) if (after_planner_hook) after_planner_hook(); +#ifdef AMD_APP_OPT_LAYER + if (wisdom_one_time_read == 0) + { +#ifndef AMD_APP_OPT_GENERATE_WISDOM + wisdom_one_time_read = 1; +#endif + X(export_wisdom_to_filename)("wis.dat"); + } + + if(prb->adt->problem_kind == PROBLEM_DFT) + { + problem_dft *pdft = (problem_dft *) prb; + if (iaddr_changed) + pdft->ri -= 1; + free(pdft->ri); + pdft->ri = ri; + pdft->ii = ii; + if (inplace) + { + pdft->ro = ri; + pdft->io = ii; + } + else + { +#ifdef AMD_APP_OPT_OUT_BUFFER_MEM + if (oaddr_changed) + pdft->ro -= 1; + free(pdft->ro); + pdft->ro = ro; + pdft->io = io; +#endif + } +#ifdef AMD_APP_LAYER_API_LOGS + printf("end-QE: %x, %x; %x, %x\n", pdft->ri, pdft->ii, pdft->ro, pdft->io); +#endif + } +#endif return p; } @@ -239,7 +383,107 @@ apiplan *X(mkapiplan_ex)(int sign, unsigned flags, int n, problem *prb) FFTW_PATIENT, FFTW_EXHAUSTIVE}; int pat, pat_max; double pcost = 0; +#ifdef AMD_APP_OPT_LAYER + R *ri, *ii, *ro, *io; + int isz, osz, inplace = 0; + int align_bytes = 0, in_alignment = 1, cur_alloc_alignment = 1, iaddr_changed = 0, oaddr_changed = 0; + flags &= ~(FFTW_ESTIMATE | FFTW_MEASURE | FFTW_PATIENT | FFTW_EXHAUSTIVE); + flags |= FFTW_PATIENT; + if (wisdom_one_time_read == 0) + { + if (!X(import_wisdom_from_filename)("wis.dat")) + { + fprintf(stderr, "apiplan_ex: ERROR reading wisdom wis.dat\n"); + } +#ifndef AMD_APP_OPT_GENERATE_WISDOM + wisdom_one_time_read = 1; +#endif + } + + if(prb->adt->problem_kind == PROBLEM_DFT) + { + problem_dft *pdft = (problem_dft *) prb; + isz = 1; + osz = 1; + if (FINITE_RNK(pdft->sz->rnk)) + { + for (int i = 0; i < pdft->sz->rnk; ++i) + { + const iodim *q = pdft->sz->dims + i; + isz *= (q->n); + osz *= (q->n); + } + } + if (FINITE_RNK(pdft->vecsz->rnk)) + { + for (int i = 0; i < pdft->vecsz->rnk; ++i) + { + const iodim *q = pdft->vecsz->dims + i; + isz *= (q->n); + osz *= (q->n); + } + } +#ifdef AMD_APP_LAYER_API_LOGS + printf("start_ex-QE: %d*%d*%d*%d\n", pdft->sz->rnk, pdft->vecsz->rnk, pdft->sz->dims->n, pdft->vecsz->dims->n); + printf("start_ex-QE: %x, %x; %x, %x\n", pdft->ri, pdft->ii, pdft->ro, pdft->io); +#endif + align_bytes = (2 * sizeof(R))-1; + if (((ptrdiff_t)pdft->ri) & align_bytes) + in_alignment = 0; + + ri = pdft->ri; + ii = pdft->ii; + inplace = (pdft->ri == pdft->ro); + pdft->ri = (R *) malloc((isz * sizeof(R) * 2) + sizeof(R)); + + if (((ptrdiff_t)pdft->ri) & align_bytes) + cur_alloc_alignment = 0; + if ((in_alignment == 0 && cur_alloc_alignment == 1) || + (in_alignment == 1 && cur_alloc_alignment == 0)) + { + pdft->ri += 1; + iaddr_changed = 1; + } + + pdft->ii = pdft->ri + 1; + if (inplace) + { + pdft->ro = pdft->ri; + pdft->io = pdft->ii; + } + else + { +#ifdef AMD_APP_OPT_OUT_BUFFER_MEM + ro = pdft->ro; + io = pdft->io; + in_alignment = 1; + cur_alloc_alignment = 1; + if (((ptrdiff_t)pdft->ro) & align_bytes) + in_alignment = 0; + pdft->ro = (R *) malloc((osz * sizeof(R) * 2) + sizeof(R)); + if (((ptrdiff_t)pdft->ro) & align_bytes) + cur_alloc_alignment = 0; + if ((in_alignment == 0 && cur_alloc_alignment == 1) || + (in_alignment == 1 && cur_alloc_alignment == 0)) + { + pdft->ro += 1; + oaddr_changed = 1; + } + pdft->io = pdft->ro + 1; +#endif + } +#ifdef AMD_APP_LAYER_API_LOGS + printf("start_ex-FFTW: (in-place:%d), %x, %x; %x, %x\n", inplace, pdft->ri, pdft->ii, pdft->ro, pdft->io); + printf("%x, %x; %x, %x\n", ri, ii, ro, io); +#endif + } + else + { + fprintf(stderr, "apiplan: UNSUPPORTED problem type/kind\n"); + return NULL; + } +#endif if (before_planner_hook) before_planner_hook(); @@ -318,7 +562,43 @@ apiplan *X(mkapiplan_ex)(int sign, unsigned flags, int n, problem *prb) if (after_planner_hook) after_planner_hook(); - +#ifdef AMD_APP_OPT_LAYER + if (wisdom_one_time_read == 0) + { +#ifndef AMD_APP_OPT_GENERATE_WISDOM + wisdom_one_time_read = 1; +#endif + X(export_wisdom_to_filename)("wis.dat"); + } + + if(prb->adt->problem_kind == PROBLEM_DFT) + { + problem_dft *pdft = (problem_dft *) prb; + if (iaddr_changed) + pdft->ri -= 1; + free(pdft->ri); + pdft->ri = ri; + pdft->ii = ii; + if (inplace) + { + pdft->ro = ri; + pdft->io = ii; + } + else + { +#ifdef AMD_APP_OPT_OUT_BUFFER_MEM + if (oaddr_changed) + pdft->ro -= 1; + free(pdft->ro); + pdft->ro = ro; + pdft->io = io; +#endif + } +#ifdef AMD_APP_LAYER_API_LOGS + printf("end_ex-QE: %x, %x; %x, %x\n", pdft->ri, pdft->ii, pdft->ro, pdft->io); +#endif + } +#endif return p; } #endif diff --git a/config.h.in b/config.h.in index e600cd77..636089d5 100644 --- a/config.h.in +++ b/config.h.in @@ -1,5 +1,8 @@ /* config.h.in. Generated from configure.ac by autoheader. */ +/* Enable AMD application optimization layer. */ +#undef AMD_APP_OPT_LAYER + /* Set VADER LIMIT in order to enable new AMD MPI transpose algorithms. */ #undef AMD_MPI_VADER_LIMIT_SET diff --git a/configure b/configure index 64bbf28c..6df732bc 100755 --- a/configure +++ b/configure @@ -876,6 +876,7 @@ enable_libtool_lock enable_mpi enable_amd_opt enable_amd_mpi_vader_limit +enable_amd_app_opt enable_amd_trans enable_amd_mpifft enable_openmp @@ -1590,6 +1591,10 @@ Optional Features: --enable-amd-mpi-vader-limit enable setting of VADER LIMIT that controls enabling of new AMD MPI transpose algorithms + --enable-amd-app-opt enable AMD application optimization layer to achieve + best FFT performance with a negligibly small + planning time targeted for HPC and scientific + applications --enable-amd-trans enable AMD cpu optimized Transpose --enable-amd-mpifft enable AMD cpu optimized MPI FFT --enable-openmp use OpenMP directives for parallelism @@ -17581,6 +17586,18 @@ $as_echo "#define AMD_MPI_VADER_LIMIT_SET 1" >>confdefs.h fi + # Check whether --enable-amd-app-opt was given. +if test "${enable_amd_app_opt+set}" = set; then : + enableval=$enable_amd_app_opt; have_amd_app_opt=$enableval +else + have_amd_app_opt=no +fi + + if test "$have_amd_app_opt" = yes ; then + +$as_echo "#define AMD_APP_OPT_LAYER 1" >>confdefs.h + + fi fi # Check whether --enable-amd-trans was given. if test "${enable_amd_trans+set}" = set; then : diff --git a/configure.ac b/configure.ac index 7e18f610..300c8e7b 100644 --- a/configure.ac +++ b/configure.ac @@ -718,6 +718,11 @@ if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then AC_DEFINE(AMD_MPI_VADER_LIMIT_SET,1,[Set VADER LIMIT in order to enable new AMD MPI transpose algorithms.]) fi + dnl amd switch to enable AMD's application optimization layer to achieve best FFT performance with a negligibly small planning time targeted for HPC and scientific applications. + AC_ARG_ENABLE(amd-app-opt, [AC_HELP_STRING([--enable-amd-app-opt],[enable AMD application optimization layer to achieve best FFT performance with a negligibly small planning time targeted for HPC and scientific applications])], have_amd_app_opt=$enableval, have_amd_app_opt=no) + if test "$have_amd_app_opt" = yes ; then + AC_DEFINE(AMD_APP_OPT_LAYER,1,[Enable AMD application optimization layer.]) + fi fi dnl amd optimization switch to enable amd cpu optimized transpose --enable-amd-trans AC_ARG_ENABLE(amd-trans, [AC_HELP_STRING([--enable-amd-trans],[enable AMD cpu optimized Transpose])], have_amd_trans=$enableval, have_amd_trans=no) diff --git a/kernel/ifftw.h b/kernel/ifftw.h index fc5a2f10..e0fd1b01 100644 --- a/kernel/ifftw.h +++ b/kernel/ifftw.h @@ -176,11 +176,24 @@ extern "C" #define AMD_OPT_TOP_N 3 //The value of AMD_OPT_TOP_N is fixed as 3, enabling the search, store and re-use of Top 3 plans. This value should not be changed by the user. #endif //-------------------------------- +//Below switches and flags enable/disable and control AMD's separate optimization layer for Applications like QE, VASP, etc. +//With this optimization layer, the fastest FFT execution is achieved by using OPATIENT plan wherein the associated planning cost is kept minimal with the use of in-memory HASH table. +//This optimization layer also implements a separate memory region for planner to deal with OPATIENT planning of in-place problems. +#ifdef AMD_APP_OPT_LAYER //AMD's application optimization layer +//Enable/disable separate memory even for output buffer in case of out-of-place FFT +//#define AMD_APP_OPT_OUT_BUFFER_MEM +//Enable this switch to generate wisdom file for the first time for the application. +#define AMD_APP_OPT_GENERATE_WISDOM +//Debug print logs for the application optimization layer +//#define AMD_APP_LAYER_API_LOGS +#endif +//-------------------------------- #endif//#ifdef AMD_OPT_ALL //Below is a manual switch to control VADER LIMIT //This is upper limit that each process/rank can send in bytes to the receiver process/rank with buffers for receiving them //without any synchronization on completion status. #define VADER_LIMIT 8000//8000//4000//500 + //============================================================ //AMD OPTIMIZATIONS :- end diff --git a/kernel/planner.c b/kernel/planner.c index 06ba7fb7..9079a410 100644 --- a/kernel/planner.c +++ b/kernel/planner.c @@ -1061,8 +1061,10 @@ static void forget(planner *ego, amnesia a) mkhashtab(&ego->htab_blessed); /* fall through */ case FORGET_ACCURSED: +#ifndef AMD_APP_OPT_LAYER htab_destroy(&ego->htab_unblessed); mkhashtab(&ego->htab_unblessed); +#endif break; default: break; diff --git a/wisdom_files/zen3/QE/Ta2O5/wis.dat b/wisdom_files/zen3/QE/Ta2O5/wis.dat new file mode 100644 index 00000000..1c032515 --- /dev/null +++ b/wisdom_files/zen3/QE/Ta2O5/wis.dat @@ -0,0 +1,90 @@ +(fftw-3.3.8 fftw_wisdom #x11e9e424 #x58b2578a #xb40cbc96 #xc2f4b464 + (fftw_dft_vrank_geq1_register 1 #x11048 #x11048 #x0 #x558eaa83 #xe52682c5 #x35fe4d7b #x58f06a3b) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xae65e704 #x9c88fea3 #x5f5c411a #x2f2bb07f) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xe87708a8 #xc11b1afc #x2956c389 #x92725956) + (fftw_codelet_t1bv_6_avx2 0 #x11048 #x11048 #x0 #x4e58fde5 #x2fec426c #x794f0653 #xb0169970) + (fftw_codelet_t1fv_15_avx 0 #x11048 #x11048 #x0 #x8b2c2a79 #xb2dbbc30 #x855ae4c8 #x37140566) + (fftw_codelet_t1_6 0 #x11048 #x11048 #x0 #xacd01375 #xf5cc1f6b #xe5a2a175 #x08bcdefd) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x5483257b #xa8cad0e0 #x8f51eb85 #x24a09c76) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xe241b8af #x0272adf5 #xc607fd1b #x0698dedf) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xcbb0650b #x32fc1eb2 #xa5d339f9 #x8c085c8a) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xc2a965e7 #x18d12abc #x23d11518 #x4857e5e3) + (fftw_codelet_t1bv_15_avx 0 #x11048 #x11048 #x0 #x04b4d980 #xfccb8ddc #x03e9df66 #xd67d74c4) + (fftw_codelet_n1_6 0 #x11048 #x11048 #x0 #x446c3f32 #xb9071d46 #xdc009af1 #x502015bb) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x82c54af5 #x52fb0277 #xed93e189 #xfab91897) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x295dd36d #x06460ba5 #xa400eb69 #x52762a91) + (fftw_codelet_t1bv_6_avx2 0 #x11048 #x11048 #x0 #xf7d7f492 #xf824ace0 #xfbe36fa0 #x42f813b4) + (fftw_codelet_n1_12 0 #x11048 #x11048 #x0 #xe85a6d59 #x38a46a12 #xf607513d #x713192d8) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xa2f1fa2a #xd421618e #x18a57936 #x92f54214) + (fftw_codelet_n1_15 0 #x11048 #x11048 #x0 #x07b632c4 #xb959cb42 #xda372586 #x2a21ea6c) + (fftw_dft_vrank_geq1_register 1 #x11048 #x11048 #x0 #x912a9b3a #xc6f97a4c #xbb25865e #x4b7d289a) + (fftw_codelet_t1bv_15_avx2 0 #x11048 #x11048 #x0 #x12e1b9f5 #x9010414c #x4e5f847c #xd1dd282b) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xa98d3cc6 #x8f170111 #xc730d583 #x8cacfd95) + (fftw_codelet_n1_20 0 #x11048 #x11048 #x0 #x3873ed60 #xf56d5e36 #xe8271d90 #x60dc6aa6) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x43e6e5af #x57c3dba2 #x1425c725 #xf98a23a2) + (fftw_codelet_n1bv_12_avx 0 #x11048 #x11048 #x0 #x4d7bfca6 #x53a818c0 #xd6520e39 #xeb78b2f2) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x4961b829 #xbd7ca5a9 #x6604ca5e #xcf5139e5) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x96ef3db3 #x50c2fa88 #x5d9e538e #x3481802f) + (fftw_codelet_t1fv_15_avx2 0 #x11048 #x11048 #x0 #x01597929 #xdb4bff5b #xcb6b813e #x2e3f0981) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xd9cf98d0 #x2208cb28 #xc6eedd38 #x7680adea) + (fftw_codelet_n2bv_12_avx2 0 #x11048 #x11048 #x0 #x7e81ba20 #x090345ee #x12be22a0 #x068ddda8) + (fftw_codelet_t1_6 0 #x11048 #x11048 #x0 #xb8495cd5 #x25625cbd #xb732b01d #xb03a79bc) + (fftw_codelet_t2fv_10_avx 0 #x11048 #x11048 #x0 #xffd09777 #x6a22c421 #x26e23624 #x55b13d2a) + (fftw_codelet_t1_3 0 #x11048 #x11048 #x0 #x1f7034a2 #x928b1d60 #x8dd09ec5 #xb03655c3) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x64763000 #x7251ac71 #x0ea4176b #x876904e9) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x6665f450 #xd1b51a08 #x6b533846 #x0eb92a8e) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x022e8f14 #xad8efd86 #x1dfe6d63 #xfe589b0c) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x0ae5f3b6 #xa342356a #xbf6cc355 #x02464afe) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x9e6fbf16 #x15d800ad #x0a28b1f4 #x66efa28a) + (fftw_codelet_n1_2 0 #x11048 #x11048 #x0 #x91766a2c #xde138f40 #xca78a6ca #x42e1cd52) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xeb74a43e #xddfb0feb #xb67b50b6 #x131948b8) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x92b8ebf8 #x636bde66 #x2a6ee626 #x495f3f1c) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xd1af6240 #xd9b1dd41 #x4b5564fb #xdb127633) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x236688f1 #x6bdc4b2f #x4edff171 #xbe8d749a) + (fftw_codelet_t2bv_10_avx 0 #x11048 #x11048 #x0 #xd60059c7 #x1f6e2c25 #x82b4eb9f #x76b5c992) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x0da03563 #x5c3768d6 #xbbb02c41 #xb1ea8c64) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x487a73f3 #xb5b31975 #xaf5e918a #x2deb6f5e) + (fftw_codelet_t2bv_20_avx2 0 #x11048 #x11048 #x0 #xe0c5c73a #x1456fbcd #x259fb340 #x28f6aed1) + (fftw_codelet_t1fv_6_avx2 0 #x11048 #x11048 #x0 #xe1baaed0 #x6d055fdd #x8dcb9e69 #x105ba8de) + (fftw_codelet_t1fv_3_avx2 0 #x11048 #x11048 #x0 #x100d9a1e #xa9d88456 #x81496e58 #xb7e6cf40) + (fftw_codelet_n2bv_10_avx2 0 #x11048 #x11048 #x0 #x839ac080 #x43811e16 #x4ba0e2a1 #x5c379b7f) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x6c0abbde #xbb45af3f #x40b0b8c3 #x474e2a6a) + (fftw_codelet_t1_6 0 #x11048 #x11048 #x0 #xea861d0c #xd15f42f5 #xe397e1bd #x329bf57d) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x1040e2d1 #x4c087e80 #xee99de64 #x5d25cdd5) + (fftw_codelet_n2fv_20_avx2 0 #x11048 #x11048 #x0 #x45fdf914 #x6227fa3d #xfb313927 #xe88c168a) + (fftw_codelet_t2bv_20_avx2 0 #x11048 #x11048 #x0 #x3e0ac293 #x4dd02770 #xcb04bab7 #xd79b3376) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x3e60bc6b #xdec14bd0 #x95a6d126 #xe3c92340) + (fftw_codelet_t1bv_3_avx 0 #x11048 #x11048 #x0 #xeec67df8 #xd0cb4ecd #xa6e28733 #x3b279783) + (fftw_codelet_n1_6 0 #x11048 #x11048 #x0 #xd65170e5 #x0185a2ff #x8697cc89 #xeffded40) + (fftw_codelet_t1_2 0 #x11048 #x11048 #x0 #x7cfc153d #x636c8e3e #x5a7f991b #xdb511d65) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xe5dcb7e7 #xe4e6643f #xbb8ea22b #x9cd81e89) + (fftw_dft_vrank_geq1_register 1 #x11048 #x11048 #x0 #xd2616a6d #x595d4b2f #x47522124 #xb8a466e0) + (fftw_codelet_t1bv_12_avx2 0 #x11048 #x11048 #x0 #xd4a9cd8b #x5fb115ae #x98e90cb8 #xd9a2038e) + (fftw_codelet_t1_10 0 #x11048 #x11048 #x0 #xd153ca6b #x679edd9c #x62db8d34 #xe602f386) + (fftw_dft_vrank_geq1_register 1 #x11048 #x11048 #x0 #xfb855c3c #x74972f45 #xca431733 #x9ff7ddd0) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xe368739e #x203e6a37 #x7292b492 #x56a5e8ce) + (fftw_codelet_n1_6 0 #x11048 #x11048 #x0 #x696962c2 #x85a436e1 #xb9017b8d #x07c564f1) + (fftw_codelet_n1fv_12_avx2 0 #x11048 #x11048 #x0 #x1df2071a #x9fdeb474 #xd9781e33 #x35353a36) + (fftw_codelet_t1bv_3_avx2 0 #x11048 #x11048 #x0 #x6a21fbcf #x0358db65 #x191bbc0f #xcd8284c4) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x9e45626c #xf9972ba0 #x65273a93 #xe1d1712d) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xa17dac59 #x239a8b70 #x7afa35fa #x4860c069) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x3ae48d73 #xd26246c7 #xba10c2f0 #xbbd55e58) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xdc481c3d #x460ef90f #x9cf29348 #x703bb91d) + (fftw_codelet_t1_6 0 #x11048 #x11048 #x0 #xcede4977 #x53cc4ff3 #xeb235ce0 #xe31c705b) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x78cbc035 #x3d579dc3 #x5667f9e1 #xc9656978) + (fftw_codelet_t1bv_3_avx2 0 #x11048 #x11048 #x0 #x06f445ac #x50edc0cc #x2a329e11 #x3442c976) + (fftw_codelet_t1bv_3_avx2 0 #x11048 #x11048 #x0 #x7d3b240d #xa9bc26ec #xae609baa #x37cc758d) + (fftw_codelet_n2fv_12_avx2 0 #x11048 #x11048 #x0 #x03a6dc39 #xdb60e85e #xd98d7c90 #x7ee801f6) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x946e12c1 #x7ceff0d0 #xa23caf31 #xf5eb0134) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xdad0f611 #x49beb1ce #x1b6427c4 #x3c690856) + (fftw_dft_vrank_geq1_register 1 #x11048 #x11048 #x0 #x49a8622a #x8dc9c4f5 #x1bce5f84 #x4056f26e) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x90f84fb6 #x7469fec3 #x751ba876 #x90b5d209) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x31afe233 #x29a2a0a2 #x9ececd1e #x76c65af6) + (fftw_codelet_t1_10 0 #x11048 #x11048 #x0 #xb7b2f13f #x9882b0fd #x3068ae68 #xacb3e8b6) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #x37a761c3 #x7bcb7ab5 #x952b7b95 #x541f749b) + (fftw_codelet_t2bv_5_avx 0 #x11048 #x11048 #x0 #x7f61f588 #x241d83c6 #xe9f43d62 #x0c94c0a6) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xafa325c5 #x91301e40 #xac4ca427 #x1885c065) + (fftw_codelet_t1_6 0 #x11048 #x11048 #x0 #xe973980d #x3c84769e #x929d54eb #x82406ead) + (fftw_codelet_t1bv_3_avx2 0 #x11048 #x11048 #x0 #xf67553c5 #x04eeca90 #x56eac28d #x818be4a6) + (fftw_dft_vrank_geq1_register 0 #x11048 #x11048 #x0 #xd90bfe43 #x141dbcce #x799007a6 #x04076eb0) +) From ea9931974008fdf3c7b142cf2742d7860bdab63c Mon Sep 17 00:00:00 2001 From: sraut Date: Fri, 3 Sep 2021 17:40:13 +0000 Subject: [PATCH 3/5] This change includes code refactoring and modifications related to AMD versioning, license, copyright and Readme. 1) Code refactoring is done for AMD application optimization layer. Fixed the assignments of memory pointers (flipped in case of backward transforms) related to the separate memory region. Support for this feature is extended to Windows version of AMD FFTW. 2) Configure script is updated to support auto detection of znver3 CPU architecture. 3) AOCL version is updated to 3.1. 4) COPYRIGHT, license and README files are updated. This code change relates to Jira IDs AMD-Internal : [CPUPL-1740] [CPUPL-1824] [CPUPL-1807] [CPUPL-1806] [CPUPL-1805] [CPUPL-1851] Change-Id: I59a6df723a4f481d2c8c43a2b79a5922e2dda3e4 --- CMakeLists.txt | 17 ++- COPYRIGHT | 1 + README_AMD.md | 20 ++- api/apiplan.c | 384 +++++++++++++++++++---------------------------- configure | 28 ++-- configure.ac | 30 ++-- doc/license.texi | 1 + kernel/ifftw.h | 4 +- kernel/planner.c | 8 +- 9 files changed, 232 insertions(+), 261 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 320de960..dc9cbbdd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,8 @@ option (ENABLE_AMD_TRANS "Enable amd optimized transpose" OFF) option (ENABLE_AMD_FAST_PLANNER "Enable for a faster planning time on AMD cpus" OFF) option (ENABLE_AMD_TOP_N_planner "Enable AMD Top N Planner for AMD cpus" OFF) +option (ENABLE_AMD_APP_OPT "Enable AMD application optimization layer for HPC and scientific applications" OFF) + if(ENABLE_VERBOSE_MODE) if(CMAKE_C_COMPILER_ID MATCHES MSVC) set(CMAKE_VERBOSE_MAKEFILE ON CACHE BOOL "ON" FORCE) @@ -104,6 +106,14 @@ if(ENABLE_AMD_TRANS) add_definitions(-DAMD_OPT_TRANS) endif() +if (ENABLE_AMD_APP_OPT) + if (NOT (ENABLE_MPI OR ENABLE_QUAD_PRECISION OR ENABLE_LONG_DOUBLE)) + add_definitions(-DAMD_APP_OPT_LAYER) + else () + message(FATAL_ERROR "AMD application optimization layer is not supported for MPI execution and in Quad or Long double precisions.") + endif () +endif () + option (DISABLE_FORTRAN "Disable Fortran wrapper routines" OFF) if (CMAKE_C_COMPILER_ID MATCHES MSVC OR CMAKE_C_COMPILER_ID MATCHES Clang) @@ -142,6 +152,7 @@ message(CMAKE_C_FLAGS_RELEASE : ${CMAKE_C_FLAGS_RELEASE}) message(CMAKE_C_COMPILER_FLAGS : ${CMAKE_C_COMPILER_FLAGS}) message(ENABLE_AMD_FAST_PLANNER : ${ENABLE_AMD_FAST_PLANNER}) message(ENABLE_AMD_TOP_N_planner : ${ENABLE_AMD_TOP_N_planner}) +message(ENABLE_AMD_APP_OPT : ${ENABLE_AMD_APP_OPT}) include(GNUInstallDirs) @@ -224,7 +235,7 @@ if (MSVC) add_definitions(-D_CRT_SECURE_NO_WARNINGS) endif(MSVC) -add_compile_definitions(AOCL_FFTW_VERSION="AOCL FFTW 3.0.1") +add_compile_definitions(AOCL_FFTW_VERSION="AOCL FFTW 3.1") find_library (LIBM_LIBRARY NAMES m) if (LIBM_LIBRARY) @@ -494,7 +505,7 @@ endif () if (ENABLE_AMD_FAST_PLANNER) if (NOT (ENABLE_QUAD_PRECISION OR ENABLE_LONG_DOUBLE)) - set (AMD_OPT_FAST_PLANNER TRUE) + add_definitions(-DAMD_OPT_FAST_PLANNER) else () message(FATAL_ERROR "AMD_FAST_PLANNER cannot be set for Quad and Long Double precision") endif () @@ -507,7 +518,7 @@ if (ENABLE_AMD_TOP_N_planner) if (NOT (ENABLE_AMD_FAST_PLANNER)) # Check if amd-top-n-planner is enabled with mpi, openmp or threads if (NOT (ENABLE_MPI OR ENABLE_OPENMP OR ENABLE_THREADS)) - set (AMD_OPT_TOP_N_PLANNER TRUE) + add_definitions(-DAMD_OPT_TOP_N_PLANNER) else () message(FATAL_ERROR "AMD_TOP_N_PLANNER can not be enabled with mpi, openmp or threads as it is supported only for single threaded mode") endif () diff --git a/COPYRIGHT b/COPYRIGHT index 089500b6..2d9ce014 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -1,6 +1,7 @@ /* * Copyright (c) 2003, 2007-14 Matteo Frigo * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * Copyright (C) 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/README_AMD.md b/README_AMD.md index 80daa63f..a663cebd 100644 --- a/README_AMD.md +++ b/README_AMD.md @@ -6,17 +6,18 @@ AMD EPYC CPUs. It is developed on top of FFTW (version fftw-3.3.8). All known features and functionalities of FFTW are retained and supported as it is with this AMD optimized FFTW library. -AMD Optimized FFTW achieves higher performance than the FFTW 3.3.8 due to +AMD Optimized FFTW achieves higher performance than the FFTW 3.3.8 due to its various optimizations involving improved SIMD Kernel functions, improved copy functions (cpy2d and cpy2d_pair used in rank-0 transform and buffering plan), improved 256-bit kernels selection by Planner and an optional in-place transpose for large problem sizes. AMD Optimized FFTW improves the performance -of in-place MPI FFT over FFTW 3.3.8 by employing a faster in-place MPI +of in-place MPI FFTs over FFTW 3.3.8 by employing a faster in-place MPI transpose function. AMD Optimized FFTW provides a new fast planner as an extension to the original planner that improves planning time of various -planning modes in general and PATIENT mode in particular. As of AMD FFTW 3.0.1, -a new feature called Top N planner is introduced that minimizes single-threaded -run-to-run variations. +planning modes in general and PATIENT mode in particular. Another new planning +mode called Top N planner is made available that minimizes single-threaded +run-to-run variations. As of AMD FFTW 3.1, a feature called AMD's application +optimization layer is introduced to speedup HPC and scientific applications. FFTW is a free collection of fast C routines for computing the Discrete Fourier Transform and various special cases thereof in one or more @@ -64,7 +65,14 @@ The new fast planner can be enabled using optional configure option "--enable-amd-fast-planner". It is supported for single and double precisions. An optional configure option "AMD_ARCH" is supported that can be set to CPU -architecture values like "auto" or "znver1" or "znver2" for AMD EPYC processors. +architecture values like "auto" or "znver1" or "znver2" or "znver3" for AMD +EPYC processors. + +The optional configure option "--enable-amd-app-opt" turns on AMD's application +optimization layer to benefit performance of HPC and scientific applications. +Currently it is developed for complex DFT problem types in double and single +precisions. It is not supported for MPI FFTs, real DFT problem types, Quad or +Long double precisions, and split array format. An optional configure option "--enable-amd-trans" is provided that may benefit the performance of transpose operations in case of very large FFT problem sizes. diff --git a/api/apiplan.c b/api/apiplan.c index c6b754fc..43168950 100644 --- a/api/apiplan.c +++ b/api/apiplan.c @@ -142,37 +142,44 @@ static plan *mkplan(planner *plnr, unsigned flags, return pln; } -apiplan *X(mkapiplan)(int sign, unsigned flags, problem *prb) -{ - apiplan *p = 0; - plan *pln; - unsigned flags_used_for_planning; - planner *plnr; - static const unsigned int pats[] = {FFTW_ESTIMATE, FFTW_MEASURE, - FFTW_PATIENT, FFTW_EXHAUSTIVE}; - int pat, pat_max; - double pcost = 0; #ifdef AMD_APP_OPT_LAYER - R *ri, *ii, *ro, *io; - int isz, osz, inplace = 0; - int align_bytes = 0, in_alignment = 1, cur_alloc_alignment = 1, iaddr_changed = 0, oaddr_changed = 0; - - flags &= ~(FFTW_ESTIMATE | FFTW_MEASURE | FFTW_PATIENT | FFTW_EXHAUSTIVE); - flags |= FFTW_PATIENT; - if (wisdom_one_time_read == 0) - { +/** AMD's application optimization layer - Starts + * It uses a separate data structure "app_layer_data" to create separate planner memory region + * and save/restore the application input and output pointers. + * It uses new functions to create and destroy the separate planner memory region, + * set planning mode to OPATIENT and OWISDOM, and save/restore application input and output pointers. + */ +typedef struct app_layer_data_s +{ + R *inPtr; + R *outPtr; + R *ri; + R *ii; + R *ro; + R *io; +} app_layer_data; + +static int create_amd_app_layer(int sign, unsigned *flags, problem *prb, app_layer_data *app_layer) +{ + int isz, osz, inplace = 0; + int align_bytes = 0, in_alignment = 1, cur_alloc_alignment = 1, iaddr_changed = 0, oaddr_changed = 0; + + *flags &= ~(FFTW_ESTIMATE | FFTW_MEASURE | FFTW_PATIENT | FFTW_EXHAUSTIVE); + *flags |= FFTW_PATIENT; + if (wisdom_one_time_read == 0) + { if (!X(import_wisdom_from_filename)("wis.dat")) { - fprintf(stderr, "apiplan: ERROR reading wisdom wis.dat\n"); + //fprintf(stderr, "apiplan: ERROR reading wisdom wis.dat\n"); } -#ifndef AMD_APP_OPT_GENERATE_WISDOM +#ifdef AMD_APP_OPT_GENERATE_WISDOM wisdom_one_time_read = 1; #endif - } - - if(prb->adt->problem_kind == PROBLEM_DFT) - { - problem_dft *pdft = (problem_dft *) prb; + } + + if(prb->adt->problem_kind == PROBLEM_DFT) + { + problem_dft *pdft = (problem_dft *) prb; isz = 1; osz = 1; if (FINITE_RNK(pdft->sz->rnk)) @@ -194,28 +201,38 @@ apiplan *X(mkapiplan)(int sign, unsigned flags, problem *prb) } } #ifdef AMD_APP_LAYER_API_LOGS - printf("start-QE: %d*%d*%d*%d\n", pdft->sz->rnk, pdft->vecsz->rnk, pdft->sz->dims->n, pdft->vecsz->dims->n); - printf("start-QE: %x, %x; %x, %x\n", pdft->ri, pdft->ii, pdft->ro, pdft->io); + printf("start-App: %d*%d*%d*%d\n", pdft->sz->rnk, pdft->vecsz->rnk, pdft->sz->dims->n, pdft->vecsz->dims->n); + printf("start-App: %x, %x; %x, %x\n", pdft->ri, pdft->ii, pdft->ro, pdft->io); #endif + app_layer->inPtr = ((sign == FFT_SIGN) ? pdft->ri : pdft->ii); align_bytes = (2 * sizeof(R))-1; - if (((ptrdiff_t)pdft->ri) & align_bytes) + if (((ptrdiff_t)(app_layer->inPtr)) & align_bytes) in_alignment = 0; - - ri = pdft->ri; - ii = pdft->ii; + + app_layer->ri = pdft->ri; + app_layer->ii = pdft->ii; inplace = (pdft->ri == pdft->ro); - pdft->ri = (R *) malloc((isz * sizeof(R) * 2) + sizeof(R)); - - if (((ptrdiff_t)pdft->ri) & align_bytes) + app_layer->inPtr = (R *) malloc((isz * sizeof(R) * 2) + sizeof(R)); + + if (((ptrdiff_t)(app_layer->inPtr)) & align_bytes) cur_alloc_alignment = 0; if ((in_alignment == 0 && cur_alloc_alignment == 1) || - (in_alignment == 1 && cur_alloc_alignment == 0)) + (in_alignment == 1 && cur_alloc_alignment == 0)) { - pdft->ri += 1; iaddr_changed = 1; } - - pdft->ii = pdft->ri + 1; + + if (sign == FFT_SIGN) + { + pdft->ri = app_layer->inPtr + iaddr_changed; + pdft->ii = pdft->ri + 1; + } + else + { + pdft->ii = app_layer->inPtr + iaddr_changed; + pdft->ri = pdft->ii + 1; + } + if (inplace) { pdft->ro = pdft->ri; @@ -224,34 +241,106 @@ apiplan *X(mkapiplan)(int sign, unsigned flags, problem *prb) else { #ifdef AMD_APP_OPT_OUT_BUFFER_MEM - ro = pdft->ro; - io = pdft->io; - in_alignment = 1; - cur_alloc_alignment = 1; - if (((ptrdiff_t)pdft->ro) & align_bytes) - in_alignment = 0; - pdft->ro = (R *) malloc((osz * sizeof(R) * 2) + sizeof(R)); - if (((ptrdiff_t)pdft->ro) & align_bytes) - cur_alloc_alignment = 0; - if ((in_alignment == 0 && cur_alloc_alignment == 1) || - (in_alignment == 1 && cur_alloc_alignment == 0)) - { - pdft->ro += 1; - oaddr_changed = 1; - } - pdft->io = pdft->ro + 1; + app_layer->ro = pdft->ro; + app_layer->io = pdft->io; + in_alignment = 1; + cur_alloc_alignment = 1; + app_layer->outPtr = ((sign == FFT_SIGN) ? pdft->ro : pdft->io); + if (((ptrdiff_t)(app_layer->outPtr)) & align_bytes) + in_alignment = 0; + app_layer->outPtr = (R *) malloc((osz * sizeof(R) * 2) + sizeof(R)); + + if (((ptrdiff_t)(app_layer->outPtr)) & align_bytes) + cur_alloc_alignment = 0; + if ((in_alignment == 0 && cur_alloc_alignment == 1) || + (in_alignment == 1 && cur_alloc_alignment == 0)) + { + oaddr_changed = 1; + } + + if (sign == FFT_SIGN) + { + pdft->ro = app_layer->outPtr + oaddr_changed; + pdft->io = pdft->ro + 1; + } + else + { + pdft->io = app_layer->outPtr + oaddr_changed; + pdft->ro = pdft->io + 1; + } #endif } #ifdef AMD_APP_LAYER_API_LOGS printf("start-FFTW: (in-place:%d), %x, %x; %x, %x\n", inplace, pdft->ri, pdft->ii, pdft->ro, pdft->io); - printf("%x, %x; %x, %x\n", ri, ii, ro, io); + printf("start-FFTW: %x, %x; %x, %x\n", app_layer->ri, app_layer->ii, app_layer->ro, app_layer->io); + printf("FFTW app_layer: %x, %x; %x, %x\n", app_layer->ri, app_layer->ii, app_layer->ro, app_layer->io); #endif - } - else - { + } + else + { fprintf(stderr, "apiplan: UNSUPPORTED problem type/kind\n"); - return NULL; - } + return -1; + } + return 0; +} + +static void destroy_amd_app_layer(problem *prb, app_layer_data *app_layer) +{ + int inplace = 0; + problem_dft *pdft = (problem_dft *) prb; + + if (wisdom_one_time_read == 0) + { +#ifdef AMD_APP_OPT_GENERATE_WISDOM + wisdom_one_time_read = 1; +#endif + X(export_wisdom_to_filename)("wis.dat"); + } + + inplace = (pdft->ri == pdft->ro); + if(prb->adt->problem_kind == PROBLEM_DFT) + { + problem_dft *pdft = (problem_dft *) prb; + free(app_layer->inPtr); + pdft->ri = app_layer->ri; + pdft->ii = app_layer->ii; + if (inplace) + { + pdft->ro = app_layer->ri; + pdft->io = app_layer->ii; + } + else + { +#ifdef AMD_APP_OPT_OUT_BUFFER_MEM + free(app_layer->outPtr); + pdft->ro = app_layer->ro; + pdft->io = app_layer->io; +#endif + } +#ifdef AMD_APP_LAYER_API_LOGS + printf("end-App: %x, %x; %x, %x\n", pdft->ri, pdft->ii, pdft->ro, pdft->io); +#endif + } +} +/** AMD's application optimization layer - Ends + */ +#endif + +apiplan *X(mkapiplan)(int sign, unsigned flags, problem *prb) +{ + apiplan *p = 0; + plan *pln; + unsigned flags_used_for_planning; + planner *plnr; + static const unsigned int pats[] = {FFTW_ESTIMATE, FFTW_MEASURE, + FFTW_PATIENT, FFTW_EXHAUSTIVE}; + int pat, pat_max; + double pcost = 0; + +#ifdef AMD_APP_OPT_LAYER + app_layer_data app_layer; + if (create_amd_app_layer(sign, &flags, prb, &app_layer)) + return NULL; #endif if (before_planner_hook) before_planner_hook(); @@ -333,41 +422,7 @@ apiplan *X(mkapiplan)(int sign, unsigned flags, problem *prb) after_planner_hook(); #ifdef AMD_APP_OPT_LAYER - if (wisdom_one_time_read == 0) - { -#ifndef AMD_APP_OPT_GENERATE_WISDOM - wisdom_one_time_read = 1; -#endif - X(export_wisdom_to_filename)("wis.dat"); - } - - if(prb->adt->problem_kind == PROBLEM_DFT) - { - problem_dft *pdft = (problem_dft *) prb; - if (iaddr_changed) - pdft->ri -= 1; - free(pdft->ri); - pdft->ri = ri; - pdft->ii = ii; - if (inplace) - { - pdft->ro = ri; - pdft->io = ii; - } - else - { -#ifdef AMD_APP_OPT_OUT_BUFFER_MEM - if (oaddr_changed) - pdft->ro -= 1; - free(pdft->ro); - pdft->ro = ro; - pdft->io = io; -#endif - } -#ifdef AMD_APP_LAYER_API_LOGS - printf("end-QE: %x, %x; %x, %x\n", pdft->ri, pdft->ii, pdft->ro, pdft->io); -#endif - } + destroy_amd_app_layer(prb, &app_layer); #endif return p; } @@ -383,106 +438,11 @@ apiplan *X(mkapiplan_ex)(int sign, unsigned flags, int n, problem *prb) FFTW_PATIENT, FFTW_EXHAUSTIVE}; int pat, pat_max; double pcost = 0; + #ifdef AMD_APP_OPT_LAYER - R *ri, *ii, *ro, *io; - int isz, osz, inplace = 0; - int align_bytes = 0, in_alignment = 1, cur_alloc_alignment = 1, iaddr_changed = 0, oaddr_changed = 0; - - flags &= ~(FFTW_ESTIMATE | FFTW_MEASURE | FFTW_PATIENT | FFTW_EXHAUSTIVE); - flags |= FFTW_PATIENT; - if (wisdom_one_time_read == 0) - { - if (!X(import_wisdom_from_filename)("wis.dat")) - { - fprintf(stderr, "apiplan_ex: ERROR reading wisdom wis.dat\n"); - } -#ifndef AMD_APP_OPT_GENERATE_WISDOM - wisdom_one_time_read = 1; -#endif - } - - if(prb->adt->problem_kind == PROBLEM_DFT) - { - problem_dft *pdft = (problem_dft *) prb; - isz = 1; - osz = 1; - if (FINITE_RNK(pdft->sz->rnk)) - { - for (int i = 0; i < pdft->sz->rnk; ++i) - { - const iodim *q = pdft->sz->dims + i; - isz *= (q->n); - osz *= (q->n); - } - } - if (FINITE_RNK(pdft->vecsz->rnk)) - { - for (int i = 0; i < pdft->vecsz->rnk; ++i) - { - const iodim *q = pdft->vecsz->dims + i; - isz *= (q->n); - osz *= (q->n); - } - } -#ifdef AMD_APP_LAYER_API_LOGS - printf("start_ex-QE: %d*%d*%d*%d\n", pdft->sz->rnk, pdft->vecsz->rnk, pdft->sz->dims->n, pdft->vecsz->dims->n); - printf("start_ex-QE: %x, %x; %x, %x\n", pdft->ri, pdft->ii, pdft->ro, pdft->io); -#endif - align_bytes = (2 * sizeof(R))-1; - if (((ptrdiff_t)pdft->ri) & align_bytes) - in_alignment = 0; - - ri = pdft->ri; - ii = pdft->ii; - inplace = (pdft->ri == pdft->ro); - pdft->ri = (R *) malloc((isz * sizeof(R) * 2) + sizeof(R)); - - if (((ptrdiff_t)pdft->ri) & align_bytes) - cur_alloc_alignment = 0; - if ((in_alignment == 0 && cur_alloc_alignment == 1) || - (in_alignment == 1 && cur_alloc_alignment == 0)) - { - pdft->ri += 1; - iaddr_changed = 1; - } - - pdft->ii = pdft->ri + 1; - if (inplace) - { - pdft->ro = pdft->ri; - pdft->io = pdft->ii; - } - else - { -#ifdef AMD_APP_OPT_OUT_BUFFER_MEM - ro = pdft->ro; - io = pdft->io; - in_alignment = 1; - cur_alloc_alignment = 1; - if (((ptrdiff_t)pdft->ro) & align_bytes) - in_alignment = 0; - pdft->ro = (R *) malloc((osz * sizeof(R) * 2) + sizeof(R)); - if (((ptrdiff_t)pdft->ro) & align_bytes) - cur_alloc_alignment = 0; - if ((in_alignment == 0 && cur_alloc_alignment == 1) || - (in_alignment == 1 && cur_alloc_alignment == 0)) - { - pdft->ro += 1; - oaddr_changed = 1; - } - pdft->io = pdft->ro + 1; -#endif - } -#ifdef AMD_APP_LAYER_API_LOGS - printf("start_ex-FFTW: (in-place:%d), %x, %x; %x, %x\n", inplace, pdft->ri, pdft->ii, pdft->ro, pdft->io); - printf("%x, %x; %x, %x\n", ri, ii, ro, io); -#endif - } - else - { - fprintf(stderr, "apiplan: UNSUPPORTED problem type/kind\n"); - return NULL; - } + app_layer_data app_layer; + if (create_amd_app_layer(sign, &flags, prb, &app_layer)) + return NULL; #endif if (before_planner_hook) before_planner_hook(); @@ -563,41 +523,7 @@ apiplan *X(mkapiplan_ex)(int sign, unsigned flags, int n, problem *prb) if (after_planner_hook) after_planner_hook(); #ifdef AMD_APP_OPT_LAYER - if (wisdom_one_time_read == 0) - { -#ifndef AMD_APP_OPT_GENERATE_WISDOM - wisdom_one_time_read = 1; -#endif - X(export_wisdom_to_filename)("wis.dat"); - } - - if(prb->adt->problem_kind == PROBLEM_DFT) - { - problem_dft *pdft = (problem_dft *) prb; - if (iaddr_changed) - pdft->ri -= 1; - free(pdft->ri); - pdft->ri = ri; - pdft->ii = ii; - if (inplace) - { - pdft->ro = ri; - pdft->io = ii; - } - else - { -#ifdef AMD_APP_OPT_OUT_BUFFER_MEM - if (oaddr_changed) - pdft->ro -= 1; - free(pdft->ro); - pdft->ro = ro; - pdft->io = io; -#endif - } -#ifdef AMD_APP_LAYER_API_LOGS - printf("end_ex-QE: %x, %x; %x, %x\n", pdft->ri, pdft->ii, pdft->ro, pdft->io); -#endif - } + destroy_amd_app_layer(prb, &app_layer); #endif return p; } diff --git a/configure b/configure index 6df732bc..7680c942 100755 --- a/configure +++ b/configure @@ -17525,13 +17525,17 @@ if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then ;; "25") if [ -z "${AMD_ARCH}" ]; then - if [ "$GCCVERSION" -ge "9" ] && [ "$AMDZENMODEL" -le "15" ]; then - CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" - elif [ "$GCCVERSION" -lt "9" ] && [ "$AMDZENMODEL" -le "15" ]; then - CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" - else - CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" - fi + if [ "$AMDZENMODEL" -le "15" ] || ( [ "$AMDZENMODEL" -ge "48" ] && [ "$AMDZENMODEL" -le "63" ] ); then + if [ "$GCCVERSION" -ge "11" ]; then + CFLAGS="$CFLAGS -march=znver3 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + elif [ "$GCCVERSION" -ge "9" ]; then + CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + else + CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi else CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" fi @@ -17557,8 +17561,8 @@ if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then fi ;; "25") - if [ "$AMDZENMODEL" -le "15" ]; then - CFLAGS="$CFLAGS -march=znver2" + if [ "$AMDZENMODEL" -le "15" ] || ( [ "$AMDZENMODEL" -ge "48" ] && [ "$AMDZENMODEL" -le "63" ] ); then + CFLAGS="$CFLAGS -march=znver3" else CFLAGS="$CFLAGS -mavx2" fi @@ -17594,9 +17598,13 @@ else fi if test "$have_amd_app_opt" = yes ; then + if (test "$enable_mpi" = yes || test "$quad_precision_supported" = yes || test "$long_double_supported" = yes); then + as_fn_error $? "AMD application optimization layer is not supported for MPI execution and in Quad or Long double precisions." "$LINENO" 5 + else $as_echo "#define AMD_APP_OPT_LAYER 1" >>confdefs.h + fi fi fi # Check whether --enable-amd-trans was given. @@ -17638,7 +17646,7 @@ else fi -$as_echo "#define AOCL_FFTW_VERSION \"AOCL FFTW 3.0.1\"" >>confdefs.h +$as_echo "#define AOCL_FFTW_VERSION \"AOCL FFTW 3.1\"" >>confdefs.h # Check whether --enable-amd-fast-planner was given. if test "${enable_amd_fast_planner+set}" = set; then : diff --git a/configure.ac b/configure.ac index 300c8e7b..c99fccf9 100644 --- a/configure.ac +++ b/configure.ac @@ -666,13 +666,17 @@ if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then ;; "25") if [[ -z "${AMD_ARCH}" ]]; then - if [[ "$GCCVERSION" -ge "9" ]] && [[ "$AMDZENMODEL" -le "15" ]]; then - CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" - elif [[ "$GCCVERSION" -lt "9" ]] && [[ "$AMDZENMODEL" -le "15" ]]; then - CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" - else - CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" - fi + if [[ "$AMDZENMODEL" -le "15" ]] || ( [[ "$AMDZENMODEL" -ge "48" ]] && [[ "$AMDZENMODEL" -le "63" ]] ); then + if [[ "$GCCVERSION" -ge "11" ]]; then + CFLAGS="$CFLAGS -march=znver3 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + elif [[ "$GCCVERSION" -ge "9" ]]; then + CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi + else + CFLAGS="$CFLAGS -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi else CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" fi @@ -698,8 +702,8 @@ if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then fi ;; "25") - if [[ "$AMDZENMODEL" -le "15" ]]; then - CFLAGS="$CFLAGS -march=znver2" + if [[ "$AMDZENMODEL" -le "15" ]] || ( [[ "$AMDZENMODEL" -ge "48" ]] && [[ "$AMDZENMODEL" -le "63" ]] ); then + CFLAGS="$CFLAGS -march=znver3" else CFLAGS="$CFLAGS -mavx2" fi @@ -721,7 +725,11 @@ if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then dnl amd switch to enable AMD's application optimization layer to achieve best FFT performance with a negligibly small planning time targeted for HPC and scientific applications. AC_ARG_ENABLE(amd-app-opt, [AC_HELP_STRING([--enable-amd-app-opt],[enable AMD application optimization layer to achieve best FFT performance with a negligibly small planning time targeted for HPC and scientific applications])], have_amd_app_opt=$enableval, have_amd_app_opt=no) if test "$have_amd_app_opt" = yes ; then - AC_DEFINE(AMD_APP_OPT_LAYER,1,[Enable AMD application optimization layer.]) + if (test "$enable_mpi" = yes || test "$quad_precision_supported" = yes || test "$long_double_supported" = yes); then + AC_MSG_ERROR([AMD application optimization layer is not supported for MPI execution and in Quad or Long double precisions.]) + else + AC_DEFINE(AMD_APP_OPT_LAYER,1,[Enable AMD application optimization layer.]) + fi fi fi dnl amd optimization switch to enable amd cpu optimized transpose --enable-amd-trans @@ -737,7 +745,7 @@ fi AC_ARG_ENABLE(openmp, [AC_HELP_STRING([--enable-openmp],[use OpenMP directives for parallelism])], enable_openmp=$enableval, enable_openmp=no) AC_ARG_ENABLE(threads, [AC_HELP_STRING([--enable-threads],[compile FFTW SMP threads library])], enable_threads=$enableval, enable_threads=no) dnl aocl version number of amd-fftw -AC_DEFINE(AOCL_FFTW_VERSION,"AOCL FFTW 3.0.1",[AOCL Version of AMD-FFTW]) +AC_DEFINE(AOCL_FFTW_VERSION,"AOCL FFTW 3.1",[AOCL Version of AMD-FFTW]) dnl amd optimization switch to enable AMD Fast Planner for AMD cpus --enable-amd-fast-planner AC_ARG_ENABLE(amd-fast-planner, [AC_HELP_STRING([--enable-amd-fast-planner],[enable AMD Fast Planner for a faster planning time on AMD cpus])], have_amd_fast_planner=$enableval, have_amd_fast_planner=no) dnl amd optimization switch to enable AMD Top N Planner for AMD cpus --enable-amd-top-n-planner diff --git a/doc/license.texi b/doc/license.texi index e317085e..58e21622 100644 --- a/doc/license.texi +++ b/doc/license.texi @@ -3,6 +3,7 @@ FFTW is Copyright @copyright{} 2003, 2007-11 Matteo Frigo, Copyright @copyright{} 2003, 2007-11 Massachusetts Institute of Technology. +@copyright{} 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved. FFTW is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/kernel/ifftw.h b/kernel/ifftw.h index e0fd1b01..cfd9c604 100644 --- a/kernel/ifftw.h +++ b/kernel/ifftw.h @@ -183,9 +183,11 @@ extern "C" //Enable/disable separate memory even for output buffer in case of out-of-place FFT //#define AMD_APP_OPT_OUT_BUFFER_MEM //Enable this switch to generate wisdom file for the first time for the application. -#define AMD_APP_OPT_GENERATE_WISDOM +//#define AMD_APP_OPT_GENERATE_WISDOM //Debug print logs for the application optimization layer //#define AMD_APP_LAYER_API_LOGS +//Maximum size of Unblessed Hash table kept alive to reuse the saved plans directly from it. +#define AMD_APP_OPT_HASH_UNBLESS_MAX_SIZE 16777216 #endif //-------------------------------- #endif//#ifdef AMD_OPT_ALL diff --git a/kernel/planner.c b/kernel/planner.c index 9079a410..f80e2c5f 100644 --- a/kernel/planner.c +++ b/kernel/planner.c @@ -1061,7 +1061,13 @@ static void forget(planner *ego, amnesia a) mkhashtab(&ego->htab_blessed); /* fall through */ case FORGET_ACCURSED: -#ifndef AMD_APP_OPT_LAYER +#ifdef AMD_APP_OPT_LAYER + if ((sizeof(struct solution_s)*ego->htab_unblessed.hashsiz) > AMD_APP_OPT_HASH_UNBLESS_MAX_SIZE) + { + htab_destroy(&ego->htab_unblessed); + mkhashtab(&ego->htab_unblessed); + } +#else htab_destroy(&ego->htab_unblessed); mkhashtab(&ego->htab_unblessed); #endif From 5647dde3732b66c0ef5b98937d15057cbea08c7e Mon Sep 17 00:00:00 2001 From: sraut Date: Fri, 24 Sep 2021 15:17:59 +0000 Subject: [PATCH 4/5] This change provides few fixes for configure and build related issues. 1) Fixed the configure script to make use of AMD_ARCH in case of clang compiler. 2) Added checks in configure and CMake to avoid build errors when optional AMD configure options are used without enabling --enable-amd-opt. 3) Fixed the display string for --info-all to print -march value when building on Windows. This code change relates to Jira IDs AMD-Internal : [CPUPL-1877] [CPUPL-1869] [CPUPL-1878] [CPUPL-1870] Change-Id: I988d35dad8475eee644c3380ec6467bb8a628c93 --- CMakeLists.txt | 18 +++++++++++++----- README_AMD.md | 5 ++++- configure | 21 ++++++++++++++------- configure.ac | 29 ++++++++++++++++++----------- 4 files changed, 49 insertions(+), 24 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dc9cbbdd..400f443a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,11 +16,11 @@ if (CMAKE_C_COMPILER_ID MATCHES Clang) if ("${AMD_ARCH}" STREQUAL "") message(FATAL_ERROR "Machine arch missing! Select one of znver1, znver2 or znver3") elseif (${AMD_ARCH} STREQUAL "znver1") - add_definitions("-march=znver1") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=znver1") elseif (${AMD_ARCH} STREQUAL "znver2") - add_definitions("-march=znver2") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=znver2") elseif (${AMD_ARCH} STREQUAL "znver3") - add_definitions("-march=znver3") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=znver3") else () message(FATAL_ERROR "Unsupported Machine arch! Select one of znver1, znver2 or znver3") endif () @@ -103,12 +103,20 @@ if(ENABLE_MPI) endif() if(ENABLE_AMD_TRANS) - add_definitions(-DAMD_OPT_TRANS) + if(ENABLE_AMD_OPT) + add_definitions(-DAMD_OPT_TRANS) + else() + message(FATAL_ERROR "Main optimization switch ENABLE_AMD_OPT must be enabled to enable this option.") + endif() endif() if (ENABLE_AMD_APP_OPT) if (NOT (ENABLE_MPI OR ENABLE_QUAD_PRECISION OR ENABLE_LONG_DOUBLE)) - add_definitions(-DAMD_APP_OPT_LAYER) + if(ENABLE_AMD_OPT) + add_definitions(-DAMD_APP_OPT_LAYER) + else () + message(FATAL_ERROR "Main optimization switch ENABLE_AMD_OPT must be enabled to enable this option.") + endif () else () message(FATAL_ERROR "AMD application optimization layer is not supported for MPI execution and in Quad or Long double precisions.") endif () diff --git a/README_AMD.md b/README_AMD.md index a663cebd..1b15d145 100644 --- a/README_AMD.md +++ b/README_AMD.md @@ -49,7 +49,10 @@ generation architectures. make install The configure option "--enable-amd-opt" enables all the improvements and -optimizations targeted for AMD EPYC CPUs. +optimizations targeted for AMD EPYC CPUs. For enabling various optional +configure options provided for AMD EPYC CPUs, the master optimization switch +"--enable-amd-opt" must be kept enabled. + When enabling configure option "--enable-amd-opt", do not use the configure option "--enable-generic-simd128" or "--enable-generic-simd256". diff --git a/configure b/configure index 7680c942..3724e071 100755 --- a/configure +++ b/configure @@ -17508,7 +17508,11 @@ if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then SUBSTRCLANG='clang' SUBSTRGCC='gcc' if grep -q "$SUBSTRCLANG" <<<"$CC"; then - CFLAGS="$CFLAGS -mavx2 -mfma" + if [ -z "${AMD_ARCH}" ]; then + CFLAGS="$CFLAGS -mavx2 -mfma" + else + CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mfma" + fi elif grep -q "$SUBSTRGCC" <<<"$CC"; then GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.`) case "$AMDZENFAMILY" in @@ -17606,31 +17610,34 @@ $as_echo "#define AMD_APP_OPT_LAYER 1" >>confdefs.h fi fi -fi -# Check whether --enable-amd-trans was given. + + # Check whether --enable-amd-trans was given. if test "${enable_amd_trans+set}" = set; then : enableval=$enable_amd_trans; have_amd_trans=$enableval else have_amd_trans=no fi -if test "$have_amd_trans" = yes; then + if test "$have_amd_trans" = yes; then $as_echo "#define AMD_OPT_TRANS 1" >>confdefs.h -fi -# Check whether --enable-amd-mpifft was given. + fi + # Check whether --enable-amd-mpifft was given. if test "${enable_amd_mpifft+set}" = set; then : enableval=$enable_amd_mpifft; have_amd_mpifft=$enableval else have_amd_mpifft=no fi -if test "$have_amd_mpifft" = yes; then + if test "$have_amd_mpifft" = yes; then $as_echo "#define AMD_OPT_MPIFFT 1" >>confdefs.h + fi + fi + # Check whether --enable-openmp was given. if test "${enable_openmp+set}" = set; then : enableval=$enable_openmp; enable_openmp=$enableval diff --git a/configure.ac b/configure.ac index c99fccf9..a8c84f45 100644 --- a/configure.ac +++ b/configure.ac @@ -649,7 +649,11 @@ if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then SUBSTRCLANG='clang' SUBSTRGCC='gcc' if grep -q "$SUBSTRCLANG" <<<"$CC"; then - CFLAGS="$CFLAGS -mavx2 -mfma" + if [[ -z "${AMD_ARCH}" ]]; then + CFLAGS="$CFLAGS -mavx2 -mfma" + else + CFLAGS="$CFLAGS -march=$AMD_ARCH -mavx2 -mfma" + fi elif grep -q "$SUBSTRGCC" <<<"$CC"; then GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.`) case "$AMDZENFAMILY" in @@ -731,17 +735,20 @@ if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then AC_DEFINE(AMD_APP_OPT_LAYER,1,[Enable AMD application optimization layer.]) fi fi + + dnl amd optimization switch to enable amd cpu optimized transpose --enable-amd-trans + AC_ARG_ENABLE(amd-trans, [AC_HELP_STRING([--enable-amd-trans],[enable AMD cpu optimized Transpose])], have_amd_trans=$enableval, have_amd_trans=no) + if test "$have_amd_trans" = yes; then + AC_DEFINE(AMD_OPT_TRANS,1,[Define to enable AMD cpu optimized Transpose.]) + fi + dnl amd optimization switch to enable MPI FFT improvements for AMD cpus --enable-amd-mpifft + AC_ARG_ENABLE(amd-mpifft, [AC_HELP_STRING([--enable-amd-mpifft],[enable AMD cpu optimized MPI FFT])], have_amd_mpifft=$enableval, have_amd_mpifft=no) + if test "$have_amd_mpifft" = yes; then + AC_DEFINE(AMD_OPT_MPIFFT,1,[Define to enable AMD cpu optimized MPI FFT.]) + fi + fi -dnl amd optimization switch to enable amd cpu optimized transpose --enable-amd-trans -AC_ARG_ENABLE(amd-trans, [AC_HELP_STRING([--enable-amd-trans],[enable AMD cpu optimized Transpose])], have_amd_trans=$enableval, have_amd_trans=no) -if test "$have_amd_trans" = yes; then - AC_DEFINE(AMD_OPT_TRANS,1,[Define to enable AMD cpu optimized Transpose.]) -fi -dnl amd optimization switch to enable MPI FFT improvements for AMD cpus --enable-amd-mpifft -AC_ARG_ENABLE(amd-mpifft, [AC_HELP_STRING([--enable-amd-mpifft],[enable AMD cpu optimized MPI FFT])], have_amd_mpifft=$enableval, have_amd_mpifft=no) -if test "$have_amd_mpifft" = yes; then - AC_DEFINE(AMD_OPT_MPIFFT,1,[Define to enable AMD cpu optimized MPI FFT.]) -fi + AC_ARG_ENABLE(openmp, [AC_HELP_STRING([--enable-openmp],[use OpenMP directives for parallelism])], enable_openmp=$enableval, enable_openmp=no) AC_ARG_ENABLE(threads, [AC_HELP_STRING([--enable-threads],[compile FFTW SMP threads library])], enable_threads=$enableval, enable_threads=no) dnl aocl version number of amd-fftw From f2c126bf12186a1ce31b9b3232e7c2cc823f3cab Mon Sep 17 00:00:00 2001 From: sraut Date: Sun, 3 Oct 2021 15:19:47 +0000 Subject: [PATCH 5/5] This change extends the applicability and functionality of AMD application optimization layer. 1) The AMD application optimization layer extends its implementation support (of a separate memory region for the planner) for the real DFT problem types that are supported through r2c and c2r interfaces. 2) The default use of wisdom feature in combination with the application optimization layer is disabled. This avoids the corruption of wisdom file from concurrent reads/writes by multiple processes of the application. It is due to known concurrency issues in the planner and needs some kind of atomic operations for the wisdom feature. 3) README is updated with more information. This code change relates to Jira IDs AMD-Internal : [CPUPL-1890] [CPUPL-1889] Change-Id: Icc78a53708a1855b41f451257ef1310a9791b22e --- README_AMD.md | 14 ++-- api/apiplan.c | 174 +++++++++++++++++++++++++++++++++++++++++++++++-- kernel/ifftw.h | 2 + 3 files changed, 181 insertions(+), 9 deletions(-) diff --git a/README_AMD.md b/README_AMD.md index 1b15d145..99b977f0 100644 --- a/README_AMD.md +++ b/README_AMD.md @@ -38,7 +38,7 @@ INSTALLATION FROM AMD Optimized FFTW GIT REPOSITORY: After downloading the latest stable release from the git repository, https://github.com/amd/amd-fftw, follow the below steps to configure and -build it for AMD EPYC processor based on Naples, Rome and future +build it for AMD EPYC processor based on Naples, Rome, Milan and future generation architectures. ./configure --enable-sse2 --enable-avx --enable-avx2 @@ -65,7 +65,11 @@ configure option, the user needs to set --mca btl_vader_eager_limit appropriately (current preference is 65536) in the MPIRUN command. The new fast planner can be enabled using optional configure option -"--enable-amd-fast-planner". It is supported for single and double precisions. +"--enable-amd-fast-planner". It is supported in single and double precisions. + +Top N planner mode can be enabled using optional configure option +"--enable-amd-top-n-planner" to minimize run-to-run variations in performance. +It is supported in single-threaded execution in single and double precisions. An optional configure option "AMD_ARCH" is supported that can be set to CPU architecture values like "auto" or "znver1" or "znver2" or "znver3" for AMD @@ -73,9 +77,9 @@ EPYC processors. The optional configure option "--enable-amd-app-opt" turns on AMD's application optimization layer to benefit performance of HPC and scientific applications. -Currently it is developed for complex DFT problem types in double and single -precisions. It is not supported for MPI FFTs, real DFT problem types, Quad or -Long double precisions, and split array format. +Currently it is developed for complex and real (r2c and c2r) DFT problem types +in double and single precisions. It is not supported for MPI FFTs, r2r real DFT +problem types, Quad or Long double precisions, and split array format. An optional configure option "--enable-amd-trans" is provided that may benefit the performance of transpose operations in case of very large FFT problem sizes. diff --git a/api/apiplan.c b/api/apiplan.c index 43168950..acc73c88 100644 --- a/api/apiplan.c +++ b/api/apiplan.c @@ -159,13 +159,24 @@ typedef struct app_layer_data_s R *io; } app_layer_data; +static inline INT imax(INT a, INT b) +{ + return (a > b) ? a : b; +} + +static inline INT imin(INT a, INT b) +{ + return (a < b) ? a : b; +} + static int create_amd_app_layer(int sign, unsigned *flags, problem *prb, app_layer_data *app_layer) { - int isz, osz, inplace = 0; + INT isz, osz, inplace = 0; int align_bytes = 0, in_alignment = 1, cur_alloc_alignment = 1, iaddr_changed = 0, oaddr_changed = 0; *flags &= ~(FFTW_ESTIMATE | FFTW_MEASURE | FFTW_PATIENT | FFTW_EXHAUSTIVE); *flags |= FFTW_PATIENT; +#ifdef AMD_APP_OPT_USE_WISDOM if (wisdom_one_time_read == 0) { if (!X(import_wisdom_from_filename)("wis.dat")) @@ -176,6 +187,7 @@ static int create_amd_app_layer(int sign, unsigned *flags, problem *prb, app_lay wisdom_one_time_read = 1; #endif } +#endif if(prb->adt->problem_kind == PROBLEM_DFT) { @@ -274,11 +286,135 @@ static int create_amd_app_layer(int sign, unsigned *flags, problem *prb, app_lay printf("start-FFTW: (in-place:%d), %x, %x; %x, %x\n", inplace, pdft->ri, pdft->ii, pdft->ro, pdft->io); printf("start-FFTW: %x, %x; %x, %x\n", app_layer->ri, app_layer->ii, app_layer->ro, app_layer->io); printf("FFTW app_layer: %x, %x; %x, %x\n", app_layer->ri, app_layer->ii, app_layer->ro, app_layer->io); +#endif + } + else if(prb->adt->problem_kind == PROBLEM_RDFT2) //R2C(forward) and C2R(backward) cases only + { + problem_rdft2 *pdft = (problem_rdft2 *) prb; + INT lb = 0, ub = 1, lb2 = 0, ub2 = 1, stridedLen, stridedLen2, is = 0, os = 0; + int rType = (R2HC_KINDP(pdft->kind));//1: r2c, 0: c2r + isz = 1; + osz = 1; + if (FINITE_RNK(pdft->sz->rnk)) + { + int i, rnkGtr1 = 0; + for (i = 0; i < pdft->sz->rnk-1; ++i) + { + const iodim *q = pdft->sz->dims + i; + is = q->is; + os = q->os; + stridedLen = (is>>!rType) * (q->n - 1); + stridedLen2 = (os>>rType) * (q->n - 1); + lb = imin(lb, lb + stridedLen); + ub = imax(ub, ub + stridedLen); + lb2 = imin(lb2, lb2 + stridedLen2); + ub2 = imax(ub2, ub2 + stridedLen2); + rnkGtr1 = 1; + } + if (i < pdft->sz->rnk) + { + const iodim *q = pdft->sz->dims + i; + is = q->is>>1; + os = q->os>>1; + stridedLen = is * (q->n - 1); + stridedLen2 = os * (q->n - 1); + lb = imin(lb, lb + stridedLen); + ub = imax(ub, ub + stridedLen); + lb2 = imin(lb2, lb2 + stridedLen2); + ub2 = imax(ub2, ub2 + stridedLen2); + } + + } + if (FINITE_RNK(pdft->vecsz->rnk)) + { + for (int i = 0; i < pdft->vecsz->rnk; ++i) + { + const iodim *q = pdft->vecsz->dims + i; + stridedLen = (q->is>>!rType) * (q->n - 1); + stridedLen2 = (q->os>>rType) * (q->n - 1); + lb = imin(lb, lb + stridedLen); + ub = imax(ub, ub + stridedLen); + lb2 = imin(lb2, lb2 + stridedLen2); + ub2 = imax(ub2, ub2 + stridedLen2); + } + } + isz = ub - lb; + osz = ub2 - lb2; +#ifdef AMD_APP_LAYER_API_LOGS + printf("start-App (real): %d*%d*%d*%d\n", pdft->sz->rnk, pdft->vecsz->rnk, pdft->sz->dims->n, pdft->vecsz->dims->n); + printf("start-App (real): %x, %x; %x, %x; %d, %d\n", pdft->r0, pdft->r1, pdft->cr, pdft->ci, isz, osz); +#endif + if (R2HC_KINDP(pdft->kind)) + { + isz = isz > (osz<<1) ? isz : (osz<<1); + app_layer->ri = pdft->r0; + app_layer->ii = pdft->r1; + app_layer->ro = pdft->cr; + app_layer->io = pdft->ci; + inplace = (pdft->r0 == pdft->cr); + app_layer->inPtr = (R *) malloc((isz * sizeof(R)) + sizeof(R)); + if (((ptrdiff_t)(pdft->r0)) & align_bytes) + in_alignment = 0; + if (((ptrdiff_t)(app_layer->inPtr)) & align_bytes) + cur_alloc_alignment = 0; + + if ((in_alignment == 0 && cur_alloc_alignment == 1) || + (in_alignment == 1 && cur_alloc_alignment == 0)) + { + iaddr_changed = 1; + } + pdft->r0 = app_layer->inPtr + iaddr_changed; + pdft->r1 = pdft->r0 + is; + if (inplace) + { + pdft->cr = pdft->r0; + pdft->ci = pdft->cr + 1; + } + } + else + { + osz = osz > (isz<<1) ? osz : (isz<<1); + app_layer->ri = pdft->cr; + app_layer->ii = pdft->ci; + app_layer->ro = pdft->r0; + app_layer->io = pdft->r1; + inplace = (pdft->r0 == pdft->cr); + if (inplace) + { + app_layer->inPtr = (R *) malloc((osz * sizeof(R)) + sizeof(R)); + } + else + { + app_layer->inPtr = (R *) malloc((isz * sizeof(R) * 2) + sizeof(R)); + } + if (((ptrdiff_t)(pdft->cr)) & align_bytes) + in_alignment = 0; + if (((ptrdiff_t)(app_layer->inPtr)) & align_bytes) + cur_alloc_alignment = 0; + + if ((in_alignment == 0 && cur_alloc_alignment == 1) || + (in_alignment == 1 && cur_alloc_alignment == 0)) + { + iaddr_changed = 1; + } + pdft->cr = app_layer->inPtr + iaddr_changed; + pdft->ci = pdft->cr + 1; + if (inplace) + { + pdft->r0 = pdft->cr; + pdft->r1 = pdft->r0 + os; + } + } + +#ifdef AMD_APP_LAYER_API_LOGS + printf("start-FFTW (real): (in-place:%d), %x, %x; %x, %x\n", inplace, pdft->r0, pdft->r1, pdft->cr, pdft->ci); + printf("start-FFTW (real): %x, %x; %x, %x\n", app_layer->ri, app_layer->ii, app_layer->ro, app_layer->io); + printf("FFTW app_layer (real): %x, %x; %x, %x\n", app_layer->ri, app_layer->ii, app_layer->ro, app_layer->io); #endif } else { - fprintf(stderr, "apiplan: UNSUPPORTED problem type/kind\n"); + fprintf(stderr, "apiplan: UNSUPPORTED problem type/kind [%d]\n", prb->adt->problem_kind); return -1; } return 0; @@ -287,8 +423,8 @@ static int create_amd_app_layer(int sign, unsigned *flags, problem *prb, app_lay static void destroy_amd_app_layer(problem *prb, app_layer_data *app_layer) { int inplace = 0; - problem_dft *pdft = (problem_dft *) prb; +#ifdef AMD_APP_OPT_USE_WISDOM if (wisdom_one_time_read == 0) { #ifdef AMD_APP_OPT_GENERATE_WISDOM @@ -296,11 +432,12 @@ static void destroy_amd_app_layer(problem *prb, app_layer_data *app_layer) #endif X(export_wisdom_to_filename)("wis.dat"); } +#endif - inplace = (pdft->ri == pdft->ro); if(prb->adt->problem_kind == PROBLEM_DFT) { problem_dft *pdft = (problem_dft *) prb; + inplace = (pdft->ri == pdft->ro); free(app_layer->inPtr); pdft->ri = app_layer->ri; pdft->ii = app_layer->ii; @@ -319,6 +456,35 @@ static void destroy_amd_app_layer(problem *prb, app_layer_data *app_layer) } #ifdef AMD_APP_LAYER_API_LOGS printf("end-App: %x, %x; %x, %x\n", pdft->ri, pdft->ii, pdft->ro, pdft->io); +#endif + } + else if(prb->adt->problem_kind == PROBLEM_RDFT2) + { + problem_rdft2 *pdft = (problem_rdft2 *) prb; + inplace = (pdft->r0 == pdft->cr); + free(app_layer->inPtr); + if (R2HC_KINDP(pdft->kind)) + { + pdft->r0 = app_layer->ri; + pdft->r1 = app_layer->ii; + if (inplace) + { + pdft->cr = app_layer->ro; + pdft->ci = app_layer->io; + } + } + else + { + pdft->cr = app_layer->ri; + pdft->ci = app_layer->ii; + if (inplace) + { + pdft->r0 = app_layer->ro; + pdft->r1 = app_layer->io; + } + } +#ifdef AMD_APP_LAYER_API_LOGS + printf("end-App: %x, %x; %x, %x\n", pdft->r0, pdft->r1, pdft->cr, pdft->ci); #endif } } diff --git a/kernel/ifftw.h b/kernel/ifftw.h index cfd9c604..9e100ba7 100644 --- a/kernel/ifftw.h +++ b/kernel/ifftw.h @@ -182,6 +182,8 @@ extern "C" #ifdef AMD_APP_OPT_LAYER //AMD's application optimization layer //Enable/disable separate memory even for output buffer in case of out-of-place FFT //#define AMD_APP_OPT_OUT_BUFFER_MEM +//Enable this switch to use wisdom feature in combination with application optimization layer. +//#define AMD_APP_OPT_USE_WISDOM //Enable this switch to generate wisdom file for the first time for the application. //#define AMD_APP_OPT_GENERATE_WISDOM //Debug print logs for the application optimization layer