Merge 3.1 release branch into amd-fftw

amd · Dec 13, 2021 · 2b0bbb5 · 2b0bbb5
2 parents 764197a + f2c126b
commit 2b0bbb5
Show file tree

Hide file tree

Showing 12 changed files with 643 additions and 61 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,11 +16,11 @@ if (CMAKE_C_COMPILER_ID MATCHES Clang)
   if ("${AMD_ARCH}" STREQUAL "")
     message(FATAL_ERROR "Machine arch missing! Select one of znver1, znver2 or znver3")
   elseif (${AMD_ARCH} STREQUAL "znver1")
-    add_definitions("-march=znver1")
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=znver1")
   elseif (${AMD_ARCH} STREQUAL "znver2")
-    add_definitions("-march=znver2")
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=znver2")
   elseif (${AMD_ARCH} STREQUAL "znver3")
-    add_definitions("-march=znver3")
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=znver3")
   else ()
     message(FATAL_ERROR "Unsupported Machine arch! Select one of znver1, znver2 or znver3")
   endif ()
@@ -47,14 +47,17 @@ option (ENABLE_AVX "Compile with AVX instruction set support" OFF)
 option (ENABLE_AVX2 "Compile with AVX2 instruction set support" ON)
 
 option (ENABLE_AMD_OPT "Enable AMD specific optimization" OFF)
-option (ENABLE_AMD_MPIFFT "Compile with AMD MPIFFT support" OFF)
+option (ENABLE_AMD_MPIFFT "Enable AMD MPI FFT optimizations" OFF)
+option (ENABLE_AMD_MPI_VADER_LIMIT "Enable advanced AMD MPI optimizations for in-place transpose based on VADER LIMIT" OFF)
 option (ENABLE_MPI "compile FFTW MPI library" OFF)
 
 option (ENABLE_AMD_TRANS "Enable amd optimized transpose" OFF)
 
 option (ENABLE_AMD_FAST_PLANNER "Enable for a faster planning time on AMD cpus" OFF)
 option (ENABLE_AMD_TOP_N_planner "Enable AMD Top N Planner for AMD cpus" OFF)
 
+option (ENABLE_AMD_APP_OPT "Enable AMD application optimization layer for HPC and scientific applications" OFF)
+
 if(ENABLE_VERBOSE_MODE)
   if(CMAKE_C_COMPILER_ID MATCHES MSVC)
     set(CMAKE_VERBOSE_MAKEFILE ON CACHE BOOL "ON" FORCE)
@@ -87,6 +90,10 @@ if(ENABLE_AMD_MPIFFT)
   add_definitions(-DAMD_OPT_MPIFFT)
 endif()
 
+if(ENABLE_AMD_MPI_VADER_LIMIT)
+  add_definitions(-DAMD_MPI_VADER_LIMIT_SET)
+endif()
+
 if(ENABLE_MPI)
   if(ENABLE_QUAD_PRECISION)
     message (FATAL_ERROR "quad precision is not supported in MPI")
@@ -96,9 +103,25 @@ if(ENABLE_MPI)
 endif()
 
 if(ENABLE_AMD_TRANS)
-  add_definitions(-DAMD_OPT_TRANS)
+    if(ENABLE_AMD_OPT)
+  	add_definitions(-DAMD_OPT_TRANS)
+    else()
+	message(FATAL_ERROR "Main optimization switch ENABLE_AMD_OPT must be enabled to enable this option.")
+    endif()
 endif()
 
+if (ENABLE_AMD_APP_OPT)
+  if (NOT (ENABLE_MPI OR  ENABLE_QUAD_PRECISION OR ENABLE_LONG_DOUBLE))
+    if(ENABLE_AMD_OPT)
+      add_definitions(-DAMD_APP_OPT_LAYER)
+    else ()
+      message(FATAL_ERROR "Main optimization switch ENABLE_AMD_OPT must be enabled to enable this option.")
+    endif ()
+  else ()
+    message(FATAL_ERROR "AMD application optimization layer is not supported for MPI execution and in Quad or Long double precisions.")
+  endif ()
+endif ()
+
 option (DISABLE_FORTRAN "Disable Fortran wrapper routines" OFF)
 
 if (CMAKE_C_COMPILER_ID MATCHES MSVC OR CMAKE_C_COMPILER_ID MATCHES Clang)
@@ -127,6 +150,7 @@ message(ENABLE_AVX : ${ENABLE_AVX})
 message(ENABLE_AVX2 : ${ENABLE_AVX2})
 message(ENABLE_AMD_OPT : ${ENABLE_AMD_OPT})
 message(ENABLE_AMD_MPIFFT : ${ENABLE_AMD_MPIFFT})
+message(ENABLE_AMD_MPI_VADER_LIMIT : ${ENABLE_AMD_MPI_VADER_LIMIT})
 message(ENABLE_MPI : ${ENABLE_MPI})
 message(ENABLE_AMD_TRANS : ${ENABLE_AMD_TRANS})
 message(DISABLE_FORTRAN : ${DISABLE_FORTRAN})
@@ -136,6 +160,7 @@ message(CMAKE_C_FLAGS_RELEASE : ${CMAKE_C_FLAGS_RELEASE})
 message(CMAKE_C_COMPILER_FLAGS : ${CMAKE_C_COMPILER_FLAGS})
 message(ENABLE_AMD_FAST_PLANNER : ${ENABLE_AMD_FAST_PLANNER})
 message(ENABLE_AMD_TOP_N_planner : ${ENABLE_AMD_TOP_N_planner})
+message(ENABLE_AMD_APP_OPT : ${ENABLE_AMD_APP_OPT})
 
 include(GNUInstallDirs)
 
@@ -218,7 +243,7 @@ if (MSVC)
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
 endif(MSVC)
 
-add_compile_definitions(AOCL_FFTW_VERSION="AOCL FFTW 3.0.1")
+add_compile_definitions(AOCL_FFTW_VERSION="AOCL FFTW 3.1")
 
 find_library (LIBM_LIBRARY NAMES m)
 if (LIBM_LIBRARY)
@@ -488,7 +513,7 @@ endif ()
 
 if (ENABLE_AMD_FAST_PLANNER)
   if (NOT (ENABLE_QUAD_PRECISION OR ENABLE_LONG_DOUBLE))
-    set (AMD_OPT_FAST_PLANNER TRUE)
+    add_definitions(-DAMD_OPT_FAST_PLANNER)
   else ()
     message(FATAL_ERROR "AMD_FAST_PLANNER cannot be set for Quad and Long Double precision")
   endif ()
@@ -501,7 +526,7 @@ if (ENABLE_AMD_TOP_N_planner)
 	if (NOT (ENABLE_AMD_FAST_PLANNER))
 		# Check if amd-top-n-planner is enabled with mpi, openmp or threads
 		if (NOT (ENABLE_MPI OR ENABLE_OPENMP OR ENABLE_THREADS))
-			set (AMD_OPT_TOP_N_PLANNER TRUE)
+			add_definitions(-DAMD_OPT_TOP_N_PLANNER)
 		else ()
 			message(FATAL_ERROR "AMD_TOP_N_PLANNER can not be enabled with mpi, openmp or threads as it is supported only for single threaded mode")
 		endif ()

diff --git a/COPYRIGHT b/COPYRIGHT
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ * Copyright (C) 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by

diff --git a/README_AMD.md b/README_AMD.md
@@ -6,17 +6,18 @@ AMD EPYC CPUs. It is developed on top of FFTW (version fftw-3.3.8).
 All known features and functionalities of FFTW are retained and supported
 as it is with this AMD optimized FFTW library.
 
-AMD Optimized FFTW achieves higher performance than the FFTW 3.3.8 due to
+AMD Optimized FFTW achieves higher performance than the FFTW 3.3.8 due to its
 various optimizations involving improved SIMD Kernel functions, improved copy
 functions (cpy2d and cpy2d_pair used in rank-0 transform and buffering plan),
 improved 256-bit kernels selection by Planner and an optional in-place 
 transpose for large problem sizes. AMD Optimized FFTW improves the performance
-of in-place MPI FFT over FFTW 3.3.8 by employing a faster in-place MPI
+of in-place MPI FFTs over FFTW 3.3.8 by employing a faster in-place MPI
 transpose function. AMD Optimized FFTW provides a new fast planner as an
 extension to the original planner that improves planning time of various
-planning modes in general and PATIENT mode in particular. As of AMD FFTW 3.0.1,
-a new feature called Top N planner is introduced that minimizes single-threaded
-run-to-run variations.
+planning modes in general and PATIENT mode in particular. Another new planning
+mode called Top N planner is made available that minimizes single-threaded
+run-to-run variations. As of AMD FFTW 3.1, a feature called AMD's application
+optimization layer is introduced to speedup HPC and scientific applications.
 
 FFTW is a free collection of fast C routines for computing the
 Discrete Fourier Transform and various special cases thereof in one or more
@@ -37,7 +38,7 @@ INSTALLATION FROM AMD Optimized FFTW GIT REPOSITORY:
 
 After downloading the latest stable release from the git repository,
 https://github.com/amd/amd-fftw, follow the below steps to configure and
-build it for AMD EPYC processor based on Naples, Rome and future 
+build it for AMD EPYC processor based on Naples, Rome, Milan and future 
 generation architectures.
 
      ./configure --enable-sse2 --enable-avx --enable-avx2 
@@ -48,7 +49,10 @@ generation architectures.
      make install
 
 The configure option "--enable-amd-opt" enables all the improvements and 
-optimizations targeted for AMD EPYC CPUs.
+optimizations targeted for AMD EPYC CPUs. For enabling various optional
+configure options provided for AMD EPYC CPUs, the master optimization switch
+"--enable-amd-opt" must be kept enabled.
+
 When enabling configure option "--enable-amd-opt", do not use the 
 configure option "--enable-generic-simd128" or "--enable-generic-simd256".
 
@@ -61,10 +65,21 @@ configure option, the user needs to set --mca btl_vader_eager_limit
 appropriately (current preference is 65536) in the MPIRUN command.
 
 The new fast planner can be enabled using optional configure option 
-"--enable-amd-fast-planner". It is supported for single and double precisions.
+"--enable-amd-fast-planner". It is supported in single and double precisions.
+
+Top N planner mode can be enabled using optional configure option
+"--enable-amd-top-n-planner" to minimize run-to-run variations in performance.
+It is supported in single-threaded execution in single and double precisions.
 
 An optional configure option "AMD_ARCH" is supported that can be set to CPU 
-architecture values like "auto" or "znver1" or "znver2" for AMD EPYC processors.
+architecture values like "auto" or "znver1" or "znver2" or "znver3" for AMD 
+EPYC processors.
+
+The optional configure option "--enable-amd-app-opt" turns on AMD's application
+optimization layer to benefit performance of HPC and scientific applications.
+Currently it is developed for complex and real (r2c and c2r) DFT problem types
+in double and single precisions. It is not supported for MPI FFTs, r2r real DFT
+problem types, Quad or Long double precisions, and split array format.
 
 An optional configure option "--enable-amd-trans" is provided that may benefit
 the performance of transpose operations in case of very large FFT problem sizes.