Skip to content

Commit

Permalink
Merge 3.1 release branch into amd-fftw
Browse files Browse the repository at this point in the history
  • Loading branch information
BiplabRaut committed Dec 13, 2021
2 parents 764197a + f2c126b commit 2b0bbb5
Show file tree
Hide file tree
Showing 12 changed files with 643 additions and 61 deletions.
41 changes: 33 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ if (CMAKE_C_COMPILER_ID MATCHES Clang)
if ("${AMD_ARCH}" STREQUAL "")
message(FATAL_ERROR "Machine arch missing! Select one of znver1, znver2 or znver3")
elseif (${AMD_ARCH} STREQUAL "znver1")
add_definitions("-march=znver1")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=znver1")
elseif (${AMD_ARCH} STREQUAL "znver2")
add_definitions("-march=znver2")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=znver2")
elseif (${AMD_ARCH} STREQUAL "znver3")
add_definitions("-march=znver3")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=znver3")
else ()
message(FATAL_ERROR "Unsupported Machine arch! Select one of znver1, znver2 or znver3")
endif ()
Expand All @@ -47,14 +47,17 @@ option (ENABLE_AVX "Compile with AVX instruction set support" OFF)
option (ENABLE_AVX2 "Compile with AVX2 instruction set support" ON)

option (ENABLE_AMD_OPT "Enable AMD specific optimization" OFF)
option (ENABLE_AMD_MPIFFT "Compile with AMD MPIFFT support" OFF)
option (ENABLE_AMD_MPIFFT "Enable AMD MPI FFT optimizations" OFF)
option (ENABLE_AMD_MPI_VADER_LIMIT "Enable advanced AMD MPI optimizations for in-place transpose based on VADER LIMIT" OFF)
option (ENABLE_MPI "compile FFTW MPI library" OFF)

option (ENABLE_AMD_TRANS "Enable amd optimized transpose" OFF)

option (ENABLE_AMD_FAST_PLANNER "Enable for a faster planning time on AMD cpus" OFF)
option (ENABLE_AMD_TOP_N_planner "Enable AMD Top N Planner for AMD cpus" OFF)

option (ENABLE_AMD_APP_OPT "Enable AMD application optimization layer for HPC and scientific applications" OFF)

if(ENABLE_VERBOSE_MODE)
if(CMAKE_C_COMPILER_ID MATCHES MSVC)
set(CMAKE_VERBOSE_MAKEFILE ON CACHE BOOL "ON" FORCE)
Expand Down Expand Up @@ -87,6 +90,10 @@ if(ENABLE_AMD_MPIFFT)
add_definitions(-DAMD_OPT_MPIFFT)
endif()

if(ENABLE_AMD_MPI_VADER_LIMIT)
add_definitions(-DAMD_MPI_VADER_LIMIT_SET)
endif()

if(ENABLE_MPI)
if(ENABLE_QUAD_PRECISION)
message (FATAL_ERROR "quad precision is not supported in MPI")
Expand All @@ -96,9 +103,25 @@ if(ENABLE_MPI)
endif()

if(ENABLE_AMD_TRANS)
add_definitions(-DAMD_OPT_TRANS)
if(ENABLE_AMD_OPT)
add_definitions(-DAMD_OPT_TRANS)
else()
message(FATAL_ERROR "Main optimization switch ENABLE_AMD_OPT must be enabled to enable this option.")
endif()
endif()

if (ENABLE_AMD_APP_OPT)
if (NOT (ENABLE_MPI OR ENABLE_QUAD_PRECISION OR ENABLE_LONG_DOUBLE))
if(ENABLE_AMD_OPT)
add_definitions(-DAMD_APP_OPT_LAYER)
else ()
message(FATAL_ERROR "Main optimization switch ENABLE_AMD_OPT must be enabled to enable this option.")
endif ()
else ()
message(FATAL_ERROR "AMD application optimization layer is not supported for MPI execution and in Quad or Long double precisions.")
endif ()
endif ()

option (DISABLE_FORTRAN "Disable Fortran wrapper routines" OFF)

if (CMAKE_C_COMPILER_ID MATCHES MSVC OR CMAKE_C_COMPILER_ID MATCHES Clang)
Expand Down Expand Up @@ -127,6 +150,7 @@ message(ENABLE_AVX : ${ENABLE_AVX})
message(ENABLE_AVX2 : ${ENABLE_AVX2})
message(ENABLE_AMD_OPT : ${ENABLE_AMD_OPT})
message(ENABLE_AMD_MPIFFT : ${ENABLE_AMD_MPIFFT})
message(ENABLE_AMD_MPI_VADER_LIMIT : ${ENABLE_AMD_MPI_VADER_LIMIT})
message(ENABLE_MPI : ${ENABLE_MPI})
message(ENABLE_AMD_TRANS : ${ENABLE_AMD_TRANS})
message(DISABLE_FORTRAN : ${DISABLE_FORTRAN})
Expand All @@ -136,6 +160,7 @@ message(CMAKE_C_FLAGS_RELEASE : ${CMAKE_C_FLAGS_RELEASE})
message(CMAKE_C_COMPILER_FLAGS : ${CMAKE_C_COMPILER_FLAGS})
message(ENABLE_AMD_FAST_PLANNER : ${ENABLE_AMD_FAST_PLANNER})
message(ENABLE_AMD_TOP_N_planner : ${ENABLE_AMD_TOP_N_planner})
message(ENABLE_AMD_APP_OPT : ${ENABLE_AMD_APP_OPT})

include(GNUInstallDirs)

Expand Down Expand Up @@ -218,7 +243,7 @@ if (MSVC)
add_definitions(-D_CRT_SECURE_NO_WARNINGS)
endif(MSVC)

add_compile_definitions(AOCL_FFTW_VERSION="AOCL FFTW 3.0.1")
add_compile_definitions(AOCL_FFTW_VERSION="AOCL FFTW 3.1")

find_library (LIBM_LIBRARY NAMES m)
if (LIBM_LIBRARY)
Expand Down Expand Up @@ -488,7 +513,7 @@ endif ()

if (ENABLE_AMD_FAST_PLANNER)
if (NOT (ENABLE_QUAD_PRECISION OR ENABLE_LONG_DOUBLE))
set (AMD_OPT_FAST_PLANNER TRUE)
add_definitions(-DAMD_OPT_FAST_PLANNER)
else ()
message(FATAL_ERROR "AMD_FAST_PLANNER cannot be set for Quad and Long Double precision")
endif ()
Expand All @@ -501,7 +526,7 @@ if (ENABLE_AMD_TOP_N_planner)
if (NOT (ENABLE_AMD_FAST_PLANNER))
# Check if amd-top-n-planner is enabled with mpi, openmp or threads
if (NOT (ENABLE_MPI OR ENABLE_OPENMP OR ENABLE_THREADS))
set (AMD_OPT_TOP_N_PLANNER TRUE)
add_definitions(-DAMD_OPT_TOP_N_PLANNER)
else ()
message(FATAL_ERROR "AMD_TOP_N_PLANNER can not be enabled with mpi, openmp or threads as it is supported only for single threaded mode")
endif ()
Expand Down
1 change: 1 addition & 0 deletions COPYRIGHT
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
* Copyright (C) 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
Expand Down
33 changes: 24 additions & 9 deletions README_AMD.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,18 @@ AMD EPYC CPUs. It is developed on top of FFTW (version fftw-3.3.8).
All known features and functionalities of FFTW are retained and supported
as it is with this AMD optimized FFTW library.

AMD Optimized FFTW achieves higher performance than the FFTW 3.3.8 due to
AMD Optimized FFTW achieves higher performance than the FFTW 3.3.8 due to its
various optimizations involving improved SIMD Kernel functions, improved copy
functions (cpy2d and cpy2d_pair used in rank-0 transform and buffering plan),
improved 256-bit kernels selection by Planner and an optional in-place
transpose for large problem sizes. AMD Optimized FFTW improves the performance
of in-place MPI FFT over FFTW 3.3.8 by employing a faster in-place MPI
of in-place MPI FFTs over FFTW 3.3.8 by employing a faster in-place MPI
transpose function. AMD Optimized FFTW provides a new fast planner as an
extension to the original planner that improves planning time of various
planning modes in general and PATIENT mode in particular. As of AMD FFTW 3.0.1,
a new feature called Top N planner is introduced that minimizes single-threaded
run-to-run variations.
planning modes in general and PATIENT mode in particular. Another new planning
mode called Top N planner is made available that minimizes single-threaded
run-to-run variations. As of AMD FFTW 3.1, a feature called AMD's application
optimization layer is introduced to speedup HPC and scientific applications.

FFTW is a free collection of fast C routines for computing the
Discrete Fourier Transform and various special cases thereof in one or more
Expand All @@ -37,7 +38,7 @@ INSTALLATION FROM AMD Optimized FFTW GIT REPOSITORY:

After downloading the latest stable release from the git repository,
https://github.com/amd/amd-fftw, follow the below steps to configure and
build it for AMD EPYC processor based on Naples, Rome and future
build it for AMD EPYC processor based on Naples, Rome, Milan and future
generation architectures.

./configure --enable-sse2 --enable-avx --enable-avx2
Expand All @@ -48,7 +49,10 @@ generation architectures.
make install

The configure option "--enable-amd-opt" enables all the improvements and
optimizations targeted for AMD EPYC CPUs.
optimizations targeted for AMD EPYC CPUs. For enabling various optional
configure options provided for AMD EPYC CPUs, the master optimization switch
"--enable-amd-opt" must be kept enabled.

When enabling configure option "--enable-amd-opt", do not use the
configure option "--enable-generic-simd128" or "--enable-generic-simd256".

Expand All @@ -61,10 +65,21 @@ configure option, the user needs to set --mca btl_vader_eager_limit
appropriately (current preference is 65536) in the MPIRUN command.

The new fast planner can be enabled using optional configure option
"--enable-amd-fast-planner". It is supported for single and double precisions.
"--enable-amd-fast-planner". It is supported in single and double precisions.

Top N planner mode can be enabled using optional configure option
"--enable-amd-top-n-planner" to minimize run-to-run variations in performance.
It is supported in single-threaded execution in single and double precisions.

An optional configure option "AMD_ARCH" is supported that can be set to CPU
architecture values like "auto" or "znver1" or "znver2" for AMD EPYC processors.
architecture values like "auto" or "znver1" or "znver2" or "znver3" for AMD
EPYC processors.

The optional configure option "--enable-amd-app-opt" turns on AMD's application
optimization layer to benefit performance of HPC and scientific applications.
Currently it is developed for complex and real (r2c and c2r) DFT problem types
in double and single precisions. It is not supported for MPI FFTs, r2r real DFT
problem types, Quad or Long double precisions, and split array format.

An optional configure option "--enable-amd-trans" is provided that may benefit
the performance of transpose operations in case of very large FFT problem sizes.
Expand Down
Loading

0 comments on commit 2b0bbb5

Please sign in to comment.