diff --git a/README.md b/README.md new file mode 120000 index 00000000..0742ec59 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +README_AMD.md \ No newline at end of file diff --git a/README_AMD.md b/README_AMD.md new file mode 100644 index 00000000..b1f438e3 --- /dev/null +++ b/README_AMD.md @@ -0,0 +1,69 @@ +AMD OPTIMIZED FFTW +------------------ + +AMD Optimized FFTW is the optimized FFTW implementation targeted for +AMD EPYC CPUs. It is developed on top of FFTW (version fftw-3.3.8). +All known features and functionalities of FFTW are retained and supported +as it is with this AMD optimized FFTW library. + +AMD Optimized FFTW achieves higher performance than the FFTW 3.3.8 due to +various optimizations involving improved SIMD Kernel functions, improved copy +functions (cpy2d and cpy2d_pair used in rank-0 transform and buffering plan), +improved 256-bit kernels selection by Planner and an optional in-place +transpose for large problem sizes. + +FFTW is a free collection of fast C routines for computing the +Discrete Fourier Transform and various special cases thereof in one or more +dimensions. It includes complex, real, symmetric, and parallel transforms, +and can handle arbitrary array sizes efficiently. + +The doc/ directory contains the manual in texinfo, PDF, info, and HTML +formats. Frequently asked questions and answers can be found in the +doc/FAQ/ directory in ASCII and HTML. + +For a quick introduction to calling FFTW, see the "Tutorial" section +of the manual. + +INSTALLATION +------------ + +INSTALLATION FROM AMD Optimized FFTW GIT REPOSITORY: + +After downloading the latest stable release from the git repository, +https://github.com/amd/amd-fftw, follow the below steps to configure and +build it for AMD EPYC processor based on Naples, Rome and future +generation architectures. + + ./configure --enable-sse2 --enable-avx --enable-avx2 + --enable-mpi --enable-openmp --enable-shared + --enable-amd-opt + --prefix= + make + make install + +The configure option "--enable-amd-opt" enables all the improvements and +optimizations targeted for AMD EPYC CPUs. +When enabling configure option "--enable-amd-opt", do not use the +configure option "--enable-generic-simd128" or "--enable-generic-simd256". + +An optional configure option "--enable-amd-trans" is provided that may benefit +the performance of transpose operations in case of very large FFT problem sizes. +This is by default not enabled and provided as an experimental optional switch. + +By default, configure script enables double-precision mode. User should pass +appropriate configure options to enable the single-precision or quad-precision +or long-double mode. + +CONTACTS +-------- + +AMD Optimized FFTW is developed and maintained by AMD. +You can contact us on the email-id aoclsupport@amd.com. +You can also raise any issue/suggestion on the git-hub repository at +https://github.com/amd/amd-fftw/issues + +ACKNOWLEDGEMENTS +---------------- + +FFTW was developed by Matteo Frigo and Steven G. Johnson. We thank Matteo Frigo +for his support provided to us. diff --git a/README b/README_MIT similarity index 100% rename from README rename to README_MIT diff --git a/configure b/configure index a9e8aaf7..c1d01b88 100755 --- a/configure +++ b/configure @@ -17447,15 +17447,17 @@ else fi if test "$have_amd_opt" = yes; then - if test "$CC" = "clang"; then - CFLAGS="$CFLAGS -mavx2 -mfma" - else - GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.` \>= 9) - AMDZEN2MODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:` \>= 48) - if test "$GCCVERSION" = "1" && test "$AMDZEN2MODEL" = "1"; then - CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + if test "$ac_test_CFLAGS" != "set"; then + if test "$CC" = "clang"; then + CFLAGS="$CFLAGS -mavx2 -mfma" else - CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.` \>= 9) + AMDZEN2MODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:` \>= 48) + if test "$GCCVERSION" = "1" && test "$AMDZEN2MODEL" = "1"; then + CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi fi fi diff --git a/configure.ac b/configure.ac index ee7189af..6668c7d5 100644 --- a/configure.ac +++ b/configure.ac @@ -617,15 +617,17 @@ fi dnl amd optimization switch and CFLAGS setting based on config arg option --enable-amd-opt AC_ARG_ENABLE(amd-opt, [AC_HELP_STRING([--enable-amd-opt],[enable AMD cpu specific optimizations])], have_amd_opt=$enableval, have_amd_opt=no) if test "$have_amd_opt" = yes; then - if test "$CC" = "clang"; then - CFLAGS="$CFLAGS -mavx2 -mfma" - else - GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.` \>= 9) - AMDZEN2MODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:` \>= 48) - if test "$GCCVERSION" = "1" && test "$AMDZEN2MODEL" = "1"; then - CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + if test "$ac_test_CFLAGS" != "set"; then + if test "$CC" = "clang"; then + CFLAGS="$CFLAGS -mavx2 -mfma" else - CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + GCCVERSION=$(expr `gcc -dumpversion | cut -f1 -d.` \>= 9) + AMDZEN2MODEL=$(expr `cat /proc/cpuinfo | grep -m1 model|cut -f2 -d:` \>= 48) + if test "$GCCVERSION" = "1" && test "$AMDZEN2MODEL" = "1"; then + CFLAGS="$CFLAGS -march=znver2 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + else + CFLAGS="$CFLAGS -march=znver1 -mavx2 -mno-avx256-split-unaligned-store -mno-avx256-split-unaligned-load -mno-prefer-avx128 -mfma" + fi fi fi AC_DEFINE(AMD_OPT_ALL,1,[Define to enable AMD cpu specific optimizations.])