From b3f32d2992291cabf0106097d19765632ece0a54 Mon Sep 17 00:00:00 2001
From: Eashan Dash <Eashan.Dash@amd.com>
Date: Mon, 18 Jan 2021 22:40:51 +0530
Subject: [PATCH 1/6] This change adds a new AMD optimized Top N Planner
 feature to FFTW.

1) The new Top N planner improves the run-to-run variations by using a dynamic wisdom (preset) plan functionality added under FFTW's OWISDOM planning mode.
   This feature implements the mechanism to search and store top N plans into the wisdom file and then use these plans to find the best plan for execution in the consecutive runs.
2) The configuration user option --enable-amd-top-n-planner enables the Top N planner feature.
   The macro AMD_TOP_N_PLANNER enables all the code implementation for this new feature.
   The value of N (Top N planner) is fixed as 3 by the macro AMD_OPT_TOP_N.
3) Code changes are contained in files : - api/apiplan.c, kernel/planner.c, kernel/ifftw.h, libbench2/speed.c
   Files configure.ac, configure and config.h.in are modified to enable option --enable-amd-top-n-planner.
4) Top N planner feature is only supported for single threaded execution for now.

This code change relates to Jira task AMD-Internal : [CPUPL-800]

Change-Id: I0a7b68fb61b79b31e64f9dd3c966287d53a58a49
---
 api/apiplan.c     |  50 ++++++++
 config.h.in       |   3 +
 configure         |  65 ++++++----
 configure.ac      |  24 +++-
 kernel/ifftw.h    |  24 +++-
 kernel/planner.c  | 293 ++++++++++++++++++++++++++++++++++++++++++++--
 libbench2/speed.c |   3 +-
 7 files changed, 424 insertions(+), 38 deletions(-)

diff --git a/api/apiplan.c b/api/apiplan.c
index a9b31e24..99921b76 100644
--- a/api/apiplan.c
+++ b/api/apiplan.c
@@ -29,10 +29,59 @@ void X(set_planner_hooks)(planner_hook_t before, planner_hook_t after)
      after_planner_hook = after;
 }
 
+#ifdef AMD_TOP_N_PLANNER
+plan *plans[AMD_OPT_TOP_N];
+static int find_lowcost_plan()
+{
+    int i, lowcost, lowcost_idx;
+    lowcost = plans[0]->pcost;
+    lowcost_idx = 0;
+
+    for (i = 1; i < AMD_OPT_TOP_N; i++) {
+         if (plans[i]->pcost < lowcost) {
+              lowcost = plans[i]->pcost;
+              lowcost_idx = i;
+         }
+    }
+    return lowcost_idx;
+}
+#endif
+
 static plan *mkplan0(planner *plnr, unsigned flags,
 		     const problem *prb, unsigned hash_info,
 		     wisdom_state_t wisdom_state)
 {
+#ifdef AMD_TOP_N_PLANNER
+/* map API flags into FFTW flags */
+     X(mapflags)(plnr, flags);
+
+     plnr->flags.hash_info = hash_info;
+     plnr->wisdom_state = wisdom_state;
+     
+     /* create plan */
+
+     if (AMD_OPT_TOP_N > 1) {
+          if (wisp_set == 1) {
+               for (int pln_idx = 0; pln_idx < AMD_OPT_TOP_N ; pln_idx ++) {
+                    plnr->index = pln_idx;
+	            plans[pln_idx] = plnr->adt->mkplan(plnr, prb);
+               }
+               lowcost_idx = find_lowcost_plan(plans);
+               return plans[lowcost_idx];
+          }
+          else {        
+               for (int pln_idx = 0; pln_idx < AMD_OPT_TOP_N ; pln_idx ++) {
+                    plnr->index = pln_idx;
+	            plans[pln_idx] = plnr->adt->mkplan(plnr, prb);
+               }	   
+	       return plans[0];
+          }
+     }   
+     else {
+          plnr->index = 0;
+          return plnr->adt->mkplan(plnr, prb);
+     }	
+#else	
      /* map API flags into FFTW flags */
      X(mapflags)(plnr, flags);
 
@@ -41,6 +90,7 @@ static plan *mkplan0(planner *plnr, unsigned flags,
 
      /* create plan */
      return plnr->adt->mkplan(plnr, prb);
+#endif     
 }
 
 static unsigned force_estimator(unsigned flags)
diff --git a/config.h.in b/config.h.in
index a7d84765..feb022df 100644
--- a/config.h.in
+++ b/config.h.in
@@ -9,6 +9,9 @@
 /* Define to enable AMD cpu optimized MPI FFT. */
 #undef AMD_OPT_MPIFFT
 
+/* Define to enable AMD Top N Planner for AMD cpus. */
+#undef AMD_OPT_TOP_N_PLANNER
+
 /* Define to enable AMD cpu optimized Transpose. */
 #undef AMD_OPT_TRANS
 
diff --git a/configure b/configure
index 26737ea2..5a6d02b0 100755
--- a/configure
+++ b/configure
@@ -877,11 +877,12 @@ enable_mpi
 enable_amd_opt
 enable_amd_trans
 enable_amd_mpifft
+enable_openmp
+enable_threads
 enable_amd_fast_planner
+enable_amd_top_n_planner
 enable_fortran
 with_g77_wrappers
-enable_openmp
-enable_threads
 with_combined_threads
 '
       ac_precious_vars='build_alias
@@ -1587,12 +1588,15 @@ Optional Features:
   --enable-amd-opt        enable AMD cpu specific optimizations
   --enable-amd-trans      enable AMD cpu optimized Transpose
   --enable-amd-mpifft     enable AMD cpu optimized MPI FFT
+  --enable-openmp         use OpenMP directives for parallelism
+  --enable-threads        compile FFTW SMP threads library
   --enable-amd-fast-planner
                           enable AMD Fast Planner for a faster planning time
                           on AMD cpus
+  --enable-amd-top-n-planner
+                          enable AMD Top N Planner feature to search, save and
+                          reuse top N plans using wisdom file on AMD cpus
   --disable-fortran       don't include Fortran-callable wrappers
-  --enable-openmp         use OpenMP directives for parallelism
-  --enable-threads        compile FFTW SMP threads library
 
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
@@ -17584,6 +17588,20 @@ if test "$have_amd_mpifft" = yes; then
 $as_echo "#define AMD_OPT_MPIFFT 1" >>confdefs.h
 
 fi
+# Check whether --enable-openmp was given.
+if test "${enable_openmp+set}" = set; then :
+  enableval=$enable_openmp; enable_openmp=$enableval
+else
+  enable_openmp=no
+fi
+
+# Check whether --enable-threads was given.
+if test "${enable_threads+set}" = set; then :
+  enableval=$enable_threads; enable_threads=$enableval
+else
+  enable_threads=no
+fi
+
 
 $as_echo "#define AOCL_FFTW_VERSION \"AOCL-3.0\"" >>confdefs.h
 
@@ -17594,10 +17612,32 @@ else
   have_amd_fast_planner=no
 fi
 
-if test "$have_amd_fast_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then
+# Check whether --enable-amd-top-n-planner was given.
+if test "${enable_amd_top_n_planner+set}" = set; then :
+  enableval=$enable_amd_top_n_planner; have_amd_top_n_planner=$enableval
+else
+  have_amd_top_n_planner=no
+fi
+
+# Check if both amd-fast-planner and amd-top-n-planner are enabled together
+if test "$have_amd_fast_planner" = yes && test "$have_amd_top_n_planner" = yes; then
+	as_fn_error $? "AMD_FAST_PLANNER and AMD_TOP_N_PLANNER can not be enabled together" "$LINENO" 5
+else
+        if test "$have_amd_fast_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then
 
 $as_echo "#define AMD_OPT_FAST_PLANNER 1" >>confdefs.h
 
+        fi
+	# Check if amd-top-n-planner is enabled with mpi, openmp or threads
+	if (test "$enable_mpi" = "yes" && test "$have_amd_top_n_planner" = yes ) || (test "$enable_openmp" = "yes" && test "$have_amd_top_n_planner" = yes) || (test "$enable_threads" = "yes" && test "$have_amd_top_n_planner" = yes); then
+		as_fn_error $? "AMD_TOP_N_PLANNER can not be enabled with mpi, openmp or threads as it is supported only for single threaded mode" "$LINENO" 5
+	else
+                if test "$have_amd_top_n_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then
+
+$as_echo "#define AMD_OPT_TOP_N_PLANNER 1" >>confdefs.h
+
+                fi
+        fi
 fi
 
 if test "$USE_MAINTAINER_MODE" = yes; then
@@ -21604,13 +21644,6 @@ $as_echo "#define WITH_G77_WRAPPERS 1" >>confdefs.h
 fi
 
 have_smp="no"
-# Check whether --enable-openmp was given.
-if test "${enable_openmp+set}" = set; then :
-  enableval=$enable_openmp; enable_openmp=$enableval
-else
-  enable_openmp=no
-fi
-
 
 if test "$enable_openmp" = "yes"; then
 
@@ -21687,14 +21720,6 @@ fi
 
 fi
 
-# Check whether --enable-threads was given.
-if test "${enable_threads+set}" = set; then :
-  enableval=$enable_threads; enable_threads=$enableval
-else
-  enable_threads=no
-fi
-
-
 if test "$enable_threads" = "yes"; then
 
 $as_echo "#define HAVE_THREADS 1" >>confdefs.h
diff --git a/configure.ac b/configure.ac
index 5f636da6..e9442832 100644
--- a/configure.ac
+++ b/configure.ac
@@ -722,12 +722,29 @@ AC_ARG_ENABLE(amd-mpifft, [AC_HELP_STRING([--enable-amd-mpifft],[enable AMD cpu
 if test "$have_amd_mpifft" = yes; then
 	AC_DEFINE(AMD_OPT_MPIFFT,1,[Define to enable AMD cpu optimized MPI FFT.])
 fi
+AC_ARG_ENABLE(openmp, [AC_HELP_STRING([--enable-openmp],[use OpenMP directives for parallelism])], enable_openmp=$enableval, enable_openmp=no)
+AC_ARG_ENABLE(threads, [AC_HELP_STRING([--enable-threads],[compile FFTW SMP threads library])], enable_threads=$enableval, enable_threads=no)
 dnl aocl version number of amd-fftw
 AC_DEFINE(AOCL_FFTW_VERSION,"AOCL-3.0",[AOCL Version of AMD-FFTW])
 dnl amd optimization switch to enable AMD Fast Planner for AMD cpus --enable-amd-fast-planner
 AC_ARG_ENABLE(amd-fast-planner, [AC_HELP_STRING([--enable-amd-fast-planner],[enable AMD Fast Planner for a faster planning time on AMD cpus])], have_amd_fast_planner=$enableval, have_amd_fast_planner=no)
-if test "$have_amd_fast_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then
-	AC_DEFINE(AMD_OPT_FAST_PLANNER,1,[Define to enable AMD Fast Planner for AMD cpus.])
+dnl amd optimization switch to enable AMD Top N Planner for AMD cpus --enable-amd-top-n-planner
+AC_ARG_ENABLE(amd-top-n-planner, [AC_HELP_STRING([--enable-amd-top-n-planner],[enable AMD Top N Planner feature to search, save and reuse top N plans using wisdom file on AMD cpus])], have_amd_top_n_planner=$enableval, have_amd_top_n_planner=no)
+# Check if both amd-fast-planner and amd-top-n-planner are enabled together 
+if test "$have_amd_fast_planner" = yes && test "$have_amd_top_n_planner" = yes; then 
+	AC_MSG_ERROR([AMD_FAST_PLANNER and AMD_TOP_N_PLANNER can not be enabled together])
+else
+        if test "$have_amd_fast_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then
+	        AC_DEFINE(AMD_OPT_FAST_PLANNER,1,[Define to enable AMD Fast Planner for AMD cpus.])
+        fi
+	# Check if amd-top-n-planner is enabled with mpi, openmp or threads
+	if (test "$enable_mpi" = "yes" && test "$have_amd_top_n_planner" = yes ) || (test "$enable_openmp" = "yes" && test "$have_amd_top_n_planner" = yes) || (test "$enable_threads" = "yes" && test "$have_amd_top_n_planner" = yes); then
+		AC_MSG_ERROR([AMD_TOP_N_PLANNER can not be enabled with mpi, openmp or threads as it is supported only for single threaded mode])
+	else	
+                if test "$have_amd_top_n_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then
+                        AC_DEFINE(AMD_OPT_TOP_N_PLANNER,1,[Define to enable AMD Top N Planner for AMD cpus.])
+                fi
+        fi		
 fi
 
 dnl check for a proper indent in maintainer mode
@@ -789,15 +806,12 @@ fi
 
 dnl -----------------------------------------------------------------------
 have_smp="no"
-AC_ARG_ENABLE(openmp, [AC_HELP_STRING([--enable-openmp],[use OpenMP directives for parallelism])], enable_openmp=$enableval, enable_openmp=no)
 
 if test "$enable_openmp" = "yes"; then
    AC_DEFINE(HAVE_OPENMP,1,[Define to enable OpenMP])
    AX_OPENMP([], [AC_MSG_ERROR([don't know how to enable OpenMP])])
 fi
 
-AC_ARG_ENABLE(threads, [AC_HELP_STRING([--enable-threads],[compile FFTW SMP threads library])], enable_threads=$enableval, enable_threads=no)
-
 if test "$enable_threads" = "yes"; then
    AC_DEFINE(HAVE_THREADS,1,[Define to enable SMP threads])
 fi
diff --git a/kernel/ifftw.h b/kernel/ifftw.h
index 7b8a2977..62a9d90b 100644
--- a/kernel/ifftw.h
+++ b/kernel/ifftw.h
@@ -142,8 +142,17 @@ extern "C"
 #define AMD_HASH_UNBLESS_MAX_SIZE 10485760
 #endif
 
-#endif//#ifdef AMD_OPT_ALL
-//============================================================
+//NEW TOP N PLANNER feature for AMD CPUs can be enabled with the below switch AMD_TOP_N_PLANNER.
+//The new Top N planner improves the run-to-run variations by using a dynamic wisdom (preset) plan functionality.
+//This feature implements the mechanism to search and store top N plans into the wisdom file and then use these plans to find the best plan for execution in the consecutive runs.
+//AMD_TOP_N_PLANNER can not be used with AMD_FAST_PLANNER. Only one of them can be enabled at a time. 
+#ifdef AMD_OPT_TOP_N_PLANNER 
+#define AMD_TOP_N_PLANNER
+#define AMD_OPT_TOP_N 3 //The value of AMD_OPT_TOP_N is fixed as 3, enabling the search, store and re-use of Top 3 plans. This value should not be changed by the user.
+#endif
+
+#endif//#ifdef AMD_OPT_ALL  
+ //============================================================
 //AMD OPTIMIZATIONS :- end
 
 /*
@@ -813,8 +822,14 @@ struct planner_s {
 
      wisdom_state_t wisdom_state;
 
+#ifdef AMD_TOP_N_PLANNER
+     hashtab htab_blessed[AMD_OPT_TOP_N];
+     hashtab htab_unblessed[AMD_OPT_TOP_N];
+     int index;
+#else
      hashtab htab_blessed;
      hashtab htab_unblessed;
+#endif
 
      int nthr;
      flags_t flags;
@@ -832,6 +847,11 @@ struct planner_s {
      int nprob;    /* number of problems evaluated */
 };
 
+#ifdef AMD_TOP_N_PLANNER
+     int wisp_set;     /* flag to identify if the plans for an input problem size is found in the wisdom file or not*/
+     int lowcost_idx;  /* to hold the index of the plan which has the least pcost among the top N plans*/
+#endif
+
 planner *X(mkplanner)(void);
 void X(planner_destroy)(planner *ego);
 
diff --git a/kernel/planner.c b/kernel/planner.c
index 2d15b2ba..cf90cbd6 100644
--- a/kernel/planner.c
+++ b/kernel/planner.c
@@ -243,9 +243,15 @@ static solution *htab_lookup(hashtab *ht, const md5sig s,
 static solution *hlookup(planner *ego, const md5sig s, 
 			 const flags_t *flagsp)
 {
+#ifdef AMD_TOP_N_PLANNER    
+     solution *sol = htab_lookup(&ego->htab_blessed[ego->index], s, flagsp);
+     if (!sol) sol = htab_lookup(&ego->htab_unblessed[ego->index], s, flagsp);
+     return sol;	
+#else	
      solution *sol = htab_lookup(&ego->htab_blessed, s, flagsp);
      if (!sol) sol = htab_lookup(&ego->htab_unblessed, s, flagsp);
      return sol;
+#endif     
 }
 
 static void fill_slot(hashtab *ht, const md5sig s, const flags_t *flagsp,
@@ -393,8 +399,13 @@ static void htab_insert(hashtab *ht, const md5sig s, const flags_t *flagsp,
 static void hinsert(planner *ego, const md5sig s, const flags_t *flagsp, 
 		    unsigned slvndx)
 {
+#ifdef AMD_TOP_N_PLANNER
+     htab_insert(BLISS(*flagsp) ? &ego->htab_blessed[ego->index] : &ego->htab_unblessed[ego->index],
+		 s, flagsp, slvndx );
+#else	
      htab_insert(BLISS(*flagsp) ? &ego->htab_blessed : &ego->htab_unblessed,
 		 s, flagsp, slvndx );
+#endif     
 }
 
 
@@ -522,6 +533,189 @@ static int timeout_p(planner *ego, const problem *p)
 static plan *search0(planner *ego, const problem *p, unsigned *slvndx, 
 		     const flags_t *flagsp)
 {
+#ifdef AMD_TOP_N_PLANNER
+     wisp_set = 0;
+     if (ego->index == 0) {
+          plan *best = 0;
+          int best_not_yet_timed = 1;
+
+          /* Do not start a search if the planner timed out. This check is
+	     necessary, lest the relaxation mechanism kick in */ 
+          if (timeout_p(ego, p))
+	       return 0;
+
+          FORALL_SOLVERS_OF_KIND(p->adt->problem_kind, ego, s, sp, {
+	       plan *pln;
+
+	       pln = invoke_solver(ego, p, s, flagsp);
+
+	       if (ego->need_timeout_check) 
+	            if (timeout_p(ego, p)) {
+		         X(plan_destroy_internal)(pln);
+		         X(plan_destroy_internal)(best);
+		         return 0;
+	            }
+
+	       if (pln) {
+	            /* read COULD_PRUNE_NOW_P because PLN may be destroyed
+		       before we use COULD_PRUNE_NOW_P */
+	            int could_prune_now_p = pln->could_prune_now_p;
+
+	            if (best) {
+		         if (best_not_yet_timed) {
+			      evaluate_plan(ego, best, p);
+			      best_not_yet_timed = 0;
+		         }
+		         evaluate_plan(ego, pln, p);
+		         if (pln->pcost < best->pcost) {
+			      X(plan_destroy_internal)(best);
+			      best = pln;
+                              *slvndx = (unsigned)(sp - ego->slvdescs);
+		         } else {
+			      X(plan_destroy_internal)(pln);
+		         }
+	            } else {
+		         best = pln;
+                         *slvndx = (unsigned)(sp - ego->slvdescs);                    
+	            }
+
+	            if (ALLOW_PRUNINGP(ego) && could_prune_now_p) 
+		         break;
+	       } 
+          });
+
+          return best;
+     }     
+     else {          
+          plan *best = 0;
+          int best_not_yet_timed = 1;
+    
+          plan *best2 = 0;
+          plan *best3 = 0;
+
+          int slvndx2 = 0;
+          int slvndx3 = 0;
+
+          int best2_not_yet_timed = 1;
+          int best3_not_yet_timed = 1;
+
+          if (timeout_p(ego, p))
+               return 0;
+
+          FORALL_SOLVERS_OF_KIND(p->adt->problem_kind, ego, s, sp, {
+               plan *pln;
+
+               pln = invoke_solver(ego, p, s, flagsp);
+
+               if (ego->need_timeout_check)
+                    if (timeout_p(ego, p)) {
+                         X(plan_destroy_internal)(pln);
+                         X(plan_destroy_internal)(best);
+                         X(plan_destroy_internal)(best2);
+                         X(plan_destroy_internal)(best3);
+
+                         return 0;
+                    }
+			   
+               if (pln) {
+                    int could_prune_now_p = pln->could_prune_now_p;
+
+                    if (best && best2 && best3) {
+                         if (best_not_yet_timed) {
+                              evaluate_plan(ego, best, p);
+                              best_not_yet_timed = 0;
+                         }
+                         if (best2_not_yet_timed) {
+                              evaluate_plan(ego, best2, p);
+                              best2_not_yet_timed = 0;
+                         }
+                         if (best3_not_yet_timed) {
+                              evaluate_plan(ego, best3, p);
+                              best3_not_yet_timed = 0;
+                         }
+                         evaluate_plan(ego, pln, p);
+
+                         if (pln->pcost < best->pcost) {                        
+                              best3 = best2;                      
+                              best2 = best;
+                              slvndx3 = slvndx2;
+                              slvndx2 = *slvndx;                       
+                              best = pln;
+                              *slvndx = (unsigned)(sp - ego->slvdescs);
+                         }
+                         else if (pln->pcost < best2->pcost) {                       
+                              best3 = best2;
+                              slvndx3 = slvndx2;                       
+                              best2 = pln;
+                              slvndx2 = (unsigned)(sp - ego->slvdescs);
+                         }
+                         else if (pln->pcost < best3->pcost) {                      
+                              best3 = pln;
+                              slvndx3 = (unsigned)(sp - ego->slvdescs);
+                         }
+		         else {
+                              X(plan_destroy_internal)(pln);
+                         }
+                    } 
+                    else if (!best) {
+                         best = pln;
+                         *slvndx = (unsigned)(sp - ego->slvdescs);
+                    }
+                    else if (!best2) {
+                         if (pln->pcost < best->pcost) {
+                              best2 = best;
+                              slvndx2 = *slvndx;						
+                              best = pln;
+                              *slvndx =  (unsigned)(sp - ego->slvdescs);
+                         }
+                         else {
+                              best2 = pln;
+                              slvndx2 = (unsigned)(sp - ego->slvdescs);
+                         }
+                    }
+                    else if (!best3) {
+                         if (pln->pcost < best->pcost) {
+                              best3 = best2;
+                              slvndx3 = slvndx2;
+                              best2 = best;
+                              slvndx2 = *slvndx;
+                              best = pln;
+                              *slvndx = (unsigned)(sp - ego->slvdescs);
+                         }
+		         else if (pln->pcost < best2->pcost) {
+                              best3 = best2;
+                              slvndx3 = slvndx2;
+                              best2 = pln;
+                              slvndx2 = (unsigned)(sp - ego->slvdescs);
+                         }
+                         else {
+                              best3 = pln;
+                              slvndx3 = (unsigned)(sp - ego->slvdescs);
+                         }
+                   }
+                   if (ALLOW_PRUNINGP(ego) && could_prune_now_p)
+                        break;
+              }
+          });
+
+          if (ego->index == 1) {
+               if (best2) {
+                    *slvndx = slvndx2;
+                    return best2;
+               }
+               else
+                    return best;
+          }
+          else if (ego->index == 2) {
+               if (best3) {
+                    *slvndx = slvndx3;
+                    return best3;
+               }
+               else
+                    return best;
+          }   
+     }
+#else	
      plan *best = 0;
      int best_not_yet_timed = 1;
 
@@ -571,6 +765,7 @@ static plan *search0(planner *ego, const problem *p, unsigned *slvndx,
      });
 
      return best;
+#endif     
 }
 
 static plan *search(planner *ego, const problem *p, unsigned *slvndx, 
@@ -640,8 +835,16 @@ static plan *mkplan(planner *ego, const problem *p)
 
 
 #ifdef FFTW_DEBUG
-     check(&ego->htab_blessed);
-     check(&ego->htab_unblessed);
+    #ifdef AMD_TOP_N_PLANNER
+
+    for (int pln_idx = 0; pln_idx < AMD_OPT_TOP_N ; pln_idx ++) {
+         check(&ego->htab_blessed[pln_idx]);
+         check(&ego->htab_unblessed[pln_idx]);
+    }
+    #else
+         check(&ego->htab_blessed);
+         check(&ego->htab_unblessed);
+    #endif
 #endif
 
      pln = 0;
@@ -711,7 +914,11 @@ static plan *mkplan(planner *ego, const problem *p)
 	       }
 	       
 	       ego->wisdom_state = owisdom_state;
-	       
+
+#ifdef AMD_TOP_N_PLANNER
+	       if (wisp_set && AMD_OPT_TOP_N > 1)
+		    evaluate_plan(ego, pln, p);
+#endif	       	       
 	       goto skip_search;
 	  }
 	  else if (ego->nowisdom_hook) /* for MPI, make sure lack of wisdom */
@@ -822,6 +1029,26 @@ static void forget(planner *ego, amnesia a)
 	      break;
      }
 }
+#elif defined (AMD_TOP_N_PLANNER)
+static void forget(planner *ego, amnesia a)
+{
+     switch (a) {
+	 case FORGET_EVERYTHING:
+              for (int pln_idx = 0; pln_idx < AMD_OPT_TOP_N ; pln_idx ++) {
+	           htab_destroy(&ego->htab_blessed[pln_idx]);
+	           mkhashtab(&ego->htab_blessed[pln_idx]);
+	      }     
+	      /* fall through */
+	 case FORGET_ACCURSED:
+              for (int pln_idx = 0; pln_idx < AMD_OPT_TOP_N ; pln_idx ++) {
+	           htab_destroy(&ego->htab_unblessed[pln_idx]);
+	           mkhashtab(&ego->htab_unblessed[pln_idx]);
+              }
+	      break;
+	 default:
+	      break;
+     }
+}
 #else
 static void forget(planner *ego, amnesia a)
 {
@@ -848,14 +1075,20 @@ static const char stimeout[] = "TIMEOUT";
 static void exprt(planner *ego, printer *p)
 {
      unsigned h;
-     hashtab *ht = &ego->htab_blessed;
+     hashtab *ht;
      md5 m;
 
+#ifdef AMD_TOP_N_PLANNER
+     for (int plan_cnt = 0; plan_cnt < AMD_OPT_TOP_N; plan_cnt++) { 
+          ht = &ego->htab_blessed[plan_cnt];
+#else
+     ht = &ego->htab_blessed;
+#endif     
      signature_of_configuration(&m, ego);
 
      p->print(p, 
 	      "(" WISDOM_PREAMBLE " #x%M #x%M #x%M #x%M\n",
-	      m.s[0], m.s[1], m.s[2], m.s[3]);
+	       m.s[0], m.s[1], m.s[2], m.s[3]);
 
      for (h = 0; h < ht->hashsiz; ++h) {
 	  solution *l = ht->solutions + h;
@@ -873,7 +1106,7 @@ static void exprt(planner *ego, printer *p)
 	       }
 
 	       /* qui salvandos salvas gratis
-		  salva me fons pietatis */
+                  salva me fons pietatis */
 	       p->print(p, "  (%s %d #x%x #x%x #x%x #x%M #x%M #x%M #x%M)\n",
 			reg_nam, reg_id, 
 			l->flags.l, l->flags.u, l->flags.timelimit_impatience, 
@@ -881,6 +1114,9 @@ static void exprt(planner *ego, printer *p)
 	  }
      }
      p->print(p, ")\n");
+#ifdef AMD_TOP_N_PLANNER     
+     } 
+#endif     
 }
 
 /* mors stupebit et natura
@@ -893,10 +1129,17 @@ static int imprt(planner *ego, scanner *sc)
      flags_t flags;
      int reg_id;
      unsigned slvndx;
-     hashtab *ht = &ego->htab_blessed;
+     hashtab *ht;
      hashtab old;
      md5 m;
 
+#ifdef AMD_TOP_N_PLANNER
+     int wis_read = 0;
+     for (int plan_cnt = 0; plan_cnt < AMD_OPT_TOP_N; plan_cnt++) {
+          ht = &ego->htab_blessed[plan_cnt];
+#else
+     ht = &ego->htab_blessed;
+#endif     
      if (!sc->scan(sc, 
 		   "(" WISDOM_PREAMBLE " #x%M #x%M #x%M #x%M\n",
 		   sig + 0, sig + 1, sig + 2, sig + 3))
@@ -949,18 +1192,35 @@ static int imprt(planner *ego, scanner *sc)
 	  CK(flags.u == u);
 	  CK(flags.timelimit_impatience == timelimit_impatience);
 
+#ifdef AMD_TOP_N_PLANNER
+	  ego->index = plan_cnt;
+#endif	  
 	  if (!hlookup(ego, sig, &flags))
 	       hinsert(ego, sig, &flags, slvndx);
      }
 
-     X(ifree0)(old.solutions);
-     return 1;
+    X(ifree0)(old.solutions);
+#ifdef AMD_TOP_N_PLANNER
+    wis_read+=1;    
+    goto wis_read;
+#else    
+    return 1;
+#endif    
 
  bad:
      /* ``The wisdom of FFTW must be above suspicion.'' */
      X(ifree0)(ht->solutions);
      *ht = old;
      return 0;
+
+#ifdef AMD_TOP_N_PLANNER     
+ wis_read:
+          if (wis_read == AMD_OPT_TOP_N) {
+               wisp_set = 1;
+               return 1;          
+	  }											
+     }     
+#endif    
 }
 
 /*
@@ -998,8 +1258,15 @@ planner *X(mkplanner)(void)
      p->need_timeout_check = 1;
      p->timelimit = -1;
 
+#ifdef AMD_TOP_N_PLANNER
+     for (int pln_idx = 0; pln_idx < AMD_OPT_TOP_N ; pln_idx ++) {
+          mkhashtab(&p->htab_blessed[pln_idx]);
+          mkhashtab(&p->htab_unblessed[pln_idx]);
+     }
+#else     
      mkhashtab(&p->htab_blessed);
      mkhashtab(&p->htab_unblessed);
+#endif
 
      for (i = 0; i < PROBLEM_LAST; ++i)
 	  p->slvdescs_for_problem_kind[i] = -1;
@@ -1010,9 +1277,15 @@ planner *X(mkplanner)(void)
 void X(planner_destroy)(planner *ego)
 {
      /* destroy hash table */
+#ifdef AMD_TOP_N_PLANNER
+     for (int pln_idx = 0; pln_idx < AMD_OPT_TOP_N ; pln_idx ++) {
+          htab_destroy(&ego->htab_blessed[pln_idx]);
+          htab_destroy(&ego->htab_unblessed[pln_idx]);
+     }
+#else	
      htab_destroy(&ego->htab_blessed);
      htab_destroy(&ego->htab_unblessed);
-
+#endif
      /* destroy solvdesc table */
      FORALL_SOLVERS(ego, s, sp, {
 	  UNUSED(sp);
diff --git a/libbench2/speed.c b/libbench2/speed.c
index 23cc7cc9..947f3476 100644
--- a/libbench2/speed.c
+++ b/libbench2/speed.c
@@ -41,11 +41,12 @@ void speed(const char *param, int setup_only)
 	  t[k] = 0;
 
      p = problem_parse(param);
-     BENCH_ASSERT(can_do(p));
      if (!no_speed_allocation) {
 	  problem_alloc(p);
 	  problem_zero(p);
      }
+     BENCH_ASSERT(can_do(p)); /* Calling can_do after problem allocation to enable buffer allocation for use in evaluate_plan. This is
+                                 required for the evaluation of Top N plans from the wisdom file during single threaded execution */
 
      timer_start(LIBBENCH_TIMER);
      setup(p);

From a2080b325fbc7843fda4e81943dcf965df52a88c Mon Sep 17 00:00:00 2001
From: Anand Kumar <Anand.KumarJaiswal@amd.com>
Date: Fri, 30 Apr 2021 14:10:19 +0530
Subject: [PATCH 2/6] This change adds support for building FFTW library on
 Windows OS.

1. Added win folder which is having in-built test programs and scripts for windows OS.
2. Added support for building with MSVC and clang compilers.
3. Updated cmake for supporting AMD optimizations related flags.
4. Removed amd copyright header from win folder scripts

AMD-Internal: [CPUPL-1553]
Change-Id: I156abdf5d555ed4c95cb386460f51a8f7605c0d9
---
 CMakeLists.txt                   | 444 +++++++++++++++++++++++--------
 api/fftw3.h                      |  10 +-
 cmake.config.h.in                |   4 +-
 libbench2/bench-user.h           |   4 +-
 libbench2/util.c                 |   9 +-
 threads/openmp.c                 |   4 +-
 win/tests/README.txt             |  37 +++
 win/tests/check.pl               | 308 +++++++++++++++++++++
 win/tests/commands.yaml          |  26 ++
 win/tests/fftw_check.py          | 166 ++++++++++++
 win/tests/fftw_mpi_benchmark.bat | 301 +++++++++++++++++++++
 win/tests/fftw_mt_benchmark.bat  | 313 ++++++++++++++++++++++
 win/tests/fftw_st_benchmark.bat  | 282 ++++++++++++++++++++
 13 files changed, 1791 insertions(+), 117 deletions(-)
 create mode 100644 win/tests/README.txt
 create mode 100644 win/tests/check.pl
 create mode 100644 win/tests/commands.yaml
 create mode 100644 win/tests/fftw_check.py
 create mode 100644 win/tests/fftw_mpi_benchmark.bat
 create mode 100644 win/tests/fftw_mt_benchmark.bat
 create mode 100644 win/tests/fftw_st_benchmark.bat

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 335808a3..30577056 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required (VERSION 3.0)
+cmake_minimum_required (VERSION 3.15)
 
 if (NOT DEFINED CMAKE_BUILD_TYPE)
   set (CMAKE_BUILD_TYPE Release CACHE STRING "Build type")
@@ -10,26 +10,134 @@ if (POLICY CMP0042)
   cmake_policy (SET CMP0042 NEW)
 endif ()
 
+SET(AMD_ARCH "znver1" CACHE STRING "select AMD zen version for Clang toolchain")
+
+if (CMAKE_C_COMPILER_ID MATCHES Clang)
+  if ("${AMD_ARCH}" STREQUAL "")
+    message(FATAL_ERROR "Machine arch missing! Select one of znver1, znver2 or znver3")
+  elseif (${AMD_ARCH} STREQUAL "znver1")
+    add_definitions("-march=znver1")
+  elseif (${AMD_ARCH} STREQUAL "znver2")
+    add_definitions("-march=znver2")
+  elseif (${AMD_ARCH} STREQUAL "znver3")
+    add_definitions("-march=znver3")
+  else ()
+    message(FATAL_ERROR "Unsupported Machine arch! Select one of znver1, znver2 or znver3")
+  endif ()
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mno-prfchw")
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mno-prfchw")
+endif ()
+
 option (BUILD_SHARED_LIBS "Build shared libraries" ON)
 option (BUILD_TESTS "Build tests" ON)
+option (ENABLE_VERBOSE_MODE "Enable verbose mode" ON)
 
 option (ENABLE_OPENMP "Use OpenMP for multithreading" OFF)
-option (ENABLE_THREADS "Use pthread for multithreading" OFF)
+option (ENABLE_THREADS "Use threads for multithreading" OFF)
 option (WITH_COMBINED_THREADS "Merge thread library" OFF)
 
 option (ENABLE_FLOAT "single-precision" OFF)
 option (ENABLE_LONG_DOUBLE "long-double precision" OFF)
-option (ENABLE_QUAD_PRECISION "quadruple-precision" OFF)
+# Not available in Windows due to limitations on 128 bit datatype
+#option (ENABLE_QUAD_PRECISION "quadruple-precision" OFF)
 
 option (ENABLE_SSE "Compile with SSE instruction set support" OFF)
 option (ENABLE_SSE2 "Compile with SSE2 instruction set support" OFF)
 option (ENABLE_AVX "Compile with AVX instruction set support" OFF)
-option (ENABLE_AVX2 "Compile with AVX2 instruction set support" OFF)
+option (ENABLE_AVX2 "Compile with AVX2 instruction set support" ON)
+
+option (ENABLE_AMD_OPT "Enable AMD specific optimization" OFF)
+option (ENABLE_AMD_MPIFFT "Compile with AMD MPIFFT support" OFF)
+option (ENABLE_MPI "compile FFTW MPI library" OFF)
+
+option (ENABLE_AMD_TRANS "Enable amd optimized transpose" OFF)
+
+option (ENABLE_AMD_FAST_PLANNER "Enable for a faster planning time on AMD cpus" OFF)
+option (ENABLE_AMD_TOP_N_planner "Enable AMD Top N Planner for AMD cpus" OFF)
+
+if(ENABLE_VERBOSE_MODE)
+  if(CMAKE_C_COMPILER_ID MATCHES MSVC)
+    set(CMAKE_VERBOSE_MAKEFILE ON CACHE BOOL "ON" FORCE)
+  elseif(CMAKE_C_COMPILER_ID MATCHES Clang)
+    add_compile_options(-v)
+  endif()
+endif()
+
+# Release mode setting for optimization flags in MSVC and Clang (in Debug mode optimization is disabled)
+if (CMAKE_C_COMPILER_ID MATCHES MSVC OR CMAKE_C_COMPILER_ID MATCHES Clang)
+  if(CMAKE_BUILD_TYPE STREQUAL "Release")
+    set(CMAKE_CXX_FLAGS_RELEASE "/O2 /DNDEBUG" CACHE STRING "" FORCE)
+    set(CMAKE_C_FLAGS_RELEASE "/O2 /DNDEBUG" CACHE STRING "" FORCE)
+  endif()
+endif()
+
+if (CMAKE_C_COMPILER_ID MATCHES MSVC OR CMAKE_C_COMPILER_ID MATCHES Clang)
+  if(CMAKE_BUILD_TYPE STREQUAL "Release")
+    set(CMAKE_C_COMPILER_FLAGS "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS_RELEASE} ${CMAKE_C_FLAGS}")
+  else()
+    set(CMAKE_C_COMPILER_FLAGS "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS_DEBUG} ${CMAKE_C_FLAGS}")
+  endif()
+endif()
+
+if(ENABLE_AMD_OPT)
+  add_definitions(-DAMD_OPT_ALL)
+endif()
+
+if(ENABLE_AMD_MPIFFT)
+  add_definitions(-DAMD_OPT_MPIFFT)
+endif()
+
+if(ENABLE_MPI)
+  if(ENABLE_QUAD_PRECISION)
+    message (FATAL_ERROR "quad precision is not supported in MPI")
+  endif()
+  find_package(MPI REQUIRED)
+  set(HAVE_MPI TRUE)
+endif()
+
+if(ENABLE_AMD_TRANS)
+  add_definitions(-DAMD_OPT_TRANS)
+endif()
 
 option (DISABLE_FORTRAN "Disable Fortran wrapper routines" OFF)
 
-include(GNUInstallDirs)
+if (CMAKE_C_COMPILER_ID MATCHES MSVC OR CMAKE_C_COMPILER_ID MATCHES Clang)
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MP")
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W0 ")
+endif ()
+
+#print configurations
+message("---cmake configurations---")
+if (CMAKE_C_COMPILER_ID MATCHES Clang)
+  message("AMD_ARCH selected:${AMD_ARCH}")
+endif ()
+message(CMAKE_C_COMPILER_ID : ${CMAKE_C_COMPILER_ID})
+message(CMAKE_BUILD_TYPE : ${CMAKE_BUILD_TYPE})
+message(BUILD_SHARED_LIBS : ${BUILD_SHARED_LIBS})
+message(BUILD_TESTS : ${BUILD_TESTS})
+message(ENABLE_OPENMP : ${ENABLE_OPENMP})
+message(ENABLE_THREADS : ${ENABLE_THREADS})
+message(WITH_COMBINED_THREADS : ${WITH_COMBINED_THREADS})
+message(ENABLE_FLOAT : ${ENABLE_FLOAT})
+message(ENABLE_LONG_DOUBLE : ${ENABLE_LONG_DOUBLE})
+message(ENABLE_QUAD_PRECISION : ${ENABLE_QUAD_PRECISION})
+message(ENABLE_SSE : ${ENABLE_SSE})
+message(ENABLE_SSE2 : ${ENABLE_SSE2})
+message(ENABLE_AVX : ${ENABLE_AVX})
+message(ENABLE_AVX2 : ${ENABLE_AVX2})
+message(ENABLE_AMD_OPT : ${ENABLE_AMD_OPT})
+message(ENABLE_AMD_MPIFFT : ${ENABLE_AMD_MPIFFT})
+message(ENABLE_MPI : ${ENABLE_MPI})
+message(ENABLE_AMD_TRANS : ${ENABLE_AMD_TRANS})
+message(DISABLE_FORTRAN : ${DISABLE_FORTRAN})
+message(CMAKE_CXX_FLAGS : ${CMAKE_CXX_FLAGS})
+message(CMAKE_C_FLAGS : ${CMAKE_C_FLAGS})
+message(CMAKE_C_FLAGS_RELEASE : ${CMAKE_C_FLAGS_RELEASE})
+message(CMAKE_C_COMPILER_FLAGS : ${CMAKE_C_COMPILER_FLAGS})
+message(ENABLE_AMD_FAST_PLANNER : ${ENABLE_AMD_FAST_PLANNER})
+message(ENABLE_AMD_TOP_N_planner : ${ENABLE_AMD_TOP_N_planner})
 
+include(GNUInstallDirs)
 
 include (CheckIncludeFile)
 check_include_file (alloca.h         HAVE_ALLOCA_H)
@@ -59,7 +167,7 @@ if (HAVE_TIME_H AND HAVE_SYS_TIME_H)
   set (TIME_WITH_SYS_TIME TRUE)
 endif ()
 
-include (CheckPrototypeDefinition) 
+include (CheckPrototypeDefinition)
 check_prototype_definition (drand48 "double drand48 (void)" "0" stdlib.h HAVE_DECL_DRAND48)
 check_prototype_definition (srand48 "void srand48(long int seedval)" "0" stdlib.h HAVE_DECL_SRAND48)
 check_prototype_definition (cosl "long double cosl( long double arg )" "0" math.h HAVE_DECL_COSL)
@@ -106,6 +214,12 @@ check_type_size ("ptrdiff_t" SIZEOF_PTRDIFF_T)
 math (EXPR SIZEOF_INT_BITS "8 * ${SIZEOF_INT}")
 set (C_FFTW_R2R_KIND "C_INT${SIZEOF_INT_BITS}_T")
 
+if (MSVC)
+  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+endif(MSVC)
+
+add_compile_definitions(AOCL_FFTW_VERSION="AOCL FFTW 3.1")
+
 find_library (LIBM_LIBRARY NAMES m)
 if (LIBM_LIBRARY)
   set (HAVE_LIBM TRUE)
@@ -131,60 +245,127 @@ endif ()
 
 include (CheckCCompilerFlag)
 
-if (ENABLE_SSE)
-  foreach (FLAG "-msse" "/arch:SSE")
-    unset (HAVE_SSE CACHE)
-    check_c_compiler_flag (${FLAG} HAVE_SSE)
-    if (HAVE_SSE)
-      set (SSE_FLAG ${FLAG})
-      break()
-    endif ()
-  endforeach ()
-endif ()
-
-if (ENABLE_SSE2)
-  foreach (FLAG "-msse2" "/arch:SSE2")
-    unset (HAVE_SSE2 CACHE)
-    check_c_compiler_flag (${FLAG} HAVE_SSE2)
-    if (HAVE_SSE2)
-      set (SSE2_FLAG ${FLAG})
-      break()
-    endif ()
-  endforeach ()
-endif ()
-
-if (ENABLE_AVX)
-  foreach (FLAG "-mavx" "/arch:AVX")
-    unset (HAVE_AVX CACHE)
-    check_c_compiler_flag (${FLAG} HAVE_AVX)
-    if (HAVE_AVX)
-      set (AVX_FLAG ${FLAG})
-      break()
-    endif ()
-  endforeach ()
-endif ()
-
-if (ENABLE_AVX2)
-  foreach (FLAG "-mavx2" "/arch:AVX2")
-    unset (HAVE_AVX2 CACHE)
-    check_c_compiler_flag (${FLAG} HAVE_AVX2)
-    if (HAVE_AVX2)
-      set (AVX2_FLAG ${FLAG})
-      break()
-    endif ()
-  endforeach ()
-endif ()
-
-# AVX2 codelets require FMA support as well
-if (ENABLE_AVX2)
-  foreach (FLAG "-mfma" "/arch:FMA")
-    unset (HAVE_FMA CACHE)
-    check_c_compiler_flag (${FLAG} HAVE_FMA)
-    if (HAVE_FMA)
-      set (FMA_FLAG ${FLAG})
-      break()
-    endif ()
-  endforeach ()
+if(CMAKE_C_COMPILER_ID MATCHES MSVC)
+  if (ENABLE_AVX2)
+    set (AVX2_FLAG /arch:AVX2)
+    set(HAVE_AVX2 TRUE)
+    message(STATUS "Enabling AVX2 instructions")
+    #not necessary to enable FMA in MSVC as its enabled
+    #by default when AVX2 is enabled for x64
+    # set (FMA_FLAG /arch:FMA)
+    # add_definitions(/arch:FMA)
+    # set(HAVE_FMA TRUE)
+    # message(STATUS "Enabling FMA instructions")
+  endif()
+
+  if(ENABLE_AVX)
+    set (AVX_FLAG /arch:AVX)
+    set(HAVE_AVX TRUE)
+    message(STATUS "Enabling AVX instructions")
+  endif()
+
+  if(ENABLE_SSE2)
+    set (SSE2_FLAG /arch:SSE2)
+    set(HAVE_SSE2 TRUE)
+    message(STATUS "Enabling SSE2 instructions")
+  endif()
+
+  if(ENABLE_SSE)
+    set (SSE_FLAG /arch:SSE)
+    set(HAVE_SSE TRUE)
+    message(STATUS "Enabling SSE instructions")
+  endif()
+
+  add_compile_definitions(${AVX_FLAG} ${AVX2_FLAG} ${SSE_FLAG} ${SSE2_FLAG})
+
+elseif(CMAKE_C_COMPILER_ID MATCHES Clang)
+
+  if (ENABLE_AVX2)
+    set (AVX2_FLAG -mavx2)
+    set(HAVE_AVX2 TRUE)
+    set (FMA_FLAG -mfma)
+    set(HAVE_FMA TRUE)
+    message(STATUS "Enabling AVX2 and FMA instructions")
+  endif()
+
+  if(ENABLE_AVX)
+    set (AVX_FLAG -mavx)
+    set(HAVE_AVX TRUE)
+    message(STATUS "Enabling AVX instructions")
+  endif()
+
+  if(ENABLE_SSE2)
+    set (SSE2_FLAG -msse2)
+    set(HAVE_SSE2 TRUE)
+    message(STATUS "Enabling SSE2 instructions")
+  endif()
+
+  if(ENABLE_SSE)
+    set (SSE_FLAG -msse)
+    set(HAVE_SSE TRUE)
+    message(STATUS "Enabling SSE instructions")
+  endif()
+
+  add_compile_options(${AVX_FLAG} ${AVX2_FLAG} ${SSE_FLAG} ${SSE2_FLAG} ${FMA_FLAG})
+
+else()
+
+  if (ENABLE_SSE)
+    foreach (FLAG "-msse" "/arch:SSE")
+      unset (HAVE_SSE CACHE)
+      check_c_compiler_flag (${FLAG} HAVE_SSE)
+      if (HAVE_SSE)
+        set (SSE_FLAG ${FLAG})
+        break()
+      endif ()
+    endforeach ()
+  endif ()
+
+  if (ENABLE_SSE2)
+    foreach (FLAG "-msse2" "/arch:SSE2")
+      unset (HAVE_SSE2 CACHE)
+      check_c_compiler_flag (${FLAG} HAVE_SSE2)
+      if (HAVE_SSE2)
+        set (SSE2_FLAG ${FLAG})
+        break()
+      endif ()
+    endforeach ()
+  endif ()
+
+  if (ENABLE_AVX)
+    foreach (FLAG "-mavx" "/arch:AVX")
+      unset (HAVE_AVX CACHE)
+      check_c_compiler_flag (${FLAG} HAVE_AVX)
+      if (HAVE_AVX)
+        set (AVX_FLAG ${FLAG})
+        break()
+      endif ()
+    endforeach ()
+  endif ()
+
+  if (ENABLE_AVX2)
+    foreach (FLAG "-mavx2" "/arch:AVX2")
+      unset (HAVE_AVX2 CACHE)
+      check_c_compiler_flag (${FLAG} HAVE_AVX2)
+      if (HAVE_AVX2)
+        set (AVX2_FLAG ${FLAG})
+        break()
+      endif ()
+    endforeach ()
+  endif ()
+
+  # AVX2 codelets require FMA support as well
+  if (ENABLE_AVX2)
+    foreach (FLAG "-mfma" "/arch:FMA")
+      unset (HAVE_FMA CACHE)
+      check_c_compiler_flag (${FLAG} HAVE_FMA)
+      if (HAVE_FMA)
+        set (FMA_FLAG ${FLAG})
+        break()
+      endif ()
+    endforeach ()
+  endif ()
+
 endif ()
 
 if (HAVE_SSE2 OR HAVE_AVX)
@@ -218,8 +399,14 @@ file(GLOB           fftw_rdft_simd_avx2_SOURCE      rdft/simd/avx2/*.c  rdft/sim
 file(GLOB           fftw_reodft_SOURCE              reodft/*.c          reodft/*.h)
 file(GLOB           fftw_simd_support_SOURCE        simd-support/*.c    simd-support/*.h)
 file(GLOB           fftw_libbench2_SOURCE           libbench2/*.c       libbench2/*.h)
+
+file(GLOB           fftw_mpi_SOURCE                 mpi/*.c             mpi/*.h)
+
 list (REMOVE_ITEM   fftw_libbench2_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/libbench2/useropt.c)
 
+list (REMOVE_ITEM   fftw_mpi_SOURCE ${CMAKE_SOURCE_DIR}/mpi/mpi-bench.c)
+list (REMOVE_ITEM   fftw_mpi_SOURCE ${CMAKE_SOURCE_DIR}/mpi/testsched.c)
+
 set(SOURCEFILES
     ${fftw_api_SOURCE}
     ${fftw_dft_SOURCE}
@@ -274,7 +461,11 @@ if (HAVE_AVX2)
   list (APPEND SOURCEFILES ${fftw_dft_simd_avx2_SOURCE} ${fftw_rdft_simd_avx2_SOURCE})
 endif ()
 
-set (FFTW_VERSION 3.3.7)
+if (HAVE_MPI)
+  list (APPEND SOURCEFILES ${fftw_mpi_SOURCE})
+endif ()
+
+set (FFTW_VERSION 3.3.8)
 
 set (PREC_SUFFIX)
 if (ENABLE_FLOAT)
@@ -294,6 +485,35 @@ if (ENABLE_QUAD_PRECISION)
   set (BENCHFFT_QUAD TRUE)
   set (PREC_SUFFIX q)
 endif ()
+
+if (ENABLE_AMD_FAST_PLANNER)
+  if (NOT (ENABLE_QUAD_PRECISION OR ENABLE_LONG_DOUBLE))
+    set (AMD_OPT_FAST_PLANNER TRUE)
+  else ()
+    message(FATAL_ERROR "AMD_FAST_PLANNER cannot be set for Quad and Long Double precision")
+  endif ()
+endif ()
+
+if (ENABLE_AMD_TOP_N_planner)
+  # Check if amd-top-n-planner is enabled with Quad and Long Double precision
+  if (NOT (ENABLE_QUAD_PRECISION OR ENABLE_LONG_DOUBLE))
+  # Check if both amd-fast-planner and amd-top-n-planner are enabled together
+	if (NOT (ENABLE_AMD_FAST_PLANNER))
+		# Check if amd-top-n-planner is enabled with mpi, openmp or threads
+		if (NOT (ENABLE_MPI OR ENABLE_OPENMP OR ENABLE_THREADS))
+			set (AMD_OPT_TOP_N_PLANNER TRUE)
+		else ()
+			message(FATAL_ERROR "AMD_TOP_N_PLANNER can not be enabled with mpi, openmp or threads as it is supported only for single threaded mode")
+		endif ()
+	else ()
+    message(FATAL_ERROR "AMD_FAST_PLANNER and AMD_TOP_N_PLANNER can not be enabled together")
+	endif ()
+  else ()
+    message(FATAL_ERROR "AMD_TOP_N_PLANNER cannot be set for Quad and Long Double precision")
+  endif ()
+  
+  endif ()
+
 set (fftw3_lib fftw3${PREC_SUFFIX})
 
 configure_file (cmake.config.h.in config.h @ONLY)
@@ -301,63 +521,70 @@ include_directories (${CMAKE_CURRENT_BINARY_DIR})
 
 if (BUILD_SHARED_LIBS)
   add_definitions (-DFFTW_DLL)
+  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif ()
 
-add_library (${fftw3_lib} ${SOURCEFILES})
-target_include_directories (${fftw3_lib} INTERFACE $<INSTALL_INTERFACE:include>)
-if (MSVC)
-  target_compile_definitions (${fftw3_lib} PRIVATE /bigobj)
+if (Threads_FOUND)
+  if (WITH_COMBINED_THREADS)
+    set (lib_name ${fftw3_lib})
+    add_library (${fftw3_lib} ${SOURCEFILES})
+    target_include_directories (${lib_name} INTERFACE $<INSTALL_INTERFACE:include>)
+    target_link_libraries (${lib_name} ${CMAKE_THREAD_LIBS_INIT})
+  else ()
+    set (lib_name ${fftw3_lib}_threads)
+    add_library (${lib_name} ${fftw_threads_SOURCE} ${SOURCEFILES})
+    target_include_directories (${lib_name} INTERFACE $<INSTALL_INTERFACE:include>)
+    target_link_libraries (${lib_name} ${CMAKE_THREAD_LIBS_INIT})
+  endif ()
+elseif (OPENMP_FOUND)
+  set (lib_name ${fftw3_lib}_omp)
+  add_library (${lib_name} ${fftw_omp_SOURCE} ${SOURCEFILES})
+  target_include_directories (${lib_name} INTERFACE $<INSTALL_INTERFACE:include>)
+  target_link_libraries (${lib_name} ${CMAKE_THREAD_LIBS_INIT})
+  target_compile_options (${lib_name} PRIVATE ${OpenMP_C_FLAGS})
+else ()
+  set(lib_name ${fftw3_lib})
+  add_library (${lib_name} ${SOURCEFILES})
+  target_include_directories (${lib_name} INTERFACE $<INSTALL_INTERFACE:include>)
 endif ()
+
+target_include_directories(${lib_name} PRIVATE ${CMAKE_SOURCE_DIR}/api)
+
+if (CMAKE_C_COMPILER_ID MATCHES MSVC)
+  target_compile_definitions (${lib_name} PRIVATE /bigobj)
+endif ()
+
 if (HAVE_SSE)
-  target_compile_options (${fftw3_lib} PRIVATE ${SSE_FLAG})
+  target_compile_options (${lib_name} PRIVATE ${SSE_FLAG})
 endif ()
 if (HAVE_SSE2)
-  target_compile_options (${fftw3_lib} PRIVATE ${SSE2_FLAG})
+  target_compile_options (${lib_name} PRIVATE ${SSE2_FLAG})
 endif ()
 if (HAVE_AVX)
-  target_compile_options (${fftw3_lib} PRIVATE ${AVX_FLAG})
+  target_compile_options (${lib_name} PRIVATE ${AVX_FLAG})
 endif ()
 if (HAVE_AVX2)
-  target_compile_options (${fftw3_lib} PRIVATE ${AVX2_FLAG})
+  target_compile_options (${lib_name} PRIVATE ${AVX2_FLAG})
 endif ()
 if (HAVE_FMA)
-  target_compile_options (${fftw3_lib} PRIVATE ${FMA_FLAG})
+  target_compile_options (${lib_name} PRIVATE ${FMA_FLAG})
 endif ()
 if (HAVE_LIBM)
-  target_link_libraries (${fftw3_lib} m)
+  target_link_libraries (${lib_name} m)
 endif ()
-
-set (subtargets ${fftw3_lib})
-
-if (Threads_FOUND)
-  if (WITH_COMBINED_THREADS)
-    target_link_libraries (${fftw3_lib} ${CMAKE_THREAD_LIBS_INIT})
-  else ()
-    add_library (${fftw3_lib}_threads ${fftw_threads_SOURCE})
-    target_include_directories (${fftw3_lib}_threads INTERFACE $<INSTALL_INTERFACE:include>)
-    target_link_libraries (${fftw3_lib}_threads ${fftw3_lib})
-    target_link_libraries (${fftw3_lib}_threads ${CMAKE_THREAD_LIBS_INIT})
-    list (APPEND subtargets ${fftw3_lib}_threads)
-  endif ()
+if (HAVE_MPI)
+  target_include_directories (${lib_name} PRIVATE ${MPI_INCLUDE_PATH})
+  target_link_libraries (${lib_name} PRIVATE ${MPI_LIBRARIES})
 endif ()
 
-if (OPENMP_FOUND)
-  add_library (${fftw3_lib}_omp ${fftw_omp_SOURCE})
-  target_include_directories (${fftw3_lib}_omp INTERFACE $<INSTALL_INTERFACE:include>)
-  target_link_libraries (${fftw3_lib}_omp ${fftw3_lib})
-  target_link_libraries (${fftw3_lib}_omp ${CMAKE_THREAD_LIBS_INIT})
-  list (APPEND subtargets ${fftw3_lib}_omp)
-  target_compile_options (${fftw3_lib}_omp PRIVATE ${OpenMP_C_FLAGS})
-endif ()
+set_target_properties (${lib_name} PROPERTIES SOVERSION 3.5.7 VERSION 3)
 
-foreach(subtarget ${subtargets})
-  set_target_properties (${subtarget} PROPERTIES SOVERSION 3.5.7 VERSION 3)
-  install (TARGETS ${subtarget}
+install (TARGETS ${lib_name}
 	  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
 	  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
           ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
-endforeach ()
-install(TARGETS ${fftw3_lib}
+
+install(TARGETS ${lib_name}
           EXPORT FFTW3LibraryDepends
           RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
           LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
@@ -375,17 +602,20 @@ if (EXISTS ${CMAKE_SOURCE_DIR}/api/fftw3.f03.in)
   install (FILES ${CMAKE_CURRENT_BINARY_DIR}/fftw3.f03 DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 endif ()
 
-if (BUILD_TESTS)
-
-  add_executable (bench ${fftw_libbench2_SOURCE} tests/bench.c tests/hook.c tests/fftw-bench.c)
+add_library (libbench2 STATIC ${fftw_libbench2_SOURCE})
 
-  if (ENABLE_THREADS AND NOT WITH_COMBINED_THREADS)
-    target_link_libraries (bench ${fftw3_lib}_threads)
-  else ()
-    target_link_libraries (bench ${fftw3_lib})
+if (BUILD_TESTS)
+  add_executable (bench tests/bench.c tests/hook.c tests/fftw-bench.c)
+  target_link_libraries (bench libbench2)
+  target_link_libraries (bench ${lib_name})
+
+  if (HAVE_MPI)
+    add_executable (mpi-bench  mpi/mpi-bench.c tests/hook.c tests/fftw-bench.c)
+    target_include_directories (mpi-bench PRIVATE ${CMAKE_SOURCE_DIR}/api)
+    target_include_directories (mpi-bench PRIVATE ${MPI_INCLUDE_PATH})
+    target_link_libraries (mpi-bench libbench2 ${MPI_LIBRARIES} ${lib_name})
   endif ()
 
-
   enable_testing ()
 
   if (Threads_FOUND)
@@ -413,7 +643,7 @@ install (FILES
          COMPONENT Development)
 
 # cmake file
-set (FFTW3_LIBRARIES "FFTW3::${fftw3_lib}")
+set (FFTW3_LIBRARIES "FFTW3::${lib_name}")
 configure_file (FFTW3Config.cmake.in FFTW3${PREC_SUFFIX}Config.cmake @ONLY)
 configure_file (FFTW3ConfigVersion.cmake.in FFTW3${PREC_SUFFIX}ConfigVersion.cmake @ONLY)
 install (FILES
@@ -422,7 +652,7 @@ install (FILES
 	  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/fftw3${PREC_SUFFIX}
          COMPONENT Development)
 
-export (TARGETS ${fftw3_lib} NAMESPACE FFTW3:: FILE ${PROJECT_BINARY_DIR}/FFTW3LibraryDepends.cmake)
+export (TARGETS ${lib_name} NAMESPACE FFTW3:: FILE ${PROJECT_BINARY_DIR}/FFTW3LibraryDepends.cmake)
 install(EXPORT FFTW3LibraryDepends
         NAMESPACE FFTW3::
         DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/fftw3${PREC_SUFFIX}
diff --git a/api/fftw3.h b/api/fftw3.h
index 5c5ffa0d..7263cc61 100644
--- a/api/fftw3.h
+++ b/api/fftw3.h
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
- * Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * The following statement of license applies *only* to this header file,
  * and *not* to the other files distributed with FFTW or derived therefrom:
@@ -94,6 +94,11 @@ extern "C"
 #  define FFTW_CDECL
 #endif
 
+/* to avoid symbol conflict with MSVS SDK for 'complex' (Windows only) */
+#if defined(_WIN32) || defined(_WIN64)
+#undef complex
+#endif
+
 enum fftw_r2r_kind_do_not_use_me {
      FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2,
      FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6,
@@ -464,7 +469,8 @@ FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
    for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */
 #if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) || (__clang__ && __clang_major__ >= 10)) \
  && !(defined(__ICC) || defined(__INTEL_COMPILER) || defined(__CUDACC__) || defined(__PGI)) \
- && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__))
+ && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__)) \
+ && !(defined(_WIN32) || defined(_WIN64))
 #  if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
 /* note: __float128 is a typedef, which is not supported with the _Complex
          keyword in gcc, so instead we use this ugly __attribute__ version.
diff --git a/cmake.config.h.in b/cmake.config.h.in
index 1f4c5055..52837f10 100644
--- a/cmake.config.h.in
+++ b/cmake.config.h.in
@@ -32,7 +32,7 @@
 /* #undef FC_DUMMY_MAIN_EQ_F77 */
 
 /* C compiler name and flags */
-#define FFTW_CC "@CMAKE_C_COMPILER@"
+#define FFTW_CC "@CMAKE_C_COMPILER_FLAGS@"
 
 /* Define to enable extra FFTW debugging code. */
 /* #undef FFTW_DEBUG */
@@ -197,7 +197,7 @@
 /* #undef HAVE_MIPS_ZBUS_TIMER */
 
 /* Define if you have the MPI library. */
-/* #undef HAVE_MPI */
+#cmakedefine HAVE_MPI
 
 /* Define to enable ARM NEON optimizations. */
 /* #undef HAVE_NEON */
diff --git a/libbench2/bench-user.h b/libbench2/bench-user.h
index 951de1df..17861356 100644
--- a/libbench2/bench-user.h
+++ b/libbench2/bench-user.h
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2001 Matteo Frigo
  * Copyright (c) 2001 Massachusetts Institute of Technology
- * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -59,7 +59,7 @@ typedef bench_real bench_complex[2];
 #define SINGLE_PRECISION (!DOUBLE_PRECISION && sizeof(bench_real) == sizeof(float))
 #undef LDOUBLE_PRECISION
 #ifdef BENCHFFT_LDOUBLE
-#define LDOUBLE_PRECISION (!DOUBLE_PRECISION && sizeof(bench_real) == sizeof(long double))
+#define LDOUBLE_PRECISION (sizeof(bench_real) == sizeof(long double))
 #else
 #define LDOUBLE_PRECISION 0
 #endif
diff --git a/libbench2/util.c b/libbench2/util.c
index aa15d282..0adadeff 100644
--- a/libbench2/util.c
+++ b/libbench2/util.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2000 Matteo Frigo
  * Copyright (c) 2000 Massachusetts Institute of Technology
+ * Copyright (C) 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -204,9 +205,11 @@ void *bench_malloc(size_t n)
         The bug seems to have been fixed as of glibc 2.3.1. */
      if (posix_memalign(&p, MIN_ALIGNMENT, n))
 	  p = (void*) 0;
-#elif defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC)
-     /* Intel's C compiler defines _mm_malloc and _mm_free intrinsics */
-     p = (void *) _mm_malloc(n, MIN_ALIGNMENT);
+#elif defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC) || (defined(_WIN32) || defined(_WIN64))
+     /* Intel's C compiler defines _mm_malloc and _mm_free intrinsics 
+	 Use "_mm_malloc" for aligned memory allocation on Windows which is supported with clang/VC++. */
+	 p = (void *) _mm_malloc(n, MIN_ALIGNMENT);
+	 
 #    undef real_free
 #    define real_free _mm_free
 #else
diff --git a/threads/openmp.c b/threads/openmp.c
index 1b384ece..ce93e594 100644
--- a/threads/openmp.c
+++ b/threads/openmp.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ * Copyright (C) 2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,7 +23,8 @@
 
 #include "threads/threads.h"
 
-#if !defined(_OPENMP)
+//_OPENMP is not defined for clang compiler
+#if !defined(_OPENMP) && !defined(__clang__)
 #error OpenMP enabled but not using an OpenMP compiler
 #endif
 
diff --git a/win/tests/README.txt b/win/tests/README.txt
new file mode 100644
index 00000000..243fbcb0
--- /dev/null
+++ b/win/tests/README.txt
@@ -0,0 +1,37 @@
+#FFTW check execution script
+
+check execution script covers check, smallcheck, bigcheck, paranoid-check and exhaustive-check
+
+##Requirements
+* Install latest version of python from python.org(preferably python 3.5 or greater)
+* Add python path and scripts path to the environment variable path
+* Install python PyYAML module using the following command
+  pip install PyYAML
+*Install the latest version of perl from the below giver URL
+  http://strawberryperl.com/
+*Add perl path to the environment variable path
+
+#Copy all the files present in win/tests directory to the directory where .exe is present
+#Open the command prompt and execute the python script and provide an argument(check_st or check_mt or smallcheck_mpi etc)
+#To know about the valid arguments , use help option(python fftw_check.py --help)
+  For example:
+    python fftw_check.py smallcheck
+    python fftw_check.py check
+    python fftw_check.py bigcheck_mt
+    python fftw_check.py smallcheck_st
+
+#Output can be seen on the command prompt
+
+
+#FFTW benchmark execution script
+For help on executing .bat file run below command from command prompt
+fftw_st_benchmark.bat --help
+fftw_mt_benchmark.bat --help
+fftw_mpi_benchmark.bat --help
+
+Sample commands:
+fftw_st_benchmark.bat --verify bench.exe
+fftw_st_benchmark.bat --verify-only bench.exe
+fftw_mt_benchmark.bat --verify bench.exe
+fftw_mt_benchmark.bat --verify-only bench.exe
+fftw_mpi_benchmark.bat --verify-only mpi-bench.exe
diff --git a/win/tests/check.pl b/win/tests/check.pl
new file mode 100644
index 00000000..7ff6a2ef
--- /dev/null
+++ b/win/tests/check.pl
@@ -0,0 +1,308 @@
+#! /usr/bin/perl -w
+
+$program = "bench.exe";
+$default_options = "";
+$verbose = 0;
+$paranoid = 0;
+$exhaustive = 0;
+$patient = 0;
+$estimate = 0;
+$wisdom = 0;
+$nthreads = 1;
+$rounds = 0;
+$maxsize = 60000;
+$maxcount = 100;
+$do_0d = 0;
+$do_1d = 0;
+$do_2d = 0;
+$do_random = 0;
+$keepgoing = 0;
+$flushcount = 42;
+
+$mpi = 0;
+$mpi_transposed_in = 0;
+$mpi_transposed_out = 0;
+
+sub make_options {
+    my $options = $default_options;
+    $options = "--verify-rounds=$rounds $options" if $rounds;
+    $options = "--verbose=$verbose $options" if $verbose;
+    $options = "-o paranoid $options" if $paranoid;
+    $options = "-o exhaustive $options" if $exhaustive;
+    $options = "-o patient $options" if $patient;
+    $options = "-o estimate $options" if $estimate;
+    $options = "-o wisdom $options" if $wisdom;
+    $options = "-o nthreads=$nthreads $options" if ($nthreads > 1);
+    $options = "-obflag=30 $options" if $mpi_transposed_in;
+    $options = "-obflag=31 $options" if $mpi_transposed_out;
+    return $options;
+}
+
+@list_of_problems = ();
+
+sub flush_problems {
+    my $options = shift;
+    my $problist = "";
+
+    if ($#list_of_problems >= 0) {
+    for (@list_of_problems) {
+        $problist = "$problist --verify $_";
+    }
+    print "Executing \"$program $options $problist\"\n"
+        if $verbose;
+
+    system("$program $options $problist");
+    $exit_value  = $? >> 8;
+    $signal_num  = $? & 127;
+    $dumped_core = $? & 128;
+
+    if ($signal_num == 1) {
+        print "hangup\n";
+        exit 0;
+    }
+    if ($signal_num == 2) {
+        print "interrupted\n";
+        exit 0;
+    }
+    if ($signal_num == 9) {
+        print "killed\n";
+        exit 0;
+    }
+
+    if ($exit_value != 0 || $dumped_core || $signal_num) {
+        print "FAILED $program: $problist\n";
+        if ($signal_num) { print "received signal $signal_num\n"; }
+        exit 1 unless $keepgoing;
+    }
+    @list_of_problems = ();
+    }
+}
+
+sub do_problem {
+    my $problem = shift;
+    my $doablep = shift;
+    my $options = &make_options;
+
+    if ($problem =~ /\// && $problem =~ /r/
+    && ($problem =~ /i.*x/
+        || $problem =~ /v/ || $problem =~ /\*/)) {
+    return; # cannot do real split inplace-multidimensional or vector
+    }
+
+    # in --mpi mode, restrict to problems supported by MPI code
+    if ($mpi) {
+    if ($problem =~ /\//) { return; } # no split
+    if ($problem =~ /\*/) { return; } # no non-contiguous vectors
+    if ($problem =~ /r/ && $problem !~ /x/) { return; } # no 1d r2c
+    if ($problem =~ /k/ && $problem !~ /x/) { return; } # no 1d r2r
+    if ($mpi_transposed_in || $problem =~ /\[/) {
+        if ($problem !~ /x/) { return; } # no 1d transposed_in
+        if ($problem =~ /r/ && $problem !~ /b/) { return; } # only c2r
+    }
+    if ($mpi_transposed_out || $problem =~ /\]/) {
+        if ($problem !~ /x/) { return; } # no 1d transposed_out
+        if ($problem =~ /r/ && $problem =~ /b/) { return; } # only r2c
+    }
+    }
+
+    # size-1 redft00 is not defined/doable
+    return if ($problem =~ /[^0-9]1e00/);
+
+    if ($doablep) {
+    @list_of_problems = ($problem, @list_of_problems);
+    &flush_problems($options) if ($#list_of_problems > $flushcount);
+    } else {
+    print "Executing \"$program $options --can-do $problem\"\n"
+        if $verbose;
+    $result=`$program $options --can-do $problem`;
+    if ($result ne "#f\n" && $result ne "#f\r\n") {
+        print "FAILED $program: $problem is not undoable\n";
+        exit 1 unless $keepgoing;
+    }
+    }
+}
+
+# given geometry, try both directions and in place/out of place
+sub do_geometry {
+    my $geom = shift;
+    my $doablep = shift;
+    do_problem("if$geom", $doablep);
+    do_problem("of$geom", $doablep);
+    do_problem("ib$geom", $doablep);
+    do_problem("ob$geom", $doablep);
+    do_problem("//if$geom", $doablep);
+    do_problem("//of$geom", $doablep);
+    do_problem("//ib$geom", $doablep);
+    do_problem("//ob$geom", $doablep);
+}
+
+# given size, try all transform kinds (complex, real, etc.)
+sub do_size {
+    my $size = shift;
+    my $doablep = shift;
+    do_geometry("c$size", $doablep);
+    do_geometry("r$size", $doablep);
+}
+
+sub small_0d {
+    for ($i = 0; $i <= 16; ++$i) {
+    for ($j = 0; $j <= 16; ++$j) {
+        for ($vl = 1; $vl <= 5; ++$vl) {
+        my $ivl = $i * $vl;
+        my $jvl = $j * $vl;
+        do_problem("o1v${i}:${vl}:${jvl}x${j}:${ivl}:${vl}x${vl}:1:1", 1);
+        do_problem("i1v${i}:${vl}:${jvl}x${j}:${ivl}:${vl}x${vl}:1:1", 1);
+        do_problem("ok1v${i}:${vl}:${jvl}x${j}:${ivl}:${vl}x${vl}:1:1", 1);
+        do_problem("ik1v${i}:${vl}:${jvl}x${j}:${ivl}:${vl}x${vl}:1:1", 1);
+        }
+    }
+    }
+}
+
+sub small_1d {
+    do_size (0, 0);
+    for ($i = 1; $i <= 100; ++$i) {
+    do_size ($i, 1);
+    }
+    do_size (128, 1);
+    do_size (256, 1);
+    do_size (512, 1);
+    do_size (1024, 1);
+    do_size (2048, 1);
+    do_size (4096, 1);
+}
+
+sub small_2d {
+    do_size ("0x0", 0);
+    for ($i = 1; $i <= 100; ++$i) {
+    my $ub = 900/$i;
+    $ub = 100 if $ub > 100;
+    for ($j = 1; $j <= $ub; ++$j) {
+        do_size ("${i}x${j}", 1);
+    }
+    }
+}
+
+sub rand_small_factors {
+    my $l = shift;
+    my $n = 1;
+    my $maxfactor = 13;
+    my $f = int(rand($maxfactor) + 1);
+    while ($n * $f < $l) {
+    $n *= $f;
+    $f = int(rand($maxfactor) + 1);
+    };
+    return $n;
+}
+
+# way too complicated...
+sub one_random_test {
+    my $q = int(2 + rand($maxsize));
+    my $rnk = int(1 + rand(4));
+    my $vtype = int(rand(3));
+    my $g = int(2 + exp(log($q) / ($rnk + ($vtype > 0))));
+    my $first = 1;
+    my $sz = "";
+    my $is_r2r = shift;
+    my @r2r_kinds = ("f", "b", "h",
+             "e00", "e01", "e10", "e11", "o00", "o01", "o10", "o11");
+
+    while ($q > 1 && $rnk > 0) {
+    my $r = rand_small_factors(int(rand($g) + 10));
+    if ($r > 1) {
+        $sz = "${sz}x" if (!$first);
+        $first = 0;
+        $sz = "${sz}${r}";
+        if ($is_r2r) {
+        my $k = $r2r_kinds[int(1 + rand($#r2r_kinds))];
+        $sz = "${sz}${k}";
+        }
+        $q = int($q / $r);
+        if ($g > $q) { $g = $q; }
+        --$rnk;
+    }
+    }
+    if ($vtype > 0 && $g > 1) {
+    my $v = int(1 + rand($g));
+    $sz = "${sz}*${v}" if ($vtype == 1);
+    $sz = "${sz}v${v}" if ($vtype == 2);
+    }
+    if ($mpi) {
+    my $stype = int(rand(3));
+    $sz = "]${sz}" if ($stype == 1);
+    $sz = "[${sz}" if ($stype == 2);
+    }
+    $sz = "d$sz" if (int(rand(3)) == 0);
+    if ($is_r2r) {
+    do_problem("ik$sz", 1);
+    do_problem("ok$sz", 1);
+    }
+    else {
+    do_size($sz, 1);
+    }
+}
+
+sub random_tests {
+    my $i;
+    for ($i = 0; $i < $maxcount; ++$i) {
+    &one_random_test(0);
+    &one_random_test(1);
+    }
+}
+
+sub parse_arguments (@)
+{
+    local (@arglist) = @_;
+
+    while (@arglist)
+    {
+    if ($arglist[0] eq '-v') { ++$verbose; }
+    elsif ($arglist[0] eq '--verbose') { ++$verbose; }
+    elsif ($arglist[0] eq '-p') { ++$paranoid; }
+    elsif ($arglist[0] eq '--paranoid') { ++$paranoid; }
+    elsif ($arglist[0] eq '--exhaustive') { ++$exhaustive; }
+    elsif ($arglist[0] eq '--patient') { ++$patient; }
+    elsif ($arglist[0] eq '--estimate') { ++$estimate; }
+    elsif ($arglist[0] eq '--wisdom') { ++$wisdom; }
+    elsif ($arglist[0] =~ /^--nthreads=(.+)$/) { $nthreads = $1; }
+    elsif ($arglist[0] eq '-k') { ++$keepgoing; }
+    elsif ($arglist[0] eq '--keep-going') { ++$keepgoing; }
+    elsif ($arglist[0] =~ /^--verify-rounds=(.+)$/) { $rounds = $1; }
+    elsif ($arglist[0] =~ /^--count=(.+)$/) { $maxcount = $1; }
+    elsif ($arglist[0] =~ /^-c=(.+)$/) { $maxcount = $1; }
+    elsif ($arglist[0] =~ /^--flushcount=(.+)$/) { $flushcount = $1; }
+    elsif ($arglist[0] =~ /^--maxsize=(.+)$/) { $maxsize = $1; }
+
+    elsif ($arglist[0] eq '--mpi') { ++$mpi; }
+    elsif ($arglist[0] eq '--mpi-transposed-in') {
+        ++$mpi; ++$mpi_transposed_in; }
+    elsif ($arglist[0] eq '--mpi-transposed-out') {
+        ++$mpi; ++$mpi_transposed_out; }
+
+    elsif ($arglist[0] eq '-0d') { ++$do_0d; }
+    elsif ($arglist[0] eq '-1d') { ++$do_1d; }
+    elsif ($arglist[0] eq '-2d') { ++$do_2d; }
+    elsif ($arglist[0] eq '-r') { ++$do_random; }
+    elsif ($arglist[0] eq '--random') { ++$do_random; }
+    elsif ($arglist[0] eq '-a') {
+        ++$do_0d; ++$do_1d; ++$do_2d; ++$do_random;
+    }
+
+    else { $program=$arglist[0]; }
+    shift (@arglist);
+    }
+}
+
+# MAIN PROGRAM:
+
+&parse_arguments (@ARGV);
+
+&random_tests if $do_random;
+&small_0d if $do_0d;
+&small_1d if $do_1d;
+&small_2d if $do_2d;
+
+{
+    my $options = &make_options;
+    &flush_problems($options);
+}
diff --git a/win/tests/commands.yaml b/win/tests/commands.yaml
new file mode 100644
index 00000000..8a87d514
--- /dev/null
+++ b/win/tests/commands.yaml
@@ -0,0 +1,26 @@
+check_st: ['perl -w check.pl -r -c=30 -v bench.exe']
+
+check_mt: ['perl -w check.pl -r -c=30 -v --nthreads=2 bench.exe']
+
+check_mpi: ['perl -w check.pl --verbose --random --maxsize=10000 -c=10 CHECK_PL_OPTS --mpi "mpiexec -np 1 mpi-bench.exe"', 'perl -w check.pl --verbose --random --maxsize=10000 -c=10 CHECK_PL_OPTS --mpi "mpiexec -np 2 mpi-bench.exe"',
+        'perl -w check.pl --verbose --random --maxsize=10000 -c=10 CHECK_PL_OPTS --mpi "mpiexec -np 3 mpi-bench.exe"', 'perl -w check.pl --verbose --random --maxsize=10000 -c=10 CHECK_PL_OPTS --mpi "mpiexec -np 4 mpi-bench.exe"', 'perl -w check.pl --verbose --random --maxsize=10000 -c=10 CHECK_PL_OPTS --mpi --nthreads=2 "mpiexec -np 3 mpi-bench.exe"']
+
+smallcheck_st: ['perl -w check.pl -r -c=1 -v bench.exe', 'perl -w check.pl -r --estimate -c=5 -v bench.exe']
+
+smallcheck_mt: ['perl -w check.pl -r --estimate -c=2 -v --nthreads=2 bench.exe']
+
+smallcheck_mpi: ['perl -w check.pl --verbose --random --maxsize=10000 -c=2 CHECK_PL_OPTS --mpi "mpiexec -np 1 mpi-bench.exe"', 'perl -w check.pl --verbose --random --maxsize=10000 -c=2 CHECK_PL_OPTS --mpi "mpiexec -np 2 mpi-bench.exe"',
+             'perl -w check.pl --verbose --random --maxsize=10000 -c=2 CHECK_PL_OPTS --mpi "mpiexec -np 3 mpi-bench.exe"', 'perl -w check.pl --verbose --random --maxsize=10000 -c=2 CHECK_PL_OPTS --mpi "mpiexec -np 4 mpi-bench.exe"', 'perl -w check.pl --verbose --random --maxsize=10000 -c=2 CHECK_PL_OPTS --mpi --nthreads=2 "mpiexec -np 3 mpi-bench.exe"']
+
+bigcheck_st: ['perl -w check.pl -a -v bench.exe']
+
+bigcheck_mt: ['perl -w check.pl -a -v --nthreads=2 bench.exe', 'perl -w check.pl -a -v --nthreads=3 bench.exe', 'perl -w check.pl -a -v --nthreads=10 bench.exe']
+
+bigcheck_mpi: ['perl -w check.pl --verbose --random --maxsize=60000 -c=100 CHECK_PL_OPTS --mpi "mpiexec -np 1 mpi-bench.exe"', 'perl -w check.pl --verbose --random --maxsize=60000 -c=100 CHECK_PL_OPTS --mpi "mpiexec -np 2 mpi-bench.exe"',
+           'perl -w check.pl --verbose --random --maxsize=60000 -c=100 CHECK_PL_OPTS --mpi "mpiexec -np 3 mpi-bench.exe"', 'perl -w check.pl --verbose --random --maxsize=60000 -c=100 CHECK_PL_OPTS --mpi "mpiexec -np 4 mpi-bench.exe"', 'perl -w check.pl --verbose --random --maxsize=10000 -c=100 CHECK_PL_OPTS --mpi --nthreads=2 "mpiexec -np 3 mpi-bench.exe"']
+
+paranoid-check: ['perl -w check.pl -a --patient --paranoid bench.exe','perl -w check.pl -a --patient --nthreads=10 --paranoid bench.exe', 'perl -w check.pl -a --patient --nthreads=7 --paranoid bench.exe',
+'perl -w check.pl -a --patient --nthreads=3 --paranoid bench.exe', 'perl -w check.pl -a --patient --nthreads=2 --paranoid bench.exe']
+
+exhaustive-check: ['perl -w check.pl -a --exhaustive --paranoid bench.exe','perl -w check.pl -a --exhaustive --nthreads=10 --paranoid bench.exe','perl -w check.pl -a --exhaustive --nthreads=7 --paranoid bench.exe',
+'perl -w check.pl -a --exhaustive --nthreads=3 --paranoid bench.exe', 'perl -w check.pl -a --exhaustive --nthreads=2 --paranoid bench.exe']
diff --git a/win/tests/fftw_check.py b/win/tests/fftw_check.py
new file mode 100644
index 00000000..da1cc3d1
--- /dev/null
+++ b/win/tests/fftw_check.py
@@ -0,0 +1,166 @@
+import os
+import sys
+import subprocess
+import yaml
+
+
+class FftwCheck:
+
+    @staticmethod
+    def check_execution():
+        """
+        :Method Name: check_execution
+        :Description: reads the command from input file and executes if the respective executables are present
+        :parameter  : None
+        :return     : None
+        """
+        try:
+            with open(r'commands.yaml') as file:
+                input_file = yaml.safe_load(file) \
+
+                try:
+                    if (sys.argv[1] == '') or (sys.argv[1] == "--h") or (sys.argv[1] == "--help"):
+                        print("Below options are available \n\nUse '*_mt' options for Multithreaded"
+                              " build validation\nUse '*_mpi'"
+                              "options for MPI build validation\n")
+                        print("usage: python fftw_check.py ",
+                              end='[check | smallcheck | bigcheck | ')
+                        for var in input_file.keys():
+                            print(var, end=' | ')
+                        print(' --h | --help]')
+                        sys.exit()
+                except IndexError:
+                    print("Below options are available \n\n"
+                          "Use '*_mt' options for Multithreaded build validation\nUse "
+                          "'*_mpi' options for MPI build validation\n")
+                    print("usage: python fftw_check.py ", end='[check | smallcheck | bigcheck | ')
+                    for var in input_file.keys():
+                        print(var, end=' | ')
+                    print(' --h | --help]')
+                    sys.exit()
+
+                try:
+                    mpi_run = False
+                    if os.path.exists("bench.exe"):
+                        if os.path.exists("mpi-bench.exe"):
+                            mpi_run = True
+                            numcheck = 10 if sys.argv[1] == "check" \
+                                else 100 if sys.argv[1] == "bigcheck" else 2
+                            check_list = ['_st', '_mt', '_mpi'] \
+                                if mpi_run else ['_st', '_mt']
+                        else:
+                            check_list = ['_st', '_mt'] \
+                                if mpi_run else ['_st', '_mt']
+                    elif not 'mpi' in sys.argv[1]:
+                        print("bench.exe is not present ")
+                        sys.exit()
+                    elif os.path.exists("mpi-bench.exe"):
+                        mpi_run = True
+                        numcheck = 10 if sys.argv[1] == "check" \
+                            else 100 if sys.argv[1] == "bigcheck" else 2
+                    else:
+                        print("bench.exe is not present ")
+                        sys.exit()
+
+                    temp = 'basic' if sys.argv[1] == "check" \
+                        else 'big' if sys.argv[1] == "bigcheck" else 'a few'
+                    if (sys.argv[1] == "check") or (sys.argv[1] == "smallcheck") \
+                            or (sys.argv[1] == "bigcheck"):
+
+                        for command in range(len(check_list)):
+                            print("=" * 50, sys.argv[1] + check_list[command],
+                                  "execution", "=" * 50)
+                            for i in range(len(input_file[sys.argv[1] + check_list[command]])):
+                                process = subprocess.Popen(input_file[sys.argv[1]
+                                                                      + check_list[command]][i],
+                                                           bufsize=1,
+                                                           universal_newlines=True,
+                                                           stdout=subprocess.PIPE,
+                                                           stderr=subprocess.STDOUT)
+                                for line in iter(process.stdout.readline, ''):
+                                    print(line[:-1])
+
+                                    sys.stdout.flush()
+                                process.wait()
+                                # errcode = process.returncode
+
+                                if check_list[command] == '_mpi':
+                                    if '--nthreads' in input_file[sys.argv[1]
+                                                                  + check_list[command]][i]:
+                                        print("-" * 80, "\n", "\t" * 2,
+                                              "MPI FFTW threaded transforms passed "
+                                              "{} tests!".format(numcheck),
+                                              "\n", "-" * 80)
+                                    else:
+                                        print("-" * 80, "\n", "\t" * 2,
+                                              "MPI FFTW transforms passed "
+                                              "{} tests, {} CPU".format(numcheck, i + 1),
+                                              "\n", "-" * 80)
+
+                            if check_list[command] == '_st':
+                                print("*" * 80, "\n", "\t" * 2, "FFTW "
+                                                                "transforms passed %s "
+                                                                "tests!\n" % temp, "*" * 80)
+                            elif check_list[command] == '_mt':
+                                print("*" * 80, "\n", "\t" * 2, "FFTW "
+                                                                "threaded transforms passed %s "
+                                                                "tests!\n" % temp,
+                                      "*" * 80)
+                    else:
+                        if 'mpi' in sys.argv[1]:
+                            if mpi_run:
+                                pass
+                            else:
+                                print('mpi-bench.exe is not present '
+                                      ', please select other valid arguments')
+                                sys.exit()
+
+                        print("=" * 50, sys.argv[1], "execution", "=" * 50)
+                        for i in range(len(input_file[sys.argv[1]])):
+                            process = subprocess.Popen(input_file[sys.argv[1]][i], bufsize=1,
+                                                       universal_newlines=True,
+                                                       stdout=subprocess.PIPE,
+                                                       stderr=subprocess.STDOUT)
+
+                            for line in iter(process.stdout.readline, ''):
+                                print(line[:-1])
+                                sys.stdout.flush()
+                            process.wait()
+                            # errcode = process.returncode
+
+                            if '_mpi' in sys.argv[1]:
+                                if "--nthreads" in input_file[sys.argv[1]][i]:
+                                    print("-" * 80, "\n", "\t" * 2,
+                                          "MPI FFTW threaded transforms "
+                                          "passed {} tests!".format(numcheck), "\n",
+                                          "-" * 80)
+                                else:
+                                    print("-" * 80, "\n", "\t" * 2,
+                                          "MPI FFTW transforms passed "
+                                          "{} tests, {} CPU".format(numcheck, i + 1), "\n",
+                                          "-" * 80)
+
+                        temp = 'a few' if ("smallcheck" in sys.argv[1]) else 'big' if (
+                                "bigcheck" in sys.argv[1]) else 'basic'
+                        if '_st' in sys.argv[1]:
+                            print("*" * 80, "\n", "\t" * 2, "FFTW "
+                                                            "transforms passed %s "
+                                                            "tests!\n" % temp, "*" * 80)
+                        elif '_mt' in sys.argv[1]:
+                            print("*" * 80, "\n", "\t" * 2, "FFTW "
+                                                            "threaded transforms passed %s "
+                                                            "tests!\n" % temp,
+                                  "*" * 80)
+                except UnboundLocalError as error:
+                    print(error)
+
+        except KeyError:
+            print("\nPlease enter a valid argument")
+            print("usage: python fftw_check.py ", end='[check | smallcheck | bigcheck |')
+            for var in input_file.keys():
+                print(var, end=' | ')
+            print(' --h | --help]')
+
+
+if __name__ == "__main__":
+    FftwCheck.check_execution()
diff --git a/win/tests/fftw_mpi_benchmark.bat b/win/tests/fftw_mpi_benchmark.bat
new file mode 100644
index 00000000..fb942910
--- /dev/null
+++ b/win/tests/fftw_mpi_benchmark.bat
@@ -0,0 +1,301 @@
+@echo OFF
+setlocal EnableDelayedExpansion
+set NL=^
+
+
+REM two empty line required
+REM FFTW-MPI Lib Batch Test
+REM This shell script benchmarks a program on a series of test problems
+REM
+REM #set -ix
+set me=%0
+
+
+if "%1" == "" (
+   echo Try '!me! --help' for more information.
+   exit /b )
+
+set usage_options=--h --help --verify --verify-only --verify-tolerance --accuracy --accuracy-rounds --k --keep-going --time-min --max --maxnd -o --user-option
+
+set usage_txt=   Usage: !me! [OPTION] program !NL! ^
+  -h, --help         print this help, then exit !NL! ^
+  --verify           verify each transform before timing !NL! ^
+  --verify-only      verify each transform but do not time !NL! ^
+  --verify-tolerance set error tolerance for --verify !NL! ^
+  --accuracy         run accuracy test !NL! ^
+  --accuracy-rounds  set number of rounds for --accuracy !NL! ^
+  -k, --keep-going   continue after verification error !NL! ^
+  --time-min:X       set minimum measurement time !NL! ^
+  --maxn:N           set maximum allowed problem size !NL! ^
+  --maxnd:N          set maximum allowed multi-dimensional problem size !NL! ^
+  -o:X, --user-option:X    undocumented !NL!
+
+set verify=no
+set accuracy=no
+set useropt=""
+set speed=yes
+set maxnd=1073741824 #1 billion size
+set maxn=67777216
+set time_min=""
+set keep_going=no
+set tolerance=""
+set rounds=""
+set arounds=""
+
+:GetLastArg
+set /a argCount+=1
+set program=%~1
+shift
+if not "%~1"=="" goto GetLastArg
+
+set delim_char=":"
+set arg_cnt=0
+
+for %%i in (%*) do (
+   set /a arg_cnt = !arg_cnt!+1
+   set optargset=yes
+   set arg_val=%%~i
+
+   if !arg_val! == !program! goto NEXT
+   echo %%~i|find %delim_char% >nul
+   if errorlevel 1 set optargset=no
+   for /F "tokens=1,2 delims=: " %%a in ("%%i") do (
+      set vararg=%%~a
+      set optarg=%%~b
+      )
+   REM echo vararg=!vararg!
+   if !optargset! == yes (
+       echo !usage_options!|find "!vararg!" >nul
+       if errorlevel 1 (
+          echo Invalid option !vararg!
+          exit /b )
+
+       if !vararg! == --verify-rounds (
+       set rounds=!optarg!
+       )
+       if !vararg! == --verify-tolerance (
+       set tolerance=!optarg!
+       )
+       if !vararg! == --keep-going (
+       set keep_going=!optarg!
+       )
+       if !vararg! == -k (
+       set keep_going=!optarg!
+       )
+       if !vararg! == --accuracy-rounds (
+       set arounds=!optarg!
+       )
+       if !vararg! == --time-min (
+       set time_min=!optarg!
+       )
+       if !vararg! == --maxn (
+       set maxn=!optarg!
+       )
+       if !vararg! == --maxnd (
+       set maxnd=!optarg!
+       )
+       if !vararg! == --user-option (
+       set useropt="%useropt% --user-option=!optarg!"
+       )
+    ) else (
+       if !vararg! == --verify-rounds (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --verify-tolerance (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --accuracy-rounds (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --time-min (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --maxn (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --maxnd (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --keep-going (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == -k (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --user-option (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !arg_val! == --help (
+       echo !usage_txt!
+       exit /b
+       )
+       if !arg_val! == --h (
+       echo !usage_txt!
+       exit /b
+       )
+       if !arg_val! == --keep-going (
+       set keep-going=yes
+       )
+       if !arg_val! == --k (
+       set keep-going=yes
+       )
+       if !arg_val! == --accuracy (
+       set accuracy=yes
+       set speed=no
+       )
+       if !arg_val! == --verify (
+       set verify=yes
+       )
+       if !arg_val! == --verify-only (
+       set verify=yes
+       set speed=no
+       )
+       echo !usage_options!|find "!arg_val!" >nul
+       if errorlevel 1 (
+          echo Invalid option !vararg!
+          exit /b )
+   )
+)
+
+:NEXT
+if !arg_cnt! == 1 (
+   if !arg_val! == --help echo !usage_txt!
+   if !arg_val! == -h echo !usage_txt!
+   if !arg_val! == !program! goto :NEXT_1
+   echo Try '!me! --help' for more information.
+   exit /b )
+
+:NEXT_1
+REM MPIFFT 1D Sizes
+set SIZES_1D=524288 1048576 2097152 4194304 8388608 16777216 33554432 67108864 390625 823543 4782969 7962624 1000000 8000000 16000000 32000000 64000000
+
+REM MPIFFT 2D Sizes
+set SIZES_2D=2048x2048 2048x4096 4096x4096 8192x8192 8192x16384 729x2187 625x3125 6561x15625 27000x27000 1000x10000 10000x10000
+
+REM MPIFFT 3D Sizes
+set SIZES_3D=256x256x256 512x512x512 512x512x1024 1024x1024x512 1024x1024x1024 243x243x243 343x343x343 243x243x729 625x625x625 200x180x216 300x1000x100 1000x1000x1000
+
+set DIRECTIONS=f
+set ALL_SIZES=!SIZES_1D! !SIZES_2D! !SIZES_3D!
+REM set ALL_SIZES=2 4x4 1960x1960 2048x2048 4x4x4 16x1024x64
+
+set PLACE=i
+set REALITY=c
+set CORES=16 32 64 128
+
+if not exist "!program!" (
+   echo !program! file does not exist
+   exit /b
+)
+
+REM test "$speed" = "no" || test -n "$time_min" || time_min=`$program --print-time-min`
+if !speed! == no (
+for /f %%i in ('%program% --print-time-min') do set time_min=%%~i )
+
+if !time_min! == "" (
+   for /f %%i in ('!program! --print-time-min') do set time_min=%%~i
+)
+
+REM precision=`$program --print-precision`
+for /f %%i in ('!program! --print-precision') do set precision=%%~i
+
+REM shorten the name
+if !precision! == single set precision=s
+if !precision! == double set precision=d
+
+REM name=`$program --info name`
+for /f %%i in ('!program! --info name') do set name=%%~i
+
+if not %tolerance%=="" (
+set tolerance=--verify-tolerance %tolerance%
+set vflags=!vflags! !tolerance!
+)
+
+if not !rounds! == "" (
+set rounds=--verify-rounds !rounds!
+set vflags=!vflags! !rounds!
+)
+
+if not !arounds! == "" (
+   set arounds=--accuracy-rounds !arounds!
+   set vflags=!vflags! !arounds!
+   set aflags=!arounds!
+)
+
+for %%r in (%CORES%) do (
+set rank_var=%%~r
+for %%p in (%PLACE%) do (
+   set plc_var=%%~p
+   for %%r in (%REALITY%) do (
+      set r_val=%%~r
+      for %%s in (%ALL_SIZES%) do (
+         REM :NEXT_LOOP
+         set size_var=%%~s
+         set size_val=%%~s
+         set oned=no
+         REM #for core in $CORES; do
+         set curmaxn=%maxnd%
+         echo !size_var!|find "x">nul
+         if errorlevel 1 set curmaxn=%maxn%
+
+         echo !size_var!|find "x">nul
+         if errorlevel 1 set oned=yes
+
+         set size_val=!size_val:x=*!
+         set /a size_val=!size_val!
+
+         if !size_val! LSS !curmaxn! (
+             for %%d in (%DIRECTIONS%) do (
+                set d_val=%%~d
+                set problem=!plc_var!!r_val!!d_val!!size_var!
+                REM #doable=`$program $useropt --can-do $problem`
+                REM set doable="#t"
+                set print=yes
+                set /A ITERS=3
+                for /l %%v in (1,1,!ITERS!) do (
+                   set acc=""
+                   if !verify! == yes (
+                      for /f "tokens=*" %%i in ('!program! !vflags! --verbose --verify !problem!') do set acc=%%i
+                      REM echo !acc!
+                      if !acc! == 'FAILED FAILED FAILED' if not !keep_going! == yes exit /b
+                    ) else ( set acc="" )
+                   if !speed! == yes (
+                      if !useropt! == "" set useropt=
+                      if !rank_var! == 16 (
+                         for /f "tokens=*" %%j in ('mpiexec --map-by L3cache --bind-to core -np !rank_var! !program! !useropt! --report-benchmark --time-min !time_min! -opatient -r500 --speed !problem!') do set time_val=%%j
+                         if !time_val! == "" set time_val="FAILED FAILED"
+                      )
+                      if !rank_var! == 32 (
+                         for /f "tokens=*" %%j in ('mpiexec --map-by L3cache --bind-to core -np !rank_var!  !program! !useropt! --report-benchmark --time-min !time_min! -opatient -r500 --speed !problem!') do set time_val=%%j
+                         if !time_val! == "" set time_val="FAILED FAILED"
+                      ) else (
+                      if !rank_var! == 64 (
+                         for /f "tokens=*" %%j in ('mpiexec --map-by core --bind-to core -np !rank_var! !program! !useropt! --report-benchmark --time-min !time_min! -opatient -r500 --speed !problem!') do set time_val=%%j
+                         if !time_val! == "" set time_val="FAILED FAILED"
+                      )
+                      )
+                    ) else ( set time_val="" )
+                   if !accuracy! == yes (
+                      if !oned! == yes (
+                        for /f "tokens=*" %%k in ('!program! !useropt! !aflags! --accuracy !problem!') do set acc=%%k
+                    ) else ( set print="no" )
+                   )
+                   if !print! == yes (
+                     echo !rank_var! !name! !precision!!r_val!!plc_var!!d_val! !size_var! !time_val! !acc! )
+                )
+                )
+           )
+        )
+    )
+)
+)
+endlocal
\ No newline at end of file
diff --git a/win/tests/fftw_mt_benchmark.bat b/win/tests/fftw_mt_benchmark.bat
new file mode 100644
index 00000000..581bfdd7
--- /dev/null
+++ b/win/tests/fftw_mt_benchmark.bat
@@ -0,0 +1,313 @@
+@echo OFF
+setlocal EnableDelayedExpansion
+set NL=^
+
+
+REM two empty line required
+REM FFTW-MT Lib Batch Test
+REM This shell script benchmarks a program on a series of test problems
+REM
+REM #set -ix
+set me=%0
+
+
+if "%1" == "" (
+   echo Try '!me! --help' for more information.
+   exit /b )
+
+set usage_options=--h --help --verify --verify-only --verify-tolerance --accuracy --accuracy-rounds --k --keep-going --time-min --max --maxnd -o --user-option
+
+set usage_txt=   Usage: !me! [OPTION] program !NL! ^
+  -h, --help         print this help, then exit !NL! ^
+  --verify           verify each transform before timing !NL! ^
+  --verify-only      verify each transform but do not time !NL! ^
+  --verify-tolerance set error tolerance for --verify !NL! ^
+  --accuracy         run accuracy test !NL! ^
+  --accuracy-rounds  set number of rounds for --accuracy !NL! ^
+  -k, --keep-going   continue after verification error !NL! ^
+  --time-min:X       set minimum measurement time !NL! ^
+  --maxn:N           set maximum allowed problem size !NL! ^
+  --maxnd:N          set maximum allowed multi-dimensional problem size !NL! ^
+  -o:X, --user-option:X    undocumented !NL!
+
+set verify=no
+set accuracy=no
+set useropt=""
+set speed=yes
+set maxn=16777216
+set maxnd=1048576
+set time_min=""
+set keep_going=no
+set tolerance=""
+set rounds=""
+set arounds=""
+
+:GetLastArg
+set /a argCount+=1
+set program=%~1
+shift
+if not "%~1"=="" goto GetLastArg
+
+set delim_char=":"
+set arg_cnt=0
+
+for %%i in (%*) do (
+   set /a arg_cnt = !arg_cnt!+1
+   set optargset=yes
+   set arg_val=%%~i
+
+   if !arg_val! == !program! goto NEXT
+   echo %%~i|find %delim_char% >nul
+   if errorlevel 1 set optargset=no
+   for /F "tokens=1,2 delims=: " %%a in ("%%i") do (
+      set vararg=%%~a
+      set optarg=%%~b
+      )
+   REM echo vararg=!vararg!
+   if !optargset! == yes (
+       echo !usage_options!|find "!vararg!" >nul
+       if errorlevel 1 (
+          echo Invalid option !vararg!
+          exit /b )
+
+       if !vararg! == --verify-rounds (
+       set rounds=!optarg!
+       )
+       if !vararg! == --verify-tolerance (
+       set tolerance=!optarg!
+       )
+       if !vararg! == --keep-going (
+       set keep_going=!optarg!
+       )
+       if !vararg! == -k (
+       set keep_going=!optarg!
+       )
+       if !vararg! == --accuracy-rounds (
+       set arounds=!optarg!
+       )
+       if !vararg! == --time-min (
+       set time_min=!optarg!
+       )
+       if !vararg! == --maxn (
+       set maxn=!optarg!
+       )
+       if !vararg! == --maxnd (
+       set maxnd=!optarg!
+       )
+       if !vararg! == --user-option (
+       set useropt="%useropt% --user-option=!optarg!"
+       )
+    ) else (
+       if !vararg! == --verify-rounds (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --verify-tolerance (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --accuracy-rounds (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --time-min (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --maxn (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --maxnd (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --keep-going (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == -k (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --user-option (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !arg_val! == --help (
+       echo !usage_txt!
+       exit /b
+       )
+       if !arg_val! == --h (
+       echo !usage_txt!
+       exit /b
+       )
+       if !arg_val! == --keep-going (
+       set keep-going=yes
+       )
+       if !arg_val! == --k (
+       set keep-going=yes
+       )
+       if !arg_val! == --accuracy (
+       set accuracy=yes
+       set speed=no
+       )
+       if !arg_val! == --verify (
+       set verify=yes
+       )
+       if !arg_val! == --verify-only (
+       set verify=yes
+       set speed=no
+       )
+       echo !usage_options!|find "!arg_val!" >nul
+       if errorlevel 1 (
+          echo Invalid option !vararg!
+          exit /b )
+   )
+)
+
+:NEXT
+if !arg_cnt! == 1 (
+   if !arg_val! == --help echo !usage_txt!
+   if !arg_val! == -h echo !usage_txt!
+   if !arg_val! == !program! goto :NEXT_1
+   echo Try '!me! --help' for more information.
+   exit /b )
+
+:NEXT_1
+set SIZES_1D=2 4 6 8 9 12 15 16 18 24 32 36 64 80 108 128 210 256 504 512 1000 1024 1960 2048 4096 4725 8192 10368 16384 27000 32768 65536 75600 131072 165375 262144 362880 524288 1048576 1594323 1953125 2097152 4194304 4782969 5764801 8388608
+
+set SIZES_2D=4x4 5x5 6x6 7x7 8x8 4x8 8x4 9x9 10x10 11x11 12x12 13x13 14x14 15x15 16x16 25x24 32x32 48x48 49x49 60x60 72x56 64x64 75x75 80x80 84x84 128x64 16x512 96x96 100x100 105x105 112x112 120x120 128x128 144x144 180x180 512x64 256x128 240x240 256x256 64x1024 360x360 512x512 1000x1000 1024x1024 1960x1960 2048x2048 3360x3360 4096x4096 4725x4725 8192x8192 10368x10368 16384x16384 27000x27000 32768x32768
+
+set SIZES_3D=4x4x4 5x5x5 6x6x6 7x7x7 8x8x8 9x9x9 10x10x10 11x11x11 12x12x12 13x13x13 14x14x14 15x15x15 16x16x16 4x8x16 24x25x28 32x32x32 48x48x48 49x49x49 60x60x60 72x60x56 64x64x64 75x75x75 80x80x80 256x64x32 84x84x84 96x96x96 100x100x100 16x1024x64 105x105x105 112x112x112 120x120x120 128x128x128 144x144x144 512x128x64 180x180x180 256x128x256 240x240x240 256x256x256 512x64x1024 360x360x360 512x512x512
+
+set DIRECTIONS=f
+set ALL_SIZES=!SIZES_1D! !SIZES_2D! !SIZES_3D!
+
+set PLACE=i
+set REALITY=c
+set CORES=2 4 8 16 32 64 128
+set Threads=128
+
+REM openMP env variables settings-bind process in specified nodes
+set OMP_PROC_BIND=TRUE
+set OMP_PLACES=cores
+
+
+if not exist "!program!" (
+   echo !program! file does not exist
+   exit /b
+)
+
+REM test "$speed" = "no" || test -n "$time_min" || time_min=`$program --print-time-min`
+if !speed! == no (
+for /f %%i in ('%program% --print-time-min') do set time_min=%%~i )
+
+if !time_min! == "" (
+   for /f %%i in ('!program! --print-time-min') do set time_min=%%~i
+)
+
+REM precision=`$program --print-precision`
+for /f %%i in ('!program! --print-precision') do set precision=%%~i
+
+REM shorten the name
+if !precision! == single set precision=s
+if !precision! == double set precision=d
+
+REM name=`$program --info name`
+for /f %%i in ('!program! --info name') do set name=%%~i
+
+if not %tolerance%=="" (
+set tolerance=--verify-tolerance %tolerance%
+set vflags=!vflags! !tolerance!
+)
+
+if not !rounds! == "" (
+set rounds=--verify-rounds !rounds!
+set vflags=!vflags! !rounds!
+)
+
+if not !arounds! == "" (
+   set arounds=--accuracy-rounds !arounds!
+   set vflags=!vflags! !arounds!
+   set aflags=!arounds!
+)
+
+for %%t in (%Threads%) do (
+set thr_var=%%~t
+for %%p in (%PLACE%) do (
+   set plc_var=%%~p
+   for %%r in (%REALITY%) do (
+      set r_val=%%~r
+      for %%s in (%ALL_SIZES%) do (
+         REM :NEXT_LOOP
+         set size_var=%%~s
+         set size_val=%%~s
+         set oned=no
+         REM #for core in $CORES; do
+         set curmaxn=%maxnd%
+         echo !size_var!|find "x">nul
+         if errorlevel 1 set curmaxn=%maxn%
+
+         echo !size_var!|find "x">nul
+         if errorlevel 1 set oned=yes
+
+         set size_val=!size_val:x=*!
+         set /a size_val=!size_val!
+
+         if !size_val! LSS !curmaxn! (
+             for %%d in (%DIRECTIONS%) do (
+                set d_val=%%~d
+                set problem=!plc_var!!r_val!!d_val!!size_var!
+                REM #doable=`$program $useropt --can-do $problem`
+                REM set doable="#t"
+                set print=yes
+                set /A ITERS=3
+                for /l %%v in (1,1,!ITERS!) do (
+                   set acc=""
+                   if !verify! == yes (
+                      for /f "tokens=*" %%i in ('!program! !vflags! --verbose --verify !problem!') do set acc=%%i
+                      REM echo !acc!
+                      if !acc! == 'FAILED FAILED FAILED' if not !keep_going! == yes exit /b
+                    ) else ( set acc="" )
+                   if !speed! == yes (
+                      if !useropt! == "" set useropt=
+                      if !thr_var! == 16 (
+                         set thread_arg=-onthreads=!thr_var!
+                         set start_opts=1
+                         for /f "tokens=*" %%j in ('start /b /wait /node !start_opts! !program! !useropt! --report-benchmark --time-min !time_min! -opatient !thread_arg! --speed !problem!') do set time_val=%%j
+                         if !time_val! == "" set time_val="FAILED FAILED"
+                      )
+                      if !thr_var! == 32 (
+                         set thread_arg=-onthreads=!thr_var!
+                         set start_opts=1,2
+                         for /f "tokens=*" %%j in ('start /b /wait /node !start_opts! !program! !useropt! --report-benchmark --time-min !time_min! -opatient !thread_arg! --speed !problem!') do set time_val=%%j
+                         if !time_val! == "" set time_val="FAILED FAILED"
+                      )
+                      if !thr_var! == 64 (
+                         set thread_arg=-onthreads=!thr_var!
+                         set start_opts=1,2,3,4
+                         for /f "tokens=*" %%j in ('start /b /wait /node !start_opts! !program! !useropt! --report-benchmark --time-min !time_min! -opatient !thread_arg! --speed !problem!') do set time_val=%%j
+                         if !time_val! == "" set time_val="FAILED FAILED"
+                      )
+                      if !thr_var! == 128 (
+                         set start_opts=1,2,3,4,5,6,7,8
+                         set thread_arg=-onthreads=!thr_var!
+                         for /f "tokens=*" %%j in ('start /b /wait /node !start_opts! !program! --report-benchmark --time-min !time_min! -opatient !thread_arg! --speed !problem!') do set time_val=%%j
+                         if !time_val! == "" set time_val="FAILED FAILED"
+                      )
+                    ) else ( set time_val="" )
+                   if !accuracy! == yes (
+                      if !oned! == yes (
+                        for /f "tokens=*" %%k in ('!program! !useropt! !aflags! --accuracy !problem!') do set acc=%%k
+                    ) else ( set print="no" )
+                   )
+                   if !print! == yes echo !thr_var! !name! !precision!!r_val!!plc_var!!d_val! !size_var! !time_val! !acc!
+                )
+                )
+           )
+        )
+    )
+)
+)
+endlocal
\ No newline at end of file
diff --git a/win/tests/fftw_st_benchmark.bat b/win/tests/fftw_st_benchmark.bat
new file mode 100644
index 00000000..cca23747
--- /dev/null
+++ b/win/tests/fftw_st_benchmark.bat
@@ -0,0 +1,282 @@
+@echo OFF
+setlocal EnableDelayedExpansion
+set NL=^
+
+
+REM two empty line required
+REM FFTW-ST Lib Batch Test
+REM This shell script benchmarks a program on a series of test problems
+REM
+REM #set -ix
+set me=%0
+
+
+if "%1" == "" (
+   echo Try '!me! --help' for more information.
+   exit /b )
+
+set usage_options=--h --help --verify --verify-only --verify-tolerance --accuracy --accuracy-rounds --k --keep-going --time-min --max --maxnd -o --user-option
+
+set usage_txt=   Usage: !me! [OPTION] program !NL! ^
+  -h, --help         print this help, then exit !NL! ^
+  --verify           verify each transform before timing !NL! ^
+  --verify-only      verify each transform but do not time !NL! ^
+  --verify-tolerance set error tolerance for --verify !NL! ^
+  --accuracy         run accuracy test !NL! ^
+  --accuracy-rounds  set number of rounds for --accuracy !NL! ^
+  -k, --keep-going   continue after verification error !NL! ^
+  --time-min:X       set minimum measurement time !NL! ^
+  --maxn:N           set maximum allowed problem size !NL! ^
+  --maxnd:N          set maximum allowed multi-dimensional problem size !NL! ^
+  -o:X, --user-option:X    undocumented !NL!
+
+set verify=no
+set accuracy=no
+set useropt=""
+set speed=yes
+set maxn=16777216
+set maxnd=1048576
+set time_min=""
+set keep_going=no
+set tolerance=""
+set rounds=""
+set arounds=""
+
+:GetLastArg
+set /a argCount+=1
+set program=%~1
+shift
+if not "%~1"=="" goto GetLastArg
+
+set delim_char=":"
+set arg_cnt=0
+
+for %%i in (%*) do (
+   set /a arg_cnt = !arg_cnt!+1
+   set optargset=yes
+   set arg_val=%%~i
+
+   if !arg_val! == !program! goto NEXT
+   echo %%~i|find %delim_char% >nul
+   if errorlevel 1 set optargset=no
+   for /F "tokens=1,2 delims=: " %%a in ("%%i") do (
+      set vararg=%%~a
+      set optarg=%%~b
+      )
+   REM echo vararg=!vararg!
+   if !optargset! == yes (
+       echo !usage_options!|find "!vararg!" >nul
+       if errorlevel 1 (
+          echo Invalid option !vararg!
+          exit /b )
+
+       if !vararg! == --verify-rounds (
+       set rounds=!optarg!
+       )
+       if !vararg! == --verify-tolerance (
+       set tolerance=!optarg!
+       )
+       if !vararg! == --keep-going (
+       set keep_going=!optarg!
+       )
+       if !vararg! == -k (
+       set keep_going=!optarg!
+       )
+       if !vararg! == --accuracy-rounds (
+       set arounds=!optarg!
+       )
+       if !vararg! == --time-min (
+       set time_min=!optarg!
+       )
+       if !vararg! == --maxn (
+       set maxn=!optarg!
+       )
+       if !vararg! == --maxnd (
+       set maxnd=!optarg!
+       )
+       if !vararg! == --user-option (
+       set useropt="%useropt% --user-option=!optarg!"
+       )
+    ) else (
+       if !vararg! == --verify-rounds (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --verify-tolerance (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --accuracy-rounds (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --time-min (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --maxn (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --maxnd (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --keep-going (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == -k (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !vararg! == --user-option (
+       echo error: missing argument to !vararg!
+       exit /b
+       )
+       if !arg_val! == --help (
+       echo !usage_txt!
+       exit /b
+       )
+       if !arg_val! == --h (
+       echo !usage_txt!
+       exit /b
+       )
+       if !arg_val! == --keep-going (
+       set keep-going=yes
+       )
+       if !arg_val! == --k (
+       set keep-going=yes
+       )
+       if !arg_val! == --accuracy (
+       set accuracy=yes
+       set speed=no
+       )
+       if !arg_val! == --verify (
+       set verify=yes
+       )
+       if !arg_val! == --verify-only (
+       set verify=yes
+       set speed=no
+       )
+       echo !usage_options!|find "!arg_val!" >nul
+       if errorlevel 1 (
+          echo Invalid option !vararg!
+          exit /b )
+   )
+)
+
+:NEXT
+if !arg_cnt! == 1 (
+   if !arg_val! == --help echo !usage_txt!
+   if !arg_val! == -h echo !usage_txt!
+   if !arg_val! == !program! goto :NEXT_1
+   echo Try '!me! --help' for more information.
+   exit /b )
+
+:NEXT_1
+set SIZES_1D=2 4 6 8 9 12 15 16 18 24 32 36 64 80 108 128 210 256 504 512 1000 1024 1960 2048 4096 4725 8192 10368 16384 27000 32768 65536 75600 131072 165375 262144 362880 524288 1048576 1594323 1953125 2097152 4194304 4782969 5764801 8388608
+
+set SIZES_2D=4x4 5x5 6x6 7x7 8x8 4x8 8x4 9x9 10x10 11x11 12x12 13x13 14x14 15x15 16x16 25x24 32x32 48x48 49x49 60x60 72x56 64x64 75x75 80x80 84x84 128x64 16x512 96x96 100x100 105x105 112x112 120x120 128x128 144x144 180x180 512x64 256x128 240x240 256x256 64x1024 360x360 512x512 1000x1000 1024x1024 1960x1960 2048x2048 3360x3360 4096x4096 4725x4725 8192x8192 10368x10368 16384x16384 27000x27000 32768x32768
+
+set SIZES_3D=4x4x4 5x5x5 6x6x6 7x7x7 8x8x8 9x9x9 10x10x10 11x11x11 12x12x12 13x13x13 14x14x14 15x15x15 16x16x16 4x8x16 24x25x28 32x32x32 48x48x48 49x49x49 60x60x60 72x60x56 64x64x64 75x75x75 80x80x80 256x64x32 84x84x84 96x96x96 100x100x100 16x1024x64 105x105x105 112x112x112 120x120x120 128x128x128 144x144x144 512x128x64 180x180x180 256x128x256 240x240x240 256x256x256 512x64x1024 360x360x360 512x512x512
+
+set DIRECTIONS=f
+set ALL_SIZES=!SIZES_1D! !SIZES_2D! !SIZES_3D!
+
+set PLACE=i
+set REALITY=c
+set CORES=2 4 8 16 32 64 128
+
+if not exist "!program!" (
+   echo !program! file does not exist
+   exit /b
+)
+
+REM test "$speed" = "no" || test -n "$time_min" || time_min=`$program --print-time-min`
+if !speed! == no (
+for /f %%i in ('%program% --print-time-min') do set time_min=%%~i )
+
+if !time_min! == "" (
+   for /f %%i in ('!program! --print-time-min') do set time_min=%%~i
+)
+
+REM precision=`$program --print-precision`
+for /f %%i in ('!program! --print-precision') do set precision=%%~i
+
+REM shorten the name
+if !precision! == single set precision=s
+if !precision! == double set precision=d
+
+REM name=`$program --info name`
+for /f %%i in ('!program! --info name') do set name=%%~i
+
+if not %tolerance%=="" (
+set tolerance=--verify-tolerance %tolerance%
+set vflags=!vflags! !tolerance!
+)
+
+if not !rounds! == "" (
+set rounds=--verify-rounds !rounds!
+set vflags=!vflags! !rounds!
+)
+
+if not !arounds! == "" (
+   set arounds=--accuracy-rounds !arounds!
+   set vflags=!vflags! !arounds!
+   set aflags=!arounds!
+)
+
+for %%p in (%PLACE%) do (
+   set plc_var=%%~p
+   for %%r in (%REALITY%) do (
+      set r_val=%%~r
+      for %%s in (%ALL_SIZES%) do (
+         REM :NEXT_LOOP
+         set size_var=%%~s
+         set size_val=%%~s
+         set oned=no
+         REM #for core in $CORES; do
+         set curmaxn=%maxnd%
+         echo !size_var!|find "x">nul
+         if errorlevel 1 set curmaxn=%maxn%
+
+         echo !size_var!|find "x">nul
+         if errorlevel 1 set oned=yes
+
+         set size_val=!size_val:x=*!
+         set /a size_val=!size_val!
+
+         if !size_val! LSS !curmaxn! (
+             for %%d in (%DIRECTIONS%) do (
+                set d_val=%%~d
+                set problem=!plc_var!!r_val!!d_val!!size_var!
+                REM #doable=`$program $useropt --can-do $problem`
+                REM set doable="#t"
+                set print=yes
+                set /A ITERS=3
+                for /l %%v in (1,1,!ITERS!) do (
+                   set acc=""
+                   if !verify! == yes (
+                      for /f "tokens=*" %%i in ('!program! !vflags! --verbose --verify !problem!') do set acc=%%i
+                      REM echo !acc!
+                      if !acc! == 'FAILED FAILED FAILED' if not !keep_going! == yes exit /b
+                    ) else ( set acc="" )
+                   if !speed! == yes (
+                      if !useropt! == "" set useropt=
+                      for /f "tokens=*" %%f in ('!program! !useropt! --report-benchmark --time-min !time_min! -opatient --speed !problem!') do set time_val=%%f
+                      if !time_val! == 'FAILED FAILED' if not !keep_going! == yes exit /b
+                    ) else ( set time_val="" )
+                   if !accuracy! == yes (
+                      if !oned! == yes (
+                        for /f "tokens=*" %%k in ('!program! !useropt! !aflags! --accuracy !problem!') do set acc=%%k
+                    ) else ( set print="no" )
+                   )
+                   if !print! == yes echo !name! !precision!!r_val!!plc_var!!d_val! !size_var! !time_val! !acc!
+                )
+                )
+           )
+        )
+    )
+)
+endlocal
\ No newline at end of file

From 7fd2e222fa47d0ab9591958a5e80d78375f9094c Mon Sep 17 00:00:00 2001
From: Eashan Dash <Eashan.Dash@amd.com>
Date: Wed, 26 May 2021 17:17:29 +0530
Subject: [PATCH 3/6] This change contains a fix to ensure AMD Top N and Fast
 Planner features are disabled for qaud and long double

1) The new AMD FFTW planner features like the Top N planner and Fast planner are supported only for double-precision and single-precision.

2) Files configure.ac and configure are modified to enable the fix.

This code change relates to Jira task AMD-Internal : [CPUPL-1554]

Change-Id: Icc28a17ce3c054a9f82eac86e0fb21b35064e4b9
---
 configure    | 12 ++++++++----
 configure.ac | 14 +++++++++-----
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/configure b/configure
index 5a6d02b0..73dfb29a 100755
--- a/configure
+++ b/configure
@@ -17623,20 +17623,24 @@ fi
 if test "$have_amd_fast_planner" = yes && test "$have_amd_top_n_planner" = yes; then
 	as_fn_error $? "AMD_FAST_PLANNER and AMD_TOP_N_PLANNER can not be enabled together" "$LINENO" 5
 else
-        if test "$have_amd_fast_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then
+        if (test "$have_amd_fast_planner" = yes && (test "$quad_precision_supported" = yes || test "$long_double_supported" = yes)); then
+		as_fn_error $? "AMD_FAST_PLANNER can not be enabled for Quad or Long double" "$LINENO" 5
+	elif (test "$have_amd_fast_planner" = yes); then
 
 $as_echo "#define AMD_OPT_FAST_PLANNER 1" >>confdefs.h
 
         fi
 	# Check if amd-top-n-planner is enabled with mpi, openmp or threads
-	if (test "$enable_mpi" = "yes" && test "$have_amd_top_n_planner" = yes ) || (test "$enable_openmp" = "yes" && test "$have_amd_top_n_planner" = yes) || (test "$enable_threads" = "yes" && test "$have_amd_top_n_planner" = yes); then
+	if (test "$have_amd_top_n_planner" = yes && (test "$enable_threads" = yes || test "$enable_openmp" = yes || test "$enable_mpi" = yes)); then
 		as_fn_error $? "AMD_TOP_N_PLANNER can not be enabled with mpi, openmp or threads as it is supported only for single threaded mode" "$LINENO" 5
 	else
-                if test "$have_amd_top_n_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then
+               if (test "$have_amd_top_n_planner" = yes && (test "$quad_precision_supported" = yes || test "$long_double_supported" = yes)); then
+	 	       as_fn_error $? "AMD_TOP_N_PLANNER can not be enabled for Quad or Long double" "$LINENO" 5
+	       elif (test "$have_amd_top_n_planner" = yes); then
 
 $as_echo "#define AMD_OPT_TOP_N_PLANNER 1" >>confdefs.h
 
-                fi
+               fi
         fi
 fi
 
diff --git a/configure.ac b/configure.ac
index e9442832..a47576ac 100644
--- a/configure.ac
+++ b/configure.ac
@@ -734,16 +734,20 @@ AC_ARG_ENABLE(amd-top-n-planner, [AC_HELP_STRING([--enable-amd-top-n-planner],[e
 if test "$have_amd_fast_planner" = yes && test "$have_amd_top_n_planner" = yes; then 
 	AC_MSG_ERROR([AMD_FAST_PLANNER and AMD_TOP_N_PLANNER can not be enabled together])
 else
-        if test "$have_amd_fast_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then
+        if (test "$have_amd_fast_planner" = yes && (test "$quad_precision_supported" = yes || test "$long_double_supported" = yes)); then		
+		AC_MSG_ERROR([AMD_FAST_PLANNER can not be enabled for Quad or Long double])
+	elif (test "$have_amd_fast_planner" = yes); then
 	        AC_DEFINE(AMD_OPT_FAST_PLANNER,1,[Define to enable AMD Fast Planner for AMD cpus.])
         fi
 	# Check if amd-top-n-planner is enabled with mpi, openmp or threads
-	if (test "$enable_mpi" = "yes" && test "$have_amd_top_n_planner" = yes ) || (test "$enable_openmp" = "yes" && test "$have_amd_top_n_planner" = yes) || (test "$enable_threads" = "yes" && test "$have_amd_top_n_planner" = yes); then
+	if (test "$have_amd_top_n_planner" = yes && (test "$enable_threads" = yes || test "$enable_openmp" = yes || test "$enable_mpi" = yes)); then
 		AC_MSG_ERROR([AMD_TOP_N_PLANNER can not be enabled with mpi, openmp or threads as it is supported only for single threaded mode])
 	else	
-                if test "$have_amd_top_n_planner" = yes && test "$quad_precision_supported" = no && test "$long_double_supported" = no; then
-                        AC_DEFINE(AMD_OPT_TOP_N_PLANNER,1,[Define to enable AMD Top N Planner for AMD cpus.])
-                fi
+               if (test "$have_amd_top_n_planner" = yes && (test "$quad_precision_supported" = yes || test "$long_double_supported" = yes)); then 
+	 	       AC_MSG_ERROR([AMD_TOP_N_PLANNER can not be enabled for Quad or Long double])
+	       elif (test "$have_amd_top_n_planner" = yes); then
+                       AC_DEFINE(AMD_OPT_TOP_N_PLANNER,1,[Define to enable AMD Top N Planner for AMD cpus.])
+               fi
         fi		
 fi
 

From 8b7f06e99ad0d18547cede2c91c1438e9b2dd327 Mon Sep 17 00:00:00 2001
From: sraut <Biplab.Raut@amd.com>
Date: Fri, 4 Jun 2021 19:03:54 +0530
Subject: [PATCH 4/6] This change adds AMD optimizations to improve MPI and ST
 performance for double-precision and single-precision.

1) A new block based MPI transpose algorithm/solver is implemented that does not require a memcpy operation and
   the auxiliary memory space for the in-place MPI transpose.
   This new algorithm makes use of the VADER LIMIT from the underlying MPI framework.
2) Added configure option --enable-amd-vader-limit that enables the new block based MPI transpose algorithm.
   If --enable-amd-vader-limit is not used in configure then this new algorithm is disabled.
   If --enable-amd-vader-limit is used in configure then this new algorithm is enabled.
   When using --enable-amd-vader-limit as configure option, the user needs to set --mca btl_vader_eager_limit
   appropriately (current preference is 65536) in the MPIRUN command.
   When using --enable-amd-vader-limit as configure option, mpi/Makefile needs to be edited appropriately,
   with --mca btl_vader_eager_limit 65536, to pass the make check verification related to mpi tests.
3) New optimizations are added to routines in cpy2d.c/cpy2d, transpose.c/transpose, vranl3-transpose/transpose_toms513
   under switches AMD_OPT_UNROLL_CPY2D, AMD_OPT_IN_PLACE_SQU_TRANS and AMD_OPT_TOMS513_TRANS respectively.
4) AOCL Version is updated to 3.0.1

This code change relates to Jira task AMD-Internal: [CPUPL-1027]

Change-Id: I383790999f8ec97a01ffbae169defa7f7e1272e9
---
 CMakeLists.txt              |   2 +-
 README_AMD.md               |  11 +-
 config.h.in                 |   3 +
 configure                   |  20 +-
 configure.ac                |   9 +-
 kernel/cpy2d.c              | 113 +++++++++-
 kernel/ifftw.h              |  24 +-
 kernel/transpose.c          |  68 +++++-
 mpi/Makefile.am             |   2 +-
 mpi/Makefile.in             |   5 +-
 mpi/conf.c                  |   5 +-
 mpi/mpi-transpose.h         |   3 +-
 mpi/transpose-blk-scheme1.c | 424 ++++++++++++++++++++++++++++++++++++
 rdft/vrank3-transpose.c     | 201 +++++++++++++++++
 14 files changed, 872 insertions(+), 18 deletions(-)
 create mode 100644 mpi/transpose-blk-scheme1.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30577056..f4d2e623 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -218,7 +218,7 @@ if (MSVC)
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
 endif(MSVC)
 
-add_compile_definitions(AOCL_FFTW_VERSION="AOCL FFTW 3.1")
+add_compile_definitions(AOCL_FFTW_VERSION="AOCL FFTW 3.0.1")
 
 find_library (LIBM_LIBRARY NAMES m)
 if (LIBM_LIBRARY)
diff --git a/README_AMD.md b/README_AMD.md
index c5b52b05..80daa63f 100644
--- a/README_AMD.md
+++ b/README_AMD.md
@@ -12,9 +12,11 @@ functions (cpy2d and cpy2d_pair used in rank-0 transform and buffering plan),
 improved 256-bit kernels selection by Planner and an optional in-place 
 transpose for large problem sizes. AMD Optimized FFTW improves the performance
 of in-place MPI FFT over FFTW 3.3.8 by employing a faster in-place MPI
-transpose function. As of AMD FFTW 3.0, a new fast planner is added as an
+transpose function. AMD Optimized FFTW provides a new fast planner as an
 extension to the original planner that improves planning time of various
-planning modes in general and PATIENT mode in particular.
+planning modes in general and PATIENT mode in particular. As of AMD FFTW 3.0.1,
+a new feature called Top N planner is introduced that minimizes single-threaded
+run-to-run variations.
 
 FFTW is a free collection of fast C routines for computing the
 Discrete Fourier Transform and various special cases thereof in one or more
@@ -53,6 +55,11 @@ configure option "--enable-generic-simd128" or "--enable-generic-simd256".
 The optional configure option "--enable-amd-mpifft" enables the MPI FFT
 related optimizations.
 
+An optional configure option "--enable-amd-mpi-vader-limit" is supported that 
+controls enabling of AMD's new MPI transpose algorithms. When using this 
+configure option, the user needs to set --mca btl_vader_eager_limit
+appropriately (current preference is 65536) in the MPIRUN command.
+
 The new fast planner can be enabled using optional configure option 
 "--enable-amd-fast-planner". It is supported for single and double precisions.
 
diff --git a/config.h.in b/config.h.in
index feb022df..e600cd77 100644
--- a/config.h.in
+++ b/config.h.in
@@ -1,5 +1,8 @@
 /* config.h.in.  Generated from configure.ac by autoheader.  */
 
+/* Set VADER LIMIT in order to enable new AMD MPI transpose algorithms. */
+#undef AMD_MPI_VADER_LIMIT_SET
+
 /* Define to enable AMD cpu specific optimizations. */
 #undef AMD_OPT_ALL
 
diff --git a/configure b/configure
index 73dfb29a..64bbf28c 100755
--- a/configure
+++ b/configure
@@ -875,6 +875,7 @@ with_sysroot
 enable_libtool_lock
 enable_mpi
 enable_amd_opt
+enable_amd_mpi_vader_limit
 enable_amd_trans
 enable_amd_mpifft
 enable_openmp
@@ -1586,6 +1587,9 @@ Optional Features:
   --disable-libtool-lock  avoid locking (might break parallel builds)
   --enable-mpi            compile FFTW MPI library
   --enable-amd-opt        enable AMD cpu specific optimizations
+  --enable-amd-mpi-vader-limit
+                          enable setting of VADER LIMIT that controls enabling
+                          of new AMD MPI transpose algorithms
   --enable-amd-trans      enable AMD cpu optimized Transpose
   --enable-amd-mpifft     enable AMD cpu optimized MPI FFT
   --enable-openmp         use OpenMP directives for parallelism
@@ -17563,6 +17567,20 @@ if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then
 
 $as_echo "#define AMD_OPT_ALL 1" >>confdefs.h
 
+
+		# Check whether --enable-amd-mpi-vader-limit was given.
+if test "${enable_amd_mpi_vader_limit+set}" = set; then :
+  enableval=$enable_amd_mpi_vader_limit; have_amd_mpi_vl=$enableval
+else
+  have_amd_mpi_vl=no
+fi
+
+	if test "$have_amd_mpi_vl" = yes ; then
+
+$as_echo "#define AMD_MPI_VADER_LIMIT_SET 1" >>confdefs.h
+
+	fi
+
 fi
 # Check whether --enable-amd-trans was given.
 if test "${enable_amd_trans+set}" = set; then :
@@ -17603,7 +17621,7 @@ else
 fi
 
 
-$as_echo "#define AOCL_FFTW_VERSION \"AOCL-3.0\"" >>confdefs.h
+$as_echo "#define AOCL_FFTW_VERSION \"AOCL FFTW 3.0.1\"" >>confdefs.h
 
 # Check whether --enable-amd-fast-planner was given.
 if test "${enable_amd_fast_planner+set}" = set; then :
diff --git a/configure.ac b/configure.ac
index a47576ac..7e18f610 100644
--- a/configure.ac
+++ b/configure.ac
@@ -711,6 +711,13 @@ if test "$have_amd_opt" = yes && test "${enable_debug+set}" != "set"; then
 		fi
 	fi
 	AC_DEFINE(AMD_OPT_ALL,1,[Define to enable AMD cpu specific optimizations.])
+	
+	dnl amd switch for VADER LIMIT that controls enabling of AMD's new MPI transpose algorithms --enable-amd-mpi-vader-limit
+	AC_ARG_ENABLE(amd-mpi-vader-limit, [AC_HELP_STRING([--enable-amd-mpi-vader-limit],[enable setting of VADER LIMIT that controls enabling of new AMD MPI transpose algorithms])], have_amd_mpi_vl=$enableval, have_amd_mpi_vl=no)
+	if test "$have_amd_mpi_vl" = yes ; then
+		AC_DEFINE(AMD_MPI_VADER_LIMIT_SET,1,[Set VADER LIMIT in order to enable new AMD MPI transpose algorithms.])
+	fi
+
 fi
 dnl amd optimization switch to enable amd cpu optimized transpose --enable-amd-trans
 AC_ARG_ENABLE(amd-trans, [AC_HELP_STRING([--enable-amd-trans],[enable AMD cpu optimized Transpose])], have_amd_trans=$enableval, have_amd_trans=no)
@@ -725,7 +732,7 @@ fi
 AC_ARG_ENABLE(openmp, [AC_HELP_STRING([--enable-openmp],[use OpenMP directives for parallelism])], enable_openmp=$enableval, enable_openmp=no)
 AC_ARG_ENABLE(threads, [AC_HELP_STRING([--enable-threads],[compile FFTW SMP threads library])], enable_threads=$enableval, enable_threads=no)
 dnl aocl version number of amd-fftw
-AC_DEFINE(AOCL_FFTW_VERSION,"AOCL-3.0",[AOCL Version of AMD-FFTW])
+AC_DEFINE(AOCL_FFTW_VERSION,"AOCL FFTW 3.0.1",[AOCL Version of AMD-FFTW])
 dnl amd optimization switch to enable AMD Fast Planner for AMD cpus --enable-amd-fast-planner
 AC_ARG_ENABLE(amd-fast-planner, [AC_HELP_STRING([--enable-amd-fast-planner],[enable AMD Fast Planner for a faster planning time on AMD cpus])], have_amd_fast_planner=$enableval, have_amd_fast_planner=no)
 dnl amd optimization switch to enable AMD Top N Planner for AMD cpus --enable-amd-top-n-planner
diff --git a/kernel/cpy2d.c b/kernel/cpy2d.c
index 46819ac1..9c616716 100644
--- a/kernel/cpy2d.c
+++ b/kernel/cpy2d.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
- * Copyright (C) 2019, Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -604,7 +604,6 @@ void X(cpy2d)(R *I, R *O,
 	      INT vl)
 {
      INT i0, i1, v;
-     
      switch (vl) {
 	 case 1:
 	      for (i1 = 0; i1 < n1; ++i1)
@@ -649,16 +648,23 @@ void X(cpy2d)(R *I, R *O,
 				  *(double *)&I[i0 * is0 + i1 * is1];
 			}
 	      } else {
+#ifdef AMD_OPT_UNROLL_CPY2D
+		      __m256d in1, in2, in3, in4, in5, in6, in7, in8;
+#else
 		      __m256d in1, in2, in3, in4;
+#endif
 		      __m256d out1, out2;
 		      __m128d in1_128, in2_128;
 		      INT t0, t1, t2, t3;
 		      INT n0_rem = n0&0x1, n1_rem = n1&0x1;
+#ifdef AMD_OPT_UNROLL_CPY2D
+			  INT n0_8, n0_8_rem, n0_16, n0_16_rem;
+#endif
 		      t0 = (is0==2);
 		      t1 = (os0==2);
 		      t2 = (is1==2);
 		      t3 = (os1==2);
-		      
+			  
 		      switch(t0 | (t1 << 1) | (t2 << 2) | (t3 << 3))
 		      {
 			      case 1://only is0 is 2. 256-bit contiguous read possible
@@ -768,8 +774,79 @@ void X(cpy2d)(R *I, R *O,
 			      break;
 			      
 			      case 9://is0=2 and os1=2. Both 256-bit read and 256-bit write possible
+#ifdef AMD_OPT_UNROLL_CPY2D
+			      n0_8_rem = n0&0x7;
+			      n0_8 = n0 - n0_8_rem;
+#endif
 			      n0 = n0 - n0_rem;
 			      n1 = n1 - n1_rem;
+#ifdef AMD_OPT_UNROLL_CPY2D
+			      for (i1 = 0; i1 < n1; i1+=2)
+			      {
+				      for (i0 = 0; i0 < n0_8; i0+=8) {
+					      in1 = _mm256_loadu_pd((double const *)&I[i0 * is0 + i1 * is1]);
+					      in2 = _mm256_loadu_pd((double const *)&I[i0 * is0 + (i1+1) * is1]);
+					      in3 = _mm256_loadu_pd((double const *)&I[(i0+2) * is0 + i1 * is1]);
+					      in4 = _mm256_loadu_pd((double const *)&I[(i0+2) * is0 + (i1+1) * is1]);
+					      in5 = _mm256_loadu_pd((double const *)&I[(i0+4) * is0 + i1 * is1]);
+					      in6 = _mm256_loadu_pd((double const *)&I[(i0+4) * is0 + (i1+1) * is1]);
+					      in7 = _mm256_loadu_pd((double const *)&I[(i0+6) * is0 + i1 * is1]);
+					      in8 = _mm256_loadu_pd((double const *)&I[(i0+6) * is0 + (i1+1) * is1]);
+
+					      //out1 = _mm256_shuffle_pd(in1, in2, 0x33);
+					      //out2 = _mm256_shuffle_pd(in1, in2, 0x11);
+					      out1 = _mm256_permute2f128_pd(in1, in2, 0x20);
+					      out2 = _mm256_permute2f128_pd(in1, in2, 0x31);
+					      in1 = _mm256_permute2f128_pd(in3, in4, 0x20);
+					      in2 = _mm256_permute2f128_pd(in3, in4, 0x31);
+					      in3 = _mm256_permute2f128_pd(in5, in6, 0x20);
+					      in4 = _mm256_permute2f128_pd(in5, in6, 0x31);
+					      in5 = _mm256_permute2f128_pd(in7, in8, 0x20);
+					      in6 = _mm256_permute2f128_pd(in7, in8, 0x31);
+
+					      _mm256_storeu_pd((double *)&O[i0 * os0 + i1 * os1], out1);
+					      _mm256_storeu_pd((double *)&O[(i0+1) * os0 + i1 * os1], out2);
+					      _mm256_storeu_pd((double *)&O[(i0+2) * os0 + i1 * os1], in1);
+					      _mm256_storeu_pd((double *)&O[(i0+3) * os0 + i1 * os1], in2);
+					      _mm256_storeu_pd((double *)&O[(i0+4) * os0 + i1 * os1], in3);
+					      _mm256_storeu_pd((double *)&O[(i0+5) * os0 + i1 * os1], in4);
+					      _mm256_storeu_pd((double *)&O[(i0+6) * os0 + i1 * os1], in5);
+					      _mm256_storeu_pd((double *)&O[(i0+7) * os0 + i1 * os1], in6);
+				      }
+				      for (; i0 < n0; i0+=2) {
+					      in1 = _mm256_loadu_pd((double const *)&I[i0 * is0 + i1 * is1]);
+					      in2 = _mm256_loadu_pd((double const *)&I[i0 * is0 + (i1+1) * is1]);
+
+					      //out1 = _mm256_shuffle_pd(in1, in2, 0x33);
+					      //out2 = _mm256_shuffle_pd(in1, in2, 0x11);
+					      out1 = _mm256_permute2f128_pd(in1, in2, 0x20);
+					      out2 = _mm256_permute2f128_pd(in1, in2, 0x31);
+					      _mm256_storeu_pd((double *)&O[i0 * os0 + i1 * os1], out1);
+					      _mm256_storeu_pd((double *)&O[(i0+1) * os0 + i1 * os1], out2);
+				      }
+				      if (n0_rem)
+				      {
+					      R x0 = I[i0 * is0 + i1 * is1];
+					      R x1 = I[i0 * is0 + i1 * is1 + 1];
+					      R x2 = I[i0 * is0 + (i1+1) * is1];
+					      R x3 = I[i0 * is0 + (i1+1) * is1 + 1];
+					      O[i0 * os0 + i1 * os1] = x0;
+					      O[i0 * os0 + i1 * os1 + 1] = x1;
+					      O[i0 * os0 + (i1+1) * os1] = x2;
+					      O[i0 * os0 + (i1+1) * os1 + 1] = x3;
+				      }
+			      }
+			      if (n1_rem)
+			      {
+				      n0 += n0_rem;
+				      for (i0 = 0; i0 < n0; ++i0) {
+					      R x0 = I[i0 * is0 + i1 * is1];
+					      R x1 = I[i0 * is0 + i1 * is1 + 1];
+					      O[i0 * os0 + i1 * os1] = x0;
+					      O[i0 * os0 + i1 * os1 + 1] = x1;
+				      }
+			      }
+#else
 			      for (i1 = 0; i1 < n1; i1+=2)
 			      {
 				      for (i0 = 0; i0 < n0; i0+=2) {
@@ -805,19 +882,49 @@ void X(cpy2d)(R *I, R *O,
 					      O[i0 * os0 + i1 * os1 + 1] = x1;
 				      }
 			      }
+#endif
 			      break;
 
 			      case 3://is0=2 and os0=2. Both 256-bit read and 256-bit write possible
 			      case 7://is0=2 and os0=2. Also is1=2. Both 256-bit read and 256-bit write possible
 			      case 11://is0=2 and os0=2. Also os1=2. Both 256-bit read and 256-bit write possible
 			      case 15://is0=2 and os0=2. Also is1=2, os1=2. Both 256-bit read and 256-bit write possible
+#ifdef AMD_OPT_UNROLL_CPY2D
+			      n0_16_rem = n0&0xF;
+			      n0_16 = n0 - n0_16_rem;
+#endif
 			      n0 = n0 - n0_rem;
 			      for (i1 = 0; i1 < n1; ++i1)
 			      {
+#ifdef AMD_OPT_UNROLL_CPY2D
+				      for (i0 = 0; i0 < n0_16; i0+=16) {
+					      in1 = _mm256_loadu_pd((double const *)&I[i0 * is0 + i1 * is1]);
+					      in2 = _mm256_loadu_pd((double const *)&I[(i0+2) * is0 + i1 * is1]);
+					      in3 = _mm256_loadu_pd((double const *)&I[(i0+4) * is0 + i1 * is1]);
+					      in4 = _mm256_loadu_pd((double const *)&I[(i0+6) * is0 + i1 * is1]);
+					      in5 = _mm256_loadu_pd((double const *)&I[(i0+8) * is0 + i1 * is1]);
+					      in6 = _mm256_loadu_pd((double const *)&I[(i0+10) * is0 + i1 * is1]);
+					      in7 = _mm256_loadu_pd((double const *)&I[(i0+12) * is0 + i1 * is1]);
+					      in8 = _mm256_loadu_pd((double const *)&I[(i0+14) * is0 + i1 * is1]);
+					      _mm256_storeu_pd((double *)&O[i0 * os0 + i1 * os1], in1);
+					      _mm256_storeu_pd((double *)&O[(i0+2) * os0 + i1 * os1], in2);
+					      _mm256_storeu_pd((double *)&O[(i0+4) * os0 + i1 * os1], in3);
+					      _mm256_storeu_pd((double *)&O[(i0+6) * os0 + i1 * os1], in4);
+					      _mm256_storeu_pd((double *)&O[(i0+8) * os0 + i1 * os1], in5);
+					      _mm256_storeu_pd((double *)&O[(i0+10) * os0 + i1 * os1], in6);
+					      _mm256_storeu_pd((double *)&O[(i0+12) * os0 + i1 * os1], in7);
+					      _mm256_storeu_pd((double *)&O[(i0+14) * os0 + i1 * os1], in8);
+				      }
+				      for (; i0 < n0; i0+=2) {
+					      in1 = _mm256_loadu_pd((double const *)&I[i0 * is0 + i1 * is1]);
+					      _mm256_storeu_pd((double *)&O[i0 * os0 + i1 * os1], in1);
+				      }
+#else
 				      for (i0 = 0; i0 < n0; i0+=2) {
 					      in1 = _mm256_loadu_pd((double const *)&I[i0 * is0 + i1 * is1]);
 					      _mm256_storeu_pd((double *)&O[i0 * os0 + i1 * os1], in1);
 				      }
+#endif
 				      if (n0_rem)
 				      {
 					      R x0 = I[i0 * is0 + i1 * is1];
diff --git a/kernel/ifftw.h b/kernel/ifftw.h
index 62a9d90b..9b30f692 100644
--- a/kernel/ifftw.h
+++ b/kernel/ifftw.h
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
- * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -96,6 +96,10 @@ extern "C"
 #define AMD_OPT_IN_PLACE_1D_CPY2D_STABLE_INTRIN
 //Below switch enables the use of memcpy function in cpy2d_pair routine instead of SIMD 256-bit load and store
 #define AMD_OPT_USE_MEMCPY_TO_CPY
+//Below switch enables the unrolling of memory read and write SIMD operations in cpy2d routine.
+#if (!defined(FFTW_LDOUBLE) && !defined(FFTW_QUAD) && !defined(FFTW_SINGLE))
+#define AMD_OPT_UNROLL_CPY2D
+#endif
 //--------------------------------
 //In-place Transpose related optimization switches :-
 //The below switches are defined through config.h using configure script run-time feature arg --enable-amd-trans
@@ -114,6 +118,10 @@ extern "C"
 //(ii) enables new auto-tuned cache-efficient raster order tiled transpose for squared sized matrix
 //     (for this optimization switch, AMD_OPT_AUTO_TUNED_TRANS_BLK_SIZE should also be enabled)
 //#define AMD_OPT_AUTO_TUNED_RASTER_TILED_TRANS_METHOD
+//The below switch enables AMD optimizations for the in-place square transpose routine.
+#define AMD_OPT_IN_PLACE_SQU_TRANS
+//The below switch enables AMD optimizations for the in-situ Toms513 algorithm.
+#define AMD_OPT_TOMS513_TRANS
 //--------------------------------
 //Kernel new implementations and optimization enable/disable switch by AMD_OPT_KERNEL_256SIMD_PERF
 #define AMD_OPT_KERNEL_256SIMD_PERF
@@ -126,6 +134,10 @@ extern "C"
 //#define AMD_MPI_MALLOC_ONCE
 //Enables debug logs for MPI FFT/Transpose solvers
 //#define AMD_MPI_TRANSPOSE_LOGS
+#ifdef AMD_MPI_VADER_LIMIT_SET
+//Below switch enables new MPI fast in-place transpose algorithms and solvers.
+#define AMD_OPT_MPIFFT_FAST_BLK_BASED_TRANSPOSE
+#endif
 #endif
 //--------------------------------
 //NEW FAST PLANNER for AMD CPUs can be enabled with the below switch AMD_FAST_PLANNER.
@@ -141,7 +153,7 @@ extern "C"
 //#define AMD_FAST_PLANNING_HASH_V2
 #define AMD_HASH_UNBLESS_MAX_SIZE 10485760
 #endif
-
+//--------------------------------
 //NEW TOP N PLANNER feature for AMD CPUs can be enabled with the below switch AMD_TOP_N_PLANNER.
 //The new Top N planner improves the run-to-run variations by using a dynamic wisdom (preset) plan functionality.
 //This feature implements the mechanism to search and store top N plans into the wisdom file and then use these plans to find the best plan for execution in the consecutive runs.
@@ -150,9 +162,13 @@ extern "C"
 #define AMD_TOP_N_PLANNER
 #define AMD_OPT_TOP_N 3 //The value of AMD_OPT_TOP_N is fixed as 3, enabling the search, store and re-use of Top 3 plans. This value should not be changed by the user.
 #endif
-
+//--------------------------------
 #endif//#ifdef AMD_OPT_ALL  
- //============================================================
+//Below is a manual switch to control VADER LIMIT
+//This is upper limit that each process/rank can send in bytes to the receiver process/rank with buffers for receiving them
+//without any synchronization on completion status.
+#define VADER_LIMIT 8000//8000//4000//500
+//============================================================
 //AMD OPTIMIZATIONS :- end
 
 /*
diff --git a/kernel/transpose.c b/kernel/transpose.c
index c5c10554..ca83fd73 100644
--- a/kernel/transpose.c
+++ b/kernel/transpose.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
- * Copyright (C) 2019, Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -21,10 +21,22 @@
 
 #include "kernel/ifftw.h"
 
+#if defined(AMD_OPT_IN_PLACE_SQU_TRANS) && (!defined(FFTW_LDOUBLE) && !defined(FFTW_QUAD))
+#include "immintrin.h"
+#endif
+
 /* in place square transposition, iterative */
 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl)
 {
      INT i0, i1, v;
+#if defined(AMD_OPT_IN_PLACE_SQU_TRANS) && (!defined(FFTW_LDOUBLE) && !defined(FFTW_QUAD))
+	 int vl_8factor = vl - (vl & 0xF);
+#ifdef FFTW_SINGLE
+ 	 __m256 in1, in2, in3, in4, in5, in6, in7, in8;
+#else
+	 __m256d in1, in2, in3, in4, in5, in6, in7, in8;
+#endif
+#endif
 
      switch (vl) {
 	 case 1:
@@ -52,6 +64,59 @@ void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl)
 	      }
 	      break;
 	 default:
+#if defined(AMD_OPT_IN_PLACE_SQU_TRANS) && (!defined(FFTW_LDOUBLE) && !defined(FFTW_QUAD))
+#ifdef FFTW_SINGLE
+	      for (i1 = 1; i1 < n; ++i1) {
+		      for (i0 = 0; i0 < i1; ++i0) {
+			      for (v = 0; v < vl_8factor; v+=16) {
+				      in1 = _mm256_loadu_ps((float const *)&I[i1 * s0 + i0 * s1 + v + 0]);
+				      in2 = _mm256_loadu_ps((float const *)&I[i1 * s1 + i0 * s0 + v + 0]);
+				      in3 = _mm256_loadu_ps((float const *)&I[i1 * s0 + i0 * s1 + v + 8]);
+				      in4 = _mm256_loadu_ps((float const *)&I[i1 * s1 + i0 * s0 + v + 8]);
+				      _mm256_storeu_ps((double *)&I[i1 * s1 + i0 * s0 + v + 0], in1);
+				      _mm256_storeu_ps((double *)&I[i1 * s0 + i0 * s1 + v + 0], in2);
+				      _mm256_storeu_ps((double *)&I[i1 * s1 + i0 * s0 + v + 8], in3);
+				      _mm256_storeu_ps((double *)&I[i1 * s0 + i0 * s1 + v + 8], in4);
+			      }
+			      for (; v < vl; ++v) {
+				      R x0 = I[i1 * s0 + i0 * s1 + v];
+				      R y0 = I[i1 * s1 + i0 * s0 + v];
+				      I[i1 * s1 + i0 * s0 + v] = x0;
+				      I[i1 * s0 + i0 * s1 + v] = y0;
+			      }
+		      }
+	      }
+#else
+	      for (i1 = 1; i1 < n; ++i1) {
+		      for (i0 = 0; i0 < i1; ++i0) {
+			      for (v = 0; v < vl_8factor; v+=16) {
+				      in1 = _mm256_loadu_pd((double const *)&I[i1 * s0 + i0 * s1 + v + 0]);
+				      in2 = _mm256_loadu_pd((double const *)&I[i1 * s1 + i0 * s0 + v + 0]);
+				      in3 = _mm256_loadu_pd((double const *)&I[i1 * s0 + i0 * s1 + v + 4]);
+				      in4 = _mm256_loadu_pd((double const *)&I[i1 * s1 + i0 * s0 + v + 4]);
+				      in5 = _mm256_loadu_pd((double const *)&I[i1 * s0 + i0 * s1 + v + 8]);
+				      in6 = _mm256_loadu_pd((double const *)&I[i1 * s1 + i0 * s0 + v + 8]);
+				      in7 = _mm256_loadu_pd((double const *)&I[i1 * s0 + i0 * s1 + v + 12]);
+				      in8 = _mm256_loadu_pd((double const *)&I[i1 * s1 + i0 * s0 + v + 12]);
+				      _mm256_storeu_pd((double *)&I[i1 * s1 + i0 * s0 + v + 0], in1);
+				      _mm256_storeu_pd((double *)&I[i1 * s0 + i0 * s1 + v + 0], in2);
+				      _mm256_storeu_pd((double *)&I[i1 * s1 + i0 * s0 + v + 4], in3);
+				      _mm256_storeu_pd((double *)&I[i1 * s0 + i0 * s1 + v + 4], in4);
+				      _mm256_storeu_pd((double *)&I[i1 * s1 + i0 * s0 + v + 8], in5);
+				      _mm256_storeu_pd((double *)&I[i1 * s0 + i0 * s1 + v + 8], in6);
+				      _mm256_storeu_pd((double *)&I[i1 * s1 + i0 * s0 + v + 12], in7);
+				      _mm256_storeu_pd((double *)&I[i1 * s0 + i0 * s1 + v + 12], in8);
+			      }
+			      for (; v < vl; ++v) {
+				      R x0 = I[i1 * s0 + i0 * s1 + v];
+				      R y0 = I[i1 * s1 + i0 * s0 + v];
+				      I[i1 * s1 + i0 * s0 + v] = x0;
+				      I[i1 * s0 + i0 * s1 + v] = y0;
+			      }
+		      }
+	      }
+#endif
+#else
 	      for (i1 = 1; i1 < n; ++i1) {
 		   for (i0 = 0; i0 < i1; ++i0) {
 			for (v = 0; v < vl; ++v) {
@@ -62,6 +127,7 @@ void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl)
 			}
 		   }
 	      }
+#endif
 	      break;
      }
 }
diff --git a/mpi/Makefile.am b/mpi/Makefile.am
index 04d32b25..fb1ae4c7 100644
--- a/mpi/Makefile.am
+++ b/mpi/Makefile.am
@@ -16,7 +16,7 @@ EXTRA_DIST = testsched.c f03api.sh f03-wrap.sh genf03-wrap.pl fftw3-mpi.f03.in f
 BUILT_SOURCES = fftw3-mpi.f03.in fftw3-mpi.f03 fftw3l-mpi.f03.in fftw3l-mpi.f03 f03-wrap.c
 CLEANFILES = fftw3-mpi.f03 fftw3l-mpi.f03
 
-TRANSPOSE_SRC = transpose-alltoall.c transpose-pairwise.c transpose-recurse.c transpose-pairwise-omc.c transpose-problem.c transpose-solve.c mpi-transpose.h
+TRANSPOSE_SRC = transpose-alltoall.c transpose-pairwise.c transpose-recurse.c transpose-pairwise-omc.c transpose-blk-scheme1.c transpose-problem.c transpose-solve.c mpi-transpose.h
 DFT_SRC = dft-serial.c dft-rank-geq2.c dft-rank-geq2-transposed.c dft-rank1.c dft-rank1-bigvec.c dft-problem.c dft-solve.c mpi-dft.h
 RDFT_SRC = rdft-serial.c rdft-rank-geq2.c rdft-rank-geq2-transposed.c rdft-rank1-bigvec.c rdft-problem.c rdft-solve.c mpi-rdft.h
 RDFT2_SRC = rdft2-serial.c rdft2-rank-geq2.c rdft2-rank-geq2-transposed.c rdft2-problem.c rdft2-solve.c mpi-rdft2.h
diff --git a/mpi/Makefile.in b/mpi/Makefile.in
index 4620dd66..4cf7d366 100644
--- a/mpi/Makefile.in
+++ b/mpi/Makefile.in
@@ -147,7 +147,7 @@ libfftw3@PREC_SUFFIX@_mpi_la_DEPENDENCIES =  \
 am__objects_1 = any-true.lo api.lo block.lo choose-radix.lo conf.lo \
 	dtensor.lo rearrange.lo wisdom-api.lo f03-wrap.lo
 am__objects_2 = transpose-alltoall.lo transpose-pairwise.lo \
-	transpose-recurse.lo transpose-pairwise-omc.lo transpose-problem.lo transpose-solve.lo
+	transpose-recurse.lo transpose-pairwise-omc.lo transpose-blk-scheme1.lo transpose-problem.lo transpose-solve.lo
 am__objects_3 = dft-serial.lo dft-rank-geq2.lo \
 	dft-rank-geq2-transposed.lo dft-rank1.lo dft-rank1-bigvec.lo \
 	dft-problem.lo dft-solve.lo
@@ -410,7 +410,7 @@ AM_CPPFLAGS = -I $(top_srcdir) -I $(top_srcdir)/api
 EXTRA_DIST = testsched.c f03api.sh f03-wrap.sh genf03-wrap.pl fftw3-mpi.f03.in fftw3l-mpi.f03.in
 BUILT_SOURCES = fftw3-mpi.f03.in fftw3-mpi.f03 fftw3l-mpi.f03.in fftw3l-mpi.f03 f03-wrap.c
 CLEANFILES = fftw3-mpi.f03 fftw3l-mpi.f03
-TRANSPOSE_SRC = transpose-alltoall.c transpose-pairwise.c transpose-recurse.c transpose-pairwise-omc.c transpose-problem.c transpose-solve.c mpi-transpose.h
+TRANSPOSE_SRC = transpose-alltoall.c transpose-pairwise.c transpose-recurse.c transpose-pairwise-omc.c transpose-blk-scheme1.c transpose-problem.c transpose-solve.c mpi-transpose.h
 DFT_SRC = dft-serial.c dft-rank-geq2.c dft-rank-geq2-transposed.c dft-rank1.c dft-rank1-bigvec.c dft-problem.c dft-solve.c mpi-dft.h
 RDFT_SRC = rdft-serial.c rdft-rank-geq2.c rdft-rank-geq2-transposed.c rdft-rank1-bigvec.c rdft-problem.c rdft-solve.c mpi-rdft.h
 RDFT2_SRC = rdft2-serial.c rdft2-rank-geq2.c rdft2-rank-geq2-transposed.c rdft2-problem.c rdft2-solve.c mpi-rdft2.h
@@ -554,6 +554,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/transpose-problem.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/transpose-recurse.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/transpose-pairwise-omc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/transpose-blk-scheme1.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/transpose-solve.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wisdom-api.Plo@am__quote@
 
diff --git a/mpi/conf.c b/mpi/conf.c
index 7607b702..2333621b 100644
--- a/mpi/conf.c
+++ b/mpi/conf.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
- * Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -32,6 +32,9 @@ static const solvtab s =
      SOLVTAB(XM(transpose_recurse_register)),
 #ifdef AMD_OPT_MPIFFT_OVERLAP_MEMCPY_MPICOMM
      SOLVTAB(XM(transpose_pairwise_omc_register)),
+#endif
+#ifdef AMD_OPT_MPIFFT_FAST_BLK_BASED_TRANSPOSE
+     SOLVTAB(XM(transpose_blk_based_scheme1_register)),
 #endif
      SOLVTAB(XM(dft_rank_geq2_register)),
      SOLVTAB(XM(dft_rank_geq2_transposed_register)),
diff --git a/mpi/mpi-transpose.h b/mpi/mpi-transpose.h
index e9525352..cdfb9323 100644
--- a/mpi/mpi-transpose.h
+++ b/mpi/mpi-transpose.h
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
- * Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -58,6 +58,7 @@ int XM(mkplans_posttranspose)(const problem_mpi_transpose *p, planner *plnr,
 			      INT *rest_Ioff, INT *rest_Ooff);
 /* various solvers */
 void XM(transpose_pairwise_omc_register)(planner *p);
+void XM(transpose_blk_based_scheme1_register)(planner *p);
 void XM(transpose_pairwise_register)(planner *p);
 void XM(transpose_alltoall_register)(planner *p);
 void XM(transpose_recurse_register)(planner *p);
diff --git a/mpi/transpose-blk-scheme1.c b/mpi/transpose-blk-scheme1.c
new file mode 100644
index 00000000..6712239c
--- /dev/null
+++ b/mpi/transpose-blk-scheme1.c
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ * Copyright (C) 2021, Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Distributed transpose that performs block based send receive operations
+   without requiring memcpy operation and without any additional memory. */
+
+#include "mpi-transpose.h"
+#include <string.h>
+#include <stdio.h>
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_transpose super;
+
+     plan *cld1, *cld2, *cld2rest, *cld3;
+     INT rest_Ioff, rest_Ooff;
+     
+     int n_pes, my_pe, *sched;
+     INT *send_block_sizes, *send_block_offsets;
+     INT *recv_block_sizes, *recv_block_offsets;
+     MPI_Comm comm;
+     int preserve_input;
+} P;
+
+static void transpose_chunks(int *sched, int n_pes, int my_pe,
+                             INT *sbs, INT *sbo, INT *rbs, INT *rbo,
+                             MPI_Comm comm,
+                             R *I, R *O)
+{
+	int i;
+	MPI_Status status;
+	if (!sched)
+		return;
+
+	if (I == O) 
+	{
+		int other, size, bufSize_s, blksize_s, totSize_s, bufSize_r, blksize_r, totSize_r;
+		R *sendbuf, *recvbuf;
+		MPI_Status status;
+		struct timeval t1,t2;
+
+		//n_pes is the no. of ranks tp paralelly communicate -> blocksize
+		//blksize = VADER_LIMIT;//(n_pes*VADER_LIMIT)/(bufSize*sizeof(R));;
+		//blksize = bufSize > blksize ? blksize : bufSize;
+
+		for (i=0; i<n_pes; i++) 
+		{
+			int other = sched[i];
+			if(my_pe != other)
+			{
+				bufSize_s = sbs[other];
+				bufSize_r = rbs[other];
+				blksize_s = bufSize_s > VADER_LIMIT ? VADER_LIMIT : bufSize_s;
+				blksize_r = bufSize_r > VADER_LIMIT ? VADER_LIMIT : bufSize_r;
+
+				for (totSize_s = 0; blksize_s && ((bufSize_s - totSize_s) >= blksize_s) ; totSize_s += blksize_s)
+				{
+					MPI_Send(I+sbo[other]+totSize_s,blksize_s,FFTW_MPI_TYPE,other,0,comm);
+
+				}
+
+				for (totSize_r = 0; blksize_r && ((bufSize_r - totSize_r) >= blksize_r) ; totSize_r += blksize_r)
+				{
+					MPI_Recv(I+rbo[other]+totSize_r,blksize_r,FFTW_MPI_TYPE,other,0,comm,&status);
+				}
+
+				if (totSize_s < bufSize_s)
+				{
+					blksize_s = bufSize_s - totSize_s;
+					MPI_Send(I+sbo[other]+totSize_s,blksize_s,FFTW_MPI_TYPE,other,0,comm);
+
+				}
+				if (totSize_r < bufSize_r)
+				{
+					blksize_r = bufSize_r - totSize_r;
+					MPI_Recv(I+rbo[other]+totSize_r,blksize_r,FFTW_MPI_TYPE,other,0,comm,&status);
+
+				}
+			}
+			else {
+				if (rbo[other] != sbo[other])
+					memmove(O + rbo[other], O + sbo[other],
+							sbs[other] * sizeof(R));
+			}				
+		}
+	}
+	else { /* I != O */
+		for (i = 0; i < n_pes; ++i) {
+			int pe = sched[i];
+			if (my_pe == pe)
+				memcpy(O + rbo[pe], I + sbo[pe], sbs[pe] * sizeof(R));
+			else
+				MPI_Sendrecv(I + sbo[pe], (int) (sbs[pe]),
+						FFTW_MPI_TYPE,
+						pe, (my_pe * n_pes + pe) & 0xffff,
+						O + rbo[pe], (int) (rbs[pe]),
+						FFTW_MPI_TYPE,
+						pe, (pe * n_pes + my_pe) & 0xffff,
+						comm, &status);
+		}
+	}
+}
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld1, *cld2, *cld2rest, *cld3;
+
+     /* transpose locally to get contiguous chunks */
+     cld1 = (plan_rdft *) ego->cld1;
+     if (cld1) {
+	  cld1->apply(ego->cld1, I, O);
+	  
+	  if (ego->preserve_input) I = O;
+
+	  /* transpose chunks globally */
+	  transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
+			   ego->send_block_sizes, ego->send_block_offsets,
+			   ego->recv_block_sizes, ego->recv_block_offsets,
+			   ego->comm, O, I);
+     }
+     else if (ego->preserve_input) {
+	  /* transpose chunks globally */
+	  transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
+			   ego->send_block_sizes, ego->send_block_offsets,
+			   ego->recv_block_sizes, ego->recv_block_offsets,
+			   ego->comm, I, O);
+
+	  I = O;
+     }
+     else {
+	  /* transpose chunks globally */
+	  transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
+			   ego->send_block_sizes, ego->send_block_offsets,
+			   ego->recv_block_sizes, ego->recv_block_offsets,
+			   ego->comm, I, I);
+     }
+
+     /* transpose locally, again, to get ordinary row-major;
+	this may take two transposes if the block sizes are unequal
+	(3 subplans, two of which operate on disjoint data) */
+     cld2 = (plan_rdft *) ego->cld2;
+     cld2->apply(ego->cld2, I, O);
+     cld2rest = (plan_rdft *) ego->cld2rest;
+     if (cld2rest) {
+	  cld2rest->apply(ego->cld2rest,
+			  I + ego->rest_Ioff, O + ego->rest_Ooff);
+	  cld3 = (plan_rdft *) ego->cld3;
+	  if (cld3)
+	       cld3->apply(ego->cld3, O, O);
+	  /* else TRANSPOSED_OUT is true and user wants O transposed */
+     }
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
+     /* Note: this is *not* UGLY for out-of-place, destroy-input plans;
+	the planner often prefers transpose-pairwise schemes to transpose-alltoall,
+	at least with LAM MPI on my machine. */
+     return (1
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && ONLY_TRANSPOSEDP(p->flags));
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+     X(plan_awake)(ego->cld2rest, wakefulness);
+     X(plan_awake)(ego->cld3, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(ifree0)(ego->sched);
+     X(ifree0)(ego->send_block_sizes);
+     MPI_Comm_free(&ego->comm);
+     X(plan_destroy_internal)(ego->cld3);
+     X(plan_destroy_internal)(ego->cld2rest);
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-transpose-blk-scheme1%s%(%p%)%(%p%)%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"",
+	      ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
+}
+
+/* Given a process which_pe and a number of processes npes, fills
+   the array sched[npes] with a sequence of processes to communicate
+   with for a deadlock-free, optimum-overlap all-to-all communication.
+   (All processes must call this routine to get their own schedules.)
+   The schedule can be re-ordered arbitrarily as long as all processes
+   apply the same permutation to their schedules.
+
+   The algorithm here is based upon the one described in:
+       J. A. M. Schreuder, "Constructing timetables for sport
+       competitions," Mathematical Programming Study 13, pp. 58-67 (1980). 
+   In a sport competition, you have N teams and want every team to
+   play every other team in as short a time as possible (maximum overlap
+   between games).  This timetabling problem is therefore identical
+   to that of an all-to-all communications problem.  In our case, there
+   is one wrinkle: as part of the schedule, the process must do
+   some data transfer with itself (local data movement), analogous
+   to a requirement that each team "play itself" in addition to other
+   teams.  With this wrinkle, it turns out that an optimal timetable
+   (N parallel games) can be constructed for any N, not just for even
+   N as in the original problem described by Schreuder.
+*/
+static void fill1_comm_sched(int *sched, int which_pe, int npes)
+{
+     int pe, i, n, s = 0;
+     A(which_pe >= 0 && which_pe < npes);
+     if (npes % 2 == 0) {
+	  n = npes;
+	  sched[s++] = which_pe;
+     }
+     else
+	  n = npes + 1;
+     for (pe = 0; pe < n - 1; ++pe) {
+	  if (npes % 2 == 0) {
+	       if (pe == which_pe) sched[s++] = npes - 1;
+	       else if (npes - 1 == which_pe) sched[s++] = pe;
+	  }
+	  else if (pe == which_pe) sched[s++] = pe;
+
+	  if (pe != which_pe && which_pe < n - 1) {
+	       i = (pe - which_pe + (n - 1)) % (n - 1);
+	       if (i < n/2)
+		    sched[s++] = (pe + i) % (n - 1);
+	       
+	       i = (which_pe - pe + (n - 1)) % (n - 1);
+	       if (i < n/2)
+		    sched[s++] = (pe - i + (n - 1)) % (n - 1);
+	  }
+     }
+     A(s == npes);
+}
+
+/* Sort the communication schedule sched for npes so that the schedule
+   on process sortpe is ascending or descending (!ascending).  This is
+   necessary to allow in-place transposes when the problem does not
+   divide equally among the processes.  In this case there is one
+   process where the incoming blocks are bigger/smaller than the
+   outgoing blocks and thus have to be received in
+   descending/ascending order, respectively, to avoid overwriting data
+   before it is sent. */
+static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending)
+{
+     int *sortsched, i;
+     sortsched = (int *) MALLOC(npes * sizeof(int) * 2, OTHER);
+     fill1_comm_sched(sortsched, sortpe, npes);
+     if (ascending)
+	  for (i = 0; i < npes; ++i)
+	       sortsched[npes + sortsched[i]] = sched[i];
+     else
+	  for (i = 0; i < npes; ++i)
+	       sortsched[2*npes - 1 - sortsched[i]] = sched[i];
+     for (i = 0; i < npes; ++i)
+	  sched[i] = sortsched[npes + i];
+     X(ifree)(sortsched);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_transpose *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
+     INT b, bt, vn, rest_Ioff, rest_Ooff;
+     INT *sbs, *sbo, *rbs, *rbo;
+     int pe, my_pe, n_pes, sort_pe = -1, ascending = 1;
+     R *I, *O;
+     static const plan_adt padt = {
+          XM(transpose_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_transpose *) p_;
+     vn = p->vn;
+     I = p->I; O = p->O;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     b = XM(block)(p->nx, p->block, my_pe);
+     
+     if (!(p->flags & TRANSPOSED_IN)) { /* b x ny x vn -> ny x b x vn */
+	  cld1 = X(mkplan_f_d)(plnr, 
+			       X(mkproblem_rdft_0_d)(X(mktensor_3d)
+						     (b, p->ny * vn, vn,
+						      p->ny, vn, b * vn,
+						      vn, 1, 1),
+						     I, O),
+			       0, 0, NO_SLOW);
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O;
+
+     if (XM(any_true)(!XM(mkplans_posttranspose)(p, plnr, I, O, my_pe,
+						 &cld2, &cld2rest, &cld3,
+						 &rest_Ioff, &rest_Ooff),
+		      p->comm)) goto nada;
+
+     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
+
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->cld2rest = cld2rest;
+     pln->rest_Ioff = rest_Ioff;
+     pln->rest_Ooff = rest_Ooff;
+     pln->cld3 = cld3;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+
+     MPI_Comm_dup(p->comm, &pln->comm);
+
+     n_pes = (int) X(imax)(XM(num_blocks)(p->nx, p->block),
+			   XM(num_blocks)(p->ny, p->tblock));
+
+     /* Compute sizes/offsets of blocks to exchange between processors */
+     sbs = (INT *) MALLOC(4 * n_pes * sizeof(INT), PLANS);
+     sbo = sbs + n_pes;
+     rbs = sbo + n_pes;
+     rbo = rbs + n_pes;
+     b = XM(block)(p->nx, p->block, my_pe);
+     bt = XM(block)(p->ny, p->tblock, my_pe);
+     for (pe = 0; pe < n_pes; ++pe) {
+	  INT db, dbt; /* destination block sizes */
+	  db = XM(block)(p->nx, p->block, pe);
+	  dbt = XM(block)(p->ny, p->tblock, pe);
+
+	  sbs[pe] = b * dbt * vn;
+	  sbo[pe] = pe * (b * p->tblock) * vn;
+	  rbs[pe] = db * bt * vn;
+	  rbo[pe] = pe * (p->block * bt) * vn;
+
+	  if (db * dbt > 0 && db * p->tblock != p->block * dbt) {
+	       A(sort_pe == -1); /* only one process should need sorting */
+	       sort_pe = pe;
+	       ascending = db * p->tblock > p->block * dbt;
+	  }
+     }
+     pln->n_pes = n_pes;
+     pln->my_pe = my_pe;
+     pln->send_block_sizes = sbs;
+     pln->send_block_offsets = sbo;
+     pln->recv_block_sizes = rbs;
+     pln->recv_block_offsets = rbo;
+
+     if (my_pe >= n_pes) {
+	  pln->sched = 0; /* this process is not doing anything */
+     }
+     else {
+	  pln->sched = (int *) MALLOC(n_pes * sizeof(int), PLANS);
+	  fill1_comm_sched(pln->sched, my_pe, n_pes);
+	  if (sort_pe >= 0)
+	       sort1_comm_sched(pln->sched, n_pes, sort_pe, ascending);
+     }
+
+     X(ops_zero)(&pln->super.super.ops);
+     if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
+     if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
+     if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
+     if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
+     /* FIXME: should MPI operations be counted in "other" somehow? */
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld3);
+     X(plan_destroy_internal)(cld2rest);
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(transpose_blk_based_scheme1_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
diff --git a/rdft/vrank3-transpose.c b/rdft/vrank3-transpose.c
index 62d7d479..2a1a5e9d 100644
--- a/rdft/vrank3-transpose.c
+++ b/rdft/vrank3-transpose.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ * Copyright (C) 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -24,6 +25,10 @@
 
 #include "rdft/rdft.h"
 
+#if defined(AMD_OPT_TOMS513_TRANS) && (!defined(FFTW_LDOUBLE) && !defined(FFTW_QUAD) && !defined(FFTW_SINGLE))
+#include "immintrin.h"
+#endif
+
 #ifdef HAVE_STRING_H
 #include <string.h>		/* for memcpy() */
 #endif
@@ -520,6 +525,11 @@ static void transpose_toms513(R *a, INT nx, INT ny, INT N,
      R *b, *c, *d;
      INT ncount;
      INT k;
+#if defined(AMD_OPT_TOMS513_TRANS) && (!defined(FFTW_LDOUBLE) && !defined(FFTW_QUAD) && !defined(FFTW_SINGLE))
+     INT mod16 = (N & 0xF), mod8 = (mod16 & 0x7);
+     INT ii, N_16mod = N - mod16, N_8mod = mod16 - mod8, N_4mod = mod8 - (mod8 & 0x3);
+     __m256d in1, in2, in3, in4, in5, in6, in7, in8;
+#endif
      
      /* check arguments and initialize: */
      A(ny > 0 && nx > 0 && N > 0 && move_size > 0);
@@ -543,8 +553,14 @@ static void transpose_toms513(R *a, INT nx, INT ny, INT N,
      im = ny;
      
      while (1) {
+#if defined(AMD_OPT_TOMS513_TRANS) && (!defined(FFTW_LDOUBLE) && !defined(FFTW_QUAD) && !defined(FFTW_SINGLE))
+	  INT i1, i2, i1c, i2c, i1_2;
+	  INT kmi;
+	  INT N_i1, N_i1c, N_i2, N_i2c;
+#else
 	  INT i1, i2, i1c, i2c;
 	  INT kmi;
+#endif
 	  
 	  /** Rearrange the elements of a loop
 	      and its companion loop: **/
@@ -552,6 +568,9 @@ static void transpose_toms513(R *a, INT nx, INT ny, INT N,
 	  i1 = i;
 	  kmi = k - i;
 	  i1c = kmi;
+#if defined(AMD_OPT_TOMS513_TRANS) && (!defined(FFTW_LDOUBLE) && !defined(FFTW_QUAD) && !defined(FFTW_SINGLE))
+	  i1_2 = i1 / nx;
+#endif
 	  switch (N) {
 	      case 1:
 		   b[0] = a[i1];
@@ -564,12 +583,73 @@ static void transpose_toms513(R *a, INT nx, INT ny, INT N,
 		   c[1] = a[2*i1c+1];
 		   break;
 	      default:
+#if defined(AMD_OPT_TOMS513_TRANS) && (!defined(FFTW_LDOUBLE) && !defined(FFTW_QUAD) && !defined(FFTW_SINGLE))
+		   N_i1 = N * i1;
+		   N_i1c = N * i1c;
+		   ii = 0;
+		   for (; ii < N_16mod; ii += 16)
+		   {
+			   in1 = _mm256_loadu_pd((double const *)&a[N_i1 + ii]);
+			   in2 = _mm256_loadu_pd((double const *)&a[N_i1 + ii + 4]);
+			   in3 = _mm256_loadu_pd((double const *)&a[N_i1 + ii + 8]);
+			   in4 = _mm256_loadu_pd((double const *)&a[N_i1 + ii + 12]);
+			   in5 = _mm256_loadu_pd((double const *)&a[N_i1c + ii]);
+			   in6 = _mm256_loadu_pd((double const *)&a[N_i1c + ii + 4]);
+			   in7 = _mm256_loadu_pd((double const *)&a[N_i1c + ii + 8]);
+			   in8 = _mm256_loadu_pd((double const *)&a[N_i1c + ii + 12]);
+			   
+			   _mm256_storeu_pd((double *)&b[ii], in1);
+			   _mm256_storeu_pd((double *)&b[ii+4], in2);
+			   _mm256_storeu_pd((double *)&b[ii+8], in3);
+			   _mm256_storeu_pd((double *)&b[ii+12], in4);
+			   _mm256_storeu_pd((double *)&c[ii], in5);
+			   _mm256_storeu_pd((double *)&c[ii+4], in6);
+			   _mm256_storeu_pd((double *)&c[ii+8], in7);
+			   _mm256_storeu_pd((double *)&c[ii+12], in8);
+		   }
+		   
+		   if (N_8mod)
+		   {
+			   in1 = _mm256_loadu_pd((double const *)&a[N_i1 + ii]);
+			   in2 = _mm256_loadu_pd((double const *)&a[N_i1 + ii + 4]);
+			   in5 = _mm256_loadu_pd((double const *)&a[N_i1c + ii]);
+			   in6 = _mm256_loadu_pd((double const *)&a[N_i1c + ii + 4]);
+			   
+			   _mm256_storeu_pd((double *)&b[ii], in1);
+			   _mm256_storeu_pd((double *)&b[ii+4], in2);
+			   _mm256_storeu_pd((double *)&c[ii], in5);
+			   _mm256_storeu_pd((double *)&c[ii+4], in6);
+			    ii += 8;
+		   }
+		   
+		   if (N_4mod)
+		   {
+			   in1 = _mm256_loadu_pd((double const *)&a[N_i1 + ii]);
+			   in5 = _mm256_loadu_pd((double const *)&a[N_i1c + ii]);
+			   
+			   _mm256_storeu_pd((double *)&b[ii], in1);
+			   _mm256_storeu_pd((double *)&c[ii], in5);
+			    ii += 4;
+		   }
+		   
+		   for (; ii < N; ii++)
+		   {
+			   b[ii] = a[N_i1 + ii];
+			   c[ii] = a[N_i1c + ii];
+		   }
+#else
 		   memcpy(b, &a[N * i1], N * sizeof(R));
 		   memcpy(c, &a[N * i1c], N * sizeof(R));
+#endif
 	  }
+	  
 	  while (1) {
+#if defined(AMD_OPT_TOMS513_TRANS) && (!defined(FFTW_LDOUBLE) && !defined(FFTW_QUAD) && !defined(FFTW_SINGLE))
+	       i2 = ny * i1 - k * (i1_2);
+#else
 	       i2 = ny * i1 - k * (i1 / nx);
 	       i2c = k - i2;
+#endif
 	       if (i1 < move_size)
 		    move[i1] = 1;
 	       if (i1c < move_size)
@@ -583,6 +663,10 @@ static void transpose_toms513(R *a, INT nx, INT ny, INT N,
 		    c = d;
 		    break;
 	       }
+#if defined(AMD_OPT_TOMS513_TRANS) && (!defined(FFTW_LDOUBLE) && !defined(FFTW_QUAD) && !defined(FFTW_SINGLE))
+	       i1_2 = i2 / nx;
+	       i2c = k - i2;
+#endif
 	       switch (N) {
 		   case 1:
 			a[i1] = a[i2];
@@ -595,10 +679,68 @@ static void transpose_toms513(R *a, INT nx, INT ny, INT N,
 			a[2*i1c+1] = a[2*i2c+1];
 			break;
 		   default:
+#if defined(AMD_OPT_TOMS513_TRANS) && (!defined(FFTW_LDOUBLE) && !defined(FFTW_QUAD) && !defined(FFTW_SINGLE))
+			N_i1 = N * i1;
+			N_i1c = N * i1c;
+			N_i2 = N * i2;
+			N_i2c = N * i2c;
+			ii = 0;
+			for (; ii < N_16mod; ii += 16)
+			{
+				in1 = _mm256_loadu_pd((double const *)&a[N_i2 + ii]);
+				in2 = _mm256_loadu_pd((double const *)&a[N_i2 + ii + 4]);
+				in3 = _mm256_loadu_pd((double const *)&a[N_i2 + ii + 8]);
+				in4 = _mm256_loadu_pd((double const *)&a[N_i2 + ii + 12]);
+				in5 = _mm256_loadu_pd((double const *)&a[N_i2c + ii]);
+				in6 = _mm256_loadu_pd((double const *)&a[N_i2c + ii + 4]);
+				in7 = _mm256_loadu_pd((double const *)&a[N_i2c + ii + 8]);
+				in8 = _mm256_loadu_pd((double const *)&a[N_i2c + ii + 12]);
+
+				_mm256_storeu_pd((double *)&a[N_i1 + ii], in1);
+				_mm256_storeu_pd((double *)&a[N_i1 + ii+4], in2);
+				_mm256_storeu_pd((double *)&a[N_i1 + ii+8], in3);
+				_mm256_storeu_pd((double *)&a[N_i1 + ii+12], in4);
+				_mm256_storeu_pd((double *)&a[N_i1c + ii], in5);
+				_mm256_storeu_pd((double *)&a[N_i1c + ii+4], in6);
+				_mm256_storeu_pd((double *)&a[N_i1c + ii+8], in7);
+				_mm256_storeu_pd((double *)&a[N_i1c + ii+12], in8);
+			}
+
+			if (N_8mod)
+			{
+				in1 = _mm256_loadu_pd((double const *)&a[N_i2 + ii]);
+				in2 = _mm256_loadu_pd((double const *)&a[N_i2 + ii + 4]);
+				in5 = _mm256_loadu_pd((double const *)&a[N_i2c + ii]);
+				in6 = _mm256_loadu_pd((double const *)&a[N_i2c + ii + 4]);
+
+				_mm256_storeu_pd((double *)&a[N_i1 + ii], in1);
+				_mm256_storeu_pd((double *)&a[N_i1 + ii+4], in2);
+				_mm256_storeu_pd((double *)&a[N_i1c + ii], in5);
+				_mm256_storeu_pd((double *)&a[N_i1c + ii+4], in6);
+				ii += 8;
+			}
+
+			if (N_4mod)
+			{
+				in1 = _mm256_loadu_pd((double const *)&a[N_i2 + ii]);
+				in5 = _mm256_loadu_pd((double const *)&a[N_i2c + ii]);
+
+				_mm256_storeu_pd((double *)&a[N_i1 + ii], in1);
+				_mm256_storeu_pd((double *)&a[N_i1c + ii], in5);
+				ii += 4;
+			}
+
+			for (; ii < N; ii++)
+			{
+				a[N_i1 + ii] = a[N_i2 + ii];
+				a[N_i1c + ii] = a[N_i2c + ii];
+			}
+#else
 			memcpy(&a[N * i1], &a[N * i2], 
 			       N * sizeof(R));
 			memcpy(&a[N * i1c], &a[N * i2c], 
 			       N * sizeof(R));
+#endif
 	       }
 	       i1 = i2;
 	       i1c = i2c;
@@ -615,8 +757,64 @@ static void transpose_toms513(R *a, INT nx, INT ny, INT N,
 		   a[2*i1c+1] = c[1];
 		   break;
 	      default:
+#if defined(AMD_OPT_TOMS513_TRANS) && (!defined(FFTW_LDOUBLE) && !defined(FFTW_QUAD) && !defined(FFTW_SINGLE))
+		  N_i1 = N * i1;
+		  N_i1c = N * i1c;
+		  ii = 0;
+		  for (; ii < N_16mod; ii += 16)
+		  {
+			   in1 = _mm256_loadu_pd((double const *)&b[ii]);
+			   in2 = _mm256_loadu_pd((double const *)&b[ii+4]);
+			   in3 = _mm256_loadu_pd((double const *)&b[ii+8]);
+			   in4 = _mm256_loadu_pd((double const *)&b[ii+12]);
+			   in5 = _mm256_loadu_pd((double const *)&c[ii]);
+			   in6 = _mm256_loadu_pd((double const *)&c[ii+4]);
+			   in7 = _mm256_loadu_pd((double const *)&c[ii+8]);
+			   in8 = _mm256_loadu_pd((double const *)&c[ii+12]);
+
+			   _mm256_storeu_pd((double *)&a[N_i1 + ii], in1);
+			   _mm256_storeu_pd((double *)&a[N_i1 + ii + 4], in2);
+			   _mm256_storeu_pd((double *)&a[N_i1 + ii + 8], in3);
+			   _mm256_storeu_pd((double *)&a[N_i1 + ii + 12], in4);
+			   _mm256_storeu_pd((double *)&a[N_i1c + ii], in5);
+			   _mm256_storeu_pd((double *)&a[N_i1c + ii + 4], in6);
+			   _mm256_storeu_pd((double *)&a[N_i1c + ii + 8], in7);
+			   _mm256_storeu_pd((double *)&a[N_i1c + ii + 12], in8);
+		  }
+
+		  if (N_8mod)
+		  {
+			   in1 = _mm256_loadu_pd((double const *)&b[ii]);
+			   in2 = _mm256_loadu_pd((double const *)&b[ii+4]);
+			   in5 = _mm256_loadu_pd((double const *)&c[ii]);
+			   in6 = _mm256_loadu_pd((double const *)&c[ii+4]);
+
+			   _mm256_storeu_pd((double *)&a[N_i1 + ii], in1);
+			   _mm256_storeu_pd((double *)&a[N_i1 + ii + 4], in2);
+			   _mm256_storeu_pd((double *)&a[N_i1c + ii], in5);
+			   _mm256_storeu_pd((double *)&a[N_i1c + ii + 4], in6);
+			    ii += 8;
+		  }
+
+		  if (N_4mod)
+		  {
+			   in1 = _mm256_loadu_pd((double const *)&b[ii]);
+			   in5 = _mm256_loadu_pd((double const *)&c[ii]);
+
+			   _mm256_storeu_pd((double *)&a[N_i1 + ii], in1);
+			   _mm256_storeu_pd((double *)&a[N_i1c + ii], in5);
+			    ii += 4;
+		  }
+		  
+		  for (; ii < N; ii++)
+		  {
+			   a[N_i1 + ii] = b[ii];
+			   a[N_i1c + ii] = c[ii];
+		  }
+#else
 		   memcpy(&a[N * i1], b, N * sizeof(R));
 		   memcpy(&a[N * i1c], c, N * sizeof(R));
+#endif		   
 	  }
 	  if (ncount >= mn)
 	       break;	/* we've moved all elements */
@@ -652,8 +850,11 @@ static void apply_toms513(const plan *ego_, R *I, R *O)
      INT n = ego->n, m = ego->m;
      INT vl = ego->vl;
      R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
+	 //R *buf = (R *)MALLOC((sizeof(R) * ego->nbuf)+32, BUFFERS);
+	 //R *buf_aligned = (R *)((ptrdiff_t)buf + (32 - ((ptrdiff_t)buf & 0x1F)));
      UNUSED(O);
      transpose_toms513(I, n, m, vl, (char *) (buf + 2*vl), (n+m)/2, buf);
+	 //transpose_toms513(I, n, m, vl, (char *) (buf_aligned + 2*vl), (n+m)/2, buf_aligned);
      X(ifree)(buf);
 }
 

From a1fb3123b3e8aabb763a3e20548354cda1778072 Mon Sep 17 00:00:00 2001
From: Madhusudhan S <madhusudhan.s@amd.com>
Date: Thu, 10 Jun 2021 14:55:36 +0530
Subject: [PATCH 5/6] This code change fixes the linker error with AOCC when
 Top-N planner is enabled.

This patch will fix the linker error that was coming due
to linking multiple files with global variables used by
Top-N planner.

Change-Id: I14f0f92b97d0903909883427ca70ca3f3d011e22
---
 api/apiplan.c    | 11 ++++++-----
 kernel/ifftw.h   |  3 +--
 kernel/planner.c |  5 ++++-
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/api/apiplan.c b/api/apiplan.c
index 99921b76..eb8d477d 100644
--- a/api/apiplan.c
+++ b/api/apiplan.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
- * Copyright (C) 2019, Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -33,17 +33,17 @@ void X(set_planner_hooks)(planner_hook_t before, planner_hook_t after)
 plan *plans[AMD_OPT_TOP_N];
 static int find_lowcost_plan()
 {
-    int i, lowcost, lowcost_idx;
+    int i, lowcost, lowcost_id;
     lowcost = plans[0]->pcost;
-    lowcost_idx = 0;
+    lowcost_id = 0;
 
     for (i = 1; i < AMD_OPT_TOP_N; i++) {
          if (plans[i]->pcost < lowcost) {
               lowcost = plans[i]->pcost;
-              lowcost_idx = i;
+              lowcost_id = i;
          }
     }
-    return lowcost_idx;
+    return lowcost_id;
 }
 #endif
 
@@ -52,6 +52,7 @@ static plan *mkplan0(planner *plnr, unsigned flags,
 		     wisdom_state_t wisdom_state)
 {
 #ifdef AMD_TOP_N_PLANNER
+     static int lowcost_idx;	/* to hold the index of the plan which has the least pcost among the top N plans*/     
 /* map API flags into FFTW flags */
      X(mapflags)(plnr, flags);
 
diff --git a/kernel/ifftw.h b/kernel/ifftw.h
index 9b30f692..89b38dd4 100644
--- a/kernel/ifftw.h
+++ b/kernel/ifftw.h
@@ -864,8 +864,7 @@ struct planner_s {
 };
 
 #ifdef AMD_TOP_N_PLANNER
-     int wisp_set;     /* flag to identify if the plans for an input problem size is found in the wisdom file or not*/
-     int lowcost_idx;  /* to hold the index of the plan which has the least pcost among the top N plans*/
+     extern int wisp_set;     /* flag to identify if the plans for an input problem size is found in the wisdom file or not*/
 #endif
 
 planner *X(mkplanner)(void);
diff --git a/kernel/planner.c b/kernel/planner.c
index cf90cbd6..06ba7fb7 100644
--- a/kernel/planner.c
+++ b/kernel/planner.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2000 Matteo Frigo
  * Copyright (c) 2000 Massachusetts Institute of Technology
- * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -24,6 +24,9 @@
 #include "dft/dft.h"
 #endif
 #include <string.h>
+#ifdef AMD_TOP_N_PLANNER
+int wisp_set; //Referring to extern variable declared in kernel/ifftw.h
+#endif
 
 /* GNU Coding Standards, Sec. 5.2: "Please write the comments in a GNU
    program in English, because English is the one language that nearly

From 7147670ce6be8c189f1d84a95491f0677c14be06 Mon Sep 17 00:00:00 2001
From: sraut <Biplab.Raut@amd.com>
Date: Fri, 18 Jun 2021 21:21:42 +0530
Subject: [PATCH 6/6] This code change provides a set of fixes for specific
 build errors observed with GCC and AOCC compilers.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1) GCC10 and AOCC3.0 treat global variables without extern as errors.
   Both compilers would throw "Multiple definitions" error for L1D_blk_size
   and L1Dsize variables when configured with "--enable-avx” SIMD option along
   with the option “--enable-amd-trans”. This has been now fixed.
2) Code fix provided for the "Undefined reference" error to ‘cpuid_all’ when
   configured with "--enable-sse2 --enable-avx2” SIMD options along with the
   option “—enable-amd-trans”.
3) Code fix provided for "Undeclared variables" errors thrown for ‘ALIGNMENT’ and
   ‘ALIGNMENTA’ when configured with “--enable-amd-fast-planner” without any SIMD
   configure options.

Change-Id: I8b5408d72c1bb74a000ee6fdc95d1ea87d4baba3
---
 dft/conf.c                 |  4 ++--
 kernel/ifftw.h             | 17 +++++++++++++++--
 kernel/tile2d.c            |  6 +++++-
 kernel/transpose.c         |  4 ++++
 simd-support/amd64-cpuid.h |  4 ++++
 simd-support/avx.c         |  9 +++++++--
 simd-support/sse2.c        |  5 ++++-
 7 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/dft/conf.c b/dft/conf.c
index 3ed498b7..97ec0e9c 100644
--- a/dft/conf.c
+++ b/dft/conf.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
- * Copyright (C) 2019, Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -90,7 +90,7 @@ void X(dft_conf_standard)(planner *p)
 #if HAVE_GENERIC_SIMD256
      X(solvtab_exec)(X(solvtab_dft_generic_simd256), p);
 #endif
-#ifdef AMD_OPT_AUTO_TUNED_TRANS_BLK_SIZE
+#ifdef AMD_OPT_TRANS
      X(enquire_L1DcacheSize)();
 #endif
 }
diff --git a/kernel/ifftw.h b/kernel/ifftw.h
index 89b38dd4..098bb4a4 100644
--- a/kernel/ifftw.h
+++ b/kernel/ifftw.h
@@ -107,10 +107,16 @@ extern "C"
 #if defined(HAVE_MPI) || defined(HAVE_OPENMP)
 #undef AMD_OPT_TRANS
 #endif
+#if defined(HAVE_SSE) || defined(HAVE_SSE2) || \
+    defined(HAVE_AVX) || defined(HAVE_AVX_128_FMA) || \
+    defined(HAVE_AVX2) || defined(HAVE_AVX512) 
 #ifdef AMD_OPT_TRANS
 #define AMD_OPT_AUTO_TUNED_TRANS_BLK_SIZE
 #define AMD_OPT_AUTO_TUNED_RASTER_TILED_TRANS_METHOD
 #endif
+#else
+#undef AMD_OPT_TRANS
+#endif
 //Here they are again provided for manual override to enable them.
 //(i) enables auto-tuned block sized tiling as per CPU's L1D cache size (applicable for both original 
 //    FFTW's transpose and the new auto-tuned cache-efficient raster order tiled transpose
@@ -148,10 +154,17 @@ extern "C"
 //UNBLESSED HASH table is kept alive till the process/thread life like the BLESSED HASH table.
 //Since UNBLESSED HASH table keeps growing, so it is cleared smartly beyond a MAX SIZE by swapping with BLESSED table.
 #ifdef AMD_OPT_FAST_PLANNER
+
+#if defined(HAVE_SSE) || defined(HAVE_SSE2) || \
+    defined(HAVE_AVX) || defined(HAVE_AVX_128_FMA) || \
+    defined(HAVE_AVX2) || defined(HAVE_AVX512) 
+
 #define AMD_FAST_PLANNER
 #define AMD_FAST_PLANNING_HASH_V1
 //#define AMD_FAST_PLANNING_HASH_V2
 #define AMD_HASH_UNBLESS_MAX_SIZE 10485760
+
+#endif
 #endif
 //--------------------------------
 //NEW TOP N PLANNER feature for AMD CPUs can be enabled with the below switch AMD_TOP_N_PLANNER.
@@ -1068,8 +1081,8 @@ void X(rader_tl_delete)(R *W, rader_tl **tl);
 /* upper bound to the cache size based on latest CPU architectures, for AMD optimized tiled routines */
 #define CACHESIZE 32768
 #define BLK_SIZE 32
-unsigned int L1D_blk_size;// = CACHESIZE;
-unsigned int L1Dsize;// = BLK_SIZE;
+extern unsigned int L1D_blk_size;// = CACHESIZE;
+extern unsigned int L1Dsize;// = BLK_SIZE;
 #else
 /* lower bound to the cache size, for tiled routines */
 #define CACHESIZE 8192
diff --git a/kernel/tile2d.c b/kernel/tile2d.c
index e99acc3d..2f962281 100644
--- a/kernel/tile2d.c
+++ b/kernel/tile2d.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
- * Copyright (C) 2019, Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,6 +22,10 @@
 /* out of place 2D copy routines */
 #include "kernel/ifftw.h"
 
+#if defined(AMD_OPT_AUTO_TUNED_TRANS_BLK_SIZE)
+unsigned int L1Dsize;// = BLK_SIZE;
+#endif
+
 void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
 	       void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
 	       void *args)
diff --git a/kernel/transpose.c b/kernel/transpose.c
index ca83fd73..39732546 100644
--- a/kernel/transpose.c
+++ b/kernel/transpose.c
@@ -25,6 +25,10 @@
 #include "immintrin.h"
 #endif
 
+#if defined(AMD_OPT_AUTO_TUNED_RASTER_TILED_TRANS_METHOD)
+unsigned int L1D_blk_size;// = CACHESIZE;
+#endif
+
 /* in place square transposition, iterative */
 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl)
 {
diff --git a/simd-support/amd64-cpuid.h b/simd-support/amd64-cpuid.h
index 9b91f497..acbc8288 100644
--- a/simd-support/amd64-cpuid.h
+++ b/simd-support/amd64-cpuid.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ * Copyright (C) 2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -18,6 +19,8 @@
  *
  */
 
+#ifndef _AMD64_CPUID_H
+#define _AMD64_CPUID_H
 
 #ifdef _MSC_VER
 #ifndef inline
@@ -146,3 +149,4 @@ static inline int xgetbv_eax(int op)
      return eax;
 #endif
 }
+#endif
diff --git a/simd-support/avx.c b/simd-support/avx.c
index 6e57b71b..7e96ee07 100644
--- a/simd-support/avx.c
+++ b/simd-support/avx.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
- * Copyright (C) 2019, Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2019-2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -53,7 +53,12 @@ int X(have_simd_avx)(void)
 
 #endif
 
-#ifdef AMD_OPT_AUTO_TUNED_TRANS_BLK_SIZE
+#ifdef AMD_OPT_TRANS
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
+#    include "amd64-cpuid.h"
+#else
+#    include "x86-cpuid.h"
+#endif
 void X(enquire_L1DcacheSize) (void)
 {
 	int eax, ebx, ecx, edx;
diff --git a/simd-support/sse2.c b/simd-support/sse2.c
index c52c852e..d6cdceed 100644
--- a/simd-support/sse2.c
+++ b/simd-support/sse2.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2003, 2007-14 Matteo Frigo
  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ * Copyright (C) 2021, Advanced Micro Devices, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -30,7 +31,9 @@
 #if HAVE_SSE2
 
 # if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
-
+#ifdef AMD_OPT_TRANS
+#    include "amd64-cpuid.h"
+#endif
   int X(have_simd_sse2)(void)
   {
        return 1;