Skip to content

Commit

Permalink
This change extends the applicability and functionality of AMD applic…
Browse files Browse the repository at this point in the history
…ation optimization layer.

1) The AMD application optimization layer extends its implementation support (of a separate memory region for the planner)
   for the real DFT problem types that are supported through r2c and c2r interfaces.
2) The default use of wisdom feature in combination with the application optimization layer is disabled.
   This avoids the corruption of wisdom file from concurrent reads/writes by multiple processes of the application.
   It is due to known concurrency issues in the planner and needs some kind of atomic operations for the wisdom feature.
3) README is updated with more information.

This code change relates to Jira IDs AMD-Internal : [CPUPL-1890] [CPUPL-1889]

Change-Id: Icc78a53708a1855b41f451257ef1310a9791b22e
  • Loading branch information
BiplabRaut committed Oct 4, 2021
1 parent 5647dde commit f2c126b
Show file tree
Hide file tree
Showing 3 changed files with 181 additions and 9 deletions.
14 changes: 9 additions & 5 deletions README_AMD.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ INSTALLATION FROM AMD Optimized FFTW GIT REPOSITORY:

After downloading the latest stable release from the git repository,
https://github.com/amd/amd-fftw, follow the below steps to configure and
build it for AMD EPYC processor based on Naples, Rome and future
build it for AMD EPYC processor based on Naples, Rome, Milan and future
generation architectures.

./configure --enable-sse2 --enable-avx --enable-avx2
Expand All @@ -65,17 +65,21 @@ configure option, the user needs to set --mca btl_vader_eager_limit
appropriately (current preference is 65536) in the MPIRUN command.

The new fast planner can be enabled using optional configure option
"--enable-amd-fast-planner". It is supported for single and double precisions.
"--enable-amd-fast-planner". It is supported in single and double precisions.

Top N planner mode can be enabled using optional configure option
"--enable-amd-top-n-planner" to minimize run-to-run variations in performance.
It is supported in single-threaded execution in single and double precisions.

An optional configure option "AMD_ARCH" is supported that can be set to CPU
architecture values like "auto" or "znver1" or "znver2" or "znver3" for AMD
EPYC processors.

The optional configure option "--enable-amd-app-opt" turns on AMD's application
optimization layer to benefit performance of HPC and scientific applications.
Currently it is developed for complex DFT problem types in double and single
precisions. It is not supported for MPI FFTs, real DFT problem types, Quad or
Long double precisions, and split array format.
Currently it is developed for complex and real (r2c and c2r) DFT problem types
in double and single precisions. It is not supported for MPI FFTs, r2r real DFT
problem types, Quad or Long double precisions, and split array format.

An optional configure option "--enable-amd-trans" is provided that may benefit
the performance of transpose operations in case of very large FFT problem sizes.
Expand Down
174 changes: 170 additions & 4 deletions api/apiplan.c
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,24 @@ typedef struct app_layer_data_s
R *io;
} app_layer_data;

static inline INT imax(INT a, INT b)
{
return (a > b) ? a : b;
}

static inline INT imin(INT a, INT b)
{
return (a < b) ? a : b;
}

static int create_amd_app_layer(int sign, unsigned *flags, problem *prb, app_layer_data *app_layer)
{
int isz, osz, inplace = 0;
INT isz, osz, inplace = 0;
int align_bytes = 0, in_alignment = 1, cur_alloc_alignment = 1, iaddr_changed = 0, oaddr_changed = 0;

*flags &= ~(FFTW_ESTIMATE | FFTW_MEASURE | FFTW_PATIENT | FFTW_EXHAUSTIVE);
*flags |= FFTW_PATIENT;
#ifdef AMD_APP_OPT_USE_WISDOM
if (wisdom_one_time_read == 0)
{
if (!X(import_wisdom_from_filename)("wis.dat"))
Expand All @@ -176,6 +187,7 @@ static int create_amd_app_layer(int sign, unsigned *flags, problem *prb, app_lay
wisdom_one_time_read = 1;
#endif
}
#endif

if(prb->adt->problem_kind == PROBLEM_DFT)
{
Expand Down Expand Up @@ -274,11 +286,135 @@ static int create_amd_app_layer(int sign, unsigned *flags, problem *prb, app_lay
printf("start-FFTW: (in-place:%d), %x, %x; %x, %x\n", inplace, pdft->ri, pdft->ii, pdft->ro, pdft->io);
printf("start-FFTW: %x, %x; %x, %x\n", app_layer->ri, app_layer->ii, app_layer->ro, app_layer->io);
printf("FFTW app_layer: %x, %x; %x, %x\n", app_layer->ri, app_layer->ii, app_layer->ro, app_layer->io);
#endif
}
else if(prb->adt->problem_kind == PROBLEM_RDFT2) //R2C(forward) and C2R(backward) cases only
{
problem_rdft2 *pdft = (problem_rdft2 *) prb;
INT lb = 0, ub = 1, lb2 = 0, ub2 = 1, stridedLen, stridedLen2, is = 0, os = 0;
int rType = (R2HC_KINDP(pdft->kind));//1: r2c, 0: c2r
isz = 1;
osz = 1;
if (FINITE_RNK(pdft->sz->rnk))
{
int i, rnkGtr1 = 0;
for (i = 0; i < pdft->sz->rnk-1; ++i)
{
const iodim *q = pdft->sz->dims + i;
is = q->is;
os = q->os;
stridedLen = (is>>!rType) * (q->n - 1);
stridedLen2 = (os>>rType) * (q->n - 1);
lb = imin(lb, lb + stridedLen);
ub = imax(ub, ub + stridedLen);
lb2 = imin(lb2, lb2 + stridedLen2);
ub2 = imax(ub2, ub2 + stridedLen2);
rnkGtr1 = 1;
}
if (i < pdft->sz->rnk)
{
const iodim *q = pdft->sz->dims + i;
is = q->is>>1;
os = q->os>>1;
stridedLen = is * (q->n - 1);
stridedLen2 = os * (q->n - 1);
lb = imin(lb, lb + stridedLen);
ub = imax(ub, ub + stridedLen);
lb2 = imin(lb2, lb2 + stridedLen2);
ub2 = imax(ub2, ub2 + stridedLen2);
}

}
if (FINITE_RNK(pdft->vecsz->rnk))
{
for (int i = 0; i < pdft->vecsz->rnk; ++i)
{
const iodim *q = pdft->vecsz->dims + i;
stridedLen = (q->is>>!rType) * (q->n - 1);
stridedLen2 = (q->os>>rType) * (q->n - 1);
lb = imin(lb, lb + stridedLen);
ub = imax(ub, ub + stridedLen);
lb2 = imin(lb2, lb2 + stridedLen2);
ub2 = imax(ub2, ub2 + stridedLen2);
}
}
isz = ub - lb;
osz = ub2 - lb2;
#ifdef AMD_APP_LAYER_API_LOGS
printf("start-App (real): %d*%d*%d*%d\n", pdft->sz->rnk, pdft->vecsz->rnk, pdft->sz->dims->n, pdft->vecsz->dims->n);
printf("start-App (real): %x, %x; %x, %x; %d, %d\n", pdft->r0, pdft->r1, pdft->cr, pdft->ci, isz, osz);
#endif
if (R2HC_KINDP(pdft->kind))
{
isz = isz > (osz<<1) ? isz : (osz<<1);
app_layer->ri = pdft->r0;
app_layer->ii = pdft->r1;
app_layer->ro = pdft->cr;
app_layer->io = pdft->ci;
inplace = (pdft->r0 == pdft->cr);
app_layer->inPtr = (R *) malloc((isz * sizeof(R)) + sizeof(R));
if (((ptrdiff_t)(pdft->r0)) & align_bytes)
in_alignment = 0;
if (((ptrdiff_t)(app_layer->inPtr)) & align_bytes)
cur_alloc_alignment = 0;

if ((in_alignment == 0 && cur_alloc_alignment == 1) ||
(in_alignment == 1 && cur_alloc_alignment == 0))
{
iaddr_changed = 1;
}
pdft->r0 = app_layer->inPtr + iaddr_changed;
pdft->r1 = pdft->r0 + is;
if (inplace)
{
pdft->cr = pdft->r0;
pdft->ci = pdft->cr + 1;
}
}
else
{
osz = osz > (isz<<1) ? osz : (isz<<1);
app_layer->ri = pdft->cr;
app_layer->ii = pdft->ci;
app_layer->ro = pdft->r0;
app_layer->io = pdft->r1;
inplace = (pdft->r0 == pdft->cr);
if (inplace)
{
app_layer->inPtr = (R *) malloc((osz * sizeof(R)) + sizeof(R));
}
else
{
app_layer->inPtr = (R *) malloc((isz * sizeof(R) * 2) + sizeof(R));
}
if (((ptrdiff_t)(pdft->cr)) & align_bytes)
in_alignment = 0;
if (((ptrdiff_t)(app_layer->inPtr)) & align_bytes)
cur_alloc_alignment = 0;

if ((in_alignment == 0 && cur_alloc_alignment == 1) ||
(in_alignment == 1 && cur_alloc_alignment == 0))
{
iaddr_changed = 1;
}
pdft->cr = app_layer->inPtr + iaddr_changed;
pdft->ci = pdft->cr + 1;
if (inplace)
{
pdft->r0 = pdft->cr;
pdft->r1 = pdft->r0 + os;
}
}

#ifdef AMD_APP_LAYER_API_LOGS
printf("start-FFTW (real): (in-place:%d), %x, %x; %x, %x\n", inplace, pdft->r0, pdft->r1, pdft->cr, pdft->ci);
printf("start-FFTW (real): %x, %x; %x, %x\n", app_layer->ri, app_layer->ii, app_layer->ro, app_layer->io);
printf("FFTW app_layer (real): %x, %x; %x, %x\n", app_layer->ri, app_layer->ii, app_layer->ro, app_layer->io);
#endif
}
else
{
fprintf(stderr, "apiplan: UNSUPPORTED problem type/kind\n");
fprintf(stderr, "apiplan: UNSUPPORTED problem type/kind [%d]\n", prb->adt->problem_kind);
return -1;
}
return 0;
Expand All @@ -287,20 +423,21 @@ static int create_amd_app_layer(int sign, unsigned *flags, problem *prb, app_lay
static void destroy_amd_app_layer(problem *prb, app_layer_data *app_layer)
{
int inplace = 0;
problem_dft *pdft = (problem_dft *) prb;

#ifdef AMD_APP_OPT_USE_WISDOM
if (wisdom_one_time_read == 0)
{
#ifdef AMD_APP_OPT_GENERATE_WISDOM
wisdom_one_time_read = 1;
#endif
X(export_wisdom_to_filename)("wis.dat");
}
#endif

inplace = (pdft->ri == pdft->ro);
if(prb->adt->problem_kind == PROBLEM_DFT)
{
problem_dft *pdft = (problem_dft *) prb;
inplace = (pdft->ri == pdft->ro);
free(app_layer->inPtr);
pdft->ri = app_layer->ri;
pdft->ii = app_layer->ii;
Expand All @@ -319,6 +456,35 @@ static void destroy_amd_app_layer(problem *prb, app_layer_data *app_layer)
}
#ifdef AMD_APP_LAYER_API_LOGS
printf("end-App: %x, %x; %x, %x\n", pdft->ri, pdft->ii, pdft->ro, pdft->io);
#endif
}
else if(prb->adt->problem_kind == PROBLEM_RDFT2)
{
problem_rdft2 *pdft = (problem_rdft2 *) prb;
inplace = (pdft->r0 == pdft->cr);
free(app_layer->inPtr);
if (R2HC_KINDP(pdft->kind))
{
pdft->r0 = app_layer->ri;
pdft->r1 = app_layer->ii;
if (inplace)
{
pdft->cr = app_layer->ro;
pdft->ci = app_layer->io;
}
}
else
{
pdft->cr = app_layer->ri;
pdft->ci = app_layer->ii;
if (inplace)
{
pdft->r0 = app_layer->ro;
pdft->r1 = app_layer->io;
}
}
#ifdef AMD_APP_LAYER_API_LOGS
printf("end-App: %x, %x; %x, %x\n", pdft->r0, pdft->r1, pdft->cr, pdft->ci);
#endif
}
}
Expand Down
2 changes: 2 additions & 0 deletions kernel/ifftw.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ extern "C"
#ifdef AMD_APP_OPT_LAYER //AMD's application optimization layer
//Enable/disable separate memory even for output buffer in case of out-of-place FFT
//#define AMD_APP_OPT_OUT_BUFFER_MEM
//Enable this switch to use wisdom feature in combination with application optimization layer.
//#define AMD_APP_OPT_USE_WISDOM
//Enable this switch to generate wisdom file for the first time for the application.
//#define AMD_APP_OPT_GENERATE_WISDOM
//Debug print logs for the application optimization layer
Expand Down

0 comments on commit f2c126b

Please sign in to comment.