Skip to content

Commit

Permalink
[flang][OpenMP] Add support for target ... private
Browse files Browse the repository at this point in the history
Adds support for the `private` clause in the `target` directive. In order to support that, the `DataSharingProcessor` was also restructured in order to separate the collection of prviate symbols from their actual privatization code-gen.

The commit adds both a code-gen and an offloading tests.
  • Loading branch information
ergawy committed Mar 18, 2024
1 parent cdc03d8 commit aeed526
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 30 deletions.
10 changes: 9 additions & 1 deletion flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,20 @@ namespace omp {
void DataSharingProcessor::processStep1() {
collectSymbolsForPrivatization();
collectDefaultSymbols();
}

void DataSharingProcessor::processStep2() {
if (privatizationDone)
return;

privatize();
defaultPrivatize();
insertBarrier();

privatizationDone = true;
}

void DataSharingProcessor::processStep2(mlir::Operation *op, bool isLoop) {
void DataSharingProcessor::processStep3(mlir::Operation *op, bool isLoop) {
insPt = firOpBuilder.saveInsertionPoint();
copyLastPrivatize(op);
firOpBuilder.restoreInsertionPoint(insPt);
Expand Down
41 changes: 31 additions & 10 deletions flang/lib/Lower/OpenMP/DataSharingProcessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ class DataSharingProcessor {
fir::FirOpBuilder &firOpBuilder;
const Fortran::parser::OmpClauseList &opClauseList;
Fortran::lower::pft::Evaluation &eval;
bool privatizationDone = false;

bool useDelayedPrivatization;
Fortran::lower::SymMap *symTable;
DelayedPrivatizationInfo delayedPrivatizationInfo;
Expand Down Expand Up @@ -90,25 +92,44 @@ class DataSharingProcessor {
eval(eval), useDelayedPrivatization(useDelayedPrivatization),
symTable(symTable) {}

// Privatisation is split into two steps.
// Step1 performs cloning of all privatisation clauses and copying for
// firstprivates. Step1 is performed at the place where process/processStep1
// Privatisation is split into 3 steps:
//
// * Step1: collects all symbols that should be privatized.
//
// * Step2: performs cloning of all privatisation clauses and copying for
// firstprivates. Step2 is performed at the place where process/processStep2
// is called. This is usually inside the Operation corresponding to the OpenMP
// construct, for looping constructs this is just before the Operation. The
// split into two steps was performed basically to be able to call
// privatisation for looping constructs before the operation is created since
// the bounds of the MLIR OpenMP operation can be privatised.
// Step2 performs the copying for lastprivates and requires knowledge of the
// MLIR operation to insert the last private update. Step2 adds
// construct, for looping constructs this is just before the Operation.
//
// * Step3: performs the copying for lastprivates and requires knowledge of
// the MLIR operation to insert the last private update. Step3 adds
// dealocation code as well.
//
// The split was performed for the following reasons:
//
// 1. Step1 was split so that the `target` op knows which symbols should not
// be mapped into the target region due to being `private`. The implicit
// mapping happens before the op body is generated so we need to to collect
// the private symbols first and then later in the body actually privatize
// them.
//
// 2. Step2 was split in order to call privatisation for looping constructs
// before the operation is created since the bounds of the MLIR OpenMP
// operation can be privatised.
void processStep1();
void processStep2(mlir::Operation *op, bool isLoop);
void processStep2();
void processStep3(mlir::Operation *op, bool isLoop);

void setLoopIV(mlir::Value iv) {
assert(!loopIV && "Loop iteration variable already set");
loopIV = iv;
}

const llvm::SetVector<const Fortran::semantics::Symbol *> &
getPrivatizedSymbols() const {
return privatizedSymbols;
}

const DelayedPrivatizationInfo &getDelayedPrivatizationInfo() const {
return delayedPrivatizationInfo;
}
Expand Down
61 changes: 42 additions & 19 deletions flang/lib/Lower/OpenMP/OpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,7 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
if (!info.dsp) {
tempDsp.emplace(info.converter, *info.clauses, info.eval);
tempDsp->processStep1();
tempDsp->processStep2();
}
}

Expand Down Expand Up @@ -593,11 +594,11 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
if (privatize) {
if (!info.dsp) {
assert(tempDsp.has_value());
tempDsp->processStep2(op, isLoop);
tempDsp->processStep3(op, isLoop);
} else {
if (isLoop && regionArgs.size() > 0)
info.dsp->setLoopIV(info.converter.getSymbolAddress(*regionArgs[0]));
info.dsp->processStep2(op, isLoop);
info.dsp->processStep3(op, isLoop);
}
}
}
Expand Down Expand Up @@ -816,8 +817,11 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
DataSharingProcessor dsp(converter, clauseList, eval,
/*useDelayedPrivatization=*/true, &symTable);

if (privatize)
if (privatize) {
dsp.processStep1();
dsp.processStep2();
}


const auto &delayedPrivatizationInfo = dsp.getDelayedPrivatizationInfo();

Expand Down Expand Up @@ -1093,7 +1097,9 @@ static void genBodyOfTargetOp(
const llvm::SmallVector<mlir::Type> &mapSymTypes,
const llvm::SmallVector<mlir::Location> &mapSymLocs,
const llvm::SmallVector<const Fortran::semantics::Symbol *> &mapSymbols,
const mlir::Location &currentLocation) {
const mlir::Location &currentLocation,
const Fortran::parser::OmpClauseList &clauseList,
DataSharingProcessor &dsp) {
assert(mapSymTypes.size() == mapSymLocs.size());

fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
Expand All @@ -1102,6 +1108,8 @@ static void genBodyOfTargetOp(
auto *regionBlock =
firOpBuilder.createBlock(&region, {}, mapSymTypes, mapSymLocs);

dsp.processStep2();

// Clones the `bounds` placing them inside the target region and returns them.
auto cloneBound = [&](mlir::Value bound) {
if (mlir::isMemoryEffectFree(bound.getDefiningOp())) {
Expand Down Expand Up @@ -1243,7 +1251,8 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
Fortran::lower::pft::Evaluation &eval, bool genNested,
mlir::Location currentLocation,
const Fortran::parser::OmpClauseList &clauseList,
llvm::omp::Directive directive, bool outerCombined = false) {
llvm::omp::Directive directive, bool outerCombined = false,
DataSharingProcessor *dsp = nullptr) {
Fortran::lower::StatementContext stmtCtx;
mlir::Value ifClauseOperand, deviceOperand, threadLimitOperand;
mlir::UnitAttr nowaitAttr;
Expand All @@ -1262,9 +1271,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
cp.processDepend(dependTypeOperands, dependOperands);
cp.processMap(currentLocation, directive, stmtCtx, mapOperands, &mapSymTypes,
&mapSymLocs, &mapSymbols);

cp.processTODO<Fortran::parser::OmpClause::Private,
Fortran::parser::OmpClause::Firstprivate,
cp.processTODO<Fortran::parser::OmpClause::Firstprivate,
Fortran::parser::OmpClause::IsDevicePtr,
Fortran::parser::OmpClause::HasDeviceAddr,
Fortran::parser::OmpClause::InReduction,
Expand All @@ -1273,6 +1280,10 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
Fortran::parser::OmpClause::Defaultmap>(
currentLocation, llvm::omp::Directive::OMPD_target);

DataSharingProcessor localDSP(converter, clauseList, eval);
DataSharingProcessor &actualDSP = dsp ? *dsp : localDSP;
actualDSP.processStep1();

// Process host-only clauses.
if (!llvm::cast<mlir::omp::OffloadModuleInterface>(*converter.getModuleOp())
.getIsTargetDevice())
Expand All @@ -1283,9 +1294,14 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,

// 5.8.1 Implicit Data-Mapping Attribute Rules
// The following code follows the implicit data-mapping rules to map all the
// symbols used inside the region that have not been explicitly mapped using
// the map clause.
// symbols used inside the region that do not have explicit data-environment
// attribute clauses (neither data-sharing; e.g. `private`, nor `map`
// clauses).
auto captureImplicitMap = [&](const Fortran::semantics::Symbol &sym) {
if (actualDSP.getPrivatizedSymbols().contains(&sym)) {
return;
}

if (llvm::find(mapSymbols, &sym) == mapSymbols.end()) {
mlir::Value baseOp = converter.getSymbolAddress(sym);
if (!baseOp)
Expand Down Expand Up @@ -1383,7 +1399,8 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
/*teams_thread_limit=*/nullptr, /*num_threads=*/nullptr);

genBodyOfTargetOp(converter, semaCtx, eval, genNested, targetOp, mapSymTypes,
mapSymLocs, mapSymbols, currentLocation);
mapSymLocs, mapSymbols, currentLocation, clauseList,
actualDSP);

return targetOp;
}
Expand Down Expand Up @@ -1459,7 +1476,8 @@ genDistributeOp(Fortran::lower::AbstractConverter &converter,
Fortran::lower::pft::Evaluation &eval, bool genNested,
mlir::Location currentLocation,
const Fortran::parser::OmpClauseList &clauseList,
bool outerCombined = false) {
bool outerCombined = false,
DataSharingProcessor *dsp = nullptr) {
// TODO Process clauses
// ClauseProcessor cp(converter, clauseList);
// cp.processAllocate(allocatorOperands, allocateOperands);
Expand All @@ -1469,7 +1487,8 @@ genDistributeOp(Fortran::lower::AbstractConverter &converter,
OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
.setGenNested(genNested)
.setOuterCombined(outerCombined)
.setClauses(&clauseList),
.setClauses(&clauseList)
.setDataSharingProcessor(dsp),
/*dist_schedule_static=*/nullptr,
/*chunk_size=*/nullptr,
/*allocate_vars=*/mlir::ValueRange(),
Expand Down Expand Up @@ -1780,6 +1799,7 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
DataSharingProcessor dsp(converter, loopOpClauseList, eval);
dsp.processStep1();
dsp.processStep2();

Fortran::lower::StatementContext stmtCtx;
mlir::Value scheduleChunkClauseOperand, ifClauseOperand;
Expand Down Expand Up @@ -1836,10 +1856,10 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
llvm::omp::Directive ompDirective,
const Fortran::parser::OmpClauseList &beginClauseList,
const Fortran::parser::OmpClauseList *endClauseList,
mlir::Location loc) {
mlir::Location loc, DataSharingProcessor &dsp) {
fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
DataSharingProcessor dsp(converter, beginClauseList, eval);
dsp.processStep1();
dsp.processStep2();

Fortran::lower::StatementContext stmtCtx;
mlir::Value scheduleChunkClauseOperand;
Expand Down Expand Up @@ -1962,8 +1982,9 @@ static void createSimdWsLoop(
// When support for vectorization is enabled, then we need to add handling of
// if clause. Currently if clause can be skipped because we always assume
// SIMD length = 1.
DataSharingProcessor dsp(converter, beginClauseList, eval);
createWsLoop(converter, semaCtx, eval, ompDirective, beginClauseList,
endClauseList, loc);
endClauseList, loc, dsp);
}

static void genOMP(Fortran::lower::AbstractConverter &converter,
Expand Down Expand Up @@ -1992,6 +2013,8 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
}();

bool validDirective = false;
DataSharingProcessor dsp(converter, loopOpClauseList, eval);

if (llvm::omp::topTaskloopSet.test(ompDirective)) {
validDirective = true;
TODO(currentLocation, "Taskloop construct");
Expand All @@ -2002,7 +2025,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
validDirective = true;
genTargetOp(converter, semaCtx, eval, /*genNested=*/false,
currentLocation, loopOpClauseList, ompDirective,
/*outerCombined=*/true);
/*outerCombined=*/true, &dsp);
}
if ((llvm::omp::allTeamsSet & llvm::omp::loopConstructSet)
.test(ompDirective)) {
Expand All @@ -2014,7 +2037,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
validDirective = true;
bool outerCombined = llvm::omp::topDistributeSet.test(ompDirective);
genDistributeOp(converter, semaCtx, eval, /*genNested=*/false,
currentLocation, loopOpClauseList, outerCombined);
currentLocation, loopOpClauseList, outerCombined, &dsp);
}
if ((llvm::omp::allParallelSet & llvm::omp::loopConstructSet)
.test(ompDirective)) {
Expand Down Expand Up @@ -2045,7 +2068,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
genOpenMPReduction(converter, semaCtx, loopOpClauseList);
} else {
createWsLoop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
endClauseList, currentLocation);
endClauseList, currentLocation, dsp);
}
}

Expand Down
30 changes: 30 additions & 0 deletions flang/test/Lower/OpenMP/target_private.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
!Test data-sharing attribute clauses for the `target` directive.

!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s

!CHECK-LABEL: func.func @_QPomp_target_private()
subroutine omp_target_private
implicit none
integer :: x(1)

!$omp target private(x)
x(1) = 42
!$omp end target
!CHECK: omp.target {
!CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
!CHECK-DAG: %[[PRIV_ALLOC:.*]] = fir.alloca !fir.array<1xi32> {bindc_name = "x",
!CHECK-SAME: pinned, uniq_name = "_QFomp_target_privateEx"}
!CHECK-NEXT: %[[SHAPE:.*]] = fir.shape %[[C1]] : (index) -> !fir.shape<1>
!CHECK-NEXT: %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]](%[[SHAPE]])
!CHECK-SAME: {uniq_name = "_QFomp_target_privateEx"} :
!CHECK-SAME: (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) ->
!CHECK-SAME: (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
!CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
!CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index
!CHECK-NEXT: %[[PRIV_BINDING:.*]] = hlfir.designate %[[PRIV_DECL]]#0 (%[[C1_2]])
!CHECK-SAME: : (!fir.ref<!fir.array<1xi32>>, index) -> !fir.ref<i32>
!CHECK-NEXT: hlfir.assign %[[C42]] to %[[PRIV_BINDING]] : i32, !fir.ref<i32>
!CHECK-NEXT: omp.terminator
!CHECK-NEXT: }

end subroutine omp_target_private
29 changes: 29 additions & 0 deletions openmp/libomptarget/test/offloading/fortran/target_private.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
! Basic offloading test with a target region
! REQUIRES: flang
! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
! UNSUPPORTED: aarch64-unknown-linux-gnu
! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
! UNSUPPORTED: x86_64-pc-linux-gnu
! UNSUPPORTED: x86_64-pc-linux-gnu-LTO

! RUN: %libomptarget-compile-fortran-generic
! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
program target_update
implicit none
integer :: x(1)
integer :: y(1)

x(1) = 42

!$omp target private(x) map(tofrom: y)
x(1) = 84
y(1) = x(1)
!$omp end target

print *, "x =", x(1)
print *, "y =", y(1)

end program target_update
! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
! CHECK: x = 42
! CHECK: y = 84

0 comments on commit aeed526

Please sign in to comment.