Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[flang][OpenMP] Add support for target ... private #39

Merged
merged 1 commit into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,20 @@ namespace omp {
void DataSharingProcessor::processStep1() {
collectSymbolsForPrivatization();
collectDefaultSymbols();
}

void DataSharingProcessor::processStep2() {
if (privatizationDone)
return;

privatize();
defaultPrivatize();
insertBarrier();

privatizationDone = true;
}

void DataSharingProcessor::processStep2(mlir::Operation *op, bool isLoop) {
void DataSharingProcessor::processStep3(mlir::Operation *op, bool isLoop) {
insPt = firOpBuilder.saveInsertionPoint();
copyLastPrivatize(op);
firOpBuilder.restoreInsertionPoint(insPt);
Expand Down
41 changes: 31 additions & 10 deletions flang/lib/Lower/OpenMP/DataSharingProcessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ class DataSharingProcessor {
fir::FirOpBuilder &firOpBuilder;
const Fortran::parser::OmpClauseList &opClauseList;
Fortran::lower::pft::Evaluation &eval;
bool privatizationDone = false;

bool useDelayedPrivatization;
Fortran::lower::SymMap *symTable;
DelayedPrivatizationInfo delayedPrivatizationInfo;
Expand Down Expand Up @@ -90,25 +92,44 @@ class DataSharingProcessor {
eval(eval), useDelayedPrivatization(useDelayedPrivatization),
symTable(symTable) {}

// Privatisation is split into two steps.
// Step1 performs cloning of all privatisation clauses and copying for
// firstprivates. Step1 is performed at the place where process/processStep1
// Privatisation is split into 3 steps:
//
// * Step1: collects all symbols that should be privatized.
//
// * Step2: performs cloning of all privatisation clauses and copying for
// firstprivates. Step2 is performed at the place where process/processStep2
// is called. This is usually inside the Operation corresponding to the OpenMP
// construct, for looping constructs this is just before the Operation. The
// split into two steps was performed basically to be able to call
// privatisation for looping constructs before the operation is created since
// the bounds of the MLIR OpenMP operation can be privatised.
// Step2 performs the copying for lastprivates and requires knowledge of the
// MLIR operation to insert the last private update. Step2 adds
// construct, for looping constructs this is just before the Operation.
//
// * Step3: performs the copying for lastprivates and requires knowledge of
// the MLIR operation to insert the last private update. Step3 adds
// dealocation code as well.
//
// The split was performed for the following reasons:
//
// 1. Step1 was split so that the `target` op knows which symbols should not
// be mapped into the target region due to being `private`. The implicit
// mapping happens before the op body is generated so we need to to collect
// the private symbols first and then later in the body actually privatize
// them.
//
// 2. Step2 was split in order to call privatisation for looping constructs
// before the operation is created since the bounds of the MLIR OpenMP
// operation can be privatised.
void processStep1();
void processStep2(mlir::Operation *op, bool isLoop);
void processStep2();
void processStep3(mlir::Operation *op, bool isLoop);

void setLoopIV(mlir::Value iv) {
assert(!loopIV && "Loop iteration variable already set");
loopIV = iv;
}

const llvm::SetVector<const Fortran::semantics::Symbol *> &
getPrivatizedSymbols() const {
return privatizedSymbols;
}

const DelayedPrivatizationInfo &getDelayedPrivatizationInfo() const {
return delayedPrivatizationInfo;
}
Expand Down
61 changes: 42 additions & 19 deletions flang/lib/Lower/OpenMP/OpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,7 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
if (!info.dsp) {
tempDsp.emplace(info.converter, *info.clauses, info.eval);
tempDsp->processStep1();
tempDsp->processStep2();
}
}

Expand Down Expand Up @@ -593,11 +594,11 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
if (privatize) {
if (!info.dsp) {
assert(tempDsp.has_value());
tempDsp->processStep2(op, isLoop);
tempDsp->processStep3(op, isLoop);
} else {
if (isLoop && regionArgs.size() > 0)
info.dsp->setLoopIV(info.converter.getSymbolAddress(*regionArgs[0]));
info.dsp->processStep2(op, isLoop);
info.dsp->processStep3(op, isLoop);
}
}
}
Expand Down Expand Up @@ -816,8 +817,11 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
DataSharingProcessor dsp(converter, clauseList, eval,
/*useDelayedPrivatization=*/true, &symTable);

if (privatize)
if (privatize) {
dsp.processStep1();
dsp.processStep2();
}


const auto &delayedPrivatizationInfo = dsp.getDelayedPrivatizationInfo();

Expand Down Expand Up @@ -1093,7 +1097,9 @@ static void genBodyOfTargetOp(
const llvm::SmallVector<mlir::Type> &mapSymTypes,
const llvm::SmallVector<mlir::Location> &mapSymLocs,
const llvm::SmallVector<const Fortran::semantics::Symbol *> &mapSymbols,
const mlir::Location &currentLocation) {
const mlir::Location &currentLocation,
const Fortran::parser::OmpClauseList &clauseList,
DataSharingProcessor &dsp) {
assert(mapSymTypes.size() == mapSymLocs.size());

fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
Expand All @@ -1102,6 +1108,8 @@ static void genBodyOfTargetOp(
auto *regionBlock =
firOpBuilder.createBlock(&region, {}, mapSymTypes, mapSymLocs);

dsp.processStep2();

// Clones the `bounds` placing them inside the target region and returns them.
auto cloneBound = [&](mlir::Value bound) {
if (mlir::isMemoryEffectFree(bound.getDefiningOp())) {
Expand Down Expand Up @@ -1243,7 +1251,8 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
Fortran::lower::pft::Evaluation &eval, bool genNested,
mlir::Location currentLocation,
const Fortran::parser::OmpClauseList &clauseList,
llvm::omp::Directive directive, bool outerCombined = false) {
llvm::omp::Directive directive, bool outerCombined = false,
DataSharingProcessor *dsp = nullptr) {
Fortran::lower::StatementContext stmtCtx;
mlir::Value ifClauseOperand, deviceOperand, threadLimitOperand;
mlir::UnitAttr nowaitAttr;
Expand All @@ -1262,9 +1271,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
cp.processDepend(dependTypeOperands, dependOperands);
cp.processMap(currentLocation, directive, stmtCtx, mapOperands, &mapSymTypes,
&mapSymLocs, &mapSymbols);

cp.processTODO<Fortran::parser::OmpClause::Private,
Fortran::parser::OmpClause::Firstprivate,
cp.processTODO<Fortran::parser::OmpClause::Firstprivate,
Fortran::parser::OmpClause::IsDevicePtr,
Fortran::parser::OmpClause::HasDeviceAddr,
Fortran::parser::OmpClause::InReduction,
Expand All @@ -1273,6 +1280,10 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
Fortran::parser::OmpClause::Defaultmap>(
currentLocation, llvm::omp::Directive::OMPD_target);

DataSharingProcessor localDSP(converter, clauseList, eval);
DataSharingProcessor &actualDSP = dsp ? *dsp : localDSP;
actualDSP.processStep1();

// Process host-only clauses.
if (!llvm::cast<mlir::omp::OffloadModuleInterface>(*converter.getModuleOp())
.getIsTargetDevice())
Expand All @@ -1283,9 +1294,14 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,

// 5.8.1 Implicit Data-Mapping Attribute Rules
// The following code follows the implicit data-mapping rules to map all the
// symbols used inside the region that have not been explicitly mapped using
// the map clause.
// symbols used inside the region that do not have explicit data-environment
// attribute clauses (neither data-sharing; e.g. `private`, nor `map`
// clauses).
auto captureImplicitMap = [&](const Fortran::semantics::Symbol &sym) {
if (actualDSP.getPrivatizedSymbols().contains(&sym)) {
return;
}

if (llvm::find(mapSymbols, &sym) == mapSymbols.end()) {
mlir::Value baseOp = converter.getSymbolAddress(sym);
if (!baseOp)
Expand Down Expand Up @@ -1383,7 +1399,8 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
/*teams_thread_limit=*/nullptr, /*num_threads=*/nullptr);

genBodyOfTargetOp(converter, semaCtx, eval, genNested, targetOp, mapSymTypes,
mapSymLocs, mapSymbols, currentLocation);
mapSymLocs, mapSymbols, currentLocation, clauseList,
actualDSP);

return targetOp;
}
Expand Down Expand Up @@ -1459,7 +1476,8 @@ genDistributeOp(Fortran::lower::AbstractConverter &converter,
Fortran::lower::pft::Evaluation &eval, bool genNested,
mlir::Location currentLocation,
const Fortran::parser::OmpClauseList &clauseList,
bool outerCombined = false) {
bool outerCombined = false,
DataSharingProcessor *dsp = nullptr) {
// TODO Process clauses
// ClauseProcessor cp(converter, clauseList);
// cp.processAllocate(allocatorOperands, allocateOperands);
Expand All @@ -1469,7 +1487,8 @@ genDistributeOp(Fortran::lower::AbstractConverter &converter,
OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
.setGenNested(genNested)
.setOuterCombined(outerCombined)
.setClauses(&clauseList),
.setClauses(&clauseList)
.setDataSharingProcessor(dsp),
/*dist_schedule_static=*/nullptr,
/*chunk_size=*/nullptr,
/*allocate_vars=*/mlir::ValueRange(),
Expand Down Expand Up @@ -1780,6 +1799,7 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
DataSharingProcessor dsp(converter, loopOpClauseList, eval);
dsp.processStep1();
dsp.processStep2();

Fortran::lower::StatementContext stmtCtx;
mlir::Value scheduleChunkClauseOperand, ifClauseOperand;
Expand Down Expand Up @@ -1836,10 +1856,10 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
llvm::omp::Directive ompDirective,
const Fortran::parser::OmpClauseList &beginClauseList,
const Fortran::parser::OmpClauseList *endClauseList,
mlir::Location loc) {
mlir::Location loc, DataSharingProcessor &dsp) {
fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
DataSharingProcessor dsp(converter, beginClauseList, eval);
dsp.processStep1();
dsp.processStep2();

Fortran::lower::StatementContext stmtCtx;
mlir::Value scheduleChunkClauseOperand;
Expand Down Expand Up @@ -1962,8 +1982,9 @@ static void createSimdWsLoop(
// When support for vectorization is enabled, then we need to add handling of
// if clause. Currently if clause can be skipped because we always assume
// SIMD length = 1.
DataSharingProcessor dsp(converter, beginClauseList, eval);
createWsLoop(converter, semaCtx, eval, ompDirective, beginClauseList,
endClauseList, loc);
endClauseList, loc, dsp);
}

static void genOMP(Fortran::lower::AbstractConverter &converter,
Expand Down Expand Up @@ -1992,6 +2013,8 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
}();

bool validDirective = false;
DataSharingProcessor dsp(converter, loopOpClauseList, eval);

if (llvm::omp::topTaskloopSet.test(ompDirective)) {
validDirective = true;
TODO(currentLocation, "Taskloop construct");
Expand All @@ -2002,7 +2025,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
validDirective = true;
genTargetOp(converter, semaCtx, eval, /*genNested=*/false,
currentLocation, loopOpClauseList, ompDirective,
/*outerCombined=*/true);
/*outerCombined=*/true, &dsp);
}
if ((llvm::omp::allTeamsSet & llvm::omp::loopConstructSet)
.test(ompDirective)) {
Expand All @@ -2014,7 +2037,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
validDirective = true;
bool outerCombined = llvm::omp::topDistributeSet.test(ompDirective);
genDistributeOp(converter, semaCtx, eval, /*genNested=*/false,
currentLocation, loopOpClauseList, outerCombined);
currentLocation, loopOpClauseList, outerCombined, &dsp);
}
if ((llvm::omp::allParallelSet & llvm::omp::loopConstructSet)
.test(ompDirective)) {
Expand Down Expand Up @@ -2045,7 +2068,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
genOpenMPReduction(converter, semaCtx, loopOpClauseList);
} else {
createWsLoop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
endClauseList, currentLocation);
endClauseList, currentLocation, dsp);
}
}

Expand Down
30 changes: 30 additions & 0 deletions flang/test/Lower/OpenMP/target_private.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
!Test data-sharing attribute clauses for the `target` directive.

!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s

!CHECK-LABEL: func.func @_QPomp_target_private()
subroutine omp_target_private
implicit none
integer :: x(1)

!$omp target private(x)
x(1) = 42
!$omp end target
!CHECK: omp.target {
!CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
!CHECK-DAG: %[[PRIV_ALLOC:.*]] = fir.alloca !fir.array<1xi32> {bindc_name = "x",
!CHECK-SAME: pinned, uniq_name = "_QFomp_target_privateEx"}
!CHECK-NEXT: %[[SHAPE:.*]] = fir.shape %[[C1]] : (index) -> !fir.shape<1>
!CHECK-NEXT: %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]](%[[SHAPE]])
!CHECK-SAME: {uniq_name = "_QFomp_target_privateEx"} :
!CHECK-SAME: (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) ->
!CHECK-SAME: (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
!CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
!CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index
!CHECK-NEXT: %[[PRIV_BINDING:.*]] = hlfir.designate %[[PRIV_DECL]]#0 (%[[C1_2]])
!CHECK-SAME: : (!fir.ref<!fir.array<1xi32>>, index) -> !fir.ref<i32>
!CHECK-NEXT: hlfir.assign %[[C42]] to %[[PRIV_BINDING]] : i32, !fir.ref<i32>
!CHECK-NEXT: omp.terminator
!CHECK-NEXT: }

end subroutine omp_target_private
29 changes: 29 additions & 0 deletions openmp/libomptarget/test/offloading/fortran/target_private.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
! Basic offloading test with a target region
! REQUIRES: flang
! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
! UNSUPPORTED: aarch64-unknown-linux-gnu
! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
! UNSUPPORTED: x86_64-pc-linux-gnu
! UNSUPPORTED: x86_64-pc-linux-gnu-LTO

! RUN: %libomptarget-compile-fortran-generic
! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
program target_update
implicit none
integer :: x(1)
integer :: y(1)

x(1) = 42

!$omp target private(x) map(tofrom: y)
x(1) = 84
y(1) = x(1)
!$omp end target

print *, "x =", x(1)
print *, "y =", y(1)

end program target_update
! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
! CHECK: x = 42
! CHECK: y = 84