Skip to content

Commit

Permalink
Merge pull request #484 from LLNL/task/rhornung67/rm-ompt-reduction-t…
Browse files Browse the repository at this point in the history
…unings

Remove old reduction "tuning" from OpenMP Target variants
  • Loading branch information
rhornung67 authored Oct 10, 2024
2 parents dba431f + b6bbf64 commit 9af20b3
Show file tree
Hide file tree
Showing 27 changed files with 136 additions and 369 deletions.
6 changes: 5 additions & 1 deletion src/algorithm/ATOMIC.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,24 +74,28 @@ class ATOMIC : public KernelBase
void setOpenMPTuningDefinitions(VariantID vid);
void setCudaTuningDefinitions(VariantID vid);
void setHipTuningDefinitions(VariantID vid);
void setOpenMPTargetTuningDefinitions(VariantID vid);

template < size_t replication >
void runSeqVariantReplicate(VariantID vid);

template < size_t replication >
void runOpenMPVariantReplicate(VariantID vid);

template < size_t block_size, size_t replication >
void runCudaVariantReplicateGlobal(VariantID vid);
template < size_t block_size, size_t replication >
void runHipVariantReplicateGlobal(VariantID vid);
template < size_t block_size, size_t replication >

void runCudaVariantReplicateWarp(VariantID vid);
template < size_t block_size, size_t replication >
void runHipVariantReplicateWarp(VariantID vid);

template < size_t block_size, size_t replication >
void runCudaVariantReplicateBlock(VariantID vid);
template < size_t block_size, size_t replication >
void runHipVariantReplicateBlock(VariantID vid);

template < size_t replication >
void runOpenMPTargetVariantReplicate(VariantID vid);

Expand Down
1 change: 1 addition & 0 deletions src/algorithm/HISTOGRAM.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ class HISTOGRAM : public KernelBase

void setCudaTuningDefinitions(VariantID vid);
void setHipTuningDefinitions(VariantID vid);

void runCudaVariantLibrary(VariantID vid);
void runHipVariantLibrary(VariantID vid);

Expand Down
1 change: 1 addition & 0 deletions src/algorithm/MEMCPY.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class MEMCPY : public KernelBase
void setSeqTuningDefinitions(VariantID vid);
void setCudaTuningDefinitions(VariantID vid);
void setHipTuningDefinitions(VariantID vid);

void runSeqVariantDefault(VariantID vid);
void runSeqVariantLibrary(VariantID vid);

Expand Down
1 change: 1 addition & 0 deletions src/algorithm/MEMSET.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class MEMSET : public KernelBase
void setSeqTuningDefinitions(VariantID vid);
void setCudaTuningDefinitions(VariantID vid);
void setHipTuningDefinitions(VariantID vid);

void runSeqVariantDefault(VariantID vid);
void runSeqVariantLibrary(VariantID vid);

Expand Down
58 changes: 13 additions & 45 deletions src/algorithm/REDUCE_SUM-OMPTarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace algorithm
const size_t threads_per_team = 256;


void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t tune_idx)
void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
{
const Index_type run_reps = getRunReps();
const Index_type ibegin = 0;
Expand Down Expand Up @@ -56,62 +56,30 @@ void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t tune_idx)

} else if ( vid == RAJA_OpenMPTarget ) {

if (tune_idx == 0) {

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> sum(m_sum_init);

RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
RAJA::RangeSegment(ibegin, iend),
[=](Index_type i) {
REDUCE_SUM_BODY;
});

m_sum = sum.get();

}
stopTimer();

} else if (tune_idx == 1) {

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

Real_type tsum = m_sum_init;
startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
RAJA::RangeSegment(ibegin, iend),
RAJA::expt::Reduce<RAJA::operators::plus>(&tsum),
[=] (Index_type i, Real_type& sum) {
REDUCE_SUM_BODY;
}
);
Real_type tsum = m_sum_init;

m_sum = static_cast<Real_type>(tsum);
RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
RAJA::RangeSegment(ibegin, iend),
RAJA::expt::Reduce<RAJA::operators::plus>(&tsum),
[=] (Index_type i, Real_type& sum) {
REDUCE_SUM_BODY;
}
);

}
stopTimer();
m_sum = static_cast<Real_type>(tsum);

} else {
getCout() << "\n REDUCE_SUM : Unknown OMP Target tuning index = " << tune_idx << std::endl;
}
stopTimer();

} else {
getCout() << "\n REDUCE_SUM : Unknown OMP Target variant id = " << vid << std::endl;
}

}

void REDUCE_SUM::setOpenMPTargetTuningDefinitions(VariantID vid)
{
addVariantTuningName(vid, "default");
if (vid == RAJA_OpenMPTarget) {
addVariantTuningName(vid, "new");
}
}

} // end namespace algorithm
} // end namespace rajaperf

Expand Down
1 change: 0 additions & 1 deletion src/algorithm/REDUCE_SUM.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ class REDUCE_SUM : public KernelBase
void setOpenMPTuningDefinitions(VariantID vid);
void setCudaTuningDefinitions(VariantID vid);
void setHipTuningDefinitions(VariantID vid);
void setOpenMPTargetTuningDefinitions(VariantID vid);
void setSyclTuningDefinitions(VariantID vid);

void runCudaVariantCub(VariantID vid);
Expand Down
2 changes: 2 additions & 0 deletions src/algorithm/SCAN.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,10 @@ class SCAN : public KernelBase

void setCudaTuningDefinitions(VariantID vid);
void setHipTuningDefinitions(VariantID vid);

void runCudaVariantLibrary(VariantID vid);
void runHipVariantLibrary(VariantID vid);

template < size_t block_size, size_t items_per_thread >
void runCudaVariantImpl(VariantID vid);
template < size_t block_size, size_t items_per_thread >
Expand Down
4 changes: 1 addition & 3 deletions src/apps/EDGE3D-OMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ void EDGE3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx
startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

auto edge3d_lam = [=](Index_type i) { EDGE3D_BODY; };

#pragma omp parallel for
for (Index_type i = ibegin ; i < iend ; ++i ) {
edge3d_lam(i);
Expand All @@ -86,7 +84,7 @@ void EDGE3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx
}

default : {
getCout() << "\n EDGE3D : Unknown variant id = " << vid << std::endl;
getCout() << "\n EDGE3D : Unknown OpenMP variant id = " << vid << std::endl;
}

}
Expand Down
4 changes: 2 additions & 2 deletions src/apps/EDGE3D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ EDGE3D::EDGE3D(const RunParams& params)

setFLOPsPerRep(number_of_elements * flops_per_element);

checksum_scale_factor = 0.001 *
m_checksum_scale_factor = 0.001 *
( static_cast<Checksum_type>(getDefaultProblemSize()) /
getActualProblemSize() );

Expand Down Expand Up @@ -116,7 +116,7 @@ void EDGE3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))

void EDGE3D::updateChecksum(VariantID vid, size_t tune_idx)
{
checksum[vid][tune_idx] += calcChecksum(m_sum, m_array_length, checksum_scale_factor, vid );
checksum[vid][tune_idx] += calcChecksum(m_sum, m_array_length, m_checksum_scale_factor, vid );
}

void EDGE3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
Expand Down
2 changes: 2 additions & 0 deletions src/apps/EDGE3D.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,8 @@ class EDGE3D : public KernelBase

ADomain* m_domain;
Index_type m_array_length;

Real_type m_checksum_scale_factor;
};

} // end namespace apps
Expand Down
1 change: 0 additions & 1 deletion src/apps/PRESSURE.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ class PRESSURE : public KernelBase
void runCudaVariantImpl(VariantID vid);
template < size_t block_size >
void runHipVariantImpl(VariantID vid);

template < size_t work_group_size >
void runSyclVariantImpl(VariantID vid);

Expand Down
1 change: 1 addition & 0 deletions src/basic/MULTI_REDUCE.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ class MULTI_REDUCE : public KernelBase

void setCudaTuningDefinitions(VariantID vid);
void setHipTuningDefinitions(VariantID vid);

template < Index_type block_size,
Index_type preferred_global_replication,
Index_type preferred_shared_replication,
Expand Down
58 changes: 13 additions & 45 deletions src/basic/PI_REDUCE-OMPTarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace basic
const size_t threads_per_team = 256;


void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t tune_idx)
void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
{
const Index_type run_reps = getRunReps();
const Index_type ibegin = 0;
Expand Down Expand Up @@ -56,62 +56,30 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t tune_idx)

} else if ( vid == RAJA_OpenMPTarget ) {

if (tune_idx == 0) {

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

RAJA::ReduceSum<RAJA::omp_target_reduce, Real_type> pi(m_pi_init);

RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
RAJA::RangeSegment(ibegin, iend),
[=](Index_type i) {
PI_REDUCE_BODY;
});

m_pi = 4.0 * pi.get();

}
stopTimer();

} else if (tune_idx == 1) {

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

Real_type tpi = m_pi_init;
startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
RAJA::RangeSegment(ibegin, iend),
RAJA::expt::Reduce<RAJA::operators::plus>(&tpi),
[=] (Index_type i, Real_type& pi) {
PI_REDUCE_BODY;
}
);
Real_type tpi = m_pi_init;

m_pi = static_cast<Real_type>(tpi) * 4.0;
RAJA::forall<RAJA::omp_target_parallel_for_exec<threads_per_team>>(
RAJA::RangeSegment(ibegin, iend),
RAJA::expt::Reduce<RAJA::operators::plus>(&tpi),
[=] (Index_type i, Real_type& pi) {
PI_REDUCE_BODY;
}
);

}
stopTimer();
m_pi = static_cast<Real_type>(tpi) * 4.0;

} else {
getCout() << "\n PI_REDUCE : Unknown OMP Target tuning index = " << tune_idx << std::endl;
}
stopTimer();

} else {
getCout() << "\n PI_REDUCE : Unknown OMP Target variant id = " << vid << std::endl;
}

}

void PI_REDUCE::setOpenMPTargetTuningDefinitions(VariantID vid)
{
addVariantTuningName(vid, "default");
if (vid == RAJA_OpenMPTarget) {
addVariantTuningName(vid, "new");
}
}

} // end namespace basic
} // end namespace rajaperf

Expand Down
1 change: 0 additions & 1 deletion src/basic/PI_REDUCE.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ class PI_REDUCE : public KernelBase
void setOpenMPTuningDefinitions(VariantID vid);
void setCudaTuningDefinitions(VariantID vid);
void setHipTuningDefinitions(VariantID vid);
void setOpenMPTargetTuningDefinitions(VariantID vid);
void setSyclTuningDefinitions(VariantID vid);

template < size_t block_size, typename MappingHelper >
Expand Down
Loading

0 comments on commit 9af20b3

Please sign in to comment.