Skip to content

Commit

Permalink
*: try a 2D decomposition of GPU SpMM
Browse files Browse the repository at this point in the history
  • Loading branch information
rohany authored and Rohan Yadav committed Mar 1, 2022
1 parent e58a72a commit 8899065
Show file tree
Hide file tree
Showing 11 changed files with 572 additions and 33 deletions.
4 changes: 3 additions & 1 deletion legion/include/taco_mapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,9 @@ class TACOMapper : public Legion::Mapping::DefaultMapper {
// InFlightTask represents a task currently being executed.
struct InFlightTask {
// Unique identifier of the task instance.
Legion::UniqueID id;
// Legion::UniqueID id;
// TODO (rohany): Comment.
std::pair<Legion::Domain, size_t> id;
// An event that will be triggered when the task finishes.
Legion::Mapping::MapperEvent event;
// A clock measurement from when the task was scheduled.
Expand Down
4 changes: 2 additions & 2 deletions legion/spinnerprod/taco-generated.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -458,13 +458,13 @@ double computeLegionDDS(Legion::Context ctx, Legion::Runtime* runtime, LegionTen
void registerTacoTasks() {
{
TaskVariantRegistrar registrar(taskID(1), "task_1");
registrar.add_constraint(ProcessorConstraint(Processor::OMP_PROC));
registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
registrar.set_leaf();
Runtime::preregister_task_variant<double,task_1>(registrar, "task_1");
}
{
TaskVariantRegistrar registrar(taskID(2), "task_2");
registrar.add_constraint(ProcessorConstraint(Processor::OMP_PROC));
registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
registrar.set_leaf();
Runtime::preregister_task_variant<double,task_2>(registrar, "task_2");
}
Expand Down
55 changes: 42 additions & 13 deletions legion/spmm/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,38 @@
#include "legion_utils.h"
#include "legion_string_utils.h"
#include "error.h"
#ifdef TACO_USE_CUDA
#include "taco-generated.cuh"
#else
#include "taco-generated.h"
#endif

using namespace Legion;
typedef double valType;

void top_level_task(const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx, Runtime* runtime) {
std::string csrFileName;
bool dump = false;
bool dump = false, consMem = false;
// The j-dimension if the computation will commonly have a small value
// that is divisible by 32, as per Stephen and Chang-wan.
int n = 10, pieces = 0, warmup = 5, jDim = 32;
int n = 10, pieces = 0, warmup = 5, jDim = 32, gx = 0, gy = 0;
Realm::CommandLineParser parser;
parser.add_option_string("-tensor", csrFileName);
parser.add_option_bool("-dump", dump);
parser.add_option_int("-n", n);
parser.add_option_int("-pieces", pieces);
parser.add_option_int("-warmup", warmup);
parser.add_option_int("-jdim", jDim);
// Arguments for memory-conserving schedule.
parser.add_option_bool("-consMem", consMem);
parser.add_option_int("-gx", gx);
parser.add_option_int("-gy", gy);
auto args = Runtime::get_input_args();
taco_uassert(parser.parse_command_line(args.argc, args.argv)) << "Parse failure.";
taco_uassert(!csrFileName.empty()) << "Provide a matrix with -tensor";

// Figure out how many pieces to chop up the data into.
if (pieces == 0) {
if (!consMem && pieces == 0) {
pieces = getNumPieces(ctx, runtime);
taco_uassert(pieces != 0) << "Please provide a number of pieces to split into with -pieces. Unable to automatically find.";
}
Expand All @@ -41,20 +49,41 @@ void top_level_task(const Task* task, const std::vector<PhysicalRegion>& regions
runtime->fill_field(ctx, A.vals, A.valsParent, FID_VAL, valType(0));
runtime->fill_field(ctx, C.vals, C.valsParent, FID_VAL, valType(1));

auto pack = partitionForcomputeLegion(ctx, runtime, &A, &B, &C, pieces);
// #ifdef TACO_USE_CUDA
partitionPackForcomputeLegion pack;
partitionPackForcomputeLegionConsMem packCons;
if (consMem) {
packCons = partitionForcomputeLegionConsMem(ctx, runtime, &A, &B, &C, gx, gy);
} else {
pack = partitionForcomputeLegion(ctx, runtime, &A, &B, &C, pieces);
}
// #else
// auto pack = partitionForcomputeLegion(ctx, runtime, &A, &B, &C, pieces);
// taco_iassert(!consMem);
// #endif

auto commPart = createSparseAliasingPartitions(ctx, runtime, A.vals.get_index_space(), pack.APartition.valsPartition.get_index_partition());
auto commLPart = runtime->get_logical_partition(ctx, A.vals, commPart);

LogicalPartition commLPart;
if (!consMem) {
auto commPart = createSparseAliasingPartitions(ctx, runtime, A.vals.get_index_space(), pack.APartition.valsPartition.get_index_partition());
commLPart = runtime->get_logical_partition(ctx, A.vals, commPart);
}

auto avgTime = benchmarkAsyncCallWithWarmup(ctx, runtime, warmup, n, [&]() {
if (dump) { runtime->fill_field(ctx, A.vals, A.valsParent, FID_VAL, valType(0)); }
computeLegion(ctx, runtime, &A, &B, &C, &pack, pieces);
#ifdef TACO_USE_CUDA
// Collapse our reduction buffers. We use sparse instances to force just the communication
// that we want. We only do this for the GPU schedule, as the CPU schedule does not
// use Legion reductions.
launchDummyReadOverPartition(ctx, runtime, A.vals, commLPart, FID_VAL, Rect<1>(0, pieces - 1), false /* wait */, true /* untrack */, false /* cpuOnly */, true /* sparse */);
#endif
// #ifdef TACO_USE_CUDA
if (consMem) {
computeLegionConsMem(ctx, runtime, &A, &B, &C, &packCons, gx, gy);
} else {
computeLegion(ctx, runtime, &A, &B, &C, &pack, pieces);
// Collapse our reduction buffers. We use sparse instances to force just the communication
// that we want. We only do this for the GPU schedule, as the CPU schedule does not
// use Legion reductions.
launchDummyReadOverPartition(ctx, runtime, A.vals, commLPart, FID_VAL, Rect<1>(0, pieces - 1), false /* wait */, true /* untrack */, false /* cpuOnly */, true /* sparse */);
}
// #else
// computeLegion(ctx, runtime, &A, &B, &C, &pack, pieces);
// #endif
});
LEGION_PRINT_ONCE(runtime, ctx, stdout, "Average execution time: %lf ms\n", avgTime);

Expand Down
190 changes: 189 additions & 1 deletion legion/spmm/taco-generated.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ struct task_1Args {
int32_t gx;
};

struct task_2Args {
int64_t A2_dimension;
int64_t B1_dimension;
int64_t C2_dimension;
int32_t gx;
int32_t gy;
};


partitionPackForcomputeLegion partitionForcomputeLegion(Legion::Context ctx, Legion::Runtime* runtime, LegionTensor* A, LegionTensor* B, LegionTensor* C, int32_t gx) {
RegionWrapper A_vals = A->vals;
Expand Down Expand Up @@ -164,12 +172,192 @@ void computeLegion(Legion::Context ctx, Legion::Runtime* runtime, LegionTensor*
launcher.add_region_requirement(RegionRequirement(get_logical_region(C_vals), READ_ONLY, EXCLUSIVE, C_vals_parent).add_field(FID_VAL));
runtime->execute_index_space(ctx, launcher);

}

partitionPackForcomputeLegionConsMem partitionForcomputeLegionConsMem(Legion::Context ctx, Legion::Runtime* runtime, LegionTensor* A, LegionTensor* B, LegionTensor* C, int32_t gx, int32_t gy) {
RegionWrapper A_vals = A->vals;
IndexSpace A_dense_run_0 = A->denseLevelRuns[0];
int B1_dimension = B->dims[0];
int B2_dimension = B->dims[1];
RegionWrapper B2_pos = B->indices[1][0];
RegionWrapper B2_crd = B->indices[1][1];
auto B2_pos_parent = B->indicesParents[1][0];
RegionWrapper B_vals = B->vals;
IndexSpace B_dense_run_0 = B->denseLevelRuns[0];
int C2_dimension = C->dims[1];
RegionWrapper C_vals = C->vals;
IndexSpace C_dense_run_0 = C->denseLevelRuns[0];

int64_t B2Size = runtime->get_index_space_domain(ctx, get_index_space(B2_crd)).hi()[0] + 1;

Point<2> lowerBound = Point<2>(0, 0);
Point<2> upperBound = Point<2>((gx - 1), (gy - 1));
auto distFusedIndexSpace = runtime->create_index_space(ctx, Rect<2>(lowerBound, upperBound));
DomainT<2> domain = runtime->get_index_space_domain(ctx, IndexSpaceT<2>(distFusedIndexSpace));
auto ADomain = runtime->get_index_space_domain(ctx, A_dense_run_0);
auto BDomain = runtime->get_index_space_domain(ctx, B_dense_run_0);
auto CDomain = runtime->get_index_space_domain(ctx, C_dense_run_0);
DomainPointColoring AColoring = DomainPointColoring();
DomainPointColoring BColoring = DomainPointColoring();
DomainPointColoring CColoring = DomainPointColoring();
for (PointInDomainIterator<2> itr = PointInDomainIterator<2>(domain); itr.valid(); itr++) {
int64_t io = (*itr)[0];
int64_t jo = (*itr)[1];
Point<2> AStart = Point<2>((io * ((B1_dimension + (gx - 1)) / gx)), (jo * ((C2_dimension + (gy - 1)) / gy)));
Point<2> AEnd = Point<2>(TACO_MIN((io * ((B1_dimension + (gx - 1)) / gx) + ((B1_dimension + (gx - 1)) / gx - 1)), ADomain.hi()[0]), TACO_MIN((jo * ((C2_dimension + (gy - 1)) / gy) + ((C2_dimension + (gy - 1)) / gy - 1)), ADomain.hi()[1]));
Rect<2> ARect = Rect<2>(AStart, AEnd);
if (!ADomain.contains(ARect.lo) || !ADomain.contains(ARect.hi)) {
ARect = ARect.make_empty();
}
AColoring[(*itr)] = ARect;
Point<1> BStart = Point<1>((io * ((B1_dimension + (gx - 1)) / gx)));
Point<1> BEnd = Point<1>(TACO_MIN((io * ((B1_dimension + (gx - 1)) / gx) + ((B1_dimension + (gx - 1)) / gx - 1)), BDomain.hi()[0]));
Rect<1> BRect = Rect<1>(BStart, BEnd);
if (!BDomain.contains(BRect.lo) || !BDomain.contains(BRect.hi)) {
BRect = BRect.make_empty();
}
BColoring[(*itr)] = BRect;
Point<2> CStart = Point<2>(0, (jo * ((C2_dimension + (gy - 1)) / gy)));
Point<2> CEnd = Point<2>(TACO_MIN(B2_dimension, CDomain.hi()[0]), TACO_MIN((jo * ((C2_dimension + (gy - 1)) / gy) + ((C2_dimension + (gy - 1)) / gy - 1)), CDomain.hi()[1]));
Rect<2> CRect = Rect<2>(CStart, CEnd);
if (!CDomain.contains(CRect.lo) || !CDomain.contains(CRect.hi)) {
CRect = CRect.make_empty();
}
CColoring[(*itr)] = CRect;
}
auto A_dense_run_0_Partition = runtime->create_index_partition(ctx, A_dense_run_0, domain, AColoring, LEGION_DISJOINT_COMPLETE_KIND);
auto A_vals_partition = copyPartition(ctx, runtime, A_dense_run_0_Partition, get_logical_region(A_vals));
auto B_dense_run_0_Partition = runtime->create_index_partition(ctx, B_dense_run_0, domain, BColoring, LEGION_ALIASED_COMPLETE_KIND);
LogicalPartition posPartB2 = copyPartition(ctx, runtime, B_dense_run_0_Partition, B2_pos);
LogicalPartition crdPartB2 = runtime->get_logical_partition(ctx, B2_crd, RectCompressedPosPartitionDownwards::apply(ctx, runtime, B2_crd.get_index_space(), posPartB2, B2_pos_parent, FID_RECT_1));
auto B_vals_partition = copyPartition(ctx, runtime, crdPartB2, get_logical_region(B_vals));
auto C_dense_run_0_Partition = runtime->create_index_partition(ctx, C_dense_run_0, domain, CColoring, LEGION_ALIASED_COMPLETE_KIND);
auto C_vals_partition = copyPartition(ctx, runtime, C_dense_run_0_Partition, get_logical_region(C_vals));
auto computePartitions = partitionPackForcomputeLegionConsMem();
computePartitions.APartition.indicesPartitions = std::vector<std::vector<LogicalPartition>>(2);
computePartitions.APartition.denseLevelRunPartitions = std::vector<IndexPartition>(2);
computePartitions.APartition.valsPartition = A_vals_partition;
computePartitions.APartition.denseLevelRunPartitions[0] = A_dense_run_0_Partition;
computePartitions.BPartition.indicesPartitions = std::vector<std::vector<LogicalPartition>>(2);
computePartitions.BPartition.denseLevelRunPartitions = std::vector<IndexPartition>(2);
computePartitions.BPartition.indicesPartitions[1].push_back(posPartB2);
computePartitions.BPartition.indicesPartitions[1].push_back(crdPartB2);
computePartitions.BPartition.valsPartition = B_vals_partition;
computePartitions.BPartition.denseLevelRunPartitions[0] = B_dense_run_0_Partition;
computePartitions.CPartition.indicesPartitions = std::vector<std::vector<LogicalPartition>>(2);
computePartitions.CPartition.denseLevelRunPartitions = std::vector<IndexPartition>(2);
computePartitions.CPartition.valsPartition = C_vals_partition;
computePartitions.CPartition.denseLevelRunPartitions[0] = C_dense_run_0_Partition;

return computePartitions;
}

void task_2(const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx, Runtime* runtime) {
PhysicalRegion A_vals = regions[0];
LogicalRegion A_vals_parent = regions[0].get_logical_region();
PhysicalRegion B2_pos = regions[1];
LogicalRegion B2_pos_parent = regions[1].get_logical_region();
PhysicalRegion B2_crd = regions[2];
LogicalRegion B2_crd_parent = regions[2].get_logical_region();
PhysicalRegion B_vals = regions[3];
LogicalRegion B_vals_parent = regions[3].get_logical_region();
PhysicalRegion C_vals = regions[4];
LogicalRegion C_vals_parent = regions[4].get_logical_region();

int64_t distFused = task->index_point[0];
task_2Args* args = (task_2Args*)(task->args);
int64_t A2_dimension = args->A2_dimension;
int64_t B1_dimension = args->B1_dimension;
int64_t C2_dimension = args->C2_dimension;
int32_t gx = args->gx;
int32_t gy = args->gy;

auto B_vals_ro_accessor = createAccessor<AccessorROdouble1>(B_vals, FID_VAL);
auto C_vals_ro_accessor = createAccessor<AccessorROdouble2>(C_vals, FID_VAL);
auto A_vals_rw_accessor = createAccessor<AccessorRWdouble2>(A_vals, FID_VAL);
auto B2_pos_accessor = createAccessor<AccessorRORect_1_1>(B2_pos, FID_RECT_1);
auto B2_crd_accessor = createAccessor<AccessorROint32_t1>(B2_crd, FID_COORD);

int64_t io = getIndexPoint(task, 0);
int64_t jo = getIndexPoint(task, 1);
int64_t pointID1 = io;
int64_t pointID2 = pointID1 * gy + jo;
for (int64_t ii = 0; ii < ((B1_dimension + (gx - 1)) / gx); ii++) {
int64_t i = io * ((B1_dimension + (gx - 1)) / gx) + ii;
if (i >= B1_dimension)
continue;

if (i >= (io + 1) * ((B1_dimension + (gx - 1)) / gx))
continue;

int64_t pointID3 = pointID2 * ((B1_dimension + (gx - 1)) / gx) + ii;
int64_t iA = i;
int64_t iB = i;
for (int64_t ji = 0; ji < ((C2_dimension + (gy - 1)) / gy); ji++) {
int64_t j = jo * ((C2_dimension + (gy - 1)) / gy) + ji;
if (j >= C2_dimension)
continue;

if (j >= (jo + 1) * ((C2_dimension + (gy - 1)) / gy))
continue;

int64_t pointID4 = pointID3 * ((C2_dimension + (gy - 1)) / gy) + ji;
int64_t jA = iA * A2_dimension + j;
for (int64_t kB = B2_pos_accessor[Point<1>(i)].lo; kB < (B2_pos_accessor[Point<1>(i)].hi + 1); kB++) {
int64_t k = B2_crd_accessor[(kB * 1)];
int64_t kC = k;
int64_t jC = kC * C2_dimension + j;
A_vals_rw_accessor[Point<2>(i, j)] = A_vals_rw_accessor[Point<2>(i, j)] + B_vals_ro_accessor[Point<1>(kB)] * C_vals_ro_accessor[Point<2>(k, j)];
}
}
}
}

void computeLegionConsMem(Legion::Context ctx, Legion::Runtime* runtime, LegionTensor* A, LegionTensor* B, LegionTensor* C, partitionPackForcomputeLegionConsMem* partitionPack, int32_t gx, int32_t gy) {
int A2_dimension = A->dims[1];
auto A_vals_parent = A->valsParent;
int B1_dimension = B->dims[0];
RegionWrapper B2_crd = B->indices[1][1];
auto B2_pos_parent = B->indicesParents[1][0];
auto B2_crd_parent = B->indicesParents[1][1];
auto B_vals_parent = B->valsParent;
int C2_dimension = C->dims[1];
auto C_vals_parent = C->valsParent;

int64_t B2Size = runtime->get_index_space_domain(ctx, get_index_space(B2_crd)).hi()[0] + 1;

Point<2> lowerBound = Point<2>(0, 0);
Point<2> upperBound = Point<2>((gx - 1), (gy - 1));
auto distFusedIndexSpace = runtime->create_index_space(ctx, Rect<2>(lowerBound, upperBound));
DomainT<2> domain = runtime->get_index_space_domain(ctx, IndexSpaceT<2>(distFusedIndexSpace));
task_2Args taskArgsRaw2;
taskArgsRaw2.A2_dimension = A2_dimension;
taskArgsRaw2.B1_dimension = B1_dimension;
taskArgsRaw2.C2_dimension = C2_dimension;
taskArgsRaw2.gx = gx;
taskArgsRaw2.gy = gy;
TaskArgument taskArgs = TaskArgument(&taskArgsRaw2, sizeof(task_2Args));
IndexLauncher launcher = IndexLauncher(taskID(2), domain, taskArgs, ArgumentMap());
launcher.add_region_requirement(RegionRequirement(partitionPack->APartition.valsPartition, 0, READ_WRITE, EXCLUSIVE, A_vals_parent).add_field(FID_VAL));
launcher.add_region_requirement(RegionRequirement(partitionPack->BPartition.indicesPartitions[1][0], 0, READ_ONLY, EXCLUSIVE, get_logical_region(B2_pos_parent)).add_field(FID_RECT_1));
launcher.add_region_requirement(RegionRequirement(partitionPack->BPartition.indicesPartitions[1][1], 0, READ_ONLY, EXCLUSIVE, get_logical_region(B2_crd_parent)).add_field(FID_COORD));
launcher.add_region_requirement(RegionRequirement(partitionPack->BPartition.valsPartition, 0, READ_ONLY, EXCLUSIVE, B_vals_parent).add_field(FID_VAL));
launcher.add_region_requirement(RegionRequirement(partitionPack->CPartition.valsPartition, 0, READ_ONLY, EXCLUSIVE, C_vals_parent).add_field(FID_VAL));
launcher.tag |= TACOMapper::BACKPRESSURE_TASK;
runtime->execute_index_space(ctx, launcher);

}
void registerTacoTasks() {
{
TaskVariantRegistrar registrar(taskID(1), "task_1");
registrar.add_constraint(ProcessorConstraint(Processor::OMP_PROC));
registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
registrar.set_leaf();
Runtime::preregister_task_variant<task_1>(registrar, "task_1");
}
{
TaskVariantRegistrar registrar(taskID(2), "task_2");
registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
registrar.set_leaf();
Runtime::preregister_task_variant<task_2>(registrar, "task_2");
}
}
Loading

0 comments on commit 8899065

Please sign in to comment.