From bd36277648ddd6a0c8ee4fb371f5419baa14abc1 Mon Sep 17 00:00:00 2001 From: Olivia Hsu Date: Wed, 14 Oct 2020 14:51:00 -0700 Subject: [PATCH 01/14] Add in hoisted workspace reuse and remove guard for divisible bound and split --- include/taco/index_notation/index_notation.h | 4 + include/taco/lower/lowerer_impl.h | 9 +- src/index_notation/index_notation.cpp | 16 ++ src/lower/lower.cpp | 1 + src/lower/lowerer_impl.cpp | 85 +++++++-- test/tests-scheduling.cpp | 15 +- test/tests-workspaces.cpp | 186 +++++++++++++++++++ 7 files changed, 291 insertions(+), 25 deletions(-) create mode 100644 test/tests-workspaces.cpp diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h index ebc710e28..7d408f027 100644 --- a/include/taco/index_notation/index_notation.h +++ b/include/taco/index_notation/index_notation.h @@ -949,6 +949,10 @@ std::vector getArguments(IndexStmt stmt); /// Returns the temporaries in the index statement, in the order they appear. std::vector getTemporaries(IndexStmt stmt); +// [Olivia] +/// Returns the temporaries in the index statement, in the order they appear. +std::map getTemporaryLocations(IndexStmt stmt); + /// Returns the tensors in the index statement. std::vector getTensorVars(IndexStmt stmt); diff --git a/include/taco/lower/lowerer_impl.h b/include/taco/lower/lowerer_impl.h index 62ac9e5fe..a3ffd6173 100644 --- a/include/taco/lower/lowerer_impl.h +++ b/include/taco/lower/lowerer_impl.h @@ -333,17 +333,19 @@ class LowererImpl : public util::Uncopyable { ir::Stmt codeToInitializeIteratorVars(std::vector iterators, std::vector rangers, std::vector mergers, ir::Expr coord, IndexVar coordinateVar); ir::Stmt codeToInitializeIteratorVar(Iterator iterator, std::vector iterators, std::vector rangers, std::vector mergers, ir::Expr coordinate, IndexVar coordinateVar); + /// Initializes a temporary workspace + std::vector codeToInitializeTemporary(Where where); /// Recovers a derived indexvar from an underived variable. ir::Stmt codeToRecoverDerivedIndexVar(IndexVar underived, IndexVar indexVar, bool emitVarDecl); - /// Conditionally increment iterator position variables. + /// Conditionally increment iterator position variables. ir::Stmt codeToIncIteratorVars(ir::Expr coordinate, IndexVar coordinateVar, std::vector iterators, std::vector mergers); ir::Stmt codeToLoadCoordinatesFromPosIterators(std::vector iterators, bool declVars); - /// Create statements to append coordinate to result modes. + /// Create statements to append coordinate to result modes. ir::Stmt appendCoordinate(std::vector appenders, ir::Expr coord); /// Create statements to append positions to result modes. @@ -363,6 +365,9 @@ class LowererImpl : public util::Uncopyable { int markAssignsAtomicDepth = 0; ParallelUnit atomicParallelUnit; + /// Map used to hoist temporary workspace initialization + std::map temporaryInitialization; + /// Map from tensor variables in index notation to variables in the IR std::map tensorVars; diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp index 1f857a5fc..e9fa934a4 100644 --- a/src/index_notation/index_notation.cpp +++ b/src/index_notation/index_notation.cpp @@ -2332,6 +2332,22 @@ vector getArguments(IndexStmt stmt) { return result; } +std::map getTemporaryLocations(IndexStmt stmt) { + map temporaryLocs; + Forall f = Forall(); + match(stmt, + function([&](const ForallNode* op, Matcher* ctx) { + f = op; + ctx->match(op->stmt); + }), + function([&](const WhereNode* w, Matcher* ctx) { + if (!(f == IndexStmt())) + temporaryLocs.insert({f, Where(w)}); + }) + ); + return temporaryLocs; +} + std::vector getTemporaries(IndexStmt stmt) { vector temporaries; bool firstAssignment = true; diff --git a/src/lower/lower.cpp b/src/lower/lower.cpp index 86389ac1d..e24406543 100644 --- a/src/lower/lower.cpp +++ b/src/lower/lower.cpp @@ -12,6 +12,7 @@ #include "taco/ir/ir.h" #include "taco/ir/simplify.h" #include "ir/ir_generators.h" +#include "taco/ir/ir_printer.h" #include "taco/lower/lowerer_impl.h" #include "taco/lower/iterator.h" diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index 7a0af13b3..2297f73c3 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -118,6 +118,9 @@ LowererImpl::lower(IndexStmt stmt, string name, vector arguments = getArguments(stmt); vector temporaries = getTemporaries(stmt); + // Create datastructure needed for temporary workspace hoisting/reuse + temporaryInitialization = getTemporaryLocations(stmt); + // Convert tensor results and arguments IR variables map resultVars; vector resultsIR = createVars(results, &resultVars, unpack); @@ -382,11 +385,29 @@ Stmt LowererImpl::lowerForall(Forall forall) taco_iassert(indexVarToExprMap.count(varToRecover)); recoverySteps.push_back(VarDecl::make(indexVarToExprMap[varToRecover], recoveredValue)); // place underived guard + std::vector iterBounds = provGraph.deriveIterBounds(varToRecover, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); if (forallNeedsUnderivedGuards && underivedBounds.count(varToRecover) && !provGraph.hasPosDescendant(varToRecover)) { - Stmt guard = IfThenElse::make(Gte::make(indexVarToExprMap[varToRecover], underivedBounds[varToRecover][1]), - Break::make()); - recoverySteps.push_back(guard); + + // FIXME: [Olivia] Check this with someone + // Removed underived guard if indexVar is bounded is divisible by its split child indexVar + vector children = provGraph.getChildren(varToRecover); + bool hasDirectDivBound = false; + std::vector iterBoundsInner = provGraph.deriveIterBounds(forall.getIndexVar(), definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + + for (auto& c: children) { + if (provGraph.hasExactBound(c) && provGraph.derivationPath(varToRecover, c).size() == 2) { + std::vector iterBoundsUnderivedChild = provGraph.deriveIterBounds(c, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + if (iterBoundsUnderivedChild[1].as()->getValue() % iterBoundsInner[1].as()->getValue() == 0) + hasDirectDivBound = true; + break; + } + } + if (!hasDirectDivBound) { + Stmt guard = IfThenElse::make(Gte::make(indexVarToExprMap[varToRecover], underivedBounds[varToRecover][1]), + Break::make()); + recoverySteps.push_back(guard); + } } } Stmt recoveryStmt = Block::make(recoverySteps); @@ -414,6 +435,12 @@ Stmt LowererImpl::lowerForall(Forall forall) getArgumentAccesses(forall), reducedAccesses); + // Emit temporary initialization if forall is sequential and leads to a where statement + vector temporaryValuesInitFree = {Stmt(), Stmt()}; + auto temp = temporaryInitialization.find(forall); + if (temp != temporaryInitialization.end() && forall.getParallelUnit() == ParallelUnit::NotParallel && !isScalar(temp->second.getTemporary().getType())) + temporaryValuesInitFree = codeToInitializeTemporary(temp->second); + Stmt loops; // Emit a loop that iterates over over a single iterator (optimization) if (lattice.iterators().size() == 1 && lattice.iterators()[0].isUnique()) { @@ -491,7 +518,9 @@ Stmt LowererImpl::lowerForall(Forall forall) parallelUnitSizes.erase(forall.getParallelUnit()); } return Block::blanks(preInitValues, - loops); + temporaryValuesInitFree[0], + loops, + temporaryValuesInitFree[1]); } Stmt LowererImpl::lowerForallCloned(Forall forall) { @@ -1272,39 +1301,36 @@ Stmt LowererImpl::lowerForallBody(Expr coordinate, IndexStmt stmt, appendCoords); } - -Stmt LowererImpl::lowerWhere(Where where) { +vector LowererImpl::codeToInitializeTemporary(Where where) { TensorVar temporary = where.getTemporary(); - // Declare and initialize the where statement's temporary - Stmt initializeTemporary = Stmt(); Stmt freeTemporary = Stmt(); + Stmt initializeTemporary = Stmt(); if (isScalar(temporary.getType())) { initializeTemporary = defineScalarVariable(temporary, true); - } - else { + } else { if (generateComputeCode()) { Expr values = ir::Var::make(temporary.getName(), temporary.getType().getDataType(), true, false); - taco_iassert(temporary.getType().getOrder() == 1) << " Temporary order was " << temporary.getType().getOrder(); // TODO + taco_iassert(temporary.getType().getOrder() == 1) << " Temporary order was " + << temporary.getType().getOrder(); // TODO Dimension temporarySize = temporary.getType().getShape().getDimension(0); Expr size; if (temporarySize.isFixed()) { size = ir::Literal::make(temporarySize.getSize()); - } - else if (temporarySize.isIndexVarSized()) { + } else if (temporarySize.isIndexVarSized()) { IndexVar var = temporarySize.getIndexVarSize(); - vector bounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + vector bounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds, + indexVarToExprMap, iterators); size = ir::Sub::make(bounds[1], bounds[0]); - } - else { + } else { taco_ierror; // TODO } // no decl needed for shared memory Stmt decl = Stmt(); - if((isa(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) { + if ((isa(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) { decl = VarDecl::make(values, ir::Literal::make(0)); } Stmt allocate = Allocate::make(values, size); @@ -1313,17 +1339,36 @@ Stmt LowererImpl::lowerWhere(Where where) { Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); Stmt zeroInitLoop = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); - freeTemporary = Free::make(values); - /// Make a struct object that lowerAssignment and lowerAccess can read /// temporary value arrays from. TemporaryArrays arrays; arrays.values = values; this->temporaryArrays.insert({temporary, arrays}); + freeTemporary = Free::make(values); initializeTemporary = Block::make(decl, allocate, zeroInitLoop); } } + return {initializeTemporary, freeTemporary}; +} + +Stmt LowererImpl::lowerWhere(Where where) { + TensorVar temporary = where.getTemporary(); + + // Declare and initialize the where statement's temporary + vector temporaryValuesInitFree = {Stmt(), Stmt()}; + bool temporaryHoisted = false; + for (auto it = temporaryInitialization.begin(); it != temporaryInitialization.end(); ++it) { + if (it->second == where && it->first.getParallelUnit() == ParallelUnit::NotParallel && !isScalar(temporary.getType())) { + temporaryHoisted = true; + } + } + + if (!temporaryHoisted) + temporaryValuesInitFree = codeToInitializeTemporary(where); + + Stmt initializeTemporary = temporaryValuesInitFree[0]; + Stmt freeTemporary = temporaryValuesInitFree[1]; match(where.getConsumer(), std::function([&](const AssignmentNode* op) { @@ -1354,7 +1399,7 @@ Stmt LowererImpl::lowerWhere(Where where) { whereConsumers.pop_back(); whereTemps.pop_back(); whereTempsToResult.erase(where.getTemporary()); - return Block::make(initializeTemporary, producer, markAssignsAtomicDepth > 0 ? capturedLocatePos : ir::Stmt(), consumer, freeTemporary); + return Block::make(initializeTemporary, producer, markAssignsAtomicDepth > 0 ? capturedLocatePos : ir::Stmt(), consumer, freeTemporary); } diff --git a/test/tests-scheduling.cpp b/test/tests-scheduling.cpp index 7adc3ca88..b7ba88ecc 100644 --- a/test/tests-scheduling.cpp +++ b/test/tests-scheduling.cpp @@ -84,6 +84,7 @@ TEST(scheduling, lowerDenseMatrixMul) { } } + cout << "-------PACKING---------" << endl; A.pack(); B.pack(); @@ -96,21 +97,29 @@ TEST(scheduling, lowerDenseMatrixMul) { .split(j, j0, j1, 2) .split(k, k0, k1, 2) .reorder({i0, j0, k0, i1, j1, k1}); + cout << "-------COMPILING---------" << endl; C.compile(stmt); + cout << "-------ASSEMBLING---------" << endl; C.assemble(); + cout << "-------COMPUTING---------" << endl; C.compute(); Tensor expected("expected", {4, 4}, {Dense, Dense}); expected(i, j) = A(i, k) * B(k, j); + IndexStmt expected_stmt = C.getAssignment().concretize(); expected.compile(); expected.assemble(); expected.compute(); ASSERT_TENSOR_EQ(C, expected); - // std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); - // ir::Stmt compute = lower(stmt, "compute", false, true); - // codegen->compile(compute, true); + cout << stmt << endl; + + std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", true, true); + codegen->compile(compute, true); + ir::Stmt expected_compute = lower(expected_stmt, "compute", false, true); + //codegen->compile(expected_compute, true); } TEST(scheduling, lowerSparseCopy) { diff --git a/test/tests-workspaces.cpp b/test/tests-workspaces.cpp new file mode 100644 index 000000000..d2b5157f5 --- /dev/null +++ b/test/tests-workspaces.cpp @@ -0,0 +1,186 @@ +#include +#include +#include +#include "test.h" +#include "test_tensors.h" +#include "taco/tensor.h" +#include "taco/index_notation/index_notation.h" +#include "codegen/codegen.h" +#include "taco/lower/lower.h" + +using namespace taco; + +TEST(workspaces, tile_vecElemMul_NoTail) { + + Tensor A("A", {16}, {Dense}); + Tensor B("B", {16}, {Dense}); + Tensor C("C", {16}, {Dense}); + + for (int i = 0; i < 16; i++) { + A.insert({i}, (double) i); + B.insert({i}, (double) i); + } + + A.pack(); + B.pack(); + + IndexVar i("i"); + IndexVar i_bounded("i_bounded"); + IndexVar i0("i0"), i1("i1"); + IndexExpr precomputedExpr = B(i) * C(i); + A(i) = precomputedExpr; + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense); + stmt = stmt.bound(i, i_bounded, 17, BoundType::MaxExact) + .split(i_bounded, i0, i1, 4) + .precompute(precomputedExpr, i1, i1, precomputed); + + A.compile(stmt.concretize()); + A.assemble(); + A.compute(); + + Tensor expected("expected", {16}, {Dense}); + expected(i) = B(i) * C(i); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(A, expected); +} + +TEST(workspaces, tile_vecElemMul_Tail1) { + + Tensor A("A", {16}, {Dense}); + Tensor B("B", {16}, {Dense}); + Tensor C("C", {16}, {Dense}); + + for (int i = 0; i < 16; i++) { + A.insert({i}, (double) i); + B.insert({i}, (double) i); + } + + A.pack(); + B.pack(); + + IndexVar i("i"); + IndexVar i_bounded("i_bounded"); + IndexVar i0("i0"), i1("i1"); + IndexExpr precomputedExpr = B(i) * C(i); + A(i) = precomputedExpr; + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense); + stmt = stmt.bound(i, i_bounded, 16, BoundType::MaxExact) + .split(i_bounded, i0, i1, 5) + .precompute(precomputedExpr, i1, i1, precomputed); + + A.compile(stmt.concretize()); + A.assemble(); + A.compute(); + + Tensor expected("expected", {16}, {Dense}); + expected(i) = B(i) * C(i); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(A, expected); +} + +TEST(workspaces, tile_vecElemMul_Tail2) { + + Tensor A("A", {17}, {Dense}); + Tensor B("B", {17}, {Dense}); + Tensor C("C", {17}, {Dense}); + + for (int i = 0; i < 17; i++) { + A.insert({i}, (double) i); + B.insert({i}, (double) i); + } + + A.pack(); + B.pack(); + + IndexVar i("i"); + IndexVar i_bounded("i_bounded"); + IndexVar i0("i0"), i1("i1"); + IndexExpr precomputedExpr = B(i) * C(i); + A(i) = precomputedExpr; + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense); + stmt = stmt.bound(i, i_bounded, 17, BoundType::MaxExact) + .split(i_bounded, i0, i1, 4) + .precompute(precomputedExpr, i1, i1, precomputed); + + A.compile(stmt.concretize()); + A.assemble(); + A.compute(); + + Tensor expected("expected", {17}, {Dense}); + expected(i) = B(i) * C(i); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(A, expected); + +// ir::IRPrinter irp = ir::IRPrinter(cout); +// +// cout << stmt << endl; +// +// std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); +// ir::Stmt compute = lower(stmt, "compute", false, true); +// +// irp.print(compute); +// cout << endl; +// codegen->compile(compute, false); +} + +TEST(workspaces, tile_denseMatMul) { + + Tensor A("A", {16}, {Dense}); + Tensor B("B", {16}, {Dense}); + Tensor C("C", {16}, {Dense}); + + for (int i = 0; i < 16; i++) { + A.insert({i}, (double) i); + B.insert({i}, (double) i); + } + + A.pack(); + B.pack(); + + IndexVar i("i"); + IndexVar i_bounded("i_bounded"); + IndexVar i0("i0"), i1("i1"); + IndexExpr precomputedExpr = B(i) * C(i); + A(i) = precomputedExpr; + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense); + stmt = stmt.bound(i, i_bounded, 16, BoundType::MaxExact) + .split(i_bounded, i0, i1, 4) + .precompute(precomputedExpr, i1, i1, precomputed); + + A.compile(stmt.concretize()); + A.assemble(); + A.compute(); + + Tensor expected("expected", {16}, {Dense}); + expected(i) = B(i) * C(i); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(A, expected); + +// ir::IRPrinter irp = ir::IRPrinter(cout); +// +// cout << stmt << endl; +// +// std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); +// ir::Stmt compute = lower(stmt, "compute", false, true); +// +// irp.print(compute); +// cout << endl; +// codegen->compile(compute, false); + +} From e649a67550b718a3a89a15d0ee953e07a404e30c Mon Sep 17 00:00:00 2001 From: Olivia Hsu Date: Wed, 14 Oct 2020 22:10:39 -0700 Subject: [PATCH 02/14] Fix some workspaces tests --- test/tests-workspaces.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/tests-workspaces.cpp b/test/tests-workspaces.cpp index d2b5157f5..bd035a8b1 100644 --- a/test/tests-workspaces.cpp +++ b/test/tests-workspaces.cpp @@ -32,11 +32,13 @@ TEST(workspaces, tile_vecElemMul_NoTail) { IndexStmt stmt = A.getAssignment().concretize(); TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense); - stmt = stmt.bound(i, i_bounded, 17, BoundType::MaxExact) + stmt = stmt.bound(i, i_bounded, 16, BoundType::MaxExact) .split(i_bounded, i0, i1, 4) .precompute(precomputedExpr, i1, i1, precomputed); - A.compile(stmt.concretize()); + cout << stmt << endl; + + A.compile(stmt); A.assemble(); A.compute(); From 01166afa48c77e0b3565126aec4fe8b5db7ce850 Mon Sep 17 00:00:00 2001 From: Rawn Date: Sun, 29 Nov 2020 21:18:00 -0800 Subject: [PATCH 03/14] Prototypes automatically generating code to to have sparse iteration over a dense workspace --- include/taco/ir/ir.h | 10 +- include/taco/ir/ir_printer.h | 1 + include/taco/ir/ir_rewriter.h | 1 + include/taco/ir/ir_visitor.h | 3 + include/taco/lower/lowerer_impl.h | 29 ++ src/codegen/codegen_c.cpp | 1 + src/index_notation/index_notation_printer.cpp | 7 +- src/ir/ir.cpp | 9 + src/ir/ir_printer.cpp | 16 +- src/ir/ir_rewriter.cpp | 18 ++ src/ir/ir_visitor.cpp | 5 + src/lower/lowerer_impl.cpp | 297 ++++++++++++++++-- 12 files changed, 369 insertions(+), 28 deletions(-) diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index 1f7e17918..bbb36c12b 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -65,7 +65,8 @@ enum class IRNodeType { BlankLine, Print, GetProperty, - Break + Break, + Sort }; enum class TensorProperty { @@ -725,6 +726,13 @@ struct Break : public StmtNode { static const IRNodeType _type_info = IRNodeType::Break; }; +struct Sort : public StmtNode { + std::vector args; + static Stmt make(std::vector args); + + static const IRNodeType _type_info = IRNodeType::Sort; +}; + /** A print statement. * Takes in a printf-style format string and Exprs to pass * for the values. diff --git a/include/taco/ir/ir_printer.h b/include/taco/ir/ir_printer.h index 759d21ad3..1c264b7f9 100644 --- a/include/taco/ir/ir_printer.h +++ b/include/taco/ir/ir_printer.h @@ -68,6 +68,7 @@ class IRPrinter : public IRVisitorStrict { virtual void visit(const Break*); virtual void visit(const Print*); virtual void visit(const GetProperty*); + virtual void visit(const Sort*); std::ostream &stream; int indent; diff --git a/include/taco/ir/ir_rewriter.h b/include/taco/ir/ir_rewriter.h index efb9eaf89..81ad43705 100644 --- a/include/taco/ir/ir_rewriter.h +++ b/include/taco/ir/ir_rewriter.h @@ -68,6 +68,7 @@ class IRRewriter : public IRVisitorStrict { virtual void visit(const Break* op); virtual void visit(const Print* op); virtual void visit(const GetProperty* op); + virtual void visit(const Sort *op); }; }} diff --git a/include/taco/ir/ir_visitor.h b/include/taco/ir/ir_visitor.h index f6331035b..810e4f758 100644 --- a/include/taco/ir/ir_visitor.h +++ b/include/taco/ir/ir_visitor.h @@ -48,6 +48,7 @@ struct BlankLine; struct Break; struct Print; struct GetProperty; +struct Sort; /// Extend this class to visit every node in the IR. class IRVisitorStrict { @@ -98,6 +99,7 @@ class IRVisitorStrict { virtual void visit(const Break*) = 0; virtual void visit(const Print*) = 0; virtual void visit(const GetProperty*) = 0; + virtual void visit(const Sort*) = 0; }; @@ -151,6 +153,7 @@ class IRVisitor : public IRVisitorStrict { virtual void visit(const Break* op); virtual void visit(const Print* op); virtual void visit(const GetProperty* op); + virtual void visit(const Sort* op); }; }} diff --git a/include/taco/lower/lowerer_impl.h b/include/taco/lower/lowerer_impl.h index a3ffd6173..71783c6ad 100644 --- a/include/taco/lower/lowerer_impl.h +++ b/include/taco/lower/lowerer_impl.h @@ -81,6 +81,16 @@ class LowererImpl : public util::Uncopyable { std::set reducedAccesses, ir::Stmt recoveryStmt); + /// Lower a forall that iterates over all the coordinates in the forall index + /// var's dimension, and locates tensor positions from the locate iterators. + virtual ir::Stmt lowerForallDenseAcceleration(Forall forall, + std::vector locaters, + std::vector inserters, + std::vector appenders, + std::set reducedAccesses, + ir::Stmt recoveryStmt); + + /// Lower a forall that iterates over the coordinates in the iterator, and /// locates tensor positions from the locate iterators. virtual ir::Stmt lowerForallCoordinate(Forall forall, Iterator iterator, @@ -333,9 +343,19 @@ class LowererImpl : public util::Uncopyable { ir::Stmt codeToInitializeIteratorVars(std::vector iterators, std::vector rangers, std::vector mergers, ir::Expr coord, IndexVar coordinateVar); ir::Stmt codeToInitializeIteratorVar(Iterator iterator, std::vector iterators, std::vector rangers, std::vector mergers, ir::Expr coordinate, IndexVar coordinateVar); + /// Returns true iff the temporary used in the where statement is dense and sparse iteration over that + /// temporary can be automaticallty supported by the compiler. + bool canAccelerateDenseTemp(Where where); + /// Initializes a temporary workspace std::vector codeToInitializeTemporary(Where where); + /// Gets the size of a temporary tensorVar + ir::Expr getTemporarySize(TensorVar var); + + /// Initializes helper arrays to give dense workspaces sparse acceleration + std::vector codeToInitializeDenseAcceleratorArrays(Where where); + /// Recovers a derived indexvar from an underived variable. ir::Stmt codeToRecoverDerivedIndexVar(IndexVar underived, IndexVar indexVar, bool emitVarDecl); @@ -376,6 +396,15 @@ class LowererImpl : public util::Uncopyable { }; std::map temporaryArrays; + /// Map form temporary to indexList var if accelerating dense workspace + std::map tempToIndexList; + + /// Map form temporary to indexListSize if accelerating dense workspace + std::map tempToIndexListSize; + + /// Map form temporary to bitGuard var if accelerating dense workspace + std::map tempToBitGuard; + /// Map from result tensors to variables tracking values array capacity. std::map capacityVars; diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp index 204aa1e2d..9611d09f5 100644 --- a/src/codegen/codegen_c.cpp +++ b/src/codegen/codegen_c.cpp @@ -190,6 +190,7 @@ class CodeGen_C::FindVars : public IRVisitor { if (!util::contains(localVars, op->var)) { localVars.push_back(op->var); } + op->var.accept(this); op->rhs.accept(this); } diff --git a/src/index_notation/index_notation_printer.cpp b/src/index_notation/index_notation_printer.cpp index 58305077e..ba633731d 100644 --- a/src/index_notation/index_notation_printer.cpp +++ b/src/index_notation/index_notation_printer.cpp @@ -81,7 +81,12 @@ void IndexNotationPrinter::visit(const NegNode* op) { Precedence precedence = Precedence::NEG; bool parenthesize = precedence > parentPrecedence; parentPrecedence = precedence; - os << "-"; + if(op->getDataType().isBool()) { + os << "!"; + } else { + os << "-"; + } + if (parenthesize) { os << "("; } diff --git a/src/ir/ir.cpp b/src/ir/ir.cpp index e5225f502..a714dddfc 100644 --- a/src/ir/ir.cpp +++ b/src/ir/ir.cpp @@ -817,6 +817,13 @@ Expr GetProperty::make(Expr tensor, TensorProperty property, int mode, return gp; } +// Sort +Stmt Sort::make(std::vector args) { + Sort* sort = new Sort; + sort->args = args; + return sort; +} + // GetProperty Expr GetProperty::make(Expr tensor, TensorProperty property, int mode) { @@ -953,6 +960,8 @@ template<> void StmtNode::accept(IRVisitorStrict *v) const { v->visit((const Print*)this); } template<> void ExprNode::accept(IRVisitorStrict *v) const { v->visit((const GetProperty*)this); } +template<> void StmtNode::accept(IRVisitorStrict *v) + const { v->visit((const Sort*)this); } // printing methods std::ostream& operator<<(std::ostream& os, const Stmt& stmt) { diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index 0fca68786..be654e295 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -131,7 +131,11 @@ void IRPrinter::visit(const Var* op) { } void IRPrinter::visit(const Neg* op) { - stream << "-"; + if(op->type.isBool()) { + stream << "!"; + } else { + stream << "-"; + } parentPrecedence = Precedence::NEG; op->a.accept(this); } @@ -575,6 +579,16 @@ void IRPrinter::visit(const GetProperty* op) { stream << op->name; } +void IRPrinter::visit(const Sort* op) { + doIndent(); + stream << "qsort("; + parentPrecedence = Precedence::CALL; + acceptJoin(this, stream, op->args, ", "); + stream << ");"; + stream << endl; +} + + void IRPrinter::resetNameCounters() { // seed the unique names with all C99 keywords // from: http://en.cppreference.com/w/c/keyword diff --git a/src/ir/ir_rewriter.cpp b/src/ir/ir_rewriter.cpp index 1a1c91f23..fd1423a00 100644 --- a/src/ir/ir_rewriter.cpp +++ b/src/ir/ir_rewriter.cpp @@ -479,5 +479,23 @@ void IRRewriter::visit(const GetProperty* op) { } } +void IRRewriter::visit(const Sort* op) { + std::vector args; + bool rewritten = false; + for (auto& arg : op->args) { + Expr rewrittenArg = rewrite(arg); + args.push_back(rewrittenArg); + if (rewrittenArg != arg) { + rewritten = true; + } + } + if (rewritten) { + stmt = Sort::make(args); + } + else { + stmt = op; + } +} + }} diff --git a/src/ir/ir_visitor.cpp b/src/ir/ir_visitor.cpp index 8a1baf6cf..19fbfbfdf 100644 --- a/src/ir/ir_visitor.cpp +++ b/src/ir/ir_visitor.cpp @@ -236,5 +236,10 @@ void IRVisitor::visit(const Print* op) { e.accept(this); } +void IRVisitor::visit(const Sort* op) { + for (auto e: op->args) + e.accept(this); +} + } // namespace ir } // namespace taco diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index 2297f73c3..a4f05a879 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -267,10 +267,11 @@ LowererImpl::lower(IndexStmt stmt, string name, Stmt LowererImpl::lowerAssignment(Assignment assignment) { TensorVar result = assignment.getLhs().getTensorVar(); + Stmt computeStmt; + Expr rhs = lower(assignment.getRhs()); if (generateComputeCode()) { Expr var = getTensorVar(result); - Expr rhs = lower(assignment.getRhs()); // Assignment to scalar variables. if (isScalar(result.getType())) { @@ -288,7 +289,6 @@ Stmt LowererImpl::lowerAssignment(Assignment assignment) Expr values = getValuesArray(result); Expr loc = generateValueLocExpr(assignment.getLhs()); - Stmt computeStmt; if (!assignment.getOperator().defined()) { computeStmt = Store::make(values, loc, rhs, markAssignsAtomicDepth > 0, atomicParallelUnit); } @@ -296,13 +296,46 @@ Stmt LowererImpl::lowerAssignment(Assignment assignment) computeStmt = compoundStore(values, loc, rhs, markAssignsAtomicDepth > 0, atomicParallelUnit); } taco_iassert(computeStmt.defined()); - return computeStmt; } } - // We're only assembling so defer allocating value memory to the end when - // we'll know exactly how much we need. - else if (generateAssembleCode()) { - // TODO + // TODO: If only assembling so defer allocating value memory to the end when + // we'll know exactly how much we need. + if (generateAssembleCode() || generateComputeCode()) { + + bool temporaryWithSparseAcceleration = util::contains(tempToIndexList, result); + if(generateComputeCode() && !temporaryWithSparseAcceleration) { + taco_iassert(computeStmt.defined()); + return computeStmt; + } + + if(temporaryWithSparseAcceleration) { + Expr values = getValuesArray(result); + Expr loc = generateValueLocExpr(assignment.getLhs()); + Stmt initialStorage = computeStmt; + if(assignment.getOperator().defined()) { + // computeStmt is a compund stmt so we need to emit an initial store into the temporary + initialStorage = Store::make(values, loc, rhs, markAssignsAtomicDepth > 0, atomicParallelUnit); + } + + Expr bitGuardArr = tempToBitGuard.at(result); + Expr indexList = tempToIndexList.at(result); + Expr indexListSize = tempToIndexListSize.at(result); + + Stmt markBitGuardAsTrue = Store::make(bitGuardArr, loc, ir::Literal::make(true), markAssignsAtomicDepth > 0, atomicParallelUnit); + Stmt trackIndex = Store::make(indexList, indexListSize, loc, markAssignsAtomicDepth > 0, atomicParallelUnit); + Expr incrementSize = ir::Add::make(indexListSize, ir::Literal::make(1)); + Stmt incrementStmt = Assign::make(indexListSize, incrementSize, markAssignsAtomicDepth > 0, atomicParallelUnit); + + Stmt firstWriteAtIndex = Block::make(initialStorage, trackIndex, markBitGuardAsTrue, incrementStmt); + if(!generateComputeCode()) { + firstWriteAtIndex = Block::make(trackIndex, markBitGuardAsTrue, incrementStmt); + } + + Expr readBitGuard = Load::make(bitGuardArr, loc); + Stmt finalStmt = IfThenElse::make(ir::Neg::make(readBitGuard), firstWriteAtIndex, computeStmt); + return finalStmt; + } + return Stmt(); } // We're neither assembling or computing so we emit nothing. @@ -473,10 +506,27 @@ Stmt LowererImpl::lowerForall(Forall forall) } } } + + // For now, this only works when consuming a single workspace. + bool canAccelWithSparseIteration = inParallelLoopDepth == 0 && provGraph.isFullyDerived(iterator.getIndexVar()); + if (canAccelWithSparseIteration && iterator.isDimensionIterator() && locators.size() == 1) { + // We are iterating over a dimension and locating into a temporary with a tracker to keep indices. Instead, we + // can just iterate over the indices and locate into the dense workspace. + for (auto it = tensorVars.begin(); it != tensorVars.end(); ++it) { + if (it->second == locators[0].getTensor() && util::contains(tempToIndexList, it->first)) { + canAccelWithSparseIteration = true; + break; + } + } + } + if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) { loops = lowerForallFusedPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } + else if (canAccelWithSparseIteration) { + loops = lowerForallDenseAcceleration(forall, locators, inserters, appenders, reducedAccesses, recoveryStmt); + } // Emit dimension coordinate iteration loop else if (iterator.isDimensionIterator()) { loops = lowerForallDimension(forall, point.locators(), @@ -853,6 +903,64 @@ Stmt LowererImpl::lowerForallDimension(Forall forall, posAppend); } + Stmt LowererImpl::lowerForallDenseAcceleration(Forall forall, + vector locators, + vector inserters, + vector appenders, + set reducedAccesses, + ir::Stmt recoveryStmt) + { + taco_iassert(locators.size() == 1) << "Optimizing a dense workspace is only supported when the consumer is the only RHS tensor"; + taco_iassert(provGraph.isFullyDerived(forall.getIndexVar())) << "Sparsely accelerating a dense workspace only works with fully derived index vars"; + taco_iassert(forall.getParallelUnit() == ParallelUnit::NotParallel) << "Sparsely accelerating a dense workspace only works within serial loops"; + + + TensorVar var; + for (auto it = tensorVars.begin(); it != tensorVars.end(); ++it) { + if (it->second == locators[0].getTensor() && util::contains(tempToIndexList, it->first)) { + var = it->first; + break; + } + } + + Expr indexList = tempToIndexList.at(var); + Expr indexListSize = tempToIndexListSize.at(var); + Expr bitGuard = tempToBitGuard.at(var); + Expr loopVar = ir::Var::make(var.getName() + "_index_locator", taco::Int32, false, false); + Expr coordinate = getCoordinateVar(forall.getIndexVar()); + + if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { + markAssignsAtomicDepth++; + atomicParallelUnit = forall.getParallelUnit(); + } + + Stmt declareVar = VarDecl::make(coordinate, Load::make(indexList, loopVar)); + Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); + Stmt resetGuard = ir::Store::make(bitGuard, loopVar, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit); + body = Block::make(declareVar, body, resetGuard); + + if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { + markAssignsAtomicDepth--; + } + + body = Block::make({recoveryStmt, body}); + + Stmt posAppend = generateAppendPositions(appenders); + + LoopKind kind = LoopKind::Serial; + if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + kind = LoopKind::Vectorized; + } + else if (forall.getParallelUnit() != ParallelUnit::NotParallel + && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction && !ignoreVectorize) { + kind = LoopKind::Runtime; + } + + return Block::blanks(For::make(loopVar, 0, indexListSize, 1, body, kind, + ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), + ignoreVectorize ? 0 : forall.getUnrollFactor()), + posAppend); + } Stmt LowererImpl::lowerForallCoordinate(Forall forall, Iterator iterator, vector locators, @@ -1301,32 +1409,156 @@ Stmt LowererImpl::lowerForallBody(Expr coordinate, IndexStmt stmt, appendCoords); } +Expr LowererImpl::getTemporarySize(TensorVar temporary) { + Dimension temporarySize = temporary.getType().getShape().getDimension(0); + + if (temporarySize.isFixed()) { + return ir::Literal::make(temporarySize.getSize()); + } + + if (temporarySize.isIndexVarSized()) { + IndexVar var = temporarySize.getIndexVarSize(); + vector bounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds, + indexVarToExprMap, iterators); + return ir::Sub::make(bounds[1], bounds[0]); + } + + taco_ierror; // TODO + return Expr(); +} + +vector LowererImpl::codeToInitializeDenseAcceleratorArrays(Where where) { + TensorVar temporary = where.getTemporary(); + + // TODO: emit as uint64 and manually emit bit pack code + const Datatype bitGuardType = taco::Bool; + const std::string bitGuardName = temporary.getName() + "_already_set"; + const Expr bitGuardSize = getTemporarySize(temporary); + const Expr alreadySetArr = ir::Var::make(bitGuardName, + bitGuardType, + true, false); + + // TODO: TACO should probably keep state on if it can use int32 or if it should switch to + // using int64 for indices. This assumption is made in other places of taco. + const Datatype indexListType = taco::Int32; + const std::string indexListName = temporary.getName() + "_index_list"; + const Expr indexListArr = ir::Var::make(indexListName, + indexListType, + true, false); + + // no decl for shared memory + Stmt alreadySetDecl = Stmt(); + Stmt indexListDecl = Stmt(); + const Expr indexListSizeExpr = ir::Var::make(indexListName + "_size", taco::Int32, false, false); + const Stmt indexListSizeDecl = VarDecl::make(indexListSizeExpr, ir::Literal::make(0)); + Stmt freeTemps = Block::make(Free::make(indexListArr), Free::make(alreadySetArr)); + if ((isa(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) { + alreadySetDecl = VarDecl::make(alreadySetArr, ir::Literal::make(0)); + indexListDecl = VarDecl::make(indexListArr, ir::Literal::make(0)); + } + + tempToIndexList[temporary] = indexListArr; + tempToIndexListSize[temporary] = indexListSizeExpr; + tempToBitGuard[temporary] = alreadySetArr; + + Stmt allocateIndexList = Allocate::make(indexListArr, bitGuardSize); + if(should_use_CUDA_codegen()) { + Stmt allocateAlreadySet = Allocate::make(alreadySetArr, bitGuardSize); + Expr p = Var::make("p" + temporary.getName(), Int()); + Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType)); + + Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial); + Stmt inits = Block::make(indexListSizeDecl, alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop); + return {inits, freeTemps}; + } else { + Expr sizeOfElt = Sizeof::make(bitGuardType); + Expr callocAlreadySet = ir::Call::make("calloc", {bitGuardSize, sizeOfElt}, Int()); + Stmt allocateAlreadySet = VarDecl::make(alreadySetArr, callocAlreadySet); + Stmt inits = Block::make(indexListSizeDecl, indexListDecl, allocateIndexList, allocateAlreadySet); + return {inits, freeTemps}; + } + +} + +// Returns true if the following conditions are met: +// 1) The temporary is a dense vector +// 2) There is only one value on the right hand side of the consumer +// -- We would need to handle sparse acceleration in the merge lattices for multiple operands on the RHS +// 3) There are no reduced accesses +// 4) The left hand side of the where consumer is sparse +// 5) CPU Code is being generated (TEMPORARY - This should be removed) +// -- The sorting calls and calloc call in lower where are CPU specific. We could map calloc to a cudaMalloc +// and use a library like CUB to emit the sort. CUB support is built into CUDA 11 but not prior versions +// of CUDA so in that case, we'd probably need to include the CUB headers in the generated code. +bool LowererImpl::canAccelerateDenseTemp(Where where) { + TensorVar temporary = where.getTemporary(); + // (1) Temporary is dense vector + if(!isDense(temporary.getFormat()) || temporary.getOrder() == 1) return false; + + vector inputAccesses, resultAccesses; + set reducedAccesses; + + inputAccesses = getArgumentAccesses(where.getConsumer()); + // (2) Multiple operands in inputs (need lattice to reason about iteration) + if(inputAccesses.size() > 1 || inputAccesses.empty()) return false; + + std::tie(resultAccesses, reducedAccesses) = getResultAccesses(where.getConsumer()); + // (3) Contains reduced accesses + if(!reducedAccesses.empty()) return false; + + // no or multiple results? + if(resultAccesses.size() > 1 || resultAccesses.empty()) return false; + + // (4) Level of result is sparse + // No check for size of tempVar since we enforced the temporary is a vector and if there is only one RHS value, + // it must (should?) be the temporary + std::vector tempVar = inputAccesses[0].getIndexVars(); + + // Get vars in result. + std::vector resultVars = resultAccesses[0].getIndexVars(); + auto it = std::find(resultVars.begin(), resultVars.end(), tempVar[0]); + int index = it != resultVars.end()? (int)(it - resultVars.begin()) + 1: -1; + + // Var used in input is not in result? Probably would fail earlier but here just in case. + if(index == -1) return false; + + int modeIndex = resultAccesses[0].getTensorVar().getFormat().getModeOrdering()[index]; + ModeFormat varFmt = resultAccesses[0].getTensorVar().getFormat().getModeFormats()[modeIndex]; + + // Actual check for condition (4). If the current mode is full, no optimizations necessary + if(varFmt.isFull()) return false; + + // TODO: TEMPORARY -- Needs to be removed + if(should_use_CUDA_codegen()) return false; + + return true; +} + vector LowererImpl::codeToInitializeTemporary(Where where) { TensorVar temporary = where.getTemporary(); + bool accelerateDense = canAccelerateDenseTemp(where); + Stmt freeTemporary = Stmt(); Stmt initializeTemporary = Stmt(); if (isScalar(temporary.getType())) { initializeTemporary = defineScalarVariable(temporary, true); } else { + // When emitting code to accelerate dense workspaces with sparse iteration, we need the following arrays + // to construct the result indices + if(accelerateDense) { + vector initAndFree = codeToInitializeDenseAcceleratorArrays(where); + initializeTemporary = initAndFree[0]; + freeTemporary = initAndFree[1]; + } + if (generateComputeCode()) { Expr values = ir::Var::make(temporary.getName(), temporary.getType().getDataType(), true, false); taco_iassert(temporary.getType().getOrder() == 1) << " Temporary order was " << temporary.getType().getOrder(); // TODO - Dimension temporarySize = temporary.getType().getShape().getDimension(0); - Expr size; - if (temporarySize.isFixed()) { - size = ir::Literal::make(temporarySize.getSize()); - } else if (temporarySize.isIndexVarSized()) { - IndexVar var = temporarySize.getIndexVarSize(); - vector bounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds, - indexVarToExprMap, iterators); - size = ir::Sub::make(bounds[1], bounds[0]); - } else { - taco_ierror; // TODO - } + Expr size = getTemporarySize(temporary); // no decl needed for shared memory Stmt decl = Stmt(); @@ -1334,10 +1566,14 @@ vector LowererImpl::codeToInitializeTemporary(Where where) { decl = VarDecl::make(values, ir::Literal::make(0)); } Stmt allocate = Allocate::make(values, size); - - Expr p = Var::make("p" + temporary.getName(), Int()); - Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); - Stmt zeroInitLoop = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); + // If we are using acceleration of the dense workspace, we do not need to initialize the values array + // since the bit guard will take care of setting the value array when necessary + Stmt zeroInitLoop = Stmt(); + if(!accelerateDense) { + Expr p = Var::make("p" + temporary.getName(), Int()); + Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); + zeroInitLoop = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); + } /// Make a struct object that lowerAssignment and lowerAccess can read /// temporary value arrays from. @@ -1345,8 +1581,8 @@ vector LowererImpl::codeToInitializeTemporary(Where where) { arrays.values = values; this->temporaryArrays.insert({temporary, arrays}); - freeTemporary = Free::make(values); - initializeTemporary = Block::make(decl, allocate, zeroInitLoop); + freeTemporary = Block::make(freeTemporary, Free::make(values)); + initializeTemporary = Block::make(decl, initializeTemporary, allocate, zeroInitLoop); } } return {initializeTemporary, freeTemporary}; @@ -1354,6 +1590,7 @@ vector LowererImpl::codeToInitializeTemporary(Where where) { Stmt LowererImpl::lowerWhere(Where where) { TensorVar temporary = where.getTemporary(); + bool accelarateDenseWorkSpace = canAccelerateDenseTemp(where); // Declare and initialize the where statement's temporary vector temporaryValuesInitFree = {Stmt(), Stmt()}; @@ -1379,6 +1616,16 @@ Stmt LowererImpl::lowerWhere(Where where) { ); Stmt consumer = lower(where.getConsumer()); + if(accelarateDenseWorkSpace) { + // We need to sort the indices array + Expr listOfIndices = tempToIndexList.at(temporary); + Expr listOfIndicesSize = tempToIndexListSize.at(temporary); + Expr sizeOfElt = ir::Sizeof::make(listOfIndices.type()); + Expr cmpName = ir::Var::make("cmp", Int()); + Stmt sortCall = ir::Sort::make( {listOfIndices, listOfIndicesSize, sizeOfElt, cmpName}); + consumer = Block::make(sortCall, consumer); + } + whereConsumers.push_back(consumer); whereTemps.push_back(where.getTemporary()); captureNextLocatePos = true; From 12b51f00790895758f9dbc085ab6e3a2ebb04390 Mon Sep 17 00:00:00 2001 From: Rawn Date: Sat, 5 Dec 2020 20:57:26 -0800 Subject: [PATCH 04/14] Fixes bugs in check for accelerating workspace --- src/lower/lowerer_impl.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index a4f05a879..e338f9c4e 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -508,16 +508,19 @@ Stmt LowererImpl::lowerForall(Forall forall) } // For now, this only works when consuming a single workspace. - bool canAccelWithSparseIteration = inParallelLoopDepth == 0 && provGraph.isFullyDerived(iterator.getIndexVar()); - if (canAccelWithSparseIteration && iterator.isDimensionIterator() && locators.size() == 1) { + bool canAccelWithSparseIteration = inParallelLoopDepth == 0 && provGraph.isFullyDerived(iterator.getIndexVar()) && + iterator.isDimensionIterator() && locators.size() == 1; + if (canAccelWithSparseIteration) { + bool indexListsExist = false; // We are iterating over a dimension and locating into a temporary with a tracker to keep indices. Instead, we // can just iterate over the indices and locate into the dense workspace. for (auto it = tensorVars.begin(); it != tensorVars.end(); ++it) { if (it->second == locators[0].getTensor() && util::contains(tempToIndexList, it->first)) { - canAccelWithSparseIteration = true; + indexListsExist = true; break; } } + canAccelWithSparseIteration &= indexListsExist; } if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) { @@ -1493,7 +1496,7 @@ vector LowererImpl::codeToInitializeDenseAcceleratorArrays(Where where) { bool LowererImpl::canAccelerateDenseTemp(Where where) { TensorVar temporary = where.getTemporary(); // (1) Temporary is dense vector - if(!isDense(temporary.getFormat()) || temporary.getOrder() == 1) return false; + if(!isDense(temporary.getFormat()) || temporary.getOrder() != 1) return false; vector inputAccesses, resultAccesses; set reducedAccesses; @@ -1517,7 +1520,7 @@ bool LowererImpl::canAccelerateDenseTemp(Where where) { // Get vars in result. std::vector resultVars = resultAccesses[0].getIndexVars(); auto it = std::find(resultVars.begin(), resultVars.end(), tempVar[0]); - int index = it != resultVars.end()? (int)(it - resultVars.begin()) + 1: -1; + int index = it != resultVars.end()? (int)(it - resultVars.begin()): -1; // Var used in input is not in result? Probably would fail earlier but here just in case. if(index == -1) return false; From c8972c052fef31016a42f478d5c55198922ec84e Mon Sep 17 00:00:00 2001 From: Rawn Date: Sat, 5 Dec 2020 22:30:58 -0800 Subject: [PATCH 05/14] Fixes bug in concreteNotation check. All workspace tests pass. --- src/index_notation/index_notation.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp index 3271e3bda..9c605684c 100644 --- a/src/index_notation/index_notation.cpp +++ b/src/index_notation/index_notation.cpp @@ -2118,8 +2118,23 @@ bool isConcreteNotation(IndexStmt stmt, std::string* reason) { return; } + // Handles derived vars on RHS with underived vars on LHS. + Assignment assignPtrWrapper = Assignment(op); + std::vector possibleReductionVars = assignPtrWrapper.getReductionVars(); + std::vector freeVars = assignPtrWrapper.getFreeVars(); + std::set freeVarsSet(freeVars.begin(), freeVars.end()); + + int numReductionVars = 0; + for(const auto& reductionVar : possibleReductionVars) { + std::vector underivedParents = provGraph.getUnderivedAncestors(reductionVar); + for(const auto& parent : underivedParents) { + if(!util::contains(freeVarsSet, parent)) { + ++numReductionVars; + } + } + } // allow introducing precompute loops where we set a temporary to values instead of += - if (Assignment(op).getReductionVars().size() > 0 && + if (numReductionVars > 0 && op->op == IndexExpr() && !inWhereProducer) { *reason = "reduction variables in concrete notation must be dominated " "by compound assignments (such as +=)"; From 4895917f210b93729a4c12b0f70044a90688c9e5 Mon Sep 17 00:00:00 2001 From: Rawn Date: Sat, 5 Dec 2020 23:30:17 -0800 Subject: [PATCH 06/14] Removes print statements --- test/tests-scheduling.cpp | 15 +++------------ test/tests-workspaces.cpp | 2 +- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/test/tests-scheduling.cpp b/test/tests-scheduling.cpp index b7ba88ecc..7adc3ca88 100644 --- a/test/tests-scheduling.cpp +++ b/test/tests-scheduling.cpp @@ -84,7 +84,6 @@ TEST(scheduling, lowerDenseMatrixMul) { } } - cout << "-------PACKING---------" << endl; A.pack(); B.pack(); @@ -97,29 +96,21 @@ TEST(scheduling, lowerDenseMatrixMul) { .split(j, j0, j1, 2) .split(k, k0, k1, 2) .reorder({i0, j0, k0, i1, j1, k1}); - cout << "-------COMPILING---------" << endl; C.compile(stmt); - cout << "-------ASSEMBLING---------" << endl; C.assemble(); - cout << "-------COMPUTING---------" << endl; C.compute(); Tensor expected("expected", {4, 4}, {Dense, Dense}); expected(i, j) = A(i, k) * B(k, j); - IndexStmt expected_stmt = C.getAssignment().concretize(); expected.compile(); expected.assemble(); expected.compute(); ASSERT_TENSOR_EQ(C, expected); - cout << stmt << endl; - - std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", true, true); - codegen->compile(compute, true); - ir::Stmt expected_compute = lower(expected_stmt, "compute", false, true); - //codegen->compile(expected_compute, true); + // std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); + // ir::Stmt compute = lower(stmt, "compute", false, true); + // codegen->compile(compute, true); } TEST(scheduling, lowerSparseCopy) { diff --git a/test/tests-workspaces.cpp b/test/tests-workspaces.cpp index bd035a8b1..a6af48f06 100644 --- a/test/tests-workspaces.cpp +++ b/test/tests-workspaces.cpp @@ -36,7 +36,7 @@ TEST(workspaces, tile_vecElemMul_NoTail) { .split(i_bounded, i0, i1, 4) .precompute(precomputedExpr, i1, i1, precomputed); - cout << stmt << endl; +// cout << stmt << endl; A.compile(stmt); A.assemble(); From 1c37ebd4a0cb1e9098937ed0d03150c19bf17f8a Mon Sep 17 00:00:00 2001 From: Rawn Date: Sun, 6 Dec 2020 15:06:51 -0800 Subject: [PATCH 07/14] Only hoists out malloc + free from where statement when possible. Emits loop to zero every element in a temporary when it is hoisted before the producer is called. Changes the codegens to keep pointer names constant --- src/codegen/codegen_c.cpp | 2 +- src/codegen/codegen_cuda.cpp | 2 +- src/lower/lowerer_impl.cpp | 27 ++++++++++++++++++--------- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp index 9611d09f5..a0c6c9591 100644 --- a/src/codegen/codegen_c.cpp +++ b/src/codegen/codegen_c.cpp @@ -182,7 +182,7 @@ class CodeGen_C::FindVars : public IRVisitor { virtual void visit(const Var *op) { if (varMap.count(op) == 0) { - varMap[op] = codeGen->genUniqueName(op->name); + varMap[op] = op->is_ptr? op->name : codeGen->genUniqueName(op->name); } } diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index 5eb57c7ad..d19cac605 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -240,7 +240,7 @@ class CodeGen_CUDA::FindVars : public IRVisitor { virtual void visit(const Var *op) { if (varMap.count(op) == 0 && !inBlock) { - varMap[op] = codeGen->genUniqueName(op->name); + varMap[op] = op->is_ptr? op->name : codeGen->genUniqueName(op->name); } } diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index e338f9c4e..bf3254375 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -1569,14 +1569,6 @@ vector LowererImpl::codeToInitializeTemporary(Where where) { decl = VarDecl::make(values, ir::Literal::make(0)); } Stmt allocate = Allocate::make(values, size); - // If we are using acceleration of the dense workspace, we do not need to initialize the values array - // since the bit guard will take care of setting the value array when necessary - Stmt zeroInitLoop = Stmt(); - if(!accelerateDense) { - Expr p = Var::make("p" + temporary.getName(), Int()); - Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); - zeroInitLoop = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); - } /// Make a struct object that lowerAssignment and lowerAccess can read /// temporary value arrays from. @@ -1585,7 +1577,7 @@ vector LowererImpl::codeToInitializeTemporary(Where where) { this->temporaryArrays.insert({temporary, arrays}); freeTemporary = Block::make(freeTemporary, Free::make(values)); - initializeTemporary = Block::make(decl, initializeTemporary, allocate, zeroInitLoop); + initializeTemporary = Block::make(decl, initializeTemporary, allocate); } } return {initializeTemporary, freeTemporary}; @@ -1629,6 +1621,23 @@ Stmt LowererImpl::lowerWhere(Where where) { consumer = Block::make(sortCall, consumer); } + // Now that temporary allocations are hoisted, we always need to emit an initialization loop before entering the + // producer. + if(generateComputeCode() && !isScalar(temporary.getType())) { + // TODO: We only actually need to do this if: + // 1) We use the temporary multiple times + // 2) The PRODUCER RHS is sparse(not full). (Guarantees that old values are overwritten before consuming) + + Expr p = Var::make("p" + temporary.getName(), Int()); + Expr values = ir::Var::make(temporary.getName(), + temporary.getType().getDataType(), + true, false); + Expr size = getTemporarySize(temporary); + Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); + Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); + initializeTemporary = Block::make(initializeTemporary, loopInit); + } + whereConsumers.push_back(consumer); whereTemps.push_back(where.getTemporary()); captureNextLocatePos = true; From 9b1450cd29bb28cb8f9829ceae809c804fcb8f96 Mon Sep 17 00:00:00 2001 From: Rawn Date: Wed, 23 Dec 2020 14:45:57 -0800 Subject: [PATCH 08/14] Removes initialization loop from before producer when accelerating a dense workspacE --- src/lower/lowerer_impl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index bf3254375..92d0e61ce 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -1622,8 +1622,8 @@ Stmt LowererImpl::lowerWhere(Where where) { } // Now that temporary allocations are hoisted, we always need to emit an initialization loop before entering the - // producer. - if(generateComputeCode() && !isScalar(temporary.getType())) { + // producer but only if there is no dense acceleration + if(generateComputeCode() && !isScalar(temporary.getType()) && !accelarateDenseWorkSpace) { // TODO: We only actually need to do this if: // 1) We use the temporary multiple times // 2) The PRODUCER RHS is sparse(not full). (Guarantees that old values are overwritten before consuming) From dd795fcdae75d736601f581722fcff43426eabb0 Mon Sep 17 00:00:00 2001 From: Rawn Date: Wed, 23 Dec 2020 18:48:37 -0800 Subject: [PATCH 09/14] Places index list size above the producer loop when accelerating a dense workspace. This should make the transition to multithreading easier and fixes a bug in the original code --- src/lower/lowerer_impl.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index 92d0e61ce..e85a20d5c 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -1453,7 +1453,6 @@ vector LowererImpl::codeToInitializeDenseAcceleratorArrays(Where where) { Stmt alreadySetDecl = Stmt(); Stmt indexListDecl = Stmt(); const Expr indexListSizeExpr = ir::Var::make(indexListName + "_size", taco::Int32, false, false); - const Stmt indexListSizeDecl = VarDecl::make(indexListSizeExpr, ir::Literal::make(0)); Stmt freeTemps = Block::make(Free::make(indexListArr), Free::make(alreadySetArr)); if ((isa(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) { alreadySetDecl = VarDecl::make(alreadySetArr, ir::Literal::make(0)); @@ -1471,13 +1470,13 @@ vector LowererImpl::codeToInitializeDenseAcceleratorArrays(Where where) { Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType)); Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial); - Stmt inits = Block::make(indexListSizeDecl, alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop); + Stmt inits = Block::make(alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop); return {inits, freeTemps}; } else { Expr sizeOfElt = Sizeof::make(bitGuardType); Expr callocAlreadySet = ir::Call::make("calloc", {bitGuardSize, sizeOfElt}, Int()); Stmt allocateAlreadySet = VarDecl::make(alreadySetArr, callocAlreadySet); - Stmt inits = Block::make(indexListSizeDecl, indexListDecl, allocateIndexList, allocateAlreadySet); + Stmt inits = Block::make(indexListDecl, allocateIndexList, allocateAlreadySet); return {inits, freeTemps}; } @@ -1650,6 +1649,11 @@ Stmt LowererImpl::lowerWhere(Where where) { } Stmt producer = lower(where.getProducer()); + if(accelarateDenseWorkSpace) { + const Expr indexListSizeExpr = tempToIndexListSize.at(temporary); + const Stmt indexListSizeDecl = VarDecl::make(indexListSizeExpr, ir::Literal::make(0)); + initializeTemporary = Block::make(indexListSizeDecl, initializeTemporary); + } if (restoreAtomicDepth) { markAssignsAtomicDepth++; From ff84784ec225134179c59b27b08cad9d09b1eb02 Mon Sep 17 00:00:00 2001 From: Rawn Date: Wed, 23 Dec 2020 21:09:00 -0800 Subject: [PATCH 10/14] Fixes workspace reset --- src/lower/lowerer_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index e85a20d5c..10478c55b 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -939,7 +939,7 @@ Stmt LowererImpl::lowerForallDimension(Forall forall, Stmt declareVar = VarDecl::make(coordinate, Load::make(indexList, loopVar)); Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); - Stmt resetGuard = ir::Store::make(bitGuard, loopVar, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit); + Stmt resetGuard = ir::Store::make(bitGuard, coordinate, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit); body = Block::make(declareVar, body, resetGuard); if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { From d5721d70b0d8f2311d5c1e9c501936accfaf4909 Mon Sep 17 00:00:00 2001 From: Rawn Date: Thu, 24 Dec 2020 00:12:04 -0800 Subject: [PATCH 11/14] If underived variables are used to index a workspace, we allocate space for the workspace based on the size of the sizes of the input tensors --- include/taco/lower/lowerer_impl.h | 4 ++-- src/lower/lowerer_impl.cpp | 22 ++++++++++++++++++---- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/include/taco/lower/lowerer_impl.h b/include/taco/lower/lowerer_impl.h index 71783c6ad..39675d450 100644 --- a/include/taco/lower/lowerer_impl.h +++ b/include/taco/lower/lowerer_impl.h @@ -350,8 +350,8 @@ class LowererImpl : public util::Uncopyable { /// Initializes a temporary workspace std::vector codeToInitializeTemporary(Where where); - /// Gets the size of a temporary tensorVar - ir::Expr getTemporarySize(TensorVar var); + /// Gets the size of a temporary tensorVar in the where statement + ir::Expr getTemporarySize(Where where); /// Initializes helper arrays to give dense workspaces sparse acceleration std::vector codeToInitializeDenseAcceleratorArrays(Where where); diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index 10478c55b..2be37d0f4 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -1412,8 +1412,22 @@ Stmt LowererImpl::lowerForallBody(Expr coordinate, IndexStmt stmt, appendCoords); } -Expr LowererImpl::getTemporarySize(TensorVar temporary) { +Expr LowererImpl::getTemporarySize(Where where) { + TensorVar temporary = where.getTemporary(); Dimension temporarySize = temporary.getType().getShape().getDimension(0); + Access temporaryAccess = getResultAccesses(where.getProducer()).first[0]; + std::vector indexVars = temporaryAccess.getIndexVars(); + + if(util::all(indexVars, [&](const IndexVar& var) { return provGraph.isUnderived(var);})) { + // All index vars underived then use tensor properties to get tensor size + taco_iassert(util::contains(dimensions, indexVars[0])) << "Missing " << indexVars[0]; + ir::Expr size = dimensions.at(indexVars[0]); + for(size_t i = 1; i < indexVars.size(); ++i) { + taco_iassert(util::contains(dimensions, indexVars[i])) << "Missing " << indexVars[i]; + size = ir::Mul::make(size, dimensions.at(indexVars[i])); + } + return size; + } if (temporarySize.isFixed()) { return ir::Literal::make(temporarySize.getSize()); @@ -1436,7 +1450,7 @@ vector LowererImpl::codeToInitializeDenseAcceleratorArrays(Where where) { // TODO: emit as uint64 and manually emit bit pack code const Datatype bitGuardType = taco::Bool; const std::string bitGuardName = temporary.getName() + "_already_set"; - const Expr bitGuardSize = getTemporarySize(temporary); + const Expr bitGuardSize = getTemporarySize(where); const Expr alreadySetArr = ir::Var::make(bitGuardName, bitGuardType, true, false); @@ -1560,7 +1574,7 @@ vector LowererImpl::codeToInitializeTemporary(Where where) { true, false); taco_iassert(temporary.getType().getOrder() == 1) << " Temporary order was " << temporary.getType().getOrder(); // TODO - Expr size = getTemporarySize(temporary); + Expr size = getTemporarySize(where); // no decl needed for shared memory Stmt decl = Stmt(); @@ -1631,7 +1645,7 @@ Stmt LowererImpl::lowerWhere(Where where) { Expr values = ir::Var::make(temporary.getName(), temporary.getType().getDataType(), true, false); - Expr size = getTemporarySize(temporary); + Expr size = getTemporarySize(where); Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); initializeTemporary = Block::make(initializeTemporary, loopInit); From 2eb298e355e39ffe013b3aab19c08c9770efb8b5 Mon Sep 17 00:00:00 2001 From: Rawn Date: Thu, 24 Dec 2020 16:29:44 -0800 Subject: [PATCH 12/14] Relaxes requirements for spmm transformation --- src/index_notation/transformations.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index 955687f46..1bebe8c6d 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -1115,16 +1115,15 @@ static IndexStmt optimizeSpMM(IndexStmt stmt) { } TensorVar B = Baccess.getTensorVar(); - if (B.getFormat().getModeFormats()[0].getName() != "dense" || - B.getFormat().getModeFormats()[1].getName() != "compressed" || - B.getFormat().getModeOrdering()[0] != 0 || + if (B.getFormat().getModeOrdering()[0] != 0 || B.getFormat().getModeOrdering()[1] != 1) { return stmt; } + // We need random access into the first mode or this tensor in order to perform a linear combination of rows + // algorithm. (I think?) TensorVar C = Caccess.getTensorVar(); - if (C.getFormat().getModeFormats()[0].getName() != "dense" || - C.getFormat().getModeFormats()[1].getName() != "compressed" || + if (C.getFormat().getModeFormats()[0].getName() == "compressed" || C.getFormat().getModeOrdering()[0] != 0 || C.getFormat().getModeOrdering()[1] != 1) { return stmt; From 46aed1307438a691a80c37598c1a58124a6ddd91 Mon Sep 17 00:00:00 2001 From: Rawn Date: Thu, 24 Dec 2020 16:39:16 -0800 Subject: [PATCH 13/14] Checks if first mode of last tensor has locate for spmm transform --- src/index_notation/transformations.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index 1bebe8c6d..f0728ef2b 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -1123,7 +1123,7 @@ static IndexStmt optimizeSpMM(IndexStmt stmt) { // We need random access into the first mode or this tensor in order to perform a linear combination of rows // algorithm. (I think?) TensorVar C = Caccess.getTensorVar(); - if (C.getFormat().getModeFormats()[0].getName() == "compressed" || + if (!C.getFormat().getModeFormats()[0].hasLocate() || C.getFormat().getModeOrdering()[0] != 0 || C.getFormat().getModeOrdering()[1] != 1) { return stmt; From 8471869aa3678c1bc777af073a212dbaaa4fd856 Mon Sep 17 00:00:00 2001 From: Rawn Date: Thu, 24 Dec 2020 17:01:12 -0800 Subject: [PATCH 14/14] Changes SPMM tranform requirement. Unsure about this --- src/index_notation/transformations.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index f0728ef2b..5310455f6 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -1114,16 +1114,19 @@ static IndexStmt optimizeSpMM(IndexStmt stmt) { return stmt; } + // I think we can to linear combination of rows as long as there are no permutations in the format and the + // level formats are ordered. The i -> k -> j loops should iterate over the data structures without issue. TensorVar B = Baccess.getTensorVar(); - if (B.getFormat().getModeOrdering()[0] != 0 || + if (!B.getFormat().getModeFormats()[0].isOrdered() || + !B.getFormat().getModeFormats()[1].isOrdered() || + B.getFormat().getModeOrdering()[0] != 0 || B.getFormat().getModeOrdering()[1] != 1) { return stmt; } - // We need random access into the first mode or this tensor in order to perform a linear combination of rows - // algorithm. (I think?) TensorVar C = Caccess.getTensorVar(); - if (!C.getFormat().getModeFormats()[0].hasLocate() || + if (!C.getFormat().getModeFormats()[0].isOrdered() || + !C.getFormat().getModeFormats()[1].isOrdered() || C.getFormat().getModeOrdering()[0] != 0 || C.getFormat().getModeOrdering()[1] != 1) { return stmt;