diff --git a/.gitignore b/.gitignore index 16389f34e..52b3eea75 100644 --- a/.gitignore +++ b/.gitignore @@ -10,5 +10,6 @@ lib/ *cmake_install.cmake CMakeCache.txt doc - +.idea/ apps/tensor_times_vector/tensor_times_vector +cmake-build-debug/ diff --git a/include/taco/lower/lowerer_impl_c.h b/include/taco/lower/lowerer_impl_c.h new file mode 100644 index 000000000..1ca1396f3 --- /dev/null +++ b/include/taco/lower/lowerer_impl_c.h @@ -0,0 +1,42 @@ +// +// Created by 张 on 2022/3/10. +// + +#ifndef TACO_LOWERER_IMPL_C_H +#define TACO_LOWERER_IMPL_C_H + +#include +#include "taco/lower/lowerer_impl_imperative.h" +namespace taco { + class LowererImplC: public LowererImplImperative { + public: + LowererImplC(); + virtual ~LowererImplC() = default; + + protected: + std::vector codeToInitializeDenseAcceleratorArrays(Where where, bool parallel = false); + std::vector codeToInitializeTemporaryParallel(Where where, ParallelUnit parallelUnit); + std::vector codeToInitializeTemporary(Where where); + std::pair canAccelerateDenseTemp(Where where); + std::vector codeToInitializeLocalTemporaryParallel(Where where, ParallelUnit parallelUnit); + /** + * Generate code to initialize values array in range + * [begin * size, (begin + 1) * size) with the fill value. + */ + ir::Stmt initValues(ir::Expr tensor, ir::Expr initVal, ir::Expr begin, ir::Expr size); + ir::Stmt lowerWhere(Where where); + ir::Stmt lowerForall(Forall forall); + + /// Lower a forall that needs to be cloned so that one copy does not have guards + /// used for vectorized and unrolled loops + ir::Stmt lowerForallCloned(Forall forall); + + private: + class Visitor; + friend class Visitor; + std::shared_ptr visitor; + }; +} + + +#endif //TACO_LOWERER_IMPL_C_H diff --git a/include/taco/lower/lowerer_impl_cuda.h b/include/taco/lower/lowerer_impl_cuda.h new file mode 100644 index 000000000..9b7151199 --- /dev/null +++ b/include/taco/lower/lowerer_impl_cuda.h @@ -0,0 +1,44 @@ +// +// Created by 张 on 2022/3/12. +// + +#ifndef TACO_LOWERER_IMPL_CUDA_H +#define TACO_LOWERER_IMPL_CUDA_H + +#include +#include "taco/lower/lowerer_impl_imperative.h" + + + +namespace taco { + class LowererImplCUDA: public LowererImplImperative { + public: + LowererImplCUDA(); + virtual ~LowererImplCUDA() = default; + + protected: + std::vector codeToInitializeDenseAcceleratorArrays(Where where, bool parallel = false); + std::vector codeToInitializeTemporaryParallel(Where where, ParallelUnit parallelUnit); + std::vector codeToInitializeTemporary(Where where); + std::pair canAccelerateDenseTemp(Where where); + std::vector codeToInitializeLocalTemporaryParallel(Where where, ParallelUnit parallelUnit); + /** + * Generate code to initialize values array in range + * [begin * size, (begin + 1) * size) with the fill value. + */ + ir::Stmt initValues(ir::Expr tensor, ir::Expr initVal, ir::Expr begin, ir::Expr size); + ir::Stmt lowerWhere(Where where); + ir::Stmt lowerForall(Forall forall); + + /// Lower a forall that needs to be cloned so that one copy does not have guards + /// used for vectorized and unrolled loops + ir::Stmt lowerForallCloned(Forall forall); + + private: + class Visitor; + friend class Visitor; + std::shared_ptr visitor; + }; +} + +#endif //TACO_LOWERER_IMPL_CUDA_H diff --git a/include/taco/lower/lowerer_impl_imperative.h b/include/taco/lower/lowerer_impl_imperative.h index fa97e3cd9..ecc5dfebe 100644 --- a/include/taco/lower/lowerer_impl_imperative.h +++ b/include/taco/lower/lowerer_impl_imperative.h @@ -340,7 +340,7 @@ class LowererImplImperative : public LowererImpl { * Generate code to initialize values array in range * [begin * size, (begin + 1) * size) with the fill value. */ - ir::Stmt initValues(ir::Expr tensor, ir::Expr initVal, ir::Expr begin, ir::Expr size); + virtual ir::Stmt initValues(ir::Expr tensor, ir::Expr initVal, ir::Expr begin, ir::Expr size); /// Declare position variables and initialize them with a locate. ir::Stmt declLocatePosVars(std::vector iterators); @@ -367,17 +367,17 @@ class LowererImplImperative : public LowererImpl { /// Returns true iff the temporary used in the where statement is dense and sparse iteration over that /// temporary can be automaticallty supported by the compiler. - std::pair canAccelerateDenseTemp(Where where); + virtual std::pair canAccelerateDenseTemp(Where where); /// Initializes a temporary workspace - std::vector codeToInitializeTemporary(Where where); - std::vector codeToInitializeTemporaryParallel(Where where, ParallelUnit parallelUnit); - std::vector codeToInitializeLocalTemporaryParallel(Where where, ParallelUnit parallelUnit); + virtual std::vector codeToInitializeTemporary(Where where); + virtual std::vector codeToInitializeTemporaryParallel(Where where, ParallelUnit parallelUnit); + virtual std::vector codeToInitializeLocalTemporaryParallel(Where where, ParallelUnit parallelUnit); /// Gets the size of a temporary tensorVar in the where statement ir::Expr getTemporarySize(Where where); /// Initializes helper arrays to give dense workspaces sparse acceleration - std::vector codeToInitializeDenseAcceleratorArrays(Where where, bool parallel = false); + virtual std::vector codeToInitializeDenseAcceleratorArrays(Where where, bool parallel = false); /// Recovers a derived indexvar from an underived variable. ir::Stmt codeToRecoverDerivedIndexVar(IndexVar underived, IndexVar indexVar, bool emitVarDecl); @@ -498,12 +498,38 @@ class LowererImplImperative : public LowererImpl { /// loop iterator variable should be incremented when the guard is fired. ir::Stmt strideBoundsGuard(Iterator iterator, ir::Expr access, bool incrementPosVar); -private: + util::ScopedSet accessibleIterators; + int inParallelLoopDepth = 0; + /// Map used to hoist parallel temporary workspaces. Maps workspace shared by all threads to where statement + std::map whereToTemporaryVar; + std::map whereToIndexListAll; + std::map whereToIndexListSizeAll; + std::map whereToBitGuardAll; + /// Map form temporary to indexList var if accelerating dense workspace + std::map tempToIndexList; + + /// Map form temporary to indexListSize if accelerating dense workspace + std::map tempToIndexListSize; + + /// Map form temporary to bitGuard var if accelerating dense workspace + std::map tempToBitGuard; + + std::set needCompute; + + struct TemporaryArrays { + ir::Expr values; + }; + std::map temporaryArrays; + std::set guardedTemps; + ProvenanceGraph provGraph; + bool emitUnderivedGuards = true; + + bool assemble; bool compute; bool loopOrderAllowsShortCircuit = false; - std::set needCompute; + // std::set needCompute; int markAssignsAtomicDepth = 0; ParallelUnit atomicParallelUnit; @@ -515,15 +541,17 @@ class LowererImplImperative : public LowererImpl { /// Map used to hoist temporary workspace initialization std::map temporaryInitialization; + /* /// Map used to hoist parallel temporary workspaces. Maps workspace shared by all threads to where statement std::map whereToTemporaryVar; std::map whereToIndexListAll; std::map whereToIndexListSizeAll; std::map whereToBitGuardAll; + */ /// Map from tensor variables in index notation to variables in the IR std::map tensorVars; - +/* struct TemporaryArrays { ir::Expr values; }; @@ -539,7 +567,7 @@ class LowererImplImperative : public LowererImpl { std::map tempToBitGuard; std::set guardedTemps; - +*/ /// Map from result tensors to variables tracking values array capacity. std::map capacityVars; @@ -556,7 +584,7 @@ class LowererImplImperative : public LowererImpl { Iterators iterators; /// Keep track of relations between IndexVars - ProvenanceGraph provGraph; + // ProvenanceGraph provGraph; bool ignoreVectorize = false; // already being taken into account @@ -573,9 +601,9 @@ class LowererImplImperative : public LowererImpl { bool captureNextLocatePos = false; ir::Stmt capturedLocatePos; // used for whereConsumer when want to replicate same locating - bool emitUnderivedGuards = true; + // bool emitUnderivedGuards = true; - int inParallelLoopDepth = 0; + // int inParallelLoopDepth = 0; std::map parallelUnitSizes; std::map parallelUnitIndexVars; @@ -588,7 +616,7 @@ class LowererImplImperative : public LowererImpl { std::map reducedValueVars; /// Set of locate-capable iterators that can be legally accessed. - util::ScopedSet accessibleIterators; + /// Visitor methods can add code to emit it to the function header. std::vector header; @@ -596,6 +624,9 @@ class LowererImplImperative : public LowererImpl { /// Visitor methods can add code to emit it to the function footer. std::vector footer; + +private: + class Visitor; friend class Visitor; std::shared_ptr visitor; diff --git a/src/lower/lower.cpp b/src/lower/lower.cpp index 511dcb442..7b1c68a31 100644 --- a/src/lower/lower.cpp +++ b/src/lower/lower.cpp @@ -16,6 +16,7 @@ #include "taco/lower/lowerer_impl.h" #include "taco/lower/lowerer_impl_imperative.h" +#include "taco/lower/lowerer_impl_c.h" #include "taco/lower/iterator.h" #include "mode_access.h" @@ -36,7 +37,8 @@ namespace taco { // class Lowerer Lowerer::Lowerer() : impl(new LowererImplImperative()) { } - +//Lowerer::Lowerer() :impl(new LowererImplC()) { +//} Lowerer::Lowerer(LowererImpl* impl) : impl(impl) { } diff --git a/src/lower/lowerer_impl_c.cpp b/src/lower/lowerer_impl_c.cpp new file mode 100644 index 000000000..af6157e8d --- /dev/null +++ b/src/lower/lowerer_impl_c.cpp @@ -0,0 +1,955 @@ +// +// Created by 张 on 2022/3/10. +// + +#include "taco/lower/lowerer_impl_c.h" +#include +#include "taco/lower/lowerer_impl_imperative.h" + + +#include "taco/index_notation/index_notation.h" + +#include "taco/index_notation/index_notation_nodes.h" +#include "taco/index_notation/index_notation_visitor.h" +#include "taco/index_notation/index_notation_rewriter.h" + +#include "taco/ir/ir.h" +#include "taco/ir/ir_generators.h" +#include "taco/ir/ir_visitor.h" +#include "taco/ir/simplify.h" +#include "taco/lower/iterator.h" +#include "taco/lower/merge_lattice.h" +#include "mode_access.h" +#include "taco/util/collections.h" + +#include "taco/ir/workspace_rewriter.h" + +using namespace std; +using namespace taco::ir; +using taco::util::combine; + +using namespace std; +using namespace taco::ir; + +namespace taco { + class LowererImplC::Visitor : public IndexNotationVisitorStrict { + public: + Visitor(LowererImplC *impl) : impl(impl) {} + + Stmt lower(IndexStmt stmt) { + this->stmt = Stmt(); + impl->accessibleIterators.scope(); + IndexStmtVisitorStrict::visit(stmt); + impl->accessibleIterators.unscope(); + return this->stmt; + } + + Expr lower(IndexExpr expr) { + this->expr = Expr(); + IndexExprVisitorStrict::visit(expr); + return this->expr; + } + + private: + LowererImplC *impl; + Expr expr; + Stmt stmt; + using IndexNotationVisitorStrict::visit; + + void visit(const AssignmentNode *node) { stmt = impl->lowerAssignment(node); } + + void visit(const YieldNode *node) { stmt = impl->lowerYield(node); } + + void visit(const ForallNode *node) { stmt = impl->lowerForall(node); } + + void visit(const WhereNode *node) { stmt = impl->lowerWhere(node); } + + void visit(const MultiNode *node) { stmt = impl->lowerMulti(node); } + + void visit(const SuchThatNode *node) { stmt = impl->lowerSuchThat(node); } + + void visit(const SequenceNode *node) { stmt = impl->lowerSequence(node); } + + void visit(const AssembleNode *node) { stmt = impl->lowerAssemble(node); } + + void visit(const AccessNode *node) { expr = impl->lowerAccess(node); } + + void visit(const LiteralNode *node) { expr = impl->lowerLiteral(node); } + + void visit(const NegNode *node) { expr = impl->lowerNeg(node); } + + void visit(const AddNode *node) { expr = impl->lowerAdd(node); } + + void visit(const SubNode *node) { expr = impl->lowerSub(node); } + + void visit(const MulNode *node) { expr = impl->lowerMul(node); } + + void visit(const DivNode *node) { expr = impl->lowerDiv(node); } + + void visit(const SqrtNode *node) { expr = impl->lowerSqrt(node); } + + void visit(const CastNode *node) { expr = impl->lowerCast(node); } + + void visit(const CallIntrinsicNode *node) { expr = impl->lowerCallIntrinsic(node); } + + void visit(const CallNode *node) { expr = impl->lowerTensorOp(node); } + + void visit(const ReductionNode *node) { + taco_ierror << "Reduction nodes not supported in concrete index notation"; + } + + void visit(const IndexVarNode *node) { expr = impl->lowerIndexVar(node); } + }; + + LowererImplC::LowererImplC() : visitor(new Visitor(this)) {} + + + static bool returnsTrue(IndexExpr expr) { + struct ReturnsTrue : public IndexExprRewriterStrict { + void visit(const AccessNode* op) { + if (op->isAccessingStructure) { + expr = op; + } + } + + void visit(const LiteralNode* op) { + if (op->getDataType() == Bool && op->getVal()) { + expr = op; + } + } + + void visit(const NegNode* op) { + expr = rewrite(op->a); + } + + void visit(const AddNode* op) { + if (rewrite(op->a).defined() || rewrite(op->b).defined()) { + expr = op; + } + } + + void visit(const MulNode* op) { + if (rewrite(op->a).defined() && rewrite(op->b).defined()) { + expr = op; + } + } + + void visit(const CastNode* op) { + expr = rewrite(op->a); + } + + void visit(const CallNode* op) { + const auto annihilator = findProperty(op->properties); + + if (!annihilator.defined() || !annihilator.positions().empty()) { + return; + } + + if (equals(annihilator.annihilator(), Literal(false))) { + for (const auto& arg : op->args) { + if (!rewrite(arg).defined()) { + return; + } + } + expr = op; + } else { + for (const auto& arg : op->args) { + if (rewrite(arg).defined()) { + expr = op; + return; + } + } + } + } + + void visit(const SqrtNode* op) {} + void visit(const SubNode* op) {} + void visit(const DivNode* op) {} + void visit(const CallIntrinsicNode* op) {} + void visit(const ReductionNode* op) {} + void visit(const IndexVarNode* op) {} + }; + return ReturnsTrue().rewrite(expr).defined(); + } + + static bool needComputeValues(IndexStmt stmt, TensorVar tensor) { + if (tensor.getType().getDataType() != Bool) { + return true; + } + + bool needComputeValue = false; + match(stmt, + function([&]( + const AssignmentNode* n, Matcher* m) { + if (n->lhs.getTensorVar() == tensor && !returnsTrue(n->rhs)) { + needComputeValue = true; + } + }) + ); + + return needComputeValue; + } + + vector LowererImplC::codeToInitializeDenseAcceleratorArrays(Where where, bool parallel) { + // if parallel == true, need to initialize dense accelerator arrays as size*numThreads + // and rename all dense accelerator arrays to name + '_all' + + TensorVar temporary = where.getTemporary(); + + // TODO: emit as uint64 and manually emit bit pack code + const Datatype bitGuardType = taco::Bool; + std::string bitGuardSuffix; + if (parallel) + bitGuardSuffix = "_already_set_all"; + else + bitGuardSuffix = "_already_set"; + const std::string bitGuardName = temporary.getName() + bitGuardSuffix; + + Expr bitGuardSize = getTemporarySize(where); + Expr maxThreads = ir::Call::make("omp_get_max_threads", {}, bitGuardSize.type()); + if (parallel) + bitGuardSize = ir::Mul::make(bitGuardSize, maxThreads); + + const Expr alreadySetArr = ir::Var::make(bitGuardName, + bitGuardType, + true, false); + + // TODO: TACO should probably keep state on if it can use int32 or if it should switch to + // using int64 for indices. This assumption is made in other places of taco. + const Datatype indexListType = taco::Int32; + std::string indexListSuffix; + if (parallel) + indexListSuffix = "_index_list_all"; + else + indexListSuffix = "_index_list"; + + const std::string indexListName = temporary.getName() + indexListSuffix; + const Expr indexListArr = ir::Var::make(indexListName, + indexListType, + true, false); + + // no decl for shared memory + Stmt alreadySetDecl = Stmt(); + Stmt indexListDecl = Stmt(); + Stmt freeTemps = Block::make(Free::make(indexListArr), Free::make(alreadySetArr)); + alreadySetDecl = VarDecl::make(alreadySetArr, ir::Literal::make(0)); + indexListDecl = VarDecl::make(indexListArr, ir::Literal::make(0)); + + if (parallel) { + whereToIndexListAll[where] = indexListArr; + whereToBitGuardAll[where] = alreadySetArr; + } else { + const Expr indexListSizeExpr = ir::Var::make(indexListName + "_size", taco::Int32, false, false); + tempToIndexList[temporary] = indexListArr; + tempToIndexListSize[temporary] = indexListSizeExpr; + tempToBitGuard[temporary] = alreadySetArr; + } + + Stmt allocateIndexList = Allocate::make(indexListArr, bitGuardSize); + Expr sizeOfElt = Sizeof::make(bitGuardType); + Expr callocAlreadySet = ir::Call::make("calloc", {bitGuardSize, sizeOfElt}, Int()); + Stmt allocateAlreadySet = VarDecl::make(alreadySetArr, callocAlreadySet); + Stmt inits = Block::make(indexListDecl, allocateIndexList, allocateAlreadySet); + return {inits, freeTemps}; + + } + + // Code to initialize a temporary workspace that is SHARED across ALL parallel units. + // New temporaries are denoted by temporary.getName() + '_all' + // Currently only supports CPUThreads + vector LowererImplC::codeToInitializeTemporaryParallel(Where where, ParallelUnit parallelUnit) { + TensorVar temporary = where.getTemporary(); + // For the parallel case, need to hoist up a workspace shared by all threads + TensorVar temporaryAll = TensorVar(temporary.getName() + "_all", temporary.getType(), temporary.getFormat()); + this->whereToTemporaryVar[where] = temporaryAll; + + bool accelerateDense = canAccelerateDenseTemp(where).first; + + Stmt freeTemporary = Stmt(); + Stmt initializeTemporary = Stmt(); + + // When emitting code to accelerate dense workspaces with sparse iteration, we need the following arrays + // to construct the result indices + if(accelerateDense) { + vector initAndFree = codeToInitializeDenseAcceleratorArrays(where, true); + initializeTemporary = initAndFree[0]; + freeTemporary = initAndFree[1]; + } + + Expr values; + if (util::contains(needCompute, temporary) && + needComputeValues(where, temporary)) { + values = ir::Var::make(temporaryAll.getName(), + temporaryAll.getType().getDataType(), + true, false); + Expr size = getTemporarySize(where); + Expr sizeAll = ir::Mul::make(size, ir::Call::make("omp_get_max_threads", {}, size.type())); + + // no decl needed for shared memory + Stmt decl = Stmt(); + decl = VarDecl::make(values, ir::Literal::make(0)); + Stmt allocate = Allocate::make(values, sizeAll); + + freeTemporary = Block::make(freeTemporary, Free::make(values)); + initializeTemporary = Block::make(decl, initializeTemporary, allocate); + } + /// Make a struct object that lowerAssignment and lowerAccess can read + /// temporary value arrays from. + TemporaryArrays arrays; + arrays.values = values; + this->temporaryArrays.insert({temporaryAll, arrays}); + + return {initializeTemporary, freeTemporary}; + } + vector LowererImplC::codeToInitializeTemporary(Where where) { + TensorVar temporary = where.getTemporary(); + + const bool accelerateDense = canAccelerateDenseTemp(where).first; + + Stmt freeTemporary = Stmt(); + Stmt initializeTemporary = Stmt(); + if (isScalar(temporary.getType())) { + initializeTemporary = defineScalarVariable(temporary, true); + Expr tempSet = ir::Var::make(temporary.getName() + "_set", Datatype::Bool); + Stmt initTempSet = VarDecl::make(tempSet, false); + initializeTemporary = Block::make(initializeTemporary, initTempSet); + tempToBitGuard[temporary] = tempSet; + } else { + // TODO: Need to support keeping track of initialized elements for + // temporaries that don't have sparse accelerator + taco_iassert(!util::contains(guardedTemps, temporary) || accelerateDense); + + // When emitting code to accelerate dense workspaces with sparse iteration, we need the following arrays + // to construct the result indices + if(accelerateDense) { + vector initAndFree = codeToInitializeDenseAcceleratorArrays(where); + initializeTemporary = initAndFree[0]; + freeTemporary = initAndFree[1]; + } + + Expr values; + if (util::contains(needCompute, temporary) && + needComputeValues(where, temporary)) { + values = ir::Var::make(temporary.getName(), + temporary.getType().getDataType(), true, false); + + Expr size = getTemporarySize(where); + + // no decl needed for shared memory + Stmt decl = Stmt(); + decl = VarDecl::make(values, ir::Literal::make(0)); + Stmt allocate = Allocate::make(values, size); + + freeTemporary = Block::make(freeTemporary, Free::make(values)); + initializeTemporary = Block::make(decl, initializeTemporary, allocate); + } + + /// Make a struct object that lowerAssignment and lowerAccess can read + /// temporary value arrays from. + TemporaryArrays arrays; + arrays.values = values; + this->temporaryArrays.insert({temporary, arrays}); + } + return {initializeTemporary, freeTemporary}; + } + + // Returns true if the following conditions are met: +// 1) The temporary is a dense vector +// 2) There is only one value on the right hand side of the consumer +// -- We would need to handle sparse acceleration in the merge lattices for +// multiple operands on the RHS +// 3) The left hand side of the where consumer is sparse, if the consumer is an +// assignment +// 4) CPU Code is being generated (TEMPORARY - This should be removed) +// -- The sorting calls and calloc call in lower where are CPU specific. We +// could map calloc to a cudaMalloc and use a library like CUB to emit +// the sort. CUB support is built into CUDA 11 but not prior versions of +// CUDA so in that case, we'd probably need to include the CUB headers in +// the generated code. + std::pair LowererImplC::canAccelerateDenseTemp(Where where) { + // TODO: TEMPORARY -- Needs to be removed + + + TensorVar temporary = where.getTemporary(); + // (1) Temporary is dense vector + if(!isDense(temporary.getFormat()) || temporary.getOrder() != 1) { + return std::make_pair(false, false); + } + + // (2) Multiple operands in inputs (need lattice to reason about iteration) + const auto inputAccesses = getArgumentAccesses(where.getConsumer()); + if(inputAccesses.size() > 1 || inputAccesses.empty()) { + return std::make_pair(false, false); + } + + // No or multiple results? + const auto resultAccesses = getResultAccesses(where.getConsumer()).first; + if(resultAccesses.size() > 1 || resultAccesses.empty()) { + return std::make_pair(false, false); + } + + // No check for size of tempVar since we enforced the temporary is a vector + // and if there is only one RHS value, it must (should?) be the temporary + std::vector tempVar = inputAccesses[0].getIndexVars(); + + // Get index vars in result. + std::vector resultVars = resultAccesses[0].getIndexVars(); + auto it = std::find_if(resultVars.begin(), resultVars.end(), + [&](const auto& resultVar) { + return resultVar == tempVar[0] || + provGraph.isDerivedFrom(tempVar[0], resultVar); + }); + + if (it == resultVars.end()) { + return std::make_pair(true, false); + } + + int index = (int)(it - resultVars.begin()); + TensorVar resultTensor = resultAccesses[0].getTensorVar(); + int modeIndex = resultTensor.getFormat().getModeOrdering()[index]; + ModeFormat varFmt = resultTensor.getFormat().getModeFormats()[modeIndex]; + // (3) Level of result is sparse + if(varFmt.isFull()) { + return std::make_pair(false, false); + } + + // Only need to sort the workspace if the result needs to be ordered + return std::make_pair(true, varFmt.isOrdered()); + } + + // Code to initialize the local temporary workspace from the shared workspace +// in codeToInitializeTemporaryParallel for a SINGLE parallel unit +// (e.g.) the local workspace that each thread uses + vector LowererImplC::codeToInitializeLocalTemporaryParallel(Where where, ParallelUnit parallelUnit) { + TensorVar temporary = where.getTemporary(); + vector decls; + + Expr tempSize = getTemporarySize(where); + Expr threadNum = ir::Call::make("omp_get_thread_num", {}, tempSize.type()); + tempSize = ir::Mul::make(tempSize, threadNum); + + bool accelerateDense = canAccelerateDenseTemp(where).first; + + Expr values; + if (util::contains(needCompute, temporary) && + needComputeValues(where, temporary)) { + // Declare local temporary workspace array + values = ir::Var::make(temporary.getName(), + temporary.getType().getDataType(), + true, false); + Expr values_all = this->temporaryArrays[this->whereToTemporaryVar[where]].values; + Expr tempRhs = ir::Add::make(values_all, tempSize); + Stmt tempDecl = ir::VarDecl::make(values, tempRhs); + decls.push_back(tempDecl); + } + /// Make a struct object that lowerAssignment and lowerAccess can read + /// temporary value arrays from. + TemporaryArrays arrays; + arrays.values = values; + this->temporaryArrays.insert({temporary, arrays}); + + if (accelerateDense) { + // Declare local index list array + // TODO: TACO should probably keep state on if it can use int32 or if it should switch to + // using int64 for indices. This assumption is made in other places of taco. + const Datatype indexListType = taco::Int32; + const std::string indexListName = temporary.getName() + "_index_list"; + const Expr indexListArr = ir::Var::make(indexListName, + indexListType, + true, false); + + Expr indexList_all = this->whereToIndexListAll[where]; + Expr indexListRhs = ir::Add::make(indexList_all, tempSize); + Stmt indexListDecl = ir::VarDecl::make(indexListArr, indexListRhs); + decls.push_back(indexListDecl); + + // Declare local indexList size variable + const Expr indexListSizeExpr = ir::Var::make(indexListName + "_size", taco::Int32, false, false); + + // Declare local already set array (bit guard) + // TODO: emit as uint64 and manually emit bit pack code + const Datatype bitGuardType = taco::Bool; + const std::string bitGuardName = temporary.getName() + "_already_set"; + const Expr alreadySetArr = ir::Var::make(bitGuardName, + bitGuardType, + true, false); + Expr bitGuard_all = this->whereToBitGuardAll[where]; + Expr bitGuardRhs = ir::Add::make(bitGuard_all, tempSize); + Stmt bitGuardDecl = ir::VarDecl::make(alreadySetArr, bitGuardRhs); + decls.push_back(bitGuardDecl); + + tempToIndexList[temporary] = indexListArr; + tempToIndexListSize[temporary] = indexListSizeExpr; + tempToBitGuard[temporary] = alreadySetArr; + } + return decls; + } + + Stmt LowererImplC::initValues(Expr tensor, Expr initVal, Expr begin, Expr size) { + Expr lower = simplify(ir::Mul::make(begin, size)); + Expr upper = simplify(ir::Mul::make(ir::Add::make(begin, 1), size)); + Expr p = Var::make("p" + util::toString(tensor), Int()); + Expr values = GetProperty::make(tensor, TensorProperty::Values); + Stmt zeroInit = Store::make(values, p, initVal); + LoopKind parallel = (isa(size) && + to(size)->getIntValue() < (1 << 10)) + ? LoopKind::Serial : LoopKind::Static_Chunked; + return For::make(p, lower, upper, 1, zeroInit, parallel); + } + + + Stmt LowererImplC::lowerForall(Forall forall) + { + bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar()); + bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards; + if (!ignoreVectorize && forallNeedsUnderivedGuards && + (forall.getParallelUnit() == ParallelUnit::CPUVector || + forall.getUnrollFactor() > 0)) { + return lowerForallCloned(forall); + } + + if (forall.getParallelUnit() != ParallelUnit::NotParallel) { + inParallelLoopDepth++; + } + + // Recover any available parents that were not recoverable previously + vector recoverySteps; + for (const IndexVar& varToRecover : provGraph.newlyRecoverableParents(forall.getIndexVar(), definedIndexVars)) { + // place pos guard + if (forallNeedsUnderivedGuards && provGraph.isCoordVariable(varToRecover) && + provGraph.getChildren(varToRecover).size() == 1 && + provGraph.isPosVariable(provGraph.getChildren(varToRecover)[0])) { + IndexVar posVar = provGraph.getChildren(varToRecover)[0]; + std::vector iterBounds = provGraph.deriveIterBounds(posVar, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + + Expr minGuard = Lt::make(indexVarToExprMap[posVar], iterBounds[0]); + Expr maxGuard = Gte::make(indexVarToExprMap[posVar], iterBounds[1]); + Expr guardCondition = Or::make(minGuard, maxGuard); + if (isa(ir::simplify(iterBounds[0])) && ir::simplify(iterBounds[0]).as()->equalsScalar(0)) { + guardCondition = maxGuard; + } + ir::Stmt guard = ir::IfThenElse::make(guardCondition, ir::Continue::make()); + recoverySteps.push_back(guard); + } + + Expr recoveredValue = provGraph.recoverVariable(varToRecover, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + taco_iassert(indexVarToExprMap.count(varToRecover)); + recoverySteps.push_back(VarDecl::make(indexVarToExprMap[varToRecover], recoveredValue)); + + // After we've recovered this index variable, some iterators are now + // accessible for use when declaring locator access variables. So, generate + // the accessors for those locator variables as part of the recovery process. + // This is necessary after a fuse transformation, for example: If we fuse + // two index variables (i, j) into f, then after we've generated the loop for + // f, all locate accessors for i and j are now available for use. + std::vector itersForVar; + for (auto& iters : iterators.levelIterators()) { + // Collect all level iterators that have locate and iterate over + // the recovered index variable. + if (iters.second.getIndexVar() == varToRecover && iters.second.hasLocate()) { + itersForVar.push_back(iters.second); + } + } + // Finally, declare all of the collected iterators' position access variables. + recoverySteps.push_back(this->declLocatePosVars(itersForVar)); + + // place underived guard + std::vector iterBounds = provGraph.deriveIterBounds(varToRecover, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + if (forallNeedsUnderivedGuards && underivedBounds.count(varToRecover) && + !provGraph.hasPosDescendant(varToRecover)) { + + // FIXME: [Olivia] Check this with someone + // Removed underived guard if indexVar is bounded is divisible by its split child indexVar + vector children = provGraph.getChildren(varToRecover); + bool hasDirectDivBound = false; + std::vector iterBoundsInner = provGraph.deriveIterBounds(forall.getIndexVar(), definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + for (auto& c: children) { + if (provGraph.hasExactBound(c) && + provGraph.derivationPath(varToRecover, c).size() == 2) { + const auto iterBoundsUnderivedChild = + provGraph.deriveIterBounds(c, definedIndexVarsOrdered, + underivedBounds, indexVarToExprMap, + iterators); + if (iterBoundsUnderivedChild[1].as()->getValue() % + iterBoundsInner[1].as()->getValue() == 0) { + hasDirectDivBound = true; + break; + } + } + } + if (!hasDirectDivBound) { + Stmt guard = IfThenElse::make(Gte::make(indexVarToExprMap[varToRecover], + underivedBounds[varToRecover][1]), + Continue::make()); + recoverySteps.push_back(guard); + } + } + + // If this index variable was divided into multiple equal chunks, then we + // must add an extra guard to make sure that further scheduling operations + // on descendent index variables exceed the bounds of each equal portion of + // the loop. For a concrete example, consider a loop of size 10 that is divided + // into two equal components -- 5 and 5. If the loop is then transformed + // with .split(..., 3), each inner chunk of 5 will be split into chunks of + // 3. Without an extra guard, the second chunk of 3 in the first group of 5 + // may attempt to perform an iteration for the second group of 5, which is + // incorrect. + if (this->provGraph.isDivided(varToRecover)) { + // Collect the children iteration variables. + auto children = this->provGraph.getChildren(varToRecover); + auto outer = children[0]; + auto inner = children[1]; + // Find the iteration bounds of the inner variable -- that is the size + // that the outer loop was broken into. + auto bounds = this->provGraph.deriveIterBounds(inner, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + // Use the difference between the bounds to find the size of the loop. + auto dimLen = ir::Sub::make(bounds[1], bounds[0]); + // For a variable f divided into into f1 and f2, the guard ensures that + // for iteration f, f should be within f1 * dimLen and (f1 + 1) * dimLen. + auto guard = ir::Gte::make(this->indexVarToExprMap[varToRecover], ir::Mul::make(ir::Add::make(this->indexVarToExprMap[outer], 1), dimLen)); + recoverySteps.push_back(IfThenElse::make(guard, ir::Continue::make())); + } + } + Stmt recoveryStmt = Block::make(recoverySteps); + + taco_iassert(!definedIndexVars.count(forall.getIndexVar())); + definedIndexVars.insert(forall.getIndexVar()); + definedIndexVarsOrdered.push_back(forall.getIndexVar()); + + if (forall.getParallelUnit() != ParallelUnit::NotParallel) { + taco_iassert(!parallelUnitSizes.count(forall.getParallelUnit())); + taco_iassert(!parallelUnitIndexVars.count(forall.getParallelUnit())); + parallelUnitIndexVars[forall.getParallelUnit()] = forall.getIndexVar(); + vector bounds = provGraph.deriveIterBounds(forall.getIndexVar(), definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + parallelUnitSizes[forall.getParallelUnit()] = ir::Sub::make(bounds[1], bounds[0]); + } + + MergeLattice caseLattice = MergeLattice::make(forall, iterators, provGraph, definedIndexVars, whereTempsToResult); + vector resultAccesses; + set reducedAccesses; + std::tie(resultAccesses, reducedAccesses) = getResultAccesses(forall); + + // Pre-allocate/initialize memory of value arrays that are full below this + // loops index variable + Stmt preInitValues = initResultArrays(forall.getIndexVar(), resultAccesses, + reducedAccesses); + + // Emit temporary initialization if forall is sequential or parallelized by + // cpu threads and leads to a where statement + // This is for workspace hoisting by 1-level + vector temporaryValuesInitFree = {Stmt(), Stmt()}; + auto temp = temporaryInitialization.find(forall); + if (temp != temporaryInitialization.end() && forall.getParallelUnit() == + ParallelUnit::NotParallel && !isScalar(temp->second.getTemporary().getType())) + temporaryValuesInitFree = codeToInitializeTemporary(temp->second); + else if (temp != temporaryInitialization.end() && forall.getParallelUnit() == + ParallelUnit::CPUThread && !isScalar(temp->second.getTemporary().getType())) { + temporaryValuesInitFree = codeToInitializeTemporaryParallel(temp->second, forall.getParallelUnit()); + } + + Stmt loops; + // Emit a loop that iterates over over a single iterator (optimization) + if (caseLattice.iterators().size() == 1 && caseLattice.iterators()[0].isUnique()) { + MergeLattice loopLattice = caseLattice.getLoopLattice(); + + MergePoint point = loopLattice.points()[0]; + Iterator iterator = loopLattice.iterators()[0]; + + vector locators = point.locators(); + vector appenders; + vector inserters; + tie(appenders, inserters) = splitAppenderAndInserters(point.results()); + + std::vector underivedAncestors = provGraph.getUnderivedAncestors(iterator.getIndexVar()); + IndexVar posDescendant; + bool hasPosDescendant = false; + if (!underivedAncestors.empty()) { + hasPosDescendant = provGraph.getPosIteratorFullyDerivedDescendant(underivedAncestors[0], &posDescendant); + } + + bool isWhereProducer = false; + vector results = point.results(); + for (Iterator result : results) { + for (auto it = tensorVars.begin(); it != tensorVars.end(); it++) { + if (it->second == result.getTensor()) { + if (whereTempsToResult.count(it->first)) { + isWhereProducer = true; + break; + } + } + } + } + + // For now, this only works when consuming a single workspace. + bool canAccelWithSparseIteration = + provGraph.isFullyDerived(iterator.getIndexVar()) && + iterator.isDimensionIterator() && locators.size() == 1; + if (canAccelWithSparseIteration) { + bool indexListsExist = false; + // We are iterating over a dimension and locating into a temporary with a tracker to keep indices. Instead, we + // can just iterate over the indices and locate into the dense workspace. + for (auto it = tensorVars.begin(); it != tensorVars.end(); ++it) { + if (it->second == locators[0].getTensor() && util::contains(tempToIndexList, it->first)) { + indexListsExist = true; + break; + } + } + canAccelWithSparseIteration &= indexListsExist; + } + + if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) { + loops = lowerForallFusedPosition(forall, iterator, locators, inserters, appenders, caseLattice, + reducedAccesses, recoveryStmt); + } + else if (canAccelWithSparseIteration) { + loops = lowerForallDenseAcceleration(forall, locators, inserters, appenders, caseLattice, reducedAccesses, recoveryStmt); + } + // Emit dimension coordinate iteration loop + else if (iterator.isDimensionIterator()) { + loops = lowerForallDimension(forall, point.locators(), inserters, appenders, caseLattice, + reducedAccesses, recoveryStmt); + } + // Emit position iteration loop + else if (iterator.hasPosIter()) { + loops = lowerForallPosition(forall, iterator, locators, inserters, appenders, caseLattice, + reducedAccesses, recoveryStmt); + } + // Emit coordinate iteration loop + else { + taco_iassert(iterator.hasCoordIter()); +// taco_not_supported_yet + loops = Stmt(); + } + } + // Emit general loops to merge multiple iterators + else { + std::vector underivedAncestors = provGraph.getUnderivedAncestors(forall.getIndexVar()); + taco_iassert(underivedAncestors.size() == 1); // TODO: add support for fused coordinate of pos loop + loops = lowerMergeLattice(caseLattice, underivedAncestors[0], + forall.getStmt(), reducedAccesses); + } +// taco_iassert(loops.defined()); + + if (!generateComputeCode() && !hasStores(loops)) { + // If assembly loop does not modify output arrays, then it can be safely + // omitted. + loops = Stmt(); + } + definedIndexVars.erase(forall.getIndexVar()); + definedIndexVarsOrdered.pop_back(); + if (forall.getParallelUnit() != ParallelUnit::NotParallel) { + inParallelLoopDepth--; + taco_iassert(parallelUnitSizes.count(forall.getParallelUnit())); + taco_iassert(parallelUnitIndexVars.count(forall.getParallelUnit())); + parallelUnitIndexVars.erase(forall.getParallelUnit()); + parallelUnitSizes.erase(forall.getParallelUnit()); + } + return Block::blanks(preInitValues, + temporaryValuesInitFree[0], + loops, + temporaryValuesInitFree[1]); + } + + Stmt LowererImplC::lowerForallCloned(Forall forall) { + // want to emit guards outside of loop to prevent unstructured loop exits + + // construct guard + // underived or pos variables that have a descendant that has not been defined yet + vector varsWithGuard; + for (auto var : provGraph.getAllIndexVars()) { + if (provGraph.isRecoverable(var, definedIndexVars)) { + continue; // already recovered + } + if (provGraph.isUnderived(var) && !provGraph.hasPosDescendant(var)) { // if there is pos descendant then will be guarded already + varsWithGuard.push_back(var); + } + else if (provGraph.isPosVariable(var)) { + // if parent is coord then this is variable that will be guarded when indexing into coord array + if(provGraph.getParents(var).size() == 1 && provGraph.isCoordVariable(provGraph.getParents(var)[0])) { + varsWithGuard.push_back(var); + } + } + } + + // determine min and max values for vars given already defined variables. + // we do a recovery where we fill in undefined variables with either 0's or the max of their iteration + std::map minVarValues; + std::map maxVarValues; + set definedForGuard = definedIndexVars; + vector guardRecoverySteps; + Expr maxOffset = 0; + bool setMaxOffset = false; + + for (auto var : varsWithGuard) { + std::vector currentDefinedVarOrder = definedIndexVarsOrdered; // TODO: get defined vars at time of this recovery + + std::map minChildValues = indexVarToExprMap; + std::map maxChildValues = indexVarToExprMap; + + for (auto child : provGraph.getFullyDerivedDescendants(var)) { + if (!definedIndexVars.count(child)) { + std::vector childBounds = provGraph.deriveIterBounds(child, currentDefinedVarOrder, underivedBounds, indexVarToExprMap, iterators); + + minChildValues[child] = childBounds[0]; + maxChildValues[child] = childBounds[1]; + + // recover new parents + for (const IndexVar& varToRecover : provGraph.newlyRecoverableParents(child, definedForGuard)) { + Expr recoveredValue = provGraph.recoverVariable(varToRecover, definedIndexVarsOrdered, underivedBounds, + minChildValues, iterators); + Expr maxRecoveredValue = provGraph.recoverVariable(varToRecover, definedIndexVarsOrdered, underivedBounds, + maxChildValues, iterators); + if (!setMaxOffset) { // TODO: work on simplifying this + maxOffset = ir::Add::make(maxOffset, ir::Sub::make(maxRecoveredValue, recoveredValue)); + setMaxOffset = true; + } + taco_iassert(indexVarToExprMap.count(varToRecover)); + + guardRecoverySteps.push_back(VarDecl::make(indexVarToExprMap[varToRecover], recoveredValue)); + definedForGuard.insert(varToRecover); + } + definedForGuard.insert(child); + } + } + + minVarValues[var] = provGraph.recoverVariable(var, currentDefinedVarOrder, underivedBounds, minChildValues, iterators); + maxVarValues[var] = provGraph.recoverVariable(var, currentDefinedVarOrder, underivedBounds, maxChildValues, iterators); + } + + // Build guards + Expr guardCondition; + for (auto var : varsWithGuard) { + std::vector iterBounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + + Expr minGuard = Lt::make(minVarValues[var], iterBounds[0]); + Expr maxGuard = Gte::make(ir::Add::make(maxVarValues[var], ir::simplify(maxOffset)), iterBounds[1]); + Expr guardConditionCurrent = Or::make(minGuard, maxGuard); + + if (isa(ir::simplify(iterBounds[0])) && ir::simplify(iterBounds[0]).as()->equalsScalar(0)) { + guardConditionCurrent = maxGuard; + } + + if (guardCondition.defined()) { + guardCondition = Or::make(guardConditionCurrent, guardCondition); + } + else { + guardCondition = guardConditionCurrent; + } + } + + Stmt unvectorizedLoop; + + taco_uassert(guardCondition.defined()) + << "Unable to vectorize or unroll loop over unbound variable " << forall.getIndexVar(); + + // build loop with guards (not vectorized) + if (!varsWithGuard.empty()) { + ignoreVectorize = true; + unvectorizedLoop = lowerForall(forall); + ignoreVectorize = false; + } + + // build loop without guards + emitUnderivedGuards = false; + Stmt vectorizedLoop = lowerForall(forall); + emitUnderivedGuards = true; + + // return guarded loops + return Block::make(Block::make(guardRecoverySteps), IfThenElse::make(guardCondition, unvectorizedLoop, vectorizedLoop)); + } + + Stmt LowererImplC::lowerWhere(Where where) { + TensorVar temporary = where.getTemporary(); + bool accelerateDenseWorkSpace, sortAccelerator; + std::tie(accelerateDenseWorkSpace, sortAccelerator) = + canAccelerateDenseTemp(where); + + // Declare and initialize the where statement's temporary + vector temporaryValuesInitFree = {Stmt(), Stmt()}; + bool temporaryHoisted = false; + for (auto it = temporaryInitialization.begin(); it != temporaryInitialization.end(); ++it) { + if (it->second == where && it->first.getParallelUnit() == + ParallelUnit::NotParallel && !isScalar(temporary.getType())) { + temporaryHoisted = true; + } else if (it->second == where && it->first.getParallelUnit() == + ParallelUnit::CPUThread && !isScalar(temporary.getType())) { + temporaryHoisted = true; + auto decls = codeToInitializeLocalTemporaryParallel(where, it->first.getParallelUnit()); + + temporaryValuesInitFree[0] = ir::Block::make(decls); + } + } + + if (!temporaryHoisted) { + temporaryValuesInitFree = codeToInitializeTemporary(where); + } + + Stmt initializeTemporary = temporaryValuesInitFree[0]; + Stmt freeTemporary = temporaryValuesInitFree[1]; + + match(where.getConsumer(), + std::function([&](const AssignmentNode* op) { + if (op->lhs.getTensorVar().getOrder() > 0) { + whereTempsToResult[where.getTemporary()] = (const AccessNode *) op->lhs.ptr; + } + }) + ); + + Stmt consumer = lower(where.getConsumer()); + if (accelerateDenseWorkSpace && sortAccelerator) { + // We need to sort the indices array + Expr listOfIndices = tempToIndexList.at(temporary); + Expr listOfIndicesSize = tempToIndexListSize.at(temporary); + Expr sizeOfElt = ir::Sizeof::make(listOfIndices.type()); + Stmt sortCall = ir::Sort::make({listOfIndices, listOfIndicesSize, sizeOfElt}); + consumer = Block::make(sortCall, consumer); + } + + // Now that temporary allocations are hoisted, we always need to emit an initialization loop before entering the + // producer but only if there is no dense acceleration + if (util::contains(needCompute, temporary) && !isScalar(temporary.getType()) && !accelerateDenseWorkSpace) { + // TODO: We only actually need to do this if: + // 1) We use the temporary multiple times + // 2) The PRODUCER RHS is sparse(not full). (Guarantees that old values are overwritten before consuming) + + Expr p = Var::make("p" + temporary.getName(), Int()); + Expr values = ir::Var::make(temporary.getName(), + temporary.getType().getDataType(), + true, false); + Expr size = getTemporarySize(where); + Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); + Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); + initializeTemporary = Block::make(initializeTemporary, loopInit); + } + + whereConsumers.push_back(consumer); + whereTemps.push_back(where.getTemporary()); + captureNextLocatePos = true; + + // don't apply atomics to producer TODO: mark specific assignments as atomic + bool restoreAtomicDepth = false; + if (markAssignsAtomicDepth > 0) { + markAssignsAtomicDepth--; + restoreAtomicDepth = true; + } + + Stmt producer = lower(where.getProducer()); + if (accelerateDenseWorkSpace) { + const Expr indexListSizeExpr = tempToIndexListSize.at(temporary); + const Stmt indexListSizeDecl = VarDecl::make(indexListSizeExpr, ir::Literal::make(0)); + initializeTemporary = Block::make(indexListSizeDecl, initializeTemporary); + } + + if (restoreAtomicDepth) { + markAssignsAtomicDepth++; + } + + whereConsumers.pop_back(); + whereTemps.pop_back(); + whereTempsToResult.erase(where.getTemporary()); + return Block::make(initializeTemporary, producer, markAssignsAtomicDepth > 0 ? capturedLocatePos : ir::Stmt(), consumer, freeTemporary); + } + + + +} \ No newline at end of file diff --git a/src/lower/lowerer_impl_cuda.cpp b/src/lower/lowerer_impl_cuda.cpp new file mode 100644 index 000000000..32545f8d3 --- /dev/null +++ b/src/lower/lowerer_impl_cuda.cpp @@ -0,0 +1,885 @@ +// +// Created by 张 on 2022/3/10. +// + +#include "taco/lower/lowerer_impl_cuda.h" +#include +#include "taco/lower/lowerer_impl_imperative.h" + + +#include "taco/index_notation/index_notation.h" + +#include "taco/index_notation/index_notation_nodes.h" +#include "taco/index_notation/index_notation_visitor.h" +#include "taco/index_notation/index_notation_rewriter.h" + +#include "taco/ir/ir.h" +#include "taco/ir/ir_generators.h" +#include "taco/ir/ir_visitor.h" +#include "taco/ir/simplify.h" +#include "taco/lower/iterator.h" +#include "taco/lower/merge_lattice.h" +#include "mode_access.h" +#include "taco/util/collections.h" + +#include "taco/ir/workspace_rewriter.h" + +using namespace std; +using namespace taco::ir; + +namespace taco { + class LowererImplCUDA::Visitor : public IndexNotationVisitorStrict { + public: + Visitor(LowererImplCUDA* impl):impl(impl) {} + Stmt lower(IndexStmt stmt) { + this->stmt = Stmt(); + impl->accessibleIterators.scope(); + IndexStmtVisitorStrict::visit(stmt); + impl->accessibleIterators.unscope(); + return this->stmt; + } + Expr lower(IndexExpr expr) { + this->expr = Expr(); + IndexExprVisitorStrict::visit(expr); + return this->expr; + } + + private: + LowererImplCUDA* impl; + Expr expr; + Stmt stmt; + using IndexNotationVisitorStrict::visit; + void visit(const AssignmentNode* node) { stmt = impl->lowerAssignment(node); } + void visit(const YieldNode* node) { stmt = impl->lowerYield(node); } + void visit(const ForallNode* node) { stmt = impl->lowerForall(node); } + void visit(const WhereNode* node) { stmt = impl->lowerWhere(node); } + void visit(const MultiNode* node) { stmt = impl->lowerMulti(node); } + void visit(const SuchThatNode* node) { stmt = impl->lowerSuchThat(node); } + void visit(const SequenceNode* node) { stmt = impl->lowerSequence(node); } + void visit(const AssembleNode* node) { stmt = impl->lowerAssemble(node); } + void visit(const AccessNode* node) { expr = impl->lowerAccess(node); } + void visit(const LiteralNode* node) { expr = impl->lowerLiteral(node); } + void visit(const NegNode* node) { expr = impl->lowerNeg(node); } + void visit(const AddNode* node) { expr = impl->lowerAdd(node); } + void visit(const SubNode* node) { expr = impl->lowerSub(node); } + void visit(const MulNode* node) { expr = impl->lowerMul(node); } + void visit(const DivNode* node) { expr = impl->lowerDiv(node); } + void visit(const SqrtNode* node) { expr = impl->lowerSqrt(node); } + void visit(const CastNode* node) { expr = impl->lowerCast(node); } + void visit(const CallIntrinsicNode* node) { expr = impl->lowerCallIntrinsic(node); } + void visit(const CallNode* node) { expr = impl->lowerTensorOp(node); } + void visit(const ReductionNode* node) { + taco_ierror << "Reduction nodes not supported in concrete index notation"; + } + void visit(const IndexVarNode* node) { expr = impl->lowerIndexVar(node); } + }; + LowererImplCUDA::LowererImplCUDA(): visitor(new Visitor(this)) {} + + static bool returnsTrue(IndexExpr expr) { + struct ReturnsTrue : public IndexExprRewriterStrict { + void visit(const AccessNode* op) { + if (op->isAccessingStructure) { + expr = op; + } + } + + void visit(const LiteralNode* op) { + if (op->getDataType() == Bool && op->getVal()) { + expr = op; + } + } + + void visit(const NegNode* op) { + expr = rewrite(op->a); + } + + void visit(const AddNode* op) { + if (rewrite(op->a).defined() || rewrite(op->b).defined()) { + expr = op; + } + } + + void visit(const MulNode* op) { + if (rewrite(op->a).defined() && rewrite(op->b).defined()) { + expr = op; + } + } + + void visit(const CastNode* op) { + expr = rewrite(op->a); + } + + void visit(const CallNode* op) { + const auto annihilator = findProperty(op->properties); + + if (!annihilator.defined() || !annihilator.positions().empty()) { + return; + } + + if (equals(annihilator.annihilator(), Literal(false))) { + for (const auto& arg : op->args) { + if (!rewrite(arg).defined()) { + return; + } + } + expr = op; + } else { + for (const auto& arg : op->args) { + if (rewrite(arg).defined()) { + expr = op; + return; + } + } + } + } + + void visit(const SqrtNode* op) {} + void visit(const SubNode* op) {} + void visit(const DivNode* op) {} + void visit(const CallIntrinsicNode* op) {} + void visit(const ReductionNode* op) {} + void visit(const IndexVarNode* op) {} + }; + return ReturnsTrue().rewrite(expr).defined(); + } + + static bool needComputeValues(IndexStmt stmt, TensorVar tensor) { + if (tensor.getType().getDataType() != Bool) { + return true; + } + + bool needComputeValue = false; + match(stmt, + function([&]( + const AssignmentNode* n, Matcher* m) { + if (n->lhs.getTensorVar() == tensor && !returnsTrue(n->rhs)) { + needComputeValue = true; + } + }) + ); + + return needComputeValue; + } + vector LowererImplCUDA::codeToInitializeDenseAcceleratorArrays(Where where, bool parallel) { + // if parallel == true, need to initialize dense accelerator arrays as size*numThreads + // and rename all dense accelerator arrays to name + '_all' + + TensorVar temporary = where.getTemporary(); + + // TODO: emit as uint64 and manually emit bit pack code + const Datatype bitGuardType = taco::Bool; + std::string bitGuardSuffix; + if (parallel) + bitGuardSuffix = "_already_set_all"; + else + bitGuardSuffix = "_already_set"; + const std::string bitGuardName = temporary.getName() + bitGuardSuffix; + + Expr bitGuardSize = getTemporarySize(where); + Expr maxThreads = ir::Call::make("omp_get_max_threads", {}, bitGuardSize.type()); + if (parallel) + bitGuardSize = ir::Mul::make(bitGuardSize, maxThreads); + + const Expr alreadySetArr = ir::Var::make(bitGuardName, + bitGuardType, + true, false); + + // TODO: TACO should probably keep state on if it can use int32 or if it should switch to + // using int64 for indices. This assumption is made in other places of taco. + const Datatype indexListType = taco::Int32; + std::string indexListSuffix; + if (parallel) + indexListSuffix = "_index_list_all"; + else + indexListSuffix = "_index_list"; + + const std::string indexListName = temporary.getName() + indexListSuffix; + const Expr indexListArr = ir::Var::make(indexListName, + indexListType, + true, false); + + // no decl for shared memory + Stmt alreadySetDecl = Stmt(); + Stmt indexListDecl = Stmt(); + Stmt freeTemps = Block::make(Free::make(indexListArr), Free::make(alreadySetArr)); + if ((isa(where.getProducer()) && inParallelLoopDepth == 0)) { + alreadySetDecl = VarDecl::make(alreadySetArr, ir::Literal::make(0)); + indexListDecl = VarDecl::make(indexListArr, ir::Literal::make(0)); + } + + if (parallel) { + whereToIndexListAll[where] = indexListArr; + whereToBitGuardAll[where] = alreadySetArr; + } else { + const Expr indexListSizeExpr = ir::Var::make(indexListName + "_size", taco::Int32, false, false); + tempToIndexList[temporary] = indexListArr; + tempToIndexListSize[temporary] = indexListSizeExpr; + tempToBitGuard[temporary] = alreadySetArr; + } + + Stmt allocateIndexList = Allocate::make(indexListArr, bitGuardSize); + Stmt allocateAlreadySet = Allocate::make(alreadySetArr, bitGuardSize); + Expr p = Var::make("p" + temporary.getName(), Int()); + Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType)); + + Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial); + Stmt inits = Block::make(alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop); + return {inits, freeTemps}; + + + } + + +// Returns true if the following conditions are met: +// 1) The temporary is a dense vector +// 2) There is only one value on the right hand side of the consumer +// -- We would need to handle sparse acceleration in the merge lattices for +// multiple operands on the RHS +// 3) The left hand side of the where consumer is sparse, if the consumer is an +// assignment +// 4) CPU Code is being generated (TEMPORARY - This should be removed) +// -- The sorting calls and calloc call in lower where are CPU specific. We +// could map calloc to a cudaMalloc and use a library like CUB to emit +// the sort. CUB support is built into CUDA 11 but not prior versions of +// CUDA so in that case, we'd probably need to include the CUB headers in +// the generated code. + std::pair LowererImplCUDA::canAccelerateDenseTemp(Where where) { + return std::make_pair(false, false); + } + +// Code to initialize the local temporary workspace from the shared workspace +// in codeToInitializeTemporaryParallel for a SINGLE parallel unit +// (e.g.) the local workspace that each thread uses + vector LowererImplCUDA::codeToInitializeLocalTemporaryParallel(Where where, ParallelUnit parallelUnit) { + TensorVar temporary = where.getTemporary(); + vector decls; + + Expr tempSize = getTemporarySize(where); + Expr threadNum = ir::Call::make("omp_get_thread_num", {}, tempSize.type()); + tempSize = ir::Mul::make(tempSize, threadNum); + + bool accelerateDense = canAccelerateDenseTemp(where).first; + + Expr values; + if (util::contains(needCompute, temporary) && + needComputeValues(where, temporary)) { + // Declare local temporary workspace array + values = ir::Var::make(temporary.getName(), + temporary.getType().getDataType(), + true, false); + Expr values_all = this->temporaryArrays[this->whereToTemporaryVar[where]].values; + Expr tempRhs = ir::Add::make(values_all, tempSize); + Stmt tempDecl = ir::VarDecl::make(values, tempRhs); + decls.push_back(tempDecl); + } + /// Make a struct object that lowerAssignment and lowerAccess can read + /// temporary value arrays from. + TemporaryArrays arrays; + arrays.values = values; + this->temporaryArrays.insert({temporary, arrays}); + + if (accelerateDense) { + // Declare local index list array + // TODO: TACO should probably keep state on if it can use int32 or if it should switch to + // using int64 for indices. This assumption is made in other places of taco. + const Datatype indexListType = taco::Int32; + const std::string indexListName = temporary.getName() + "_index_list"; + const Expr indexListArr = ir::Var::make(indexListName, + indexListType, + true, false); + + Expr indexList_all = this->whereToIndexListAll[where]; + Expr indexListRhs = ir::Add::make(indexList_all, tempSize); + Stmt indexListDecl = ir::VarDecl::make(indexListArr, indexListRhs); + decls.push_back(indexListDecl); + + // Declare local indexList size variable + const Expr indexListSizeExpr = ir::Var::make(indexListName + "_size", taco::Int32, false, false); + + // Declare local already set array (bit guard) + // TODO: emit as uint64 and manually emit bit pack code + const Datatype bitGuardType = taco::Bool; + const std::string bitGuardName = temporary.getName() + "_already_set"; + const Expr alreadySetArr = ir::Var::make(bitGuardName, + bitGuardType, + true, false); + Expr bitGuard_all = this->whereToBitGuardAll[where]; + Expr bitGuardRhs = ir::Add::make(bitGuard_all, tempSize); + Stmt bitGuardDecl = ir::VarDecl::make(alreadySetArr, bitGuardRhs); + decls.push_back(bitGuardDecl); + + tempToIndexList[temporary] = indexListArr; + tempToIndexListSize[temporary] = indexListSizeExpr; + tempToBitGuard[temporary] = alreadySetArr; + } + return decls; + } + +// Code to initialize a temporary workspace that is SHARED across ALL parallel units. +// New temporaries are denoted by temporary.getName() + '_all' +// Currently only supports CPUThreads + vector LowererImplCUDA::codeToInitializeTemporaryParallel(Where where, ParallelUnit parallelUnit) { + TensorVar temporary = where.getTemporary(); + // For the parallel case, need to hoist up a workspace shared by all threads + TensorVar temporaryAll = TensorVar(temporary.getName() + "_all", temporary.getType(), temporary.getFormat()); + this->whereToTemporaryVar[where] = temporaryAll; + + bool accelerateDense = canAccelerateDenseTemp(where).first; + + Stmt freeTemporary = Stmt(); + Stmt initializeTemporary = Stmt(); + + // When emitting code to accelerate dense workspaces with sparse iteration, we need the following arrays + // to construct the result indices + if(accelerateDense) { + vector initAndFree = codeToInitializeDenseAcceleratorArrays(where, true); + initializeTemporary = initAndFree[0]; + freeTemporary = initAndFree[1]; + } + + Expr values; + if (util::contains(needCompute, temporary) && + needComputeValues(where, temporary)) { + values = ir::Var::make(temporaryAll.getName(), + temporaryAll.getType().getDataType(), + true, false); + Expr size = getTemporarySize(where); + Expr sizeAll = ir::Mul::make(size, ir::Call::make("omp_get_max_threads", {}, size.type())); + + // no decl needed for shared memory + Stmt decl = Stmt(); + if ((isa(where.getProducer()) && inParallelLoopDepth == 0)) { + decl = VarDecl::make(values, ir::Literal::make(0)); + } + Stmt allocate = Allocate::make(values, sizeAll); + + freeTemporary = Block::make(freeTemporary, Free::make(values)); + initializeTemporary = Block::make(decl, initializeTemporary, allocate); + } + /// Make a struct object that lowerAssignment and lowerAccess can read + /// temporary value arrays from. + TemporaryArrays arrays; + arrays.values = values; + this->temporaryArrays.insert({temporaryAll, arrays}); + + return {initializeTemporary, freeTemporary}; + } + + vector LowererImplCUDA::codeToInitializeTemporary(Where where) { + TensorVar temporary = where.getTemporary(); + + const bool accelerateDense = canAccelerateDenseTemp(where).first; + + Stmt freeTemporary = Stmt(); + Stmt initializeTemporary = Stmt(); + if (isScalar(temporary.getType())) { + initializeTemporary = defineScalarVariable(temporary, true); + Expr tempSet = ir::Var::make(temporary.getName() + "_set", Datatype::Bool); + Stmt initTempSet = VarDecl::make(tempSet, false); + initializeTemporary = Block::make(initializeTemporary, initTempSet); + tempToBitGuard[temporary] = tempSet; + } else { + // TODO: Need to support keeping track of initialized elements for + // temporaries that don't have sparse accelerator + taco_iassert(!util::contains(guardedTemps, temporary) || accelerateDense); + + // When emitting code to accelerate dense workspaces with sparse iteration, we need the following arrays + // to construct the result indices + if(accelerateDense) { + vector initAndFree = codeToInitializeDenseAcceleratorArrays(where); + initializeTemporary = initAndFree[0]; + freeTemporary = initAndFree[1]; + } + + Expr values; + if (util::contains(needCompute, temporary) && + needComputeValues(where, temporary)) { + values = ir::Var::make(temporary.getName(), + temporary.getType().getDataType(), true, false); + + Expr size = getTemporarySize(where); + + // no decl needed for shared memory + Stmt decl = Stmt(); + if ((isa(where.getProducer()) && inParallelLoopDepth == 0)) { + decl = VarDecl::make(values, ir::Literal::make(0)); + } + Stmt allocate = Allocate::make(values, size); + + freeTemporary = Block::make(freeTemporary, Free::make(values)); + initializeTemporary = Block::make(decl, initializeTemporary, allocate); + } + + /// Make a struct object that lowerAssignment and lowerAccess can read + /// temporary value arrays from. + TemporaryArrays arrays; + arrays.values = values; + this->temporaryArrays.insert({temporary, arrays}); + } + return {initializeTemporary, freeTemporary}; + } + Stmt LowererImplCUDA::initValues(Expr tensor, Expr initVal, Expr begin, Expr size) { + Expr lower = simplify(ir::Mul::make(begin, size)); + Expr upper = simplify(ir::Mul::make(ir::Add::make(begin, 1), size)); + Expr p = Var::make("p" + util::toString(tensor), Int()); + Expr values = GetProperty::make(tensor, TensorProperty::Values); + Stmt zeroInit = Store::make(values, p, initVal); + LoopKind parallel = (isa(size) && + to(size)->getIntValue() < (1 << 10)) + ? LoopKind::Serial : LoopKind::Static_Chunked; + if (util::contains(parallelUnitSizes, ParallelUnit::GPUBlock)) { + return ir::VarDecl::make(ir::Var::make("status", Int()), + ir::Call::make("cudaMemset", {values, ir::Literal::make(0, Int()), + ir::Mul::make(ir::Sub::make(upper, lower), + ir::Literal::make(values.type().getNumBytes()))}, Int())); + } + return For::make(p, lower, upper, 1, zeroInit, parallel); + } + Stmt LowererImplCUDA::lowerForall(Forall forall) + { + bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar()); + bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards; + if (!ignoreVectorize && forallNeedsUnderivedGuards && + (forall.getParallelUnit() == ParallelUnit::CPUVector || + forall.getUnrollFactor() > 0)) { + return lowerForallCloned(forall); + } + + if (forall.getParallelUnit() != ParallelUnit::NotParallel) { + inParallelLoopDepth++; + } + + // Recover any available parents that were not recoverable previously + vector recoverySteps; + for (const IndexVar& varToRecover : provGraph.newlyRecoverableParents(forall.getIndexVar(), definedIndexVars)) { + // place pos guard + if (forallNeedsUnderivedGuards && provGraph.isCoordVariable(varToRecover) && + provGraph.getChildren(varToRecover).size() == 1 && + provGraph.isPosVariable(provGraph.getChildren(varToRecover)[0])) { + IndexVar posVar = provGraph.getChildren(varToRecover)[0]; + std::vector iterBounds = provGraph.deriveIterBounds(posVar, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + + Expr minGuard = Lt::make(indexVarToExprMap[posVar], iterBounds[0]); + Expr maxGuard = Gte::make(indexVarToExprMap[posVar], iterBounds[1]); + Expr guardCondition = Or::make(minGuard, maxGuard); + if (isa(ir::simplify(iterBounds[0])) && ir::simplify(iterBounds[0]).as()->equalsScalar(0)) { + guardCondition = maxGuard; + } + ir::Stmt guard = ir::IfThenElse::make(guardCondition, ir::Continue::make()); + recoverySteps.push_back(guard); + } + + Expr recoveredValue = provGraph.recoverVariable(varToRecover, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + taco_iassert(indexVarToExprMap.count(varToRecover)); + recoverySteps.push_back(VarDecl::make(indexVarToExprMap[varToRecover], recoveredValue)); + + // After we've recovered this index variable, some iterators are now + // accessible for use when declaring locator access variables. So, generate + // the accessors for those locator variables as part of the recovery process. + // This is necessary after a fuse transformation, for example: If we fuse + // two index variables (i, j) into f, then after we've generated the loop for + // f, all locate accessors for i and j are now available for use. + std::vector itersForVar; + for (auto& iters : iterators.levelIterators()) { + // Collect all level iterators that have locate and iterate over + // the recovered index variable. + if (iters.second.getIndexVar() == varToRecover && iters.second.hasLocate()) { + itersForVar.push_back(iters.second); + } + } + // Finally, declare all of the collected iterators' position access variables. + recoverySteps.push_back(this->declLocatePosVars(itersForVar)); + + // place underived guard + std::vector iterBounds = provGraph.deriveIterBounds(varToRecover, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + if (forallNeedsUnderivedGuards && underivedBounds.count(varToRecover) && + !provGraph.hasPosDescendant(varToRecover)) { + + // FIXME: [Olivia] Check this with someone + // Removed underived guard if indexVar is bounded is divisible by its split child indexVar + vector children = provGraph.getChildren(varToRecover); + bool hasDirectDivBound = false; + std::vector iterBoundsInner = provGraph.deriveIterBounds(forall.getIndexVar(), definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + for (auto& c: children) { + if (provGraph.hasExactBound(c) && + provGraph.derivationPath(varToRecover, c).size() == 2) { + const auto iterBoundsUnderivedChild = + provGraph.deriveIterBounds(c, definedIndexVarsOrdered, + underivedBounds, indexVarToExprMap, + iterators); + if (iterBoundsUnderivedChild[1].as()->getValue() % + iterBoundsInner[1].as()->getValue() == 0) { + hasDirectDivBound = true; + break; + } + } + } + if (!hasDirectDivBound) { + Stmt guard = IfThenElse::make(Gte::make(indexVarToExprMap[varToRecover], + underivedBounds[varToRecover][1]), + Continue::make()); + recoverySteps.push_back(guard); + } + } + + // If this index variable was divided into multiple equal chunks, then we + // must add an extra guard to make sure that further scheduling operations + // on descendent index variables exceed the bounds of each equal portion of + // the loop. For a concrete example, consider a loop of size 10 that is divided + // into two equal components -- 5 and 5. If the loop is then transformed + // with .split(..., 3), each inner chunk of 5 will be split into chunks of + // 3. Without an extra guard, the second chunk of 3 in the first group of 5 + // may attempt to perform an iteration for the second group of 5, which is + // incorrect. + if (this->provGraph.isDivided(varToRecover)) { + // Collect the children iteration variables. + auto children = this->provGraph.getChildren(varToRecover); + auto outer = children[0]; + auto inner = children[1]; + // Find the iteration bounds of the inner variable -- that is the size + // that the outer loop was broken into. + auto bounds = this->provGraph.deriveIterBounds(inner, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + // Use the difference between the bounds to find the size of the loop. + auto dimLen = ir::Sub::make(bounds[1], bounds[0]); + // For a variable f divided into into f1 and f2, the guard ensures that + // for iteration f, f should be within f1 * dimLen and (f1 + 1) * dimLen. + auto guard = ir::Gte::make(this->indexVarToExprMap[varToRecover], ir::Mul::make(ir::Add::make(this->indexVarToExprMap[outer], 1), dimLen)); + recoverySteps.push_back(IfThenElse::make(guard, ir::Continue::make())); + } + } + Stmt recoveryStmt = Block::make(recoverySteps); + + taco_iassert(!definedIndexVars.count(forall.getIndexVar())); + definedIndexVars.insert(forall.getIndexVar()); + definedIndexVarsOrdered.push_back(forall.getIndexVar()); + + if (forall.getParallelUnit() != ParallelUnit::NotParallel) { + taco_iassert(!parallelUnitSizes.count(forall.getParallelUnit())); + taco_iassert(!parallelUnitIndexVars.count(forall.getParallelUnit())); + parallelUnitIndexVars[forall.getParallelUnit()] = forall.getIndexVar(); + vector bounds = provGraph.deriveIterBounds(forall.getIndexVar(), definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + parallelUnitSizes[forall.getParallelUnit()] = ir::Sub::make(bounds[1], bounds[0]); + } + + MergeLattice caseLattice = MergeLattice::make(forall, iterators, provGraph, definedIndexVars, whereTempsToResult); + vector resultAccesses; + set reducedAccesses; + std::tie(resultAccesses, reducedAccesses) = getResultAccesses(forall); + + // Pre-allocate/initialize memory of value arrays that are full below this + // loops index variable + Stmt preInitValues = initResultArrays(forall.getIndexVar(), resultAccesses, + reducedAccesses); + + // Emit temporary initialization if forall is sequential or parallelized by + // cpu threads and leads to a where statement + // This is for workspace hoisting by 1-level + vector temporaryValuesInitFree = {Stmt(), Stmt()}; + auto temp = temporaryInitialization.find(forall); + if (temp != temporaryInitialization.end() && forall.getParallelUnit() == + ParallelUnit::NotParallel && !isScalar(temp->second.getTemporary().getType())) + temporaryValuesInitFree = codeToInitializeTemporary(temp->second); + else if (temp != temporaryInitialization.end() && forall.getParallelUnit() == + ParallelUnit::CPUThread && !isScalar(temp->second.getTemporary().getType())) { + temporaryValuesInitFree = codeToInitializeTemporaryParallel(temp->second, forall.getParallelUnit()); + } + + Stmt loops; + // Emit a loop that iterates over over a single iterator (optimization) + if (caseLattice.iterators().size() == 1 && caseLattice.iterators()[0].isUnique()) { + MergeLattice loopLattice = caseLattice.getLoopLattice(); + + MergePoint point = loopLattice.points()[0]; + Iterator iterator = loopLattice.iterators()[0]; + + vector locators = point.locators(); + vector appenders; + vector inserters; + tie(appenders, inserters) = splitAppenderAndInserters(point.results()); + + std::vector underivedAncestors = provGraph.getUnderivedAncestors(iterator.getIndexVar()); + IndexVar posDescendant; + bool hasPosDescendant = false; + if (!underivedAncestors.empty()) { + hasPosDescendant = provGraph.getPosIteratorFullyDerivedDescendant(underivedAncestors[0], &posDescendant); + } + + bool isWhereProducer = false; + vector results = point.results(); + for (Iterator result : results) { + for (auto it = tensorVars.begin(); it != tensorVars.end(); it++) { + if (it->second == result.getTensor()) { + if (whereTempsToResult.count(it->first)) { + isWhereProducer = true; + break; + } + } + } + } + + // For now, this only works when consuming a single workspace. + bool canAccelWithSparseIteration = + provGraph.isFullyDerived(iterator.getIndexVar()) && + iterator.isDimensionIterator() && locators.size() == 1; + if (canAccelWithSparseIteration) { + bool indexListsExist = false; + // We are iterating over a dimension and locating into a temporary with a tracker to keep indices. Instead, we + // can just iterate over the indices and locate into the dense workspace. + for (auto it = tensorVars.begin(); it != tensorVars.end(); ++it) { + if (it->second == locators[0].getTensor() && util::contains(tempToIndexList, it->first)) { + indexListsExist = true; + break; + } + } + canAccelWithSparseIteration &= indexListsExist; + } + + if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) { + loops = lowerForallFusedPosition(forall, iterator, locators, inserters, appenders, caseLattice, + reducedAccesses, recoveryStmt); + } + else if (canAccelWithSparseIteration) { + loops = lowerForallDenseAcceleration(forall, locators, inserters, appenders, caseLattice, reducedAccesses, recoveryStmt); + } + // Emit dimension coordinate iteration loop + else if (iterator.isDimensionIterator()) { + loops = lowerForallDimension(forall, point.locators(), inserters, appenders, caseLattice, + reducedAccesses, recoveryStmt); + } + // Emit position iteration loop + else if (iterator.hasPosIter()) { + loops = lowerForallPosition(forall, iterator, locators, inserters, appenders, caseLattice, + reducedAccesses, recoveryStmt); + } + // Emit coordinate iteration loop + else { + taco_iassert(iterator.hasCoordIter()); +// taco_not_supported_yet + loops = Stmt(); + } + } + // Emit general loops to merge multiple iterators + else { + std::vector underivedAncestors = provGraph.getUnderivedAncestors(forall.getIndexVar()); + taco_iassert(underivedAncestors.size() == 1); // TODO: add support for fused coordinate of pos loop + loops = lowerMergeLattice(caseLattice, underivedAncestors[0], + forall.getStmt(), reducedAccesses); + } +// taco_iassert(loops.defined()); + + if (!generateComputeCode() && !hasStores(loops)) { + // If assembly loop does not modify output arrays, then it can be safely + // omitted. + loops = Stmt(); + } + definedIndexVars.erase(forall.getIndexVar()); + definedIndexVarsOrdered.pop_back(); + if (forall.getParallelUnit() != ParallelUnit::NotParallel) { + inParallelLoopDepth--; + taco_iassert(parallelUnitSizes.count(forall.getParallelUnit())); + taco_iassert(parallelUnitIndexVars.count(forall.getParallelUnit())); + parallelUnitIndexVars.erase(forall.getParallelUnit()); + parallelUnitSizes.erase(forall.getParallelUnit()); + } + return Block::blanks(preInitValues, + temporaryValuesInitFree[0], + loops, + temporaryValuesInitFree[1]); + } + + Stmt LowererImplCUDA::lowerForallCloned(Forall forall) { + // want to emit guards outside of loop to prevent unstructured loop exits + + // construct guard + // underived or pos variables that have a descendant that has not been defined yet + vector varsWithGuard; + for (auto var : provGraph.getAllIndexVars()) { + if (provGraph.isRecoverable(var, definedIndexVars)) { + continue; // already recovered + } + if (provGraph.isUnderived(var) && !provGraph.hasPosDescendant(var)) { // if there is pos descendant then will be guarded already + varsWithGuard.push_back(var); + } + else if (provGraph.isPosVariable(var)) { + // if parent is coord then this is variable that will be guarded when indexing into coord array + if(provGraph.getParents(var).size() == 1 && provGraph.isCoordVariable(provGraph.getParents(var)[0])) { + varsWithGuard.push_back(var); + } + } + } + + // determine min and max values for vars given already defined variables. + // we do a recovery where we fill in undefined variables with either 0's or the max of their iteration + std::map minVarValues; + std::map maxVarValues; + set definedForGuard = definedIndexVars; + vector guardRecoverySteps; + Expr maxOffset = 0; + bool setMaxOffset = false; + + for (auto var : varsWithGuard) { + std::vector currentDefinedVarOrder = definedIndexVarsOrdered; // TODO: get defined vars at time of this recovery + + std::map minChildValues = indexVarToExprMap; + std::map maxChildValues = indexVarToExprMap; + + for (auto child : provGraph.getFullyDerivedDescendants(var)) { + if (!definedIndexVars.count(child)) { + std::vector childBounds = provGraph.deriveIterBounds(child, currentDefinedVarOrder, underivedBounds, indexVarToExprMap, iterators); + + minChildValues[child] = childBounds[0]; + maxChildValues[child] = childBounds[1]; + + // recover new parents + for (const IndexVar& varToRecover : provGraph.newlyRecoverableParents(child, definedForGuard)) { + Expr recoveredValue = provGraph.recoverVariable(varToRecover, definedIndexVarsOrdered, underivedBounds, + minChildValues, iterators); + Expr maxRecoveredValue = provGraph.recoverVariable(varToRecover, definedIndexVarsOrdered, underivedBounds, + maxChildValues, iterators); + if (!setMaxOffset) { // TODO: work on simplifying this + maxOffset = ir::Add::make(maxOffset, ir::Sub::make(maxRecoveredValue, recoveredValue)); + setMaxOffset = true; + } + taco_iassert(indexVarToExprMap.count(varToRecover)); + + guardRecoverySteps.push_back(VarDecl::make(indexVarToExprMap[varToRecover], recoveredValue)); + definedForGuard.insert(varToRecover); + } + definedForGuard.insert(child); + } + } + + minVarValues[var] = provGraph.recoverVariable(var, currentDefinedVarOrder, underivedBounds, minChildValues, iterators); + maxVarValues[var] = provGraph.recoverVariable(var, currentDefinedVarOrder, underivedBounds, maxChildValues, iterators); + } + + // Build guards + Expr guardCondition; + for (auto var : varsWithGuard) { + std::vector iterBounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); + + Expr minGuard = Lt::make(minVarValues[var], iterBounds[0]); + Expr maxGuard = Gte::make(ir::Add::make(maxVarValues[var], ir::simplify(maxOffset)), iterBounds[1]); + Expr guardConditionCurrent = Or::make(minGuard, maxGuard); + + if (isa(ir::simplify(iterBounds[0])) && ir::simplify(iterBounds[0]).as()->equalsScalar(0)) { + guardConditionCurrent = maxGuard; + } + + if (guardCondition.defined()) { + guardCondition = Or::make(guardConditionCurrent, guardCondition); + } + else { + guardCondition = guardConditionCurrent; + } + } + + Stmt unvectorizedLoop; + + taco_uassert(guardCondition.defined()) + << "Unable to vectorize or unroll loop over unbound variable " << forall.getIndexVar(); + + // build loop with guards (not vectorized) + if (!varsWithGuard.empty()) { + ignoreVectorize = true; + unvectorizedLoop = lowerForall(forall); + ignoreVectorize = false; + } + + // build loop without guards + emitUnderivedGuards = false; + Stmt vectorizedLoop = lowerForall(forall); + emitUnderivedGuards = true; + + // return guarded loops + return Block::make(Block::make(guardRecoverySteps), IfThenElse::make(guardCondition, unvectorizedLoop, vectorizedLoop)); + } + Stmt LowererImplCUDA::lowerWhere(Where where) { + TensorVar temporary = where.getTemporary(); + bool accelerateDenseWorkSpace, sortAccelerator; + std::tie(accelerateDenseWorkSpace, sortAccelerator) = + canAccelerateDenseTemp(where); + + // Declare and initialize the where statement's temporary + vector temporaryValuesInitFree = {Stmt(), Stmt()}; + bool temporaryHoisted = false; + for (auto it = temporaryInitialization.begin(); it != temporaryInitialization.end(); ++it) { + if (it->second == where && it->first.getParallelUnit() == + ParallelUnit::NotParallel && !isScalar(temporary.getType())) { + temporaryHoisted = true; + } + } + + if (!temporaryHoisted) { + temporaryValuesInitFree = codeToInitializeTemporary(where); + } + + Stmt initializeTemporary = temporaryValuesInitFree[0]; + Stmt freeTemporary = temporaryValuesInitFree[1]; + + match(where.getConsumer(), + std::function([&](const AssignmentNode* op) { + if (op->lhs.getTensorVar().getOrder() > 0) { + whereTempsToResult[where.getTemporary()] = (const AccessNode *) op->lhs.ptr; + } + }) + ); + + Stmt consumer = lower(where.getConsumer()); + if (accelerateDenseWorkSpace && sortAccelerator) { + // We need to sort the indices array + Expr listOfIndices = tempToIndexList.at(temporary); + Expr listOfIndicesSize = tempToIndexListSize.at(temporary); + Expr sizeOfElt = ir::Sizeof::make(listOfIndices.type()); + Stmt sortCall = ir::Sort::make({listOfIndices, listOfIndicesSize, sizeOfElt}); + consumer = Block::make(sortCall, consumer); + } + + // Now that temporary allocations are hoisted, we always need to emit an initialization loop before entering the + // producer but only if there is no dense acceleration + if (util::contains(needCompute, temporary) && !isScalar(temporary.getType()) && !accelerateDenseWorkSpace) { + // TODO: We only actually need to do this if: + // 1) We use the temporary multiple times + // 2) The PRODUCER RHS is sparse(not full). (Guarantees that old values are overwritten before consuming) + + Expr p = Var::make("p" + temporary.getName(), Int()); + Expr values = ir::Var::make(temporary.getName(), + temporary.getType().getDataType(), + true, false); + Expr size = getTemporarySize(where); + Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); + Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); + initializeTemporary = Block::make(initializeTemporary, loopInit); + } + + whereConsumers.push_back(consumer); + whereTemps.push_back(where.getTemporary()); + captureNextLocatePos = true; + + // don't apply atomics to producer TODO: mark specific assignments as atomic + bool restoreAtomicDepth = false; + if (markAssignsAtomicDepth > 0) { + markAssignsAtomicDepth--; + restoreAtomicDepth = true; + } + + Stmt producer = lower(where.getProducer()); + if (accelerateDenseWorkSpace) { + const Expr indexListSizeExpr = tempToIndexListSize.at(temporary); + const Stmt indexListSizeDecl = VarDecl::make(indexListSizeExpr, ir::Literal::make(0)); + initializeTemporary = Block::make(indexListSizeDecl, initializeTemporary); + } + + if (restoreAtomicDepth) { + markAssignsAtomicDepth++; + } + + whereConsumers.pop_back(); + whereTemps.pop_back(); + whereTempsToResult.erase(where.getTemporary()); + return Block::make(initializeTemporary, producer, markAssignsAtomicDepth > 0 ? capturedLocatePos : ir::Stmt(), consumer, freeTemporary); + } + + + +} \ No newline at end of file diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index 8109dc20f..d03ca2e50 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -1828,20 +1828,13 @@ Stmt LowererImplImperative::lowerMergeCases(ir::Expr coordinate, IndexVar coordi vector inserters; tie(appenders, inserters) = splitAppenderAndInserters(loopLattice.results()); - if (loopLattice.iterators().size() == 1) { - // Just one iterator, so no conditionals needed + // If loo + if (loopLattice.iterators().size() == 1 || (loopLattice.exact() && + isa(stmt) && returnsTrue(stmt.as().getRhs()))) { + // Just one iterator so no conditional taco_iassert(!loopLattice.points()[0].isOmitter()); - Stmt body = lowerForallBody(coordinate, stmt, {}, inserters, appenders, - loopLattice, reducedAccesses, mergeStrategy); - result.push_back(body); - } - else if (loopLattice.exact() && isa(stmt) && - returnsTrue(stmt.as().getRhs())) { - // All cases require the same computation, so no conditionals needed - taco_iassert(!loopLattice.points()[0].isOmitter()); - Stmt body = lowerForallBody(coordinate, stmt, {}, inserters, appenders, - MergeLattice({loopLattice.points()[0]}), - reducedAccesses, mergeStrategy); + Stmt body = lowerForallBody(coordinate, stmt, {}, inserters, + appenders, loopLattice, reducedAccesses, mergeStrategy); result.push_back(body); } else if (!loopLattice.points().empty()) { @@ -2035,8 +2028,10 @@ Stmt LowererImplImperative::lowerMergeCasesWithExplicitZeroChecks(ir::Expr coord MergeStrategy mergeStrategy) { vector result; - if (lattice.points().size() == 1 && lattice.iterators().size() == 1) { - // Just one iterator, so no conditional needed + if (lattice.points().size() == 1 && lattice.iterators().size() == 1 + || (lattice.exact() && + isa(stmt) && returnsTrue(stmt.as().getRhs()))) { + // Just one iterator so no conditional vector appenders; vector inserters; tie(appenders, inserters) = splitAppenderAndInserters(lattice.results()); @@ -2044,17 +2039,6 @@ Stmt LowererImplImperative::lowerMergeCasesWithExplicitZeroChecks(ir::Expr coord Stmt body = lowerForallBody(coordinate, stmt, {}, inserters, appenders, lattice, reducedAccesses, mergeStrategy); result.push_back(body); - } else if (lattice.exact() && isa(stmt) && - returnsTrue(stmt.as().getRhs())) { - // All cases require the same computation, so no conditionals needed - vector appenders; - vector inserters; - tie(appenders, inserters) = splitAppenderAndInserters(lattice.results()); - taco_iassert(!lattice.points()[0].isOmitter()); - Stmt body = lowerForallBody(coordinate, stmt, {}, inserters, appenders, - MergeLattice({lattice.points()[0]}), - reducedAccesses, mergeStrategy); - result.push_back(body); } else if (!lattice.points().empty()) { map iteratorToConditionMap; @@ -2318,7 +2302,9 @@ std::pair LowererImplImperative::canAccelerateDenseTemp(Where where) return resultVar == tempVar[0] || provGraph.isDerivedFrom(tempVar[0], resultVar); }); - + if (resultVars.size() == 0){ + return std::make_pair(false, false); + } if (it == resultVars.end()) { return std::make_pair(true, false); } @@ -3062,7 +3048,7 @@ Stmt LowererImplImperative::initResultArrays(vector writes, taco_iassert(!iterators.empty()); Expr tensor = getTensorVar(write.getTensorVar()); - Expr fill = lower(write.getTensorVar().getFill()); + Expr fill = GetProperty::make(tensor, TensorProperty::FillValue); Expr valuesArr = GetProperty::make(tensor, TensorProperty::Values); bool clearValuesAllocation = false; @@ -3230,7 +3216,7 @@ Stmt LowererImplImperative::initResultArrays(IndexVar var, vector writes vector result; for (auto& write : writes) { Expr tensor = getTensorVar(write.getTensorVar()); - Expr fill = lower(write.getTensorVar().getFill()); + Expr fill = GetProperty::make(tensor, TensorProperty::FillValue); Expr values = GetProperty::make(tensor, TensorProperty::Values); vector iterators = getIteratorsFrom(var, getIterators(write)); diff --git a/src/storage/file_io_mtx.cpp b/src/storage/file_io_mtx.cpp index 33bd85258..bc0b4c686 100644 --- a/src/storage/file_io_mtx.cpp +++ b/src/storage/file_io_mtx.cpp @@ -125,6 +125,9 @@ TensorBase dispatchReadSparse(std::istream& stream, const T& format, values.push_back(val); } + + + // Create matrix TensorBase tensor(type(), dimensions, format); if (symm) diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index a16ab11f9..6b7866e4a 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -10,14 +10,30 @@ #include "codegen/codegen.h" #include "taco/lower/lower.h" #include "op_factory.h" +#include "taco/lower/lowerer_impl_c.h" +#include "taco/lower/lowerer_impl_cuda.h" +#include "taco/lower/lowerer_impl_imperative.h" using namespace taco; const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"), n("n"); int WARP_SIZE = 32; -void printToCout(IndexStmt stmt) { +enum Platform { + c, + cuda +}; + +void printToCout(IndexStmt stmt, Platform platform=c) { std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); + ir::Stmt compute; + switch (platform) { + case c: + compute = lower(stmt, "compute", false, true, false, false, Lowerer(new LowererImplC())); + break; + case cuda: + compute = lower(stmt, "compute", false, true, false, false, Lowerer(new LowererImplCUDA())); + break; + } codegen->compile(compute, true); } @@ -395,9 +411,7 @@ IndexStmt exampleScheduleSPMVPosIteration(IndexStmt stmt, Tensor A) { } TEST(scheduling_eval, test_spmvCPU_temp) { - if (should_use_CUDA_codegen()) { - return; - } + int NUM_I = 1021/10; int NUM_J = 1039/10; float SPARSITY = .3; @@ -428,7 +442,8 @@ TEST(scheduling_eval, test_spmvCPU_temp) { IndexStmt stmt = y.getAssignment().concretize(); stmt = stmt.parallelize(i, ParallelUnit::CPUThread, OutputRaceStrategy::Atomics); - //printToFile("test_spmvCPU_temp", stmt); + //set_CUDA_codegen_enabled(1); + printToCout(stmt, Platform(c)); y.compile(stmt); y.assemble(); @@ -442,6 +457,41 @@ TEST(scheduling_eval, test_spmvCPU_temp) { ASSERT_TENSOR_EQ(expected, y); } +TEST(scheduling_eval, spmvGPU_temp) { + int NUM_I = 1021/10; + int NUM_J = 1039/10; + float SPARSITY = .01; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, Format({Dense})); + Tensor y("y", {NUM_I}, Format({Dense})); + + srand(94353); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + x.insert({j}, (double) ((int) (rand_float*3/SPARSITY))); + } + + x.pack(); + A.pack(); + IndexExpr precomputed = A(i, j) * x(j); + y(i) = precomputed; + + IndexStmt stmt = y.getAssignment().concretize(); + stmt = scheduleSpMVGPU(stmt, A, precomputed); + set_CUDA_codegen_enabled(1); + printToCout(stmt, Platform(cuda)); + +} + TEST(scheduling_eval, test_sptvCPU_temp) { if (should_use_CUDA_codegen()) { return; diff --git a/test/tests-workspaces.cpp b/test/tests-workspaces.cpp index 80fd5f3f5..b23bb7089 100644 --- a/test/tests-workspaces.cpp +++ b/test/tests-workspaces.cpp @@ -7,121 +7,145 @@ #include "taco/index_notation/index_notation.h" #include "codegen/codegen.h" #include "taco/lower/lower.h" - +#include "fstream" using namespace taco; -TEST(workspaces, tile_vecElemMul_NoTail) { - - Tensor A("A", {16}, Format{Dense}); - Tensor B("B", {16}, Format{Dense}); - Tensor C("C", {16}, Format{Dense}); - - for (int i = 0; i < 16; i++) { - A.insert({i}, (double) i); - B.insert({i}, (double) i); - } - - A.pack(); - B.pack(); - - IndexVar i("i"); - IndexVar i_bounded("i_bounded"); - IndexVar i0("i0"), i1("i1"); - IndexExpr precomputedExpr = B(i) * C(i); - A(i) = precomputedExpr; - - IndexStmt stmt = A.getAssignment().concretize(); - TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense); - stmt = stmt.bound(i, i_bounded, 16, BoundType::MaxExact) - .split(i_bounded, i0, i1, 4) - .precompute(precomputedExpr, i1, i1, precomputed); - - A.compile(stmt); - A.assemble(); - A.compute(); - - Tensor expected("expected", {16}, Format{Dense}); - expected(i) = B(i) * C(i); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); -} - -TEST(workspaces, tile_vecElemMul_Tail1) { - - Tensor A("A", {16}, Format{Dense}); - Tensor B("B", {16}, Format{Dense}); - Tensor C("C", {16}, Format{Dense}); - - for (int i = 0; i < 16; i++) { - A.insert({i}, (double) i); - B.insert({i}, (double) i); - } - - A.pack(); - B.pack(); - - IndexVar i("i"); - IndexVar i_bounded("i_bounded"); - IndexVar i0("i0"), i1("i1"); - IndexExpr precomputedExpr = B(i) * C(i); - A(i) = precomputedExpr; - - IndexStmt stmt = A.getAssignment().concretize(); - TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense); - stmt = stmt.bound(i, i_bounded, 16, BoundType::MaxExact) - .split(i_bounded, i0, i1, 5) - .precompute(precomputedExpr, i1, i1, precomputed); - - A.compile(stmt.concretize()); - A.assemble(); - A.compute(); - - Tensor expected("expected", {16}, Format{Dense}); - expected(i) = B(i) * C(i); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); -} - -TEST(workspaces, tile_vecElemMul_Tail2) { - - Tensor A("A", {17}, Format{Dense}); - Tensor B("B", {17}, Format{Dense}); - Tensor C("C", {17}, Format{Dense}); - - for (int i = 0; i < 17; i++) { - A.insert({i}, (double) i); - B.insert({i}, (double) i); - } - - A.pack(); - B.pack(); - - IndexVar i("i"); - IndexVar i_bounded("i_bounded"); - IndexVar i0("i0"), i1("i1"); - IndexExpr precomputedExpr = B(i) * C(i); - A(i) = precomputedExpr; - - IndexStmt stmt = A.getAssignment().concretize(); - TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense); - stmt = stmt.bound(i, i_bounded, 17, BoundType::MaxExact) - .split(i_bounded, i0, i1, 4) - .precompute(precomputedExpr, i1, i1, precomputed); - - A.compile(stmt.concretize()); - A.assemble(); - A.compute(); - - Tensor expected("expected", {17}, Format{Dense}); - expected(i) = B(i) * C(i); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); +namespace Temptest { + void _printIRtoFile(const string& filename, const IndexStmt& stmt) { + stringstream source; + string file_path = "eval_generated/"; + mkdir(file_path.c_str(), 0777); + std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + + ofstream source_file; + string file_ending=".txt"; + source_file.open(file_path + filename + file_ending); + ir::IRPrinter irp = ir::IRPrinter(source_file); + source_file< codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + ir::IRPrinter irp = ir::IRPrinter(cout); + irp.print(compute); + } + + TEST(workspaces, tile_vecElemMul_NoTail) { + + Tensor A("A", {16}, Format{Dense}); + Tensor B("B", {16}, Format{Dense}); + Tensor C("C", {16}, Format{Dense}); + + for (int i = 0; i < 16; i++) { + A.insert({i}, (double) i); + B.insert({i}, (double) i); + } + + A.pack(); + B.pack(); + + IndexVar i("i"); + IndexVar i_bounded("i_bounded"); + IndexVar i0("i0"), i1("i1"); + IndexExpr precomputedExpr = B(i) * C(i); + A(i) = precomputedExpr; + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense); + stmt = stmt.bound(i, i_bounded, 16, BoundType::MaxExact) + .split(i_bounded, i0, i1, 4) + .precompute(precomputedExpr, i1, i1, precomputed); + + A.compile(stmt); + A.assemble(); + A.compute(); + _printToCout(stmt); + Tensor expected("expected", {16}, Format{Dense}); + expected(i) = B(i) * C(i); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); + } + + TEST(workspaces, tile_vecElemMul_Tail1) { + + Tensor A("A", {16}, Format{Dense}); + Tensor B("B", {16}, Format{Dense}); + Tensor C("C", {16}, Format{Dense}); + + for (int i = 0; i < 16; i++) { + A.insert({i}, (double) i); + B.insert({i}, (double) i); + } + + A.pack(); + B.pack(); + + IndexVar i("i"); + IndexVar i_bounded("i_bounded"); + IndexVar i0("i0"), i1("i1"); + IndexExpr precomputedExpr = B(i) * C(i); + A(i) = precomputedExpr; + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense); + stmt = stmt.bound(i, i_bounded, 16, BoundType::MaxExact) + .split(i_bounded, i0, i1, 5) + .precompute(precomputedExpr, i1, i1, precomputed); + + A.compile(stmt.concretize()); + A.assemble(); + A.compute(); + + Tensor expected("expected", {16}, Format{Dense}); + expected(i) = B(i) * C(i); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); + } + + TEST(workspaces, tile_vecElemMul_Tail2) { + + Tensor A("A", {17}, Format{Dense}); + Tensor B("B", {17}, Format{Dense}); + Tensor C("C", {17}, Format{Dense}); + + for (int i = 0; i < 17; i++) { + A.insert({i}, (double) i); + B.insert({i}, (double) i); + } + + A.pack(); + B.pack(); + + IndexVar i("i"); + IndexVar i_bounded("i_bounded"); + IndexVar i0("i0"), i1("i1"); + IndexExpr precomputedExpr = B(i) * C(i); + A(i) = precomputedExpr; + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense); + stmt = stmt.bound(i, i_bounded, 17, BoundType::MaxExact) + .split(i_bounded, i0, i1, 4) + .precompute(precomputedExpr, i1, i1, precomputed); + + A.compile(stmt.concretize()); + A.assemble(); + A.compute(); + + Tensor expected("expected", {17}, Format{Dense}); + expected(i) = B(i) * C(i); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); // ir::IRPrinter irp = ir::IRPrinter(cout); // @@ -133,45 +157,45 @@ TEST(workspaces, tile_vecElemMul_Tail2) { // irp.print(compute); // cout << endl; // codegen->compile(compute, false); -} - -TEST(workspaces, tile_denseMatMul) { - - Tensor A("A", {16}, Format{Dense}); - Tensor B("B", {16}, Format{Dense}); - Tensor C("C", {16}, Format{Dense}); - - for (int i = 0; i < 16; i++) { - B.insert({i}, (double) i); - C.insert({i}, (double) i); - } - - A.pack(); - B.pack(); - - IndexVar i("i"); - IndexVar i_bounded("i_bounded"); - IndexVar i0("i0"), i1("i1"); - IndexExpr precomputedExpr = B(i) * C(i); - A(i) = precomputedExpr; - - IndexStmt stmt = A.getAssignment().concretize(); - TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense); - stmt = stmt.bound(i, i_bounded, 16, BoundType::MaxExact) - .split(i_bounded, i0, i1, 4); - - stmt = stmt.precompute(precomputedExpr, i1, i1, precomputed); - - A.compile(stmt.concretize()); - A.assemble(); - A.compute(); - - Tensor expected("expected", {16}, Format{Dense}); - expected(i) = B(i) * C(i); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); + } + + TEST(workspaces, tile_denseMatMul) { + + Tensor A("A", {16}, Format{Dense}); + Tensor B("B", {16}, Format{Dense}); + Tensor C("C", {16}, Format{Dense}); + + for (int i = 0; i < 16; i++) { + B.insert({i}, (double) i); + C.insert({i}, (double) i); + } + + A.pack(); + B.pack(); + + IndexVar i("i"); + IndexVar i_bounded("i_bounded"); + IndexVar i0("i0"), i1("i1"); + IndexExpr precomputedExpr = B(i) * C(i); + A(i) = precomputedExpr; + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense); + stmt = stmt.bound(i, i_bounded, 16, BoundType::MaxExact) + .split(i_bounded, i0, i1, 4); + + stmt = stmt.precompute(precomputedExpr, i1, i1, precomputed); + + A.compile(stmt.concretize()); + A.assemble(); + A.compute(); + + Tensor expected("expected", {16}, Format{Dense}); + expected(i) = B(i) * C(i); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); // ir::IRPrinter irp = ir::IRPrinter(cout); // @@ -183,426 +207,427 @@ TEST(workspaces, tile_denseMatMul) { // irp.print(compute); // cout << endl; // codegen->compile(compute, false); - -} - -TEST(workspaces, precompute2D_add) { - int N = 16; - Tensor A("A", {N, N}, Format{Dense, Dense}); - Tensor B("B", {N, N}, Format{Dense, Dense}); - Tensor C("C", {N, N}, Format{Dense, Dense}); - Tensor D("D", {N, N}, Format{Dense, Dense}); - - for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - B.insert({i, j}, (double) i); - C.insert({i, j}, (double) j); - D.insert({i, j}, (double) i*j); + } - } - - IndexVar i("i"), j("j"); - IndexExpr precomputedExpr = B(i, j) + C(i, j); - A(i, j) = precomputedExpr + D(i, j); - - IndexStmt stmt = A.getAssignment().concretize(); - TensorVar ws("ws", Type(Float64, {(size_t)N, (size_t)N}), Format{Dense, Dense}); - stmt = stmt.precompute(precomputedExpr, {i, j}, {i, j}, ws); - - A.compile(stmt.concretize()); - A.assemble(); - A.compute(); - - Tensor expected("expected", {N, N}, Format{Dense, Dense}); - expected(i, j) = B(i, j) + C(i, j) + D(i, j); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); - -} - -TEST(workspaces, precompute4D_add) { - int N = 16; - Tensor A("A", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); - Tensor B("B", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); - Tensor C("C", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); - Tensor D("D", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); - - for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - for (int k = 0; k < N; k++) { - for (int l = 0; l < N; l++) { - B.insert({i, j, k, l}, (double) i + j); - C.insert({i, j, k, l}, (double) j * k); - D.insert({i, j, k, l}, (double) k * l); + + TEST(workspaces, precompute2D_add) { + int N = 16; + Tensor A("A", {N, N}, Format{Dense, Dense}); + Tensor B("B", {N, N}, Format{Dense, Dense}); + Tensor C("C", {N, N}, Format{Dense, Dense}); + Tensor D("D", {N, N}, Format{Dense, Dense}); + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + B.insert({i, j}, (double) i); + C.insert({i, j}, (double) j); + D.insert({i, j}, (double) i * j); + } } - } + + IndexVar i("i"), j("j"); + IndexExpr precomputedExpr = B(i, j) + C(i, j); + A(i, j) = precomputedExpr + D(i, j); + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar ws("ws", Type(Float64, {(size_t) N, (size_t) N}), Format{Dense, Dense}); + stmt = stmt.precompute(precomputedExpr, {i, j}, {i, j}, ws); + + A.compile(stmt.concretize()); + A.assemble(); + A.compute(); + + Tensor expected("expected", {N, N}, Format{Dense, Dense}); + expected(i, j) = B(i, j) + C(i, j) + D(i, j); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); + } - } - - IndexVar i("i"), j("j"), k("k"), l("l"); - IndexExpr precomputedExpr = B(i, j, k, l) + C(i, j, k, l); - A(i, j, k, l) = precomputedExpr + D(i, j, k, l); - - - IndexStmt stmt = A.getAssignment().concretize(); - TensorVar ws1("ws1", Type(Float64, {(size_t)N, (size_t)N, (size_t)N, (size_t)N}), - Format{Dense, Dense, Dense, Dense}); - TensorVar ws2("ws2", Type(Float64, {(size_t)N, (size_t)N, (size_t)N, (size_t)N}), - Format{Dense, Dense, Dense, Dense}); - stmt = stmt.precompute(precomputedExpr, {i, j, k, l}, {i, j, k, l}, ws1) - .precompute(ws1(i, j, k, l) + D(i, j, k, l), {i, j, k, l}, {i, j, k ,l}, ws2); - - A.compile(stmt.concretize()); - A.assemble(); - A.compute(); - - Tensor expected("expected", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); - expected(i, j, k, l) = B(i, j, k, l) + C(i, j, k, l) + D(i, j, k, l); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); -} - -TEST(workspaces, precompute4D_multireduce) { - int N = 16; - Tensor A("A", {N, N}, Format{Dense, Dense}); - Tensor B("B", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); - Tensor C("C", {N, N, N}, Format{Dense, Dense, Dense}); - Tensor D("D", {N, N}, Format{Dense, Dense}); - - for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - for (int k = 0; k < N; k++) { - for (int l = 0; l < N; l++) { - B.insert({i, j, k, l}, (double) k*l); - C.insert({i, j, k}, (double) j * k); - D.insert({i, j}, (double) i+j); + + TEST(workspaces, precompute4D_add) { + int N = 16; + Tensor A("A", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); + Tensor B("B", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); + Tensor C("C", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); + Tensor D("D", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + for (int l = 0; l < N; l++) { + B.insert({i, j, k, l}, (double) i + j); + C.insert({i, j, k, l}, (double) j * k); + D.insert({i, j, k, l}, (double) k * l); + } + } + } } - } + + IndexVar i("i"), j("j"), k("k"), l("l"); + IndexExpr precomputedExpr = B(i, j, k, l) + C(i, j, k, l); + A(i, j, k, l) = precomputedExpr + D(i, j, k, l); + + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar ws1("ws1", Type(Float64, {(size_t) N, (size_t) N, (size_t) N, (size_t) N}), + Format{Dense, Dense, Dense, Dense}); + TensorVar ws2("ws2", Type(Float64, {(size_t) N, (size_t) N, (size_t) N, (size_t) N}), + Format{Dense, Dense, Dense, Dense}); + stmt = stmt.precompute(precomputedExpr, {i, j, k, l}, {i, j, k, l}, ws1) + .precompute(ws1(i, j, k, l) + D(i, j, k, l), {i, j, k, l}, {i, j, k, l}, ws2); + + A.compile(stmt.concretize()); + A.assemble(); + A.compute(); + + Tensor expected("expected", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); + expected(i, j, k, l) = B(i, j, k, l) + C(i, j, k, l) + D(i, j, k, l); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); } - } - - IndexVar i("i"), j("j"), k("k"), l("l"), m("m"), n("n"); - IndexExpr precomputedExpr = B(i, j, k, l) * C(k, l, m); - A(i, j) = precomputedExpr * D(m, n); - - - IndexStmt stmt = A.getAssignment().concretize(); - TensorVar ws1("ws1", Type(Float64, {(size_t)N, (size_t)N, (size_t)N}), Format{Dense, Dense, Dense}); - TensorVar ws2("ws2", Type(Float64, {(size_t)N, (size_t)N}), Format{Dense, Dense}); - stmt = stmt.precompute(precomputedExpr, {i, j, m}, {i, j, m}, ws1) - .precompute(ws1(i, j, m) * D(m, n), {i, j}, {i, j}, ws2); - - A.compile(stmt.concretize()); - A.assemble(); - A.compute(); - - Tensor expected("expected", {N, N}, Format{Dense, Dense}); - expected(i, j) = B(i, j, k, l) * C(k, l, m) * D(m, n); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); -} - -TEST(workspaces, precompute3D_TspV) { - int N = 16; - Tensor A("A", {N, N}, Format{Dense, Dense}); - Tensor B("B", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); - Tensor c("c", {N}, Format{Sparse}); - - for (int i = 0; i < N; i++) { - c.insert({i}, (double) i); - for (int j = 0; j < N; j++) { - for (int k = 0; k < N; k++) { - for (int l = 0; l < N; l++) { - B.insert({i, j, k, l}, (double) i + j); + + TEST(workspaces, precompute4D_multireduce) { + int N = 16; + Tensor A("A", {N, N}, Format{Dense, Dense}); + Tensor B("B", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); + Tensor C("C", {N, N, N}, Format{Dense, Dense, Dense}); + Tensor D("D", {N, N}, Format{Dense, Dense}); + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + for (int l = 0; l < N; l++) { + B.insert({i, j, k, l}, (double) k * l); + C.insert({i, j, k}, (double) j * k); + D.insert({i, j}, (double) i + j); + } + } + } } - } + + IndexVar i("i"), j("j"), k("k"), l("l"), m("m"), n("n"); + IndexExpr precomputedExpr = B(i, j, k, l) * C(k, l, m); + A(i, j) = precomputedExpr * D(m, n); + + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar ws1("ws1", Type(Float64, {(size_t) N, (size_t) N, (size_t) N}), Format{Dense, Dense, Dense}); + TensorVar ws2("ws2", Type(Float64, {(size_t) N, (size_t) N}), Format{Dense, Dense}); + stmt = stmt.precompute(precomputedExpr, {i, j, m}, {i, j, m}, ws1) + .precompute(ws1(i, j, m) * D(m, n), {i, j}, {i, j}, ws2); + + A.compile(stmt.concretize()); + A.assemble(); + A.compute(); + + Tensor expected("expected", {N, N}, Format{Dense, Dense}); + expected(i, j) = B(i, j, k, l) * C(k, l, m) * D(m, n); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); } - } - - IndexVar i("i"), j("j"), k("k"), l("l"); - IndexExpr precomputedExpr = B(i, j, k, l) * c(l); - A(i, j) = precomputedExpr * c(k); - - - IndexStmt stmt = A.getAssignment().concretize(); - TensorVar ws("ws", Type(Float64, {(size_t)N, (size_t)N, (size_t)N}), Format{Dense, Dense, Dense}); - stmt = stmt.precompute(precomputedExpr, {i, j, k}, {i, j, k}, ws); - stmt = stmt.concretize(); - - A.compile(stmt); - A.assemble(); - A.compute(); - - Tensor expected("expected", {N, N}, Format{Dense, Dense}); - expected(i, j) = (B(i, j, k, l) * c(l)) * c(k); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); - -} - -TEST(workspaces, precompute3D_multipleWS) { - int N = 16; - Tensor A("A", {N, N}, Format{Dense, Dense}); - Tensor B("B", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); - Tensor c("c", {N}, Format{Sparse}); - - for (int i = 0; i < N; i++) { - c.insert({i}, (double) i); - for (int j = 0; j < N; j++) { - for (int k = 0; k < N; k++) { - for (int l = 0; l < N; l++) { - B.insert({i, j, k, l}, (double) i + j); + + TEST(workspaces, precompute3D_TspV) { + int N = 16; + Tensor A("A", {N, N}, Format{Dense, Dense}); + Tensor B("B", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); + Tensor c("c", {N}, Format{Sparse}); + + for (int i = 0; i < N; i++) { + c.insert({i}, (double) i); + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + for (int l = 0; l < N; l++) { + B.insert({i, j, k, l}, (double) i + j); + } + } + } } - } + + IndexVar i("i"), j("j"), k("k"), l("l"); + IndexExpr precomputedExpr = B(i, j, k, l) * c(l); + A(i, j) = precomputedExpr * c(k); + + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar ws("ws", Type(Float64, {(size_t) N, (size_t) N, (size_t) N}), Format{Dense, Dense, Dense}); + stmt = stmt.precompute(precomputedExpr, {i, j, k}, {i, j, k}, ws); + stmt = stmt.concretize(); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {N, N}, Format{Dense, Dense}); + expected(i, j) = (B(i, j, k, l) * c(l)) * c(k); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); + } - } - - IndexVar i("i"), j("j"), k("k"), l("l"); - IndexExpr precomputedExpr = B(i, j, k, l) * c(l); - IndexExpr precomputedExpr2 = precomputedExpr * c(k); - A(i, j) = precomputedExpr2; - - - IndexStmt stmt = A.getAssignment().concretize(); - TensorVar ws("ws", Type(Float64, {(size_t)N, (size_t)N, (size_t)N}), Format{Dense, Dense, Dense}); - TensorVar t("t", Type(Float64, {(size_t) N, (size_t)N}), Format{Dense, Dense}); - stmt = stmt.precompute(precomputedExpr, {i, j, k}, {i, j, k}, ws); - - stmt = stmt.precompute(ws(i, j, k) * c(k), {i, j}, {i, j}, t); - stmt = stmt.concretize(); - - A.compile(stmt); - A.assemble(); - A.compute(); - - Tensor expected("expected", {N, N}, Format{Dense, Dense}); - expected(i, j) = (B(i, j, k, l) * c(l)) * c(k); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); - -} - -TEST(workspaces, precompute3D_renamedIVars_TspV) { - int N = 16; - Tensor A("A", {N, N}, Format{Dense, Dense}); - Tensor B("B", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); - Tensor c("c", {N}, Format{Sparse}); - - for (int i = 0; i < N; i++) { - c.insert({i}, (double) i); - for (int j = 0; j < N; j++) { - for (int k = 0; k < N; k++) { - for (int l = 0; l < N; l++) { - B.insert({i, j, k, l}, (double) i + j); + + TEST(workspaces, precompute3D_multipleWS) { + int N = 16; + Tensor A("A", {N, N}, Format{Dense, Dense}); + Tensor B("B", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); + Tensor c("c", {N}, Format{Sparse}); + + for (int i = 0; i < N; i++) { + c.insert({i}, (double) i); + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + for (int l = 0; l < N; l++) { + B.insert({i, j, k, l}, (double) i + j); + } + } + } } - } + + IndexVar i("i"), j("j"), k("k"), l("l"); + IndexExpr precomputedExpr = B(i, j, k, l) * c(l); + IndexExpr precomputedExpr2 = precomputedExpr * c(k); + A(i, j) = precomputedExpr2; + + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar ws("ws", Type(Float64, {(size_t) N, (size_t) N, (size_t) N}), Format{Dense, Dense, Dense}); + TensorVar t("t", Type(Float64, {(size_t) N, (size_t) N}), Format{Dense, Dense}); + stmt = stmt.precompute(precomputedExpr, {i, j, k}, {i, j, k}, ws); + + stmt = stmt.precompute(ws(i, j, k) * c(k), {i, j}, {i, j}, t); + stmt = stmt.concretize(); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {N, N}, Format{Dense, Dense}); + expected(i, j) = (B(i, j, k, l) * c(l)) * c(k); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); + + } + + TEST(workspaces, precompute3D_renamedIVars_TspV) { + int N = 16; + Tensor A("A", {N, N}, Format{Dense, Dense}); + Tensor B("B", {N, N, N, N}, Format{Dense, Dense, Dense, Dense}); + Tensor c("c", {N}, Format{Sparse}); + + for (int i = 0; i < N; i++) { + c.insert({i}, (double) i); + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + for (int l = 0; l < N; l++) { + B.insert({i, j, k, l}, (double) i + j); + } + } + } + } + + IndexVar i("i"), j("j"), k("k"), l("l"); + IndexExpr precomputedExpr = B(i, j, k, l) * c(l); + A(i, j) = precomputedExpr * c(k); + + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar ws("ws", Type(Float64, {(size_t) N, (size_t) N, (size_t) N}), + Format{Dense, Dense, Dense}); + + IndexVar iw("iw"), jw("jw"), kw("kw"); + stmt = stmt.precompute(precomputedExpr, {i, j, k}, {iw, jw, kw}, ws); + stmt = stmt.concretize(); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {N, N}, Format{Dense, Dense}); + expected(i, j) = (B(i, j, k, l) * c(l)) * c(k); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); + + } + + TEST(workspaces, DISABLED_tile_dotProduct_1) { + // FIXME: Disabled because currently the precompute algorithm does not appropriately + // find the correct forall substmt to next the WhereNode in after i has been + // split into i0 and i1. As an example, the first precompute below is incorrect + // since it should transform + // forall(i0, forall(i1, A() += B(i) * C(i))) --> + // forall(i0, where(forall(i1, A() += ws(i1)), forall(i1, ws(i1) += B(i) * C(i)))) + // + // But currently the algorithm does + // forall(i0, forall(i1, A() += B(i) * C(i))) --> + // where(forall(i1, A() += ws(i1)), forall(i0, forall(i1, ws(i1) += B(i) * C(i)))) + + int N = 1024; + Tensor A("A"); + Tensor B("B", {N}, Format({Dense})); + Tensor C("C", {N}, Format({Dense})); + + for (int i = 0; i < N; i++) { + B.insert({i}, (double) i); + C.insert({i}, (double) i); + } + + B.pack(); + C.pack(); + + IndexVar i("i"); + IndexVar i_bounded("i_bounded"); + IndexVar i0("i0"), i1("i1"); + IndexExpr BExpr = B(i); + IndexExpr CExpr = C(i); + IndexExpr precomputedExpr = (BExpr) * (CExpr); + A() = precomputedExpr; + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar B_new("B_new", Type(Float64, {(size_t) N}), taco::dense); + TensorVar C_new("C_new", Type(Float64, {(size_t) N}), taco::dense); + TensorVar precomputed("precomputed", Type(Float64, {(size_t) N}), taco::dense); + + stmt = stmt.bound(i, i_bounded, (size_t) N, BoundType::MaxExact) + .split(i_bounded, i0, i1, 32); + stmt = stmt.precompute(precomputedExpr, i1, i1, precomputed); + stmt = stmt.precompute(BExpr, i1, i1, B_new) + .precompute(CExpr, i1, i1, C_new); + + stmt = stmt.concretize(); + + A.compile(stmt); + A.assemble(); + A.compute(); + + ir::IRPrinter irp = ir::IRPrinter(cout); + + cout << stmt << endl; + + std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + + irp.print(compute); + cout << endl; + codegen->compile(compute, false); + + Tensor expected("expected"); + expected() = B(i) * C(i); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); + } + + TEST(workspaces, DISABLED_tile_dotProduct_2) { + // FIXME: This is also currently disabled since split(...) scheduling commands + // only split on the FIRST INSTANCE of an indexVar (assumes only one). + // This is wrong if the indexVar is not renamed across iw_vars since an indexVar can + // then occur on BOTH the consumer and producer side and should be split across both. + + int N = 1024; + Tensor A("A"); + Tensor B("B", {N}, Format({Dense})); + Tensor C("C", {N}, Format({Dense})); + + for (int i = 0; i < N; i++) { + B.insert({i}, (double) i); + C.insert({i}, (double) i); + } + + B.pack(); + C.pack(); + + IndexVar i("i"); + IndexVar i_bounded("i_bounded"); + IndexVar i0("i0"), i1("i1"); + IndexExpr BExpr = B(i); + IndexExpr CExpr = C(i); + IndexExpr precomputedExpr = (BExpr) * (CExpr); + A() = precomputedExpr; + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar B_new("B_new", Type(Float64, {(size_t) N}), taco::dense); + TensorVar C_new("C_new", Type(Float64, {(size_t) N}), taco::dense); + TensorVar precomputed("precomputed", Type(Float64, {(size_t) N}), taco::dense); + + stmt = stmt.precompute(precomputedExpr, i, i, precomputed); + + stmt = stmt.precompute(BExpr, i, i, B_new) + .precompute(CExpr, i, i, C_new); + + stmt = stmt.bound(i, i_bounded, (size_t) N, BoundType::MaxExact) + .split(i_bounded, i0, i1, 32); + + stmt = stmt.concretize(); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected"); + expected() = B(i) * C(i); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); + } + + TEST(workspaces, tile_dotProduct_3) { + int N = 1024; + Tensor A("A"); + Tensor B("B", {N}, Format({Dense})); + Tensor C("C", {N}, Format({Dense})); + + for (int i = 0; i < N; i++) { + B.insert({i}, (double) i); + C.insert({i}, (double) i); + } + + B.pack(); + C.pack(); + + IndexVar i("i"); + IndexVar i_bounded("i_bounded"); + IndexVar i0("i0"), i1("i1"); + IndexExpr BExpr = B(i); + IndexExpr CExpr = C(i); + IndexExpr precomputedExpr = (BExpr) * (CExpr); + A() = precomputedExpr; + + IndexStmt stmt = A.getAssignment().concretize(); + TensorVar B_new("B_new", Type(Float64, {(size_t) N}), taco::dense); + TensorVar C_new("C_new", Type(Float64, {(size_t) N}), taco::dense); + TensorVar precomputed("precomputed", Type(Float64, {(size_t) N}), taco::dense); + + stmt = stmt.bound(i, i_bounded, (size_t) N, BoundType::MaxExact) + .split(i_bounded, i0, i1, 32); + stmt = stmt.precompute(precomputedExpr, i0, i0, precomputed); + + stmt = stmt.precompute(BExpr, i1, i1, B_new) + .precompute(CExpr, i1, i1, C_new); + + + stmt = stmt.concretize(); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected"); + expected() = B(i) * C(i); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); } - } - - IndexVar i("i"), j("j"), k("k"), l("l"); - IndexExpr precomputedExpr = B(i, j, k, l) * c(l); - A(i, j) = precomputedExpr * c(k); - - - IndexStmt stmt = A.getAssignment().concretize(); - TensorVar ws("ws", Type(Float64, {(size_t)N, (size_t)N, (size_t)N}), - Format{Dense, Dense, Dense}); - - IndexVar iw("iw"), jw("jw"), kw("kw"); - stmt = stmt.precompute(precomputedExpr, {i, j, k}, {iw, jw, kw}, ws); - stmt = stmt.concretize(); - - A.compile(stmt); - A.assemble(); - A.compute(); - - Tensor expected("expected", {N, N}, Format{Dense, Dense}); - expected(i, j) = (B(i, j, k, l) * c(l)) * c(k); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); - -} - -TEST(workspaces, DISABLED_tile_dotProduct_1) { - // FIXME: Disabled because currently the precompute algorithm does not appropriately - // find the correct forall substmt to next the WhereNode in after i has been - // split into i0 and i1. As an example, the first precompute below is incorrect - // since it should transform - // forall(i0, forall(i1, A() += B(i) * C(i))) --> - // forall(i0, where(forall(i1, A() += ws(i1)), forall(i1, ws(i1) += B(i) * C(i)))) - // - // But currently the algorithm does - // forall(i0, forall(i1, A() += B(i) * C(i))) --> - // where(forall(i1, A() += ws(i1)), forall(i0, forall(i1, ws(i1) += B(i) * C(i)))) - - int N = 1024; - Tensor A("A"); - Tensor B("B", {N}, Format({Dense})); - Tensor C("C", {N}, Format({Dense})); - - for (int i = 0; i < N; i++) { - B.insert({i}, (double) i); - C.insert({i}, (double) i); - } - - B.pack(); - C.pack(); - - IndexVar i("i"); - IndexVar i_bounded("i_bounded"); - IndexVar i0("i0"), i1("i1"); - IndexExpr BExpr = B(i); - IndexExpr CExpr = C(i); - IndexExpr precomputedExpr = (BExpr) * (CExpr); - A() = precomputedExpr; - - IndexStmt stmt = A.getAssignment().concretize(); - TensorVar B_new("B_new", Type(Float64, {(size_t)N}), taco::dense); - TensorVar C_new("C_new", Type(Float64, {(size_t)N}), taco::dense); - TensorVar precomputed("precomputed", Type(Float64, {(size_t)N}), taco::dense); - - stmt = stmt.bound(i, i_bounded, (size_t)N, BoundType::MaxExact) - .split(i_bounded, i0, i1, 32); - stmt = stmt.precompute(precomputedExpr, i1, i1, precomputed); - stmt = stmt.precompute(BExpr, i1, i1, B_new) - .precompute(CExpr, i1, i1, C_new); - - stmt = stmt.concretize(); - - A.compile(stmt); - A.assemble(); - A.compute(); - - ir::IRPrinter irp = ir::IRPrinter(cout); - - cout << stmt << endl; - - std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); - - irp.print(compute); - cout << endl; - codegen->compile(compute, false); - - Tensor expected("expected"); - expected() = B(i) * C(i); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); -} - -TEST(workspaces, DISABLED_tile_dotProduct_2) { - // FIXME: This is also currently disabled since split(...) scheduling commands - // only split on the FIRST INSTANCE of an indexVar (assumes only one). - // This is wrong if the indexVar is not renamed across iw_vars since an indexVar can - // then occur on BOTH the consumer and producer side and should be split across both. - - int N = 1024; - Tensor A("A"); - Tensor B("B", {N}, Format({Dense})); - Tensor C("C", {N}, Format({Dense})); - - for (int i = 0; i < N; i++) { - B.insert({i}, (double) i); - C.insert({i}, (double) i); - } - - B.pack(); - C.pack(); - - IndexVar i("i"); - IndexVar i_bounded("i_bounded"); - IndexVar i0("i0"), i1("i1"); - IndexExpr BExpr = B(i); - IndexExpr CExpr = C(i); - IndexExpr precomputedExpr = (BExpr) * (CExpr); - A() = precomputedExpr; - - IndexStmt stmt = A.getAssignment().concretize(); - TensorVar B_new("B_new", Type(Float64, {(size_t)N}), taco::dense); - TensorVar C_new("C_new", Type(Float64, {(size_t)N}), taco::dense); - TensorVar precomputed("precomputed", Type(Float64, {(size_t)N}), taco::dense); - - stmt = stmt.precompute(precomputedExpr, i, i, precomputed); - - stmt = stmt.precompute(BExpr, i, i, B_new) - .precompute(CExpr, i, i, C_new); - - stmt = stmt.bound(i, i_bounded, (size_t)N, BoundType::MaxExact) - .split(i_bounded, i0, i1, 32); - - stmt = stmt.concretize(); - - A.compile(stmt); - A.assemble(); - A.compute(); - - Tensor expected("expected"); - expected() = B(i) * C(i); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); -} - -TEST(workspaces, tile_dotProduct_3) { - int N = 1024; - Tensor A("A"); - Tensor B("B", {N}, Format({Dense})); - Tensor C("C", {N}, Format({Dense})); - - for (int i = 0; i < N; i++) { - B.insert({i}, (double) i); - C.insert({i}, (double) i); - } - - B.pack(); - C.pack(); - - IndexVar i("i"); - IndexVar i_bounded("i_bounded"); - IndexVar i0("i0"), i1("i1"); - IndexExpr BExpr = B(i); - IndexExpr CExpr = C(i); - IndexExpr precomputedExpr = (BExpr) * (CExpr); - A() = precomputedExpr; - - IndexStmt stmt = A.getAssignment().concretize(); - TensorVar B_new("B_new", Type(Float64, {(size_t)N}), taco::dense); - TensorVar C_new("C_new", Type(Float64, {(size_t)N}), taco::dense); - TensorVar precomputed("precomputed", Type(Float64, {(size_t)N}), taco::dense); - - stmt = stmt.bound(i, i_bounded, (size_t)N, BoundType::MaxExact) - .split(i_bounded, i0, i1, 32); - stmt = stmt.precompute(precomputedExpr, i0, i0, precomputed); - - stmt = stmt.precompute(BExpr, i1, i1, B_new) - .precompute(CExpr, i1, i1, C_new); - - - stmt = stmt.concretize(); - - A.compile(stmt); - A.assemble(); - A.compute(); - - Tensor expected("expected"); - expected() = B(i) * C(i); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); -} +} \ No newline at end of file