From bd36277648ddd6a0c8ee4fb371f5419baa14abc1 Mon Sep 17 00:00:00 2001
From: Olivia Hsu <owhsu@stanford.edu>
Date: Wed, 14 Oct 2020 14:51:00 -0700
Subject: [PATCH 01/14] Add in hoisted workspace reuse and remove guard for
 divisible bound and split

---
 include/taco/index_notation/index_notation.h |   4 +
 include/taco/lower/lowerer_impl.h            |   9 +-
 src/index_notation/index_notation.cpp        |  16 ++
 src/lower/lower.cpp                          |   1 +
 src/lower/lowerer_impl.cpp                   |  85 +++++++--
 test/tests-scheduling.cpp                    |  15 +-
 test/tests-workspaces.cpp                    | 186 +++++++++++++++++++
 7 files changed, 291 insertions(+), 25 deletions(-)
 create mode 100644 test/tests-workspaces.cpp
diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h
index ebc710e28..7d408f027 100644
--- a/include/taco/index_notation/index_notation.h
+++ b/include/taco/index_notation/index_notation.h
@@ -949,6 +949,10 @@ std::vector<TensorVar> getArguments(IndexStmt stmt);
 /// Returns the temporaries in the index statement, in the order they appear.
 std::vector<TensorVar> getTemporaries(IndexStmt stmt);
 
+// [Olivia]
+/// Returns the temporaries in the index statement, in the order they appear.
+std::map<Forall, Where> getTemporaryLocations(IndexStmt stmt);
+
 /// Returns the tensors in the index statement.
 std::vector<TensorVar> getTensorVars(IndexStmt stmt);
 
diff --git a/include/taco/lower/lowerer_impl.h b/include/taco/lower/lowerer_impl.h
index 62ac9e5fe..a3ffd6173 100644
--- a/include/taco/lower/lowerer_impl.h
+++ b/include/taco/lower/lowerer_impl.h
@@ -333,17 +333,19 @@ class LowererImpl : public util::Uncopyable {
   ir::Stmt codeToInitializeIteratorVars(std::vector<Iterator> iterators, std::vector<Iterator> rangers, std::vector<Iterator> mergers, ir::Expr coord, IndexVar coordinateVar);
   ir::Stmt codeToInitializeIteratorVar(Iterator iterator, std::vector<Iterator> iterators, std::vector<Iterator> rangers, std::vector<Iterator> mergers, ir::Expr coordinate, IndexVar coordinateVar);
 
+  /// Initializes a temporary workspace
+  std::vector<ir::Stmt> codeToInitializeTemporary(Where where);
 
   /// Recovers a derived indexvar from an underived variable.
   ir::Stmt codeToRecoverDerivedIndexVar(IndexVar underived, IndexVar indexVar, bool emitVarDecl);
 
-    /// Conditionally increment iterator position variables.
+  /// Conditionally increment iterator position variables.
   ir::Stmt codeToIncIteratorVars(ir::Expr coordinate, IndexVar coordinateVar,
           std::vector<Iterator> iterators, std::vector<Iterator> mergers);
 
   ir::Stmt codeToLoadCoordinatesFromPosIterators(std::vector<Iterator> iterators, bool declVars);
 
-    /// Create statements to append coordinate to result modes.
+  /// Create statements to append coordinate to result modes.
   ir::Stmt appendCoordinate(std::vector<Iterator> appenders, ir::Expr coord);
 
   /// Create statements to append positions to result modes.
@@ -363,6 +365,9 @@ class LowererImpl : public util::Uncopyable {
   int markAssignsAtomicDepth = 0;
   ParallelUnit atomicParallelUnit;
 
+  /// Map used to hoist temporary workspace initialization
+  std::map<Forall, Where> temporaryInitialization;
+
   /// Map from tensor variables in index notation to variables in the IR
   std::map<TensorVar, ir::Expr> tensorVars;
 
diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp
index 1f857a5fc..e9fa934a4 100644
--- a/src/index_notation/index_notation.cpp
+++ b/src/index_notation/index_notation.cpp
@@ -2332,6 +2332,22 @@ vector<TensorVar> getArguments(IndexStmt stmt) {
   return result;
 }
 
+std::map<Forall, Where> getTemporaryLocations(IndexStmt stmt) {
+  map<Forall, Where> temporaryLocs;
+  Forall f = Forall();
+  match(stmt,
+        function<void(const ForallNode*, Matcher*)>([&](const ForallNode* op, Matcher* ctx) {
+          f = op;
+          ctx->match(op->stmt);
+        }),
+          function<void(const WhereNode*, Matcher*)>([&](const WhereNode* w, Matcher* ctx) {
+            if (!(f == IndexStmt()))
+              temporaryLocs.insert({f, Where(w)});
+          })
+        );
+  return temporaryLocs;
+}
+
 std::vector<TensorVar> getTemporaries(IndexStmt stmt) {
   vector<TensorVar> temporaries;
   bool firstAssignment = true;
diff --git a/src/lower/lower.cpp b/src/lower/lower.cpp
index 86389ac1d..e24406543 100644
--- a/src/lower/lower.cpp
+++ b/src/lower/lower.cpp
@@ -12,6 +12,7 @@
 #include "taco/ir/ir.h"
 #include "taco/ir/simplify.h"
 #include "ir/ir_generators.h"
+#include "taco/ir/ir_printer.h"
 
 #include "taco/lower/lowerer_impl.h"
 #include "taco/lower/iterator.h"
diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp
index 7a0af13b3..2297f73c3 100644
--- a/src/lower/lowerer_impl.cpp
+++ b/src/lower/lowerer_impl.cpp
@@ -118,6 +118,9 @@ LowererImpl::lower(IndexStmt stmt, string name,
   vector<TensorVar> arguments = getArguments(stmt);
   vector<TensorVar> temporaries = getTemporaries(stmt);
 
+  // Create datastructure needed for temporary workspace hoisting/reuse
+  temporaryInitialization = getTemporaryLocations(stmt);
+
   // Convert tensor results and arguments IR variables
   map<TensorVar, Expr> resultVars;
   vector<Expr> resultsIR = createVars(results, &resultVars, unpack);
@@ -382,11 +385,29 @@ Stmt LowererImpl::lowerForall(Forall forall)
     taco_iassert(indexVarToExprMap.count(varToRecover));
     recoverySteps.push_back(VarDecl::make(indexVarToExprMap[varToRecover], recoveredValue));
     // place underived guard
+    std::vector<ir::Expr> iterBounds = provGraph.deriveIterBounds(varToRecover, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators);
     if (forallNeedsUnderivedGuards && underivedBounds.count(varToRecover) &&
         !provGraph.hasPosDescendant(varToRecover)) {
-      Stmt guard = IfThenElse::make(Gte::make(indexVarToExprMap[varToRecover], underivedBounds[varToRecover][1]),
-                                    Break::make());
-      recoverySteps.push_back(guard);
+
+      // FIXME: [Olivia] Check this with someone
+      // Removed underived guard if indexVar is bounded is divisible by its split child indexVar
+      vector<IndexVar> children = provGraph.getChildren(varToRecover);
+      bool hasDirectDivBound = false;
+      std::vector<ir::Expr> iterBoundsInner = provGraph.deriveIterBounds(forall.getIndexVar(), definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators);
+
+        for (auto& c: children) {
+          if (provGraph.hasExactBound(c) && provGraph.derivationPath(varToRecover, c).size() == 2) {
+              std::vector<ir::Expr> iterBoundsUnderivedChild = provGraph.deriveIterBounds(c, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators);
+              if (iterBoundsUnderivedChild[1].as<ir::Literal>()->getValue<int>() % iterBoundsInner[1].as<ir::Literal>()->getValue<int>() == 0)
+              hasDirectDivBound = true;
+              break;
+          }
+      }
+      if (!hasDirectDivBound) {
+          Stmt guard = IfThenElse::make(Gte::make(indexVarToExprMap[varToRecover], underivedBounds[varToRecover][1]),
+                                        Break::make());
+          recoverySteps.push_back(guard);
+      }
     }
   }
   Stmt recoveryStmt = Block::make(recoverySteps);
@@ -414,6 +435,12 @@ Stmt LowererImpl::lowerForall(Forall forall)
                                         getArgumentAccesses(forall), 
                                         reducedAccesses);
 
+  // Emit temporary initialization if forall is sequential and leads to a where statement
+  vector<Stmt> temporaryValuesInitFree = {Stmt(), Stmt()};
+  auto temp = temporaryInitialization.find(forall);
+  if (temp != temporaryInitialization.end() && forall.getParallelUnit() == ParallelUnit::NotParallel && !isScalar(temp->second.getTemporary().getType()))
+    temporaryValuesInitFree = codeToInitializeTemporary(temp->second);
+
   Stmt loops;
   // Emit a loop that iterates over over a single iterator (optimization)
   if (lattice.iterators().size() == 1 && lattice.iterators()[0].isUnique()) {
@@ -491,7 +518,9 @@ Stmt LowererImpl::lowerForall(Forall forall)
     parallelUnitSizes.erase(forall.getParallelUnit());
   }
   return Block::blanks(preInitValues,
-                       loops);
+                       temporaryValuesInitFree[0],
+                       loops,
+                       temporaryValuesInitFree[1]);
 }
 
 Stmt LowererImpl::lowerForallCloned(Forall forall) {
@@ -1272,39 +1301,36 @@ Stmt LowererImpl::lowerForallBody(Expr coordinate, IndexStmt stmt,
                      appendCoords);
 }
 
-
-Stmt LowererImpl::lowerWhere(Where where) {
+vector<Stmt> LowererImpl::codeToInitializeTemporary(Where where) {
   TensorVar temporary = where.getTemporary();
 
-  // Declare and initialize the where statement's temporary
-  Stmt initializeTemporary = Stmt();
   Stmt freeTemporary = Stmt();
+  Stmt initializeTemporary = Stmt();
   if (isScalar(temporary.getType())) {
     initializeTemporary = defineScalarVariable(temporary, true);
-  }
-  else {
+  } else {
     if (generateComputeCode()) {
       Expr values = ir::Var::make(temporary.getName(),
                                   temporary.getType().getDataType(),
                                   true, false);
-      taco_iassert(temporary.getType().getOrder() == 1) << " Temporary order was " << temporary.getType().getOrder();  // TODO
+      taco_iassert(temporary.getType().getOrder() == 1) << " Temporary order was "
+                                                        << temporary.getType().getOrder();  // TODO
       Dimension temporarySize = temporary.getType().getShape().getDimension(0);
       Expr size;
       if (temporarySize.isFixed()) {
         size = ir::Literal::make(temporarySize.getSize());
-      }
-      else if (temporarySize.isIndexVarSized()) {
+      } else if (temporarySize.isIndexVarSized()) {
         IndexVar var = temporarySize.getIndexVarSize();
-        vector<Expr> bounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators);
+        vector<Expr> bounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds,
+                                                         indexVarToExprMap, iterators);
         size = ir::Sub::make(bounds[1], bounds[0]);
-      }
-      else {
+      } else {
         taco_ierror; // TODO
       }
 
       // no decl needed for shared memory
       Stmt decl = Stmt();
-      if((isa<Forall>(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) {
+      if ((isa<Forall>(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) {
         decl = VarDecl::make(values, ir::Literal::make(0));
       }
       Stmt allocate = Allocate::make(values, size);
@@ -1313,17 +1339,36 @@ Stmt LowererImpl::lowerWhere(Where where) {
       Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType()));
       Stmt zeroInitLoop = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial);
 
-      freeTemporary = Free::make(values);
-
       /// Make a struct object that lowerAssignment and lowerAccess can read
       /// temporary value arrays from.
       TemporaryArrays arrays;
       arrays.values = values;
       this->temporaryArrays.insert({temporary, arrays});
 
+      freeTemporary = Free::make(values);
       initializeTemporary = Block::make(decl, allocate, zeroInitLoop);
     }
   }
+  return {initializeTemporary, freeTemporary};
+}
+
+Stmt LowererImpl::lowerWhere(Where where) {
+  TensorVar temporary = where.getTemporary();
+
+  // Declare and initialize the where statement's temporary
+  vector<Stmt> temporaryValuesInitFree = {Stmt(), Stmt()};
+  bool temporaryHoisted = false;
+  for (auto it = temporaryInitialization.begin(); it != temporaryInitialization.end(); ++it) {
+    if (it->second == where && it->first.getParallelUnit() == ParallelUnit::NotParallel && !isScalar(temporary.getType())) {
+      temporaryHoisted = true;
+    }
+  }
+
+  if (!temporaryHoisted)
+    temporaryValuesInitFree = codeToInitializeTemporary(where);
+
+  Stmt initializeTemporary = temporaryValuesInitFree[0];
+  Stmt freeTemporary = temporaryValuesInitFree[1];
 
   match(where.getConsumer(),
         std::function<void(const AssignmentNode*)>([&](const AssignmentNode* op) {
@@ -1354,7 +1399,7 @@ Stmt LowererImpl::lowerWhere(Where where) {
   whereConsumers.pop_back();
   whereTemps.pop_back();
   whereTempsToResult.erase(where.getTemporary());
-  return Block::make(initializeTemporary, producer, markAssignsAtomicDepth > 0 ? capturedLocatePos : ir::Stmt(), consumer, freeTemporary);
+  return Block::make(initializeTemporary, producer, markAssignsAtomicDepth > 0 ? capturedLocatePos : ir::Stmt(), consumer,  freeTemporary);
 }
 
 
diff --git a/test/tests-scheduling.cpp b/test/tests-scheduling.cpp
index 7adc3ca88..b7ba88ecc 100644
--- a/test/tests-scheduling.cpp
+++ b/test/tests-scheduling.cpp
@@ -84,6 +84,7 @@ TEST(scheduling, lowerDenseMatrixMul) {
     }
   }
 
+  cout << "-------PACKING---------" << endl;
   A.pack();
   B.pack();
 
@@ -96,21 +97,29 @@ TEST(scheduling, lowerDenseMatrixMul) {
              .split(j, j0, j1, 2)
              .split(k, k0, k1, 2)
              .reorder({i0, j0, k0, i1, j1, k1});
+  cout << "-------COMPILING---------" << endl;
 
   C.compile(stmt);
+  cout << "-------ASSEMBLING---------" << endl;
   C.assemble();
+  cout << "-------COMPUTING---------" << endl;
   C.compute();
 
   Tensor<double> expected("expected", {4, 4}, {Dense, Dense});
   expected(i, j) = A(i, k) * B(k, j);
+  IndexStmt expected_stmt = C.getAssignment().concretize();
   expected.compile();
   expected.assemble();
   expected.compute();
   ASSERT_TENSOR_EQ(C, expected);
 
-  //  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen);
-  //  ir::Stmt compute = lower(stmt, "compute",  false, true);
-  //  codegen->compile(compute, true);
+    cout << stmt << endl;
+
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen);
+    ir::Stmt compute = lower(stmt, "compute",  true, true);
+    codegen->compile(compute, true);
+    ir::Stmt expected_compute = lower(expected_stmt, "compute",  false, true);
+    //codegen->compile(expected_compute, true);
 }
 
 TEST(scheduling, lowerSparseCopy) {
diff --git a/test/tests-workspaces.cpp b/test/tests-workspaces.cpp
new file mode 100644
index 000000000..d2b5157f5
--- /dev/null
+++ b/test/tests-workspaces.cpp
@@ -0,0 +1,186 @@
+#include <taco/index_notation/transformations.h>
+#include <codegen/codegen_c.h>
+#include <codegen/codegen_cuda.h>
+#include "test.h"
+#include "test_tensors.h"
+#include "taco/tensor.h"
+#include "taco/index_notation/index_notation.h"
+#include "codegen/codegen.h"
+#include "taco/lower/lower.h"
+
+using namespace taco;
+
+TEST(workspaces, tile_vecElemMul_NoTail) {
+  
+  Tensor<double> A("A", {16}, {Dense});
+  Tensor<double> B("B", {16}, {Dense});
+  Tensor<double> C("C", {16}, {Dense});
+
+  for (int i = 0; i < 16; i++) {
+      A.insert({i}, (double) i);
+      B.insert({i}, (double) i);
+  }
+
+  A.pack();
+  B.pack();
+
+  IndexVar i("i");
+  IndexVar i_bounded("i_bounded");
+  IndexVar i0("i0"), i1("i1");
+  IndexExpr precomputedExpr = B(i) * C(i);
+  A(i) = precomputedExpr;
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense);
+  stmt = stmt.bound(i, i_bounded, 17, BoundType::MaxExact)
+             .split(i_bounded, i0, i1, 4)
+             .precompute(precomputedExpr, i1, i1, precomputed);
+   
+  A.compile(stmt.concretize());
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {16}, {Dense});
+  expected(i) = B(i) * C(i);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(A, expected);
+}
+
+TEST(workspaces, tile_vecElemMul_Tail1) {
+  
+  Tensor<double> A("A", {16}, {Dense});
+  Tensor<double> B("B", {16}, {Dense});
+  Tensor<double> C("C", {16}, {Dense});
+
+  for (int i = 0; i < 16; i++) {
+      A.insert({i}, (double) i);
+      B.insert({i}, (double) i);
+  }
+
+  A.pack();
+  B.pack();
+
+  IndexVar i("i");
+  IndexVar i_bounded("i_bounded");
+  IndexVar i0("i0"), i1("i1");
+  IndexExpr precomputedExpr = B(i) * C(i);
+  A(i) = precomputedExpr;
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense);
+  stmt = stmt.bound(i, i_bounded, 16, BoundType::MaxExact)
+             .split(i_bounded, i0, i1, 5)
+             .precompute(precomputedExpr, i1, i1, precomputed);
+   
+  A.compile(stmt.concretize());
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {16}, {Dense});
+  expected(i) = B(i) * C(i);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(A, expected);
+}
+
+TEST(workspaces, tile_vecElemMul_Tail2) {
+  
+  Tensor<double> A("A", {17}, {Dense});
+  Tensor<double> B("B", {17}, {Dense});
+  Tensor<double> C("C", {17}, {Dense});
+
+  for (int i = 0; i < 17; i++) {
+      A.insert({i}, (double) i);
+      B.insert({i}, (double) i);
+  }
+
+  A.pack();
+  B.pack();
+
+  IndexVar i("i");
+  IndexVar i_bounded("i_bounded");
+  IndexVar i0("i0"), i1("i1");
+  IndexExpr precomputedExpr = B(i) * C(i);
+  A(i) = precomputedExpr;
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense);
+  stmt = stmt.bound(i, i_bounded, 17, BoundType::MaxExact)
+             .split(i_bounded, i0, i1, 4)
+             .precompute(precomputedExpr, i1, i1, precomputed);
+   
+  A.compile(stmt.concretize());
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {17}, {Dense});
+  expected(i) = B(i) * C(i);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(A, expected);
+
+//  ir::IRPrinter irp = ir::IRPrinter(cout);
+//    
+//  cout << stmt << endl;
+//
+//  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen);
+//  ir::Stmt compute = lower(stmt, "compute",  false, true);
+//  
+//  irp.print(compute);
+//  cout << endl;
+//  codegen->compile(compute, false);
+}
+
+TEST(workspaces, tile_denseMatMul) {
+  
+  Tensor<double> A("A", {16}, {Dense});
+  Tensor<double> B("B", {16}, {Dense});
+  Tensor<double> C("C", {16}, {Dense});
+
+  for (int i = 0; i < 16; i++) {
+      A.insert({i}, (double) i);
+      B.insert({i}, (double) i);
+  }
+
+  A.pack();
+  B.pack();
+
+  IndexVar i("i");
+  IndexVar i_bounded("i_bounded");
+  IndexVar i0("i0"), i1("i1");
+  IndexExpr precomputedExpr = B(i) * C(i);
+  A(i) = precomputedExpr;
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense);
+  stmt = stmt.bound(i, i_bounded, 16, BoundType::MaxExact)
+             .split(i_bounded, i0, i1, 4)
+             .precompute(precomputedExpr, i1, i1, precomputed);
+   
+  A.compile(stmt.concretize());
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {16}, {Dense});
+  expected(i) = B(i) * C(i);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(A, expected);
+
+//  ir::IRPrinter irp = ir::IRPrinter(cout);
+//    
+//  cout << stmt << endl;
+//
+//  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen);
+//  ir::Stmt compute = lower(stmt, "compute",  false, true);
+//  
+//  irp.print(compute);
+//  cout << endl;
+//  codegen->compile(compute, false);
+  
+}

From e649a67550b718a3a89a15d0ee953e07a404e30c Mon Sep 17 00:00:00 2001
From: Olivia Hsu <owhsu@stanford.edu>
Date: Wed, 14 Oct 2020 22:10:39 -0700
Subject: [PATCH 02/14] Fix some workspaces tests

---
 test/tests-workspaces.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/tests-workspaces.cpp b/test/tests-workspaces.cpp
index d2b5157f5..bd035a8b1 100644
--- a/test/tests-workspaces.cpp
+++ b/test/tests-workspaces.cpp
@@ -32,11 +32,13 @@ TEST(workspaces, tile_vecElemMul_NoTail) {
 
   IndexStmt stmt = A.getAssignment().concretize();
   TensorVar precomputed("precomputed", Type(Float64, {Dimension(i1)}), taco::dense);
-  stmt = stmt.bound(i, i_bounded, 17, BoundType::MaxExact)
+  stmt = stmt.bound(i, i_bounded, 16, BoundType::MaxExact)
              .split(i_bounded, i0, i1, 4)
              .precompute(precomputedExpr, i1, i1, precomputed);
    
-  A.compile(stmt.concretize());
+    cout << stmt << endl;
+
+  A.compile(stmt);
   A.assemble();
   A.compute();
 

From 01166afa48c77e0b3565126aec4fe8b5db7ce850 Mon Sep 17 00:00:00 2001
From: Rawn <rawnhenry@gmail.com>
Date: Sun, 29 Nov 2020 21:18:00 -0800
Subject: [PATCH 03/14] Prototypes automatically generating code to to have
 sparse iteration over a dense workspace

---
 include/taco/ir/ir.h                          |  10 +-
 include/taco/ir/ir_printer.h                  |   1 +
 include/taco/ir/ir_rewriter.h                 |   1 +
 include/taco/ir/ir_visitor.h                  |   3 +
 include/taco/lower/lowerer_impl.h             |  29 ++
 src/codegen/codegen_c.cpp                     |   1 +
 src/index_notation/index_notation_printer.cpp |   7 +-
 src/ir/ir.cpp                                 |   9 +
 src/ir/ir_printer.cpp                         |  16 +-
 src/ir/ir_rewriter.cpp                        |  18 ++
 src/ir/ir_visitor.cpp                         |   5 +
 src/lower/lowerer_impl.cpp                    | 297 ++++++++++++++++--
 12 files changed, 369 insertions(+), 28 deletions(-)

diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h
index 1f7e17918..bbb36c12b 100644
--- a/include/taco/ir/ir.h
+++ b/include/taco/ir/ir.h
@@ -65,7 +65,8 @@ enum class IRNodeType {
   BlankLine,
   Print,
   GetProperty,
-  Break
+  Break,
+  Sort
 };
 
 enum class TensorProperty {
@@ -725,6 +726,13 @@ struct Break : public StmtNode<Break> {
   static const IRNodeType _type_info = IRNodeType::Break;
 };
 
+struct Sort : public StmtNode<Sort> {
+  std::vector<Expr> args;
+  static Stmt make(std::vector<Expr> args);
+
+  static const IRNodeType _type_info = IRNodeType::Sort;
+};
+
 /** A print statement.
  * Takes in a printf-style format string and Exprs to pass
  * for the values.
diff --git a/include/taco/ir/ir_printer.h b/include/taco/ir/ir_printer.h
index 759d21ad3..1c264b7f9 100644
--- a/include/taco/ir/ir_printer.h
+++ b/include/taco/ir/ir_printer.h
@@ -68,6 +68,7 @@ class IRPrinter : public IRVisitorStrict {
   virtual void visit(const Break*);
   virtual void visit(const Print*);
   virtual void visit(const GetProperty*);
+  virtual void visit(const Sort*);
 
   std::ostream &stream;
   int indent;
diff --git a/include/taco/ir/ir_rewriter.h b/include/taco/ir/ir_rewriter.h
index efb9eaf89..81ad43705 100644
--- a/include/taco/ir/ir_rewriter.h
+++ b/include/taco/ir/ir_rewriter.h
@@ -68,6 +68,7 @@ class IRRewriter : public IRVisitorStrict {
   virtual void visit(const Break* op);
   virtual void visit(const Print* op);
   virtual void visit(const GetProperty* op);
+  virtual void visit(const Sort *op);
 };
 
 }}
diff --git a/include/taco/ir/ir_visitor.h b/include/taco/ir/ir_visitor.h
index f6331035b..810e4f758 100644
--- a/include/taco/ir/ir_visitor.h
+++ b/include/taco/ir/ir_visitor.h
@@ -48,6 +48,7 @@ struct BlankLine;
 struct Break;
 struct Print;
 struct GetProperty;
+struct Sort;
 
 /// Extend this class to visit every node in the IR.
 class IRVisitorStrict {
@@ -98,6 +99,7 @@ class IRVisitorStrict {
   virtual void visit(const Break*) = 0;
   virtual void visit(const Print*) = 0;
   virtual void visit(const GetProperty*) = 0;
+  virtual void visit(const Sort*) = 0;
 };
 
 
@@ -151,6 +153,7 @@ class IRVisitor : public IRVisitorStrict {
   virtual void visit(const Break* op);
   virtual void visit(const Print* op);
   virtual void visit(const GetProperty* op);
+  virtual void visit(const Sort* op);
 };
 
 }}
diff --git a/include/taco/lower/lowerer_impl.h b/include/taco/lower/lowerer_impl.h
index a3ffd6173..71783c6ad 100644
--- a/include/taco/lower/lowerer_impl.h
+++ b/include/taco/lower/lowerer_impl.h
@@ -81,6 +81,16 @@ class LowererImpl : public util::Uncopyable {
                                         std::set<Access> reducedAccesses,
                                         ir::Stmt recoveryStmt);
 
+  /// Lower a forall that iterates over all the coordinates in the forall index
+  /// var's dimension, and locates tensor positions from the locate iterators.
+  virtual ir::Stmt lowerForallDenseAcceleration(Forall forall,
+                                                std::vector<Iterator> locaters,
+                                                std::vector<Iterator> inserters,
+                                                std::vector<Iterator> appenders,
+                                                std::set<Access> reducedAccesses,
+                                                ir::Stmt recoveryStmt);
+
+
   /// Lower a forall that iterates over the coordinates in the iterator, and
   /// locates tensor positions from the locate iterators.
   virtual ir::Stmt lowerForallCoordinate(Forall forall, Iterator iterator,
@@ -333,9 +343,19 @@ class LowererImpl : public util::Uncopyable {
   ir::Stmt codeToInitializeIteratorVars(std::vector<Iterator> iterators, std::vector<Iterator> rangers, std::vector<Iterator> mergers, ir::Expr coord, IndexVar coordinateVar);
   ir::Stmt codeToInitializeIteratorVar(Iterator iterator, std::vector<Iterator> iterators, std::vector<Iterator> rangers, std::vector<Iterator> mergers, ir::Expr coordinate, IndexVar coordinateVar);
 
+  /// Returns true iff the temporary used in the where statement is dense and sparse iteration over that
+  /// temporary can be automaticallty supported by the compiler.
+  bool canAccelerateDenseTemp(Where where);
+
   /// Initializes a temporary workspace
   std::vector<ir::Stmt> codeToInitializeTemporary(Where where);
 
+  /// Gets the size of a temporary tensorVar
+  ir::Expr getTemporarySize(TensorVar var);
+
+  /// Initializes helper arrays to give dense workspaces sparse acceleration
+  std::vector<ir::Stmt> codeToInitializeDenseAcceleratorArrays(Where where);
+
   /// Recovers a derived indexvar from an underived variable.
   ir::Stmt codeToRecoverDerivedIndexVar(IndexVar underived, IndexVar indexVar, bool emitVarDecl);
 
@@ -376,6 +396,15 @@ class LowererImpl : public util::Uncopyable {
   };
   std::map<TensorVar, TemporaryArrays> temporaryArrays;
 
+  /// Map form temporary to indexList var if accelerating dense workspace
+  std::map<TensorVar, ir::Expr> tempToIndexList;
+
+  /// Map form temporary to indexListSize if accelerating dense workspace
+  std::map<TensorVar, ir::Expr> tempToIndexListSize;
+
+  /// Map form temporary to bitGuard var if accelerating dense workspace
+  std::map<TensorVar, ir::Expr> tempToBitGuard;
+
   /// Map from result tensors to variables tracking values array capacity.
   std::map<ir::Expr, ir::Expr> capacityVars;
 
diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp
index 204aa1e2d..9611d09f5 100644
--- a/src/codegen/codegen_c.cpp
+++ b/src/codegen/codegen_c.cpp
@@ -190,6 +190,7 @@ class CodeGen_C::FindVars : public IRVisitor {
     if (!util::contains(localVars, op->var)) {
       localVars.push_back(op->var);
     }
+    op->var.accept(this);
     op->rhs.accept(this);
   }
 
diff --git a/src/index_notation/index_notation_printer.cpp b/src/index_notation/index_notation_printer.cpp
index 58305077e..ba633731d 100644
--- a/src/index_notation/index_notation_printer.cpp
+++ b/src/index_notation/index_notation_printer.cpp
@@ -81,7 +81,12 @@ void IndexNotationPrinter::visit(const NegNode* op) {
   Precedence precedence = Precedence::NEG;
   bool parenthesize =  precedence > parentPrecedence;
   parentPrecedence = precedence;
-  os << "-";
+  if(op->getDataType().isBool()) {
+    os << "!";
+  } else {
+    os << "-";
+  }
+
   if (parenthesize) {
     os << "(";
   }
diff --git a/src/ir/ir.cpp b/src/ir/ir.cpp
index e5225f502..a714dddfc 100644
--- a/src/ir/ir.cpp
+++ b/src/ir/ir.cpp
@@ -817,6 +817,13 @@ Expr GetProperty::make(Expr tensor, TensorProperty property, int mode,
   return gp;
 }
 
+// Sort
+Stmt Sort::make(std::vector<Expr> args) {
+  Sort* sort = new Sort;
+  sort->args = args;
+  return sort;
+}
+
 
 // GetProperty
 Expr GetProperty::make(Expr tensor, TensorProperty property, int mode) {
@@ -953,6 +960,8 @@ template<> void StmtNode<Print>::accept(IRVisitorStrict *v)
     const { v->visit((const Print*)this); }
 template<> void ExprNode<GetProperty>::accept(IRVisitorStrict *v)
     const { v->visit((const GetProperty*)this); }
+template<> void StmtNode<Sort>::accept(IRVisitorStrict *v)
+  const { v->visit((const Sort*)this); }
 
 // printing methods
 std::ostream& operator<<(std::ostream& os, const Stmt& stmt) {
diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp
index 0fca68786..be654e295 100644
--- a/src/ir/ir_printer.cpp
+++ b/src/ir/ir_printer.cpp
@@ -131,7 +131,11 @@ void IRPrinter::visit(const Var* op) {
 }
 
 void IRPrinter::visit(const Neg* op) {
-  stream << "-";
+  if(op->type.isBool()) {
+    stream << "!";
+  } else {
+    stream << "-";
+  }
   parentPrecedence = Precedence::NEG;
   op->a.accept(this);
 }
@@ -575,6 +579,16 @@ void IRPrinter::visit(const GetProperty* op) {
   stream << op->name;
 }
 
+void IRPrinter::visit(const Sort* op) {
+  doIndent();
+  stream << "qsort(";
+  parentPrecedence = Precedence::CALL;
+  acceptJoin(this, stream, op->args, ", ");
+  stream << ");";
+  stream << endl;
+}
+
+
 void IRPrinter::resetNameCounters() {
   // seed the unique names with all C99 keywords
   // from: http://en.cppreference.com/w/c/keyword
diff --git a/src/ir/ir_rewriter.cpp b/src/ir/ir_rewriter.cpp
index 1a1c91f23..fd1423a00 100644
--- a/src/ir/ir_rewriter.cpp
+++ b/src/ir/ir_rewriter.cpp
@@ -479,5 +479,23 @@ void IRRewriter::visit(const GetProperty* op) {
   }
 }
 
+void IRRewriter::visit(const Sort* op) {
+  std::vector<Expr> args;
+  bool rewritten = false;
+  for (auto& arg : op->args) {
+    Expr rewrittenArg = rewrite(arg);
+    args.push_back(rewrittenArg);
+    if (rewrittenArg != arg) {
+      rewritten = true;
+    }
+  }
+  if (rewritten) {
+    stmt = Sort::make(args);
+  }
+  else {
+    stmt = op;
+  }
+}
+
 
 }}
diff --git a/src/ir/ir_visitor.cpp b/src/ir/ir_visitor.cpp
index 8a1baf6cf..19fbfbfdf 100644
--- a/src/ir/ir_visitor.cpp
+++ b/src/ir/ir_visitor.cpp
@@ -236,5 +236,10 @@ void IRVisitor::visit(const Print* op) {
     e.accept(this);
 }
 
+void IRVisitor::visit(const Sort* op) {
+  for (auto e: op->args)
+    e.accept(this);
+}
+
 }  // namespace ir
 }  // namespace taco
diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp
index 2297f73c3..a4f05a879 100644
--- a/src/lower/lowerer_impl.cpp
+++ b/src/lower/lowerer_impl.cpp
@@ -267,10 +267,11 @@ LowererImpl::lower(IndexStmt stmt, string name,
 Stmt LowererImpl::lowerAssignment(Assignment assignment)
 {
   TensorVar result = assignment.getLhs().getTensorVar();
+  Stmt computeStmt;
+  Expr rhs = lower(assignment.getRhs());
 
   if (generateComputeCode()) {
     Expr var = getTensorVar(result);
-    Expr rhs = lower(assignment.getRhs());
 
     // Assignment to scalar variables.
     if (isScalar(result.getType())) {
@@ -288,7 +289,6 @@ Stmt LowererImpl::lowerAssignment(Assignment assignment)
       Expr values = getValuesArray(result);
       Expr loc = generateValueLocExpr(assignment.getLhs());
 
-      Stmt computeStmt;
       if (!assignment.getOperator().defined()) {
         computeStmt = Store::make(values, loc, rhs, markAssignsAtomicDepth > 0, atomicParallelUnit);
       }
@@ -296,13 +296,46 @@ Stmt LowererImpl::lowerAssignment(Assignment assignment)
         computeStmt = compoundStore(values, loc, rhs, markAssignsAtomicDepth > 0, atomicParallelUnit);
       }
       taco_iassert(computeStmt.defined());
-      return computeStmt;
     }
   }
-  // We're only assembling so defer allocating value memory to the end when
-  // we'll know exactly how much we need.
-  else if (generateAssembleCode()) {
-    // TODO
+  // TODO: If only assembling so defer allocating value memory to the end when
+  //       we'll know exactly how much we need.
+  if (generateAssembleCode() || generateComputeCode()) {
+
+    bool temporaryWithSparseAcceleration = util::contains(tempToIndexList, result);
+    if(generateComputeCode() && !temporaryWithSparseAcceleration) {
+      taco_iassert(computeStmt.defined());
+      return computeStmt;
+    }
+
+    if(temporaryWithSparseAcceleration) {
+      Expr values = getValuesArray(result);
+      Expr loc = generateValueLocExpr(assignment.getLhs());
+      Stmt initialStorage = computeStmt;
+      if(assignment.getOperator().defined()) {
+        // computeStmt is a compund stmt so we need to emit an initial store into the temporary
+        initialStorage =  Store::make(values, loc, rhs, markAssignsAtomicDepth > 0, atomicParallelUnit);
+      }
+
+      Expr bitGuardArr = tempToBitGuard.at(result);
+      Expr indexList = tempToIndexList.at(result);
+      Expr indexListSize = tempToIndexListSize.at(result);
+
+      Stmt markBitGuardAsTrue = Store::make(bitGuardArr, loc, ir::Literal::make(true), markAssignsAtomicDepth > 0, atomicParallelUnit);
+      Stmt trackIndex = Store::make(indexList, indexListSize, loc, markAssignsAtomicDepth > 0, atomicParallelUnit);
+      Expr incrementSize = ir::Add::make(indexListSize, ir::Literal::make(1));
+      Stmt incrementStmt = Assign::make(indexListSize, incrementSize, markAssignsAtomicDepth > 0, atomicParallelUnit);
+
+      Stmt firstWriteAtIndex = Block::make(initialStorage, trackIndex, markBitGuardAsTrue, incrementStmt);
+      if(!generateComputeCode()) {
+        firstWriteAtIndex = Block::make(trackIndex, markBitGuardAsTrue, incrementStmt);
+      }
+
+      Expr readBitGuard = Load::make(bitGuardArr, loc);
+      Stmt finalStmt = IfThenElse::make(ir::Neg::make(readBitGuard), firstWriteAtIndex, computeStmt);
+      return finalStmt;
+    }
+
     return Stmt();
   }
   // We're neither assembling or computing so we emit nothing.
@@ -473,10 +506,27 @@ Stmt LowererImpl::lowerForall(Forall forall)
         }
       }
     }
+
+    // For now, this only works when consuming a single workspace.
+    bool canAccelWithSparseIteration = inParallelLoopDepth == 0 && provGraph.isFullyDerived(iterator.getIndexVar());
+    if (canAccelWithSparseIteration && iterator.isDimensionIterator() && locators.size() == 1) {
+      // We are iterating over a dimension and locating into a temporary with a tracker to keep indices. Instead, we
+      // can just iterate over the indices and locate into the dense workspace.
+      for (auto it = tensorVars.begin(); it != tensorVars.end(); ++it) {
+        if (it->second == locators[0].getTensor() && util::contains(tempToIndexList, it->first)) {
+          canAccelWithSparseIteration = true;
+          break;
+        }
+      }
+    }
+
     if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) {
       loops = lowerForallFusedPosition(forall, iterator, locators,
                                          inserters, appenders, reducedAccesses, recoveryStmt);
     }
+    else if (canAccelWithSparseIteration) {
+      loops = lowerForallDenseAcceleration(forall, locators, inserters, appenders, reducedAccesses, recoveryStmt);
+    }
     // Emit dimension coordinate iteration loop
     else if (iterator.isDimensionIterator()) {
       loops = lowerForallDimension(forall, point.locators(),
@@ -853,6 +903,64 @@ Stmt LowererImpl::lowerForallDimension(Forall forall,
                        posAppend);
 }
 
+  Stmt LowererImpl::lowerForallDenseAcceleration(Forall forall,
+                                                 vector<Iterator> locators,
+                                                 vector<Iterator> inserters,
+                                                 vector<Iterator> appenders,
+                                                 set<Access> reducedAccesses,
+                                                 ir::Stmt recoveryStmt)
+  {
+    taco_iassert(locators.size() == 1) << "Optimizing a dense workspace is only supported when the consumer is the only RHS tensor";
+    taco_iassert(provGraph.isFullyDerived(forall.getIndexVar())) << "Sparsely accelerating a dense workspace only works with fully derived index vars";
+    taco_iassert(forall.getParallelUnit() == ParallelUnit::NotParallel) << "Sparsely accelerating a dense workspace only works within serial loops";
+
+
+    TensorVar var;
+    for (auto it = tensorVars.begin(); it != tensorVars.end(); ++it) {
+      if (it->second == locators[0].getTensor() && util::contains(tempToIndexList, it->first)) {
+        var = it->first;
+        break;
+      }
+    }
+
+    Expr indexList = tempToIndexList.at(var);
+    Expr indexListSize = tempToIndexListSize.at(var);
+    Expr bitGuard = tempToBitGuard.at(var);
+    Expr loopVar = ir::Var::make(var.getName() + "_index_locator", taco::Int32, false, false);
+    Expr coordinate = getCoordinateVar(forall.getIndexVar());
+
+    if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) {
+      markAssignsAtomicDepth++;
+      atomicParallelUnit = forall.getParallelUnit();
+    }
+
+    Stmt declareVar = VarDecl::make(coordinate, Load::make(indexList, loopVar));
+    Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses);
+    Stmt resetGuard = ir::Store::make(bitGuard, loopVar, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit);
+    body = Block::make(declareVar, body, resetGuard);
+
+    if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) {
+      markAssignsAtomicDepth--;
+    }
+
+    body = Block::make({recoveryStmt, body});
+
+    Stmt posAppend = generateAppendPositions(appenders);
+
+    LoopKind kind = LoopKind::Serial;
+    if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) {
+      kind = LoopKind::Vectorized;
+    }
+    else if (forall.getParallelUnit() != ParallelUnit::NotParallel
+             && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction && !ignoreVectorize) {
+      kind = LoopKind::Runtime;
+    }
+
+    return Block::blanks(For::make(loopVar, 0, indexListSize, 1, body, kind,
+                                         ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(),
+                                         ignoreVectorize ? 0 : forall.getUnrollFactor()),
+                                         posAppend);
+  }
 
 Stmt LowererImpl::lowerForallCoordinate(Forall forall, Iterator iterator,
                                         vector<Iterator> locators,
@@ -1301,32 +1409,156 @@ Stmt LowererImpl::lowerForallBody(Expr coordinate, IndexStmt stmt,
                      appendCoords);
 }
 
+Expr LowererImpl::getTemporarySize(TensorVar temporary) {
+  Dimension temporarySize = temporary.getType().getShape().getDimension(0);
+
+  if (temporarySize.isFixed()) {
+    return ir::Literal::make(temporarySize.getSize());
+  }
+
+  if (temporarySize.isIndexVarSized()) {
+    IndexVar var = temporarySize.getIndexVarSize();
+    vector<Expr> bounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds,
+                                                     indexVarToExprMap, iterators);
+    return ir::Sub::make(bounds[1], bounds[0]);
+  }
+
+  taco_ierror; // TODO
+  return Expr();
+}
+
+vector<Stmt> LowererImpl::codeToInitializeDenseAcceleratorArrays(Where where) {
+  TensorVar temporary = where.getTemporary();
+
+  // TODO: emit as uint64 and manually emit bit pack code
+  const Datatype bitGuardType = taco::Bool;
+  const std::string bitGuardName = temporary.getName() + "_already_set";
+  const Expr bitGuardSize = getTemporarySize(temporary);
+  const Expr alreadySetArr = ir::Var::make(bitGuardName,
+                                           bitGuardType,
+                                           true, false);
+
+  // TODO: TACO should probably keep state on if it can use int32 or if it should switch to
+  //       using int64 for indices. This assumption is made in other places of taco.
+  const Datatype indexListType = taco::Int32;
+  const std::string indexListName = temporary.getName() + "_index_list";
+  const Expr indexListArr = ir::Var::make(indexListName,
+                                          indexListType,
+                                          true, false);
+
+  // no decl for shared memory
+  Stmt alreadySetDecl = Stmt();
+  Stmt indexListDecl = Stmt();
+  const Expr indexListSizeExpr = ir::Var::make(indexListName + "_size", taco::Int32, false, false);
+  const Stmt indexListSizeDecl = VarDecl::make(indexListSizeExpr, ir::Literal::make(0));
+  Stmt freeTemps = Block::make(Free::make(indexListArr), Free::make(alreadySetArr));
+  if ((isa<Forall>(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) {
+    alreadySetDecl = VarDecl::make(alreadySetArr, ir::Literal::make(0));
+    indexListDecl = VarDecl::make(indexListArr, ir::Literal::make(0));
+  }
+
+  tempToIndexList[temporary] = indexListArr;
+  tempToIndexListSize[temporary] = indexListSizeExpr;
+  tempToBitGuard[temporary] = alreadySetArr;
+
+  Stmt allocateIndexList = Allocate::make(indexListArr, bitGuardSize);
+  if(should_use_CUDA_codegen()) {
+    Stmt allocateAlreadySet = Allocate::make(alreadySetArr, bitGuardSize);
+    Expr p = Var::make("p" + temporary.getName(), Int());
+    Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType));
+
+    Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial);
+    Stmt inits = Block::make(indexListSizeDecl, alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop);
+    return {inits, freeTemps};
+  } else {
+    Expr sizeOfElt = Sizeof::make(bitGuardType);
+    Expr callocAlreadySet = ir::Call::make("calloc", {bitGuardSize, sizeOfElt}, Int());
+    Stmt allocateAlreadySet = VarDecl::make(alreadySetArr, callocAlreadySet);
+    Stmt inits = Block::make(indexListSizeDecl, indexListDecl, allocateIndexList, allocateAlreadySet);
+    return {inits, freeTemps};
+  }
+
+}
+
+// Returns true if the following conditions are met:
+// 1) The temporary is a dense vector
+// 2) There is only one value on the right hand side of the consumer
+//    -- We would need to handle sparse acceleration in the merge lattices for multiple operands on the RHS
+// 3) There are no reduced accesses
+// 4) The left hand side of the where consumer is sparse
+// 5) CPU Code is being generated (TEMPORARY - This should be removed)
+//    -- The sorting calls and calloc call in lower where are CPU specific. We could map calloc to a cudaMalloc
+//       and use a library like CUB to emit the sort. CUB support is built into CUDA 11 but not prior versions
+//       of CUDA so in that case, we'd probably need to include the CUB headers in the generated code.
+bool LowererImpl::canAccelerateDenseTemp(Where where) {
+  TensorVar temporary = where.getTemporary();
+  // (1) Temporary is dense vector
+  if(!isDense(temporary.getFormat()) || temporary.getOrder() == 1) return false;
+
+  vector<Access> inputAccesses, resultAccesses;
+  set<Access> reducedAccesses;
+
+  inputAccesses = getArgumentAccesses(where.getConsumer());
+  // (2) Multiple operands in inputs (need lattice to reason about iteration)
+  if(inputAccesses.size() > 1 || inputAccesses.empty()) return false;
+
+  std::tie(resultAccesses, reducedAccesses) = getResultAccesses(where.getConsumer());
+  // (3) Contains reduced accesses
+  if(!reducedAccesses.empty()) return false;
+
+  // no or multiple results?
+  if(resultAccesses.size() > 1 || resultAccesses.empty()) return false;
+
+  // (4) Level of result is sparse
+  // No check for size of tempVar since we enforced the temporary is a vector and if there is only one RHS value,
+  // it must (should?) be the temporary
+  std::vector<IndexVar> tempVar = inputAccesses[0].getIndexVars();
+
+  // Get vars in result.
+  std::vector<IndexVar> resultVars = resultAccesses[0].getIndexVars();
+  auto it = std::find(resultVars.begin(), resultVars.end(), tempVar[0]);
+  int index = it != resultVars.end()? (int)(it - resultVars.begin()) + 1: -1;
+
+  // Var used in input is not in result? Probably would fail earlier but here just in case.
+  if(index == -1) return false;
+
+  int modeIndex = resultAccesses[0].getTensorVar().getFormat().getModeOrdering()[index];
+  ModeFormat varFmt = resultAccesses[0].getTensorVar().getFormat().getModeFormats()[modeIndex];
+
+  // Actual check for condition (4). If the current mode is full, no optimizations necessary
+  if(varFmt.isFull()) return false;
+
+  // TODO: TEMPORARY -- Needs to be removed
+  if(should_use_CUDA_codegen()) return false;
+
+  return true;
+}
+
 vector<Stmt> LowererImpl::codeToInitializeTemporary(Where where) {
   TensorVar temporary = where.getTemporary();
 
+  bool accelerateDense = canAccelerateDenseTemp(where);
+
   Stmt freeTemporary = Stmt();
   Stmt initializeTemporary = Stmt();
   if (isScalar(temporary.getType())) {
     initializeTemporary = defineScalarVariable(temporary, true);
   } else {
+    // When emitting code to accelerate dense workspaces with sparse iteration, we need the following arrays
+    // to construct the result indices
+    if(accelerateDense) {
+      vector<Stmt> initAndFree = codeToInitializeDenseAcceleratorArrays(where);
+      initializeTemporary = initAndFree[0];
+      freeTemporary = initAndFree[1];
+    }
+
     if (generateComputeCode()) {
       Expr values = ir::Var::make(temporary.getName(),
                                   temporary.getType().getDataType(),
                                   true, false);
       taco_iassert(temporary.getType().getOrder() == 1) << " Temporary order was "
                                                         << temporary.getType().getOrder();  // TODO
-      Dimension temporarySize = temporary.getType().getShape().getDimension(0);
-      Expr size;
-      if (temporarySize.isFixed()) {
-        size = ir::Literal::make(temporarySize.getSize());
-      } else if (temporarySize.isIndexVarSized()) {
-        IndexVar var = temporarySize.getIndexVarSize();
-        vector<Expr> bounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds,
-                                                         indexVarToExprMap, iterators);
-        size = ir::Sub::make(bounds[1], bounds[0]);
-      } else {
-        taco_ierror; // TODO
-      }
+      Expr size = getTemporarySize(temporary);
 
       // no decl needed for shared memory
       Stmt decl = Stmt();
@@ -1334,10 +1566,14 @@ vector<Stmt> LowererImpl::codeToInitializeTemporary(Where where) {
         decl = VarDecl::make(values, ir::Literal::make(0));
       }
       Stmt allocate = Allocate::make(values, size);
-
-      Expr p = Var::make("p" + temporary.getName(), Int());
-      Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType()));
-      Stmt zeroInitLoop = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial);
+      // If we are using acceleration of the dense workspace, we do not need to initialize the values array
+      // since the bit guard will take care of setting the value array when necessary
+      Stmt zeroInitLoop = Stmt();
+      if(!accelerateDense) {
+        Expr p = Var::make("p" + temporary.getName(), Int());
+        Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType()));
+        zeroInitLoop = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial);
+      }
 
       /// Make a struct object that lowerAssignment and lowerAccess can read
       /// temporary value arrays from.
@@ -1345,8 +1581,8 @@ vector<Stmt> LowererImpl::codeToInitializeTemporary(Where where) {
       arrays.values = values;
       this->temporaryArrays.insert({temporary, arrays});
 
-      freeTemporary = Free::make(values);
-      initializeTemporary = Block::make(decl, allocate, zeroInitLoop);
+      freeTemporary = Block::make(freeTemporary, Free::make(values));
+      initializeTemporary = Block::make(decl, initializeTemporary, allocate, zeroInitLoop);
     }
   }
   return {initializeTemporary, freeTemporary};
@@ -1354,6 +1590,7 @@ vector<Stmt> LowererImpl::codeToInitializeTemporary(Where where) {
 
 Stmt LowererImpl::lowerWhere(Where where) {
   TensorVar temporary = where.getTemporary();
+  bool accelarateDenseWorkSpace = canAccelerateDenseTemp(where);
 
   // Declare and initialize the where statement's temporary
   vector<Stmt> temporaryValuesInitFree = {Stmt(), Stmt()};
@@ -1379,6 +1616,16 @@ Stmt LowererImpl::lowerWhere(Where where) {
   );
 
   Stmt consumer = lower(where.getConsumer());
+  if(accelarateDenseWorkSpace) {
+    // We need to sort the indices array
+    Expr listOfIndices = tempToIndexList.at(temporary);
+    Expr listOfIndicesSize = tempToIndexListSize.at(temporary);
+    Expr sizeOfElt = ir::Sizeof::make(listOfIndices.type());
+    Expr cmpName = ir::Var::make("cmp", Int());
+    Stmt sortCall = ir::Sort::make( {listOfIndices, listOfIndicesSize, sizeOfElt, cmpName});
+    consumer = Block::make(sortCall, consumer);
+  }
+
   whereConsumers.push_back(consumer);
   whereTemps.push_back(where.getTemporary());
   captureNextLocatePos = true;

From 12b51f00790895758f9dbc085ab6e3a2ebb04390 Mon Sep 17 00:00:00 2001
From: Rawn <rawnhenry@gmail.com>
Date: Sat, 5 Dec 2020 20:57:26 -0800
Subject: [PATCH 04/14] Fixes bugs in check for accelerating workspace

---
 src/lower/lowerer_impl.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp
index a4f05a879..e338f9c4e 100644
--- a/src/lower/lowerer_impl.cpp
+++ b/src/lower/lowerer_impl.cpp
@@ -508,16 +508,19 @@ Stmt LowererImpl::lowerForall(Forall forall)
     }
 
     // For now, this only works when consuming a single workspace.
-    bool canAccelWithSparseIteration = inParallelLoopDepth == 0 && provGraph.isFullyDerived(iterator.getIndexVar());
-    if (canAccelWithSparseIteration && iterator.isDimensionIterator() && locators.size() == 1) {
+    bool canAccelWithSparseIteration = inParallelLoopDepth == 0 && provGraph.isFullyDerived(iterator.getIndexVar()) &&
+                                       iterator.isDimensionIterator() && locators.size() == 1;
+    if (canAccelWithSparseIteration) {
+      bool indexListsExist = false;
       // We are iterating over a dimension and locating into a temporary with a tracker to keep indices. Instead, we
       // can just iterate over the indices and locate into the dense workspace.
       for (auto it = tensorVars.begin(); it != tensorVars.end(); ++it) {
         if (it->second == locators[0].getTensor() && util::contains(tempToIndexList, it->first)) {
-          canAccelWithSparseIteration = true;
+          indexListsExist = true;
           break;
         }
       }
+      canAccelWithSparseIteration &= indexListsExist;
     }
 
     if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) {
@@ -1493,7 +1496,7 @@ vector<Stmt> LowererImpl::codeToInitializeDenseAcceleratorArrays(Where where) {
 bool LowererImpl::canAccelerateDenseTemp(Where where) {
   TensorVar temporary = where.getTemporary();
   // (1) Temporary is dense vector
-  if(!isDense(temporary.getFormat()) || temporary.getOrder() == 1) return false;
+  if(!isDense(temporary.getFormat()) || temporary.getOrder() != 1) return false;
 
   vector<Access> inputAccesses, resultAccesses;
   set<Access> reducedAccesses;
@@ -1517,7 +1520,7 @@ bool LowererImpl::canAccelerateDenseTemp(Where where) {
   // Get vars in result.
   std::vector<IndexVar> resultVars = resultAccesses[0].getIndexVars();
   auto it = std::find(resultVars.begin(), resultVars.end(), tempVar[0]);
-  int index = it != resultVars.end()? (int)(it - resultVars.begin()) + 1: -1;
+  int index = it != resultVars.end()? (int)(it - resultVars.begin()): -1;
 
   // Var used in input is not in result? Probably would fail earlier but here just in case.
   if(index == -1) return false;

From c8972c052fef31016a42f478d5c55198922ec84e Mon Sep 17 00:00:00 2001
From: Rawn <rawnhenry@gmail.com>
Date: Sat, 5 Dec 2020 22:30:58 -0800
Subject: [PATCH 05/14] Fixes bug in concreteNotation check. All workspace
 tests pass.

---
 src/index_notation/index_notation.cpp | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp
index 3271e3bda..9c605684c 100644
--- a/src/index_notation/index_notation.cpp
+++ b/src/index_notation/index_notation.cpp
@@ -2118,8 +2118,23 @@ bool isConcreteNotation(IndexStmt stmt, std::string* reason) {
         return;
       }
 
+      // Handles derived vars on RHS with underived vars on LHS.
+      Assignment assignPtrWrapper = Assignment(op);
+      std::vector<IndexVar> possibleReductionVars = assignPtrWrapper.getReductionVars();
+      std::vector<IndexVar> freeVars = assignPtrWrapper.getFreeVars();
+      std::set<IndexVar> freeVarsSet(freeVars.begin(), freeVars.end());
+
+      int numReductionVars = 0;
+      for(const auto& reductionVar : possibleReductionVars) {
+        std::vector<IndexVar> underivedParents = provGraph.getUnderivedAncestors(reductionVar);
+        for(const auto& parent : underivedParents) {
+          if(!util::contains(freeVarsSet, parent)) {
+            ++numReductionVars;
+          }
+        }
+      }
       // allow introducing precompute loops where we set a temporary to values instead of +=
-      if (Assignment(op).getReductionVars().size() > 0 &&
+      if (numReductionVars > 0 &&
           op->op == IndexExpr() && !inWhereProducer) {
         *reason = "reduction variables in concrete notation must be dominated "
                   "by compound assignments (such as +=)";

From 4895917f210b93729a4c12b0f70044a90688c9e5 Mon Sep 17 00:00:00 2001
From: Rawn <rawnhenry@gmail.com>
Date: Sat, 5 Dec 2020 23:30:17 -0800
Subject: [PATCH 06/14] Removes print statements

---
 test/tests-scheduling.cpp | 15 +++------------
 test/tests-workspaces.cpp |  2 +-
 2 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/test/tests-scheduling.cpp b/test/tests-scheduling.cpp
index b7ba88ecc..7adc3ca88 100644
--- a/test/tests-scheduling.cpp
+++ b/test/tests-scheduling.cpp
@@ -84,7 +84,6 @@ TEST(scheduling, lowerDenseMatrixMul) {
     }
   }
 
-  cout << "-------PACKING---------" << endl;
   A.pack();
   B.pack();
 
@@ -97,29 +96,21 @@ TEST(scheduling, lowerDenseMatrixMul) {
              .split(j, j0, j1, 2)
              .split(k, k0, k1, 2)
              .reorder({i0, j0, k0, i1, j1, k1});
-  cout << "-------COMPILING---------" << endl;
 
   C.compile(stmt);
-  cout << "-------ASSEMBLING---------" << endl;
   C.assemble();
-  cout << "-------COMPUTING---------" << endl;
   C.compute();
 
   Tensor<double> expected("expected", {4, 4}, {Dense, Dense});
   expected(i, j) = A(i, k) * B(k, j);
-  IndexStmt expected_stmt = C.getAssignment().concretize();
   expected.compile();
   expected.assemble();
   expected.compute();
   ASSERT_TENSOR_EQ(C, expected);
 
-    cout << stmt << endl;
-
-    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen);
-    ir::Stmt compute = lower(stmt, "compute",  true, true);
-    codegen->compile(compute, true);
-    ir::Stmt expected_compute = lower(expected_stmt, "compute",  false, true);
-    //codegen->compile(expected_compute, true);
+  //  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen);
+  //  ir::Stmt compute = lower(stmt, "compute",  false, true);
+  //  codegen->compile(compute, true);
 }
 
 TEST(scheduling, lowerSparseCopy) {
diff --git a/test/tests-workspaces.cpp b/test/tests-workspaces.cpp
index bd035a8b1..a6af48f06 100644
--- a/test/tests-workspaces.cpp
+++ b/test/tests-workspaces.cpp
@@ -36,7 +36,7 @@ TEST(workspaces, tile_vecElemMul_NoTail) {
              .split(i_bounded, i0, i1, 4)
              .precompute(precomputedExpr, i1, i1, precomputed);
    
-    cout << stmt << endl;
+//    cout << stmt << endl;
 
   A.compile(stmt);
   A.assemble();

From 1c37ebd4a0cb1e9098937ed0d03150c19bf17f8a Mon Sep 17 00:00:00 2001
From: Rawn <rawnhenry@gmail.com>
Date: Sun, 6 Dec 2020 15:06:51 -0800
Subject: [PATCH 07/14] Only hoists out malloc + free from where statement when
 possible. Emits loop to zero every element in a temporary when it is hoisted
 before the producer is called. Changes the codegens to keep pointer names
 constant

---
 src/codegen/codegen_c.cpp    |  2 +-
 src/codegen/codegen_cuda.cpp |  2 +-
 src/lower/lowerer_impl.cpp   | 27 ++++++++++++++++++---------
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp
index 9611d09f5..a0c6c9591 100644
--- a/src/codegen/codegen_c.cpp
+++ b/src/codegen/codegen_c.cpp
@@ -182,7 +182,7 @@ class CodeGen_C::FindVars : public IRVisitor {
 
   virtual void visit(const Var *op) {
     if (varMap.count(op) == 0) {
-      varMap[op] = codeGen->genUniqueName(op->name);
+      varMap[op] = op->is_ptr? op->name : codeGen->genUniqueName(op->name);
     }
   }
 
diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp
index 5eb57c7ad..d19cac605 100644
--- a/src/codegen/codegen_cuda.cpp
+++ b/src/codegen/codegen_cuda.cpp
@@ -240,7 +240,7 @@ class CodeGen_CUDA::FindVars : public IRVisitor {
 
   virtual void visit(const Var *op) {
     if (varMap.count(op) == 0 && !inBlock) {
-      varMap[op] = codeGen->genUniqueName(op->name);
+      varMap[op] = op->is_ptr? op->name : codeGen->genUniqueName(op->name);
     }
   }
 
diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp
index e338f9c4e..bf3254375 100644
--- a/src/lower/lowerer_impl.cpp
+++ b/src/lower/lowerer_impl.cpp
@@ -1569,14 +1569,6 @@ vector<Stmt> LowererImpl::codeToInitializeTemporary(Where where) {
         decl = VarDecl::make(values, ir::Literal::make(0));
       }
       Stmt allocate = Allocate::make(values, size);
-      // If we are using acceleration of the dense workspace, we do not need to initialize the values array
-      // since the bit guard will take care of setting the value array when necessary
-      Stmt zeroInitLoop = Stmt();
-      if(!accelerateDense) {
-        Expr p = Var::make("p" + temporary.getName(), Int());
-        Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType()));
-        zeroInitLoop = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial);
-      }
 
       /// Make a struct object that lowerAssignment and lowerAccess can read
       /// temporary value arrays from.
@@ -1585,7 +1577,7 @@ vector<Stmt> LowererImpl::codeToInitializeTemporary(Where where) {
       this->temporaryArrays.insert({temporary, arrays});
 
       freeTemporary = Block::make(freeTemporary, Free::make(values));
-      initializeTemporary = Block::make(decl, initializeTemporary, allocate, zeroInitLoop);
+      initializeTemporary = Block::make(decl, initializeTemporary, allocate);
     }
   }
   return {initializeTemporary, freeTemporary};
@@ -1629,6 +1621,23 @@ Stmt LowererImpl::lowerWhere(Where where) {
     consumer = Block::make(sortCall, consumer);
   }
 
+  // Now that temporary allocations are hoisted, we always need to emit an initialization loop before entering the
+  // producer.
+  if(generateComputeCode() && !isScalar(temporary.getType())) {
+    // TODO: We only actually need to do this if:
+    //      1) We use the temporary multiple times
+    //      2) The PRODUCER RHS is sparse(not full). (Guarantees that old values are overwritten before consuming)
+
+    Expr p = Var::make("p" + temporary.getName(), Int());
+    Expr values = ir::Var::make(temporary.getName(),
+                                temporary.getType().getDataType(),
+                                true, false);
+    Expr size = getTemporarySize(temporary);
+    Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType()));
+    Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial);
+    initializeTemporary = Block::make(initializeTemporary, loopInit);
+  }
+
   whereConsumers.push_back(consumer);
   whereTemps.push_back(where.getTemporary());
   captureNextLocatePos = true;

From 9b1450cd29bb28cb8f9829ceae809c804fcb8f96 Mon Sep 17 00:00:00 2001
From: Rawn <rawnhenry@gmail.com>
Date: Wed, 23 Dec 2020 14:45:57 -0800
Subject: [PATCH 08/14] Removes initialization loop from before producer when
 accelerating a dense workspacE

---
 src/lower/lowerer_impl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp
index bf3254375..92d0e61ce 100644
--- a/src/lower/lowerer_impl.cpp
+++ b/src/lower/lowerer_impl.cpp
@@ -1622,8 +1622,8 @@ Stmt LowererImpl::lowerWhere(Where where) {
   }
 
   // Now that temporary allocations are hoisted, we always need to emit an initialization loop before entering the
-  // producer.
-  if(generateComputeCode() && !isScalar(temporary.getType())) {
+  // producer but only if there is no dense acceleration
+  if(generateComputeCode() && !isScalar(temporary.getType()) && !accelarateDenseWorkSpace) {
     // TODO: We only actually need to do this if:
     //      1) We use the temporary multiple times
     //      2) The PRODUCER RHS is sparse(not full). (Guarantees that old values are overwritten before consuming)

From dd795fcdae75d736601f581722fcff43426eabb0 Mon Sep 17 00:00:00 2001
From: Rawn <rawnhenry@gmail.com>
Date: Wed, 23 Dec 2020 18:48:37 -0800
Subject: [PATCH 09/14] Places index list size above the producer loop when
 accelerating a dense workspace. This should make the transition to
 multithreading easier and fixes a bug in the original code

---
 src/lower/lowerer_impl.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp
index 92d0e61ce..e85a20d5c 100644
--- a/src/lower/lowerer_impl.cpp
+++ b/src/lower/lowerer_impl.cpp
@@ -1453,7 +1453,6 @@ vector<Stmt> LowererImpl::codeToInitializeDenseAcceleratorArrays(Where where) {
   Stmt alreadySetDecl = Stmt();
   Stmt indexListDecl = Stmt();
   const Expr indexListSizeExpr = ir::Var::make(indexListName + "_size", taco::Int32, false, false);
-  const Stmt indexListSizeDecl = VarDecl::make(indexListSizeExpr, ir::Literal::make(0));
   Stmt freeTemps = Block::make(Free::make(indexListArr), Free::make(alreadySetArr));
   if ((isa<Forall>(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) {
     alreadySetDecl = VarDecl::make(alreadySetArr, ir::Literal::make(0));
@@ -1471,13 +1470,13 @@ vector<Stmt> LowererImpl::codeToInitializeDenseAcceleratorArrays(Where where) {
     Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType));
 
     Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial);
-    Stmt inits = Block::make(indexListSizeDecl, alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop);
+    Stmt inits = Block::make(alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop);
     return {inits, freeTemps};
   } else {
     Expr sizeOfElt = Sizeof::make(bitGuardType);
     Expr callocAlreadySet = ir::Call::make("calloc", {bitGuardSize, sizeOfElt}, Int());
     Stmt allocateAlreadySet = VarDecl::make(alreadySetArr, callocAlreadySet);
-    Stmt inits = Block::make(indexListSizeDecl, indexListDecl, allocateIndexList, allocateAlreadySet);
+    Stmt inits = Block::make(indexListDecl, allocateIndexList, allocateAlreadySet);
     return {inits, freeTemps};
   }
 
@@ -1650,6 +1649,11 @@ Stmt LowererImpl::lowerWhere(Where where) {
   }
 
   Stmt producer = lower(where.getProducer());
+  if(accelarateDenseWorkSpace) {
+    const Expr indexListSizeExpr = tempToIndexListSize.at(temporary);
+    const Stmt indexListSizeDecl = VarDecl::make(indexListSizeExpr, ir::Literal::make(0));
+    initializeTemporary = Block::make(indexListSizeDecl, initializeTemporary);
+  }
 
   if (restoreAtomicDepth) {
     markAssignsAtomicDepth++;

From ff84784ec225134179c59b27b08cad9d09b1eb02 Mon Sep 17 00:00:00 2001
From: Rawn <rawnhenry@gmail.com>
Date: Wed, 23 Dec 2020 21:09:00 -0800
Subject: [PATCH 10/14] Fixes workspace reset

---
 src/lower/lowerer_impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp
index e85a20d5c..10478c55b 100644
--- a/src/lower/lowerer_impl.cpp
+++ b/src/lower/lowerer_impl.cpp
@@ -939,7 +939,7 @@ Stmt LowererImpl::lowerForallDimension(Forall forall,
 
     Stmt declareVar = VarDecl::make(coordinate, Load::make(indexList, loopVar));
     Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses);
-    Stmt resetGuard = ir::Store::make(bitGuard, loopVar, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit);
+    Stmt resetGuard = ir::Store::make(bitGuard, coordinate, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit);
     body = Block::make(declareVar, body, resetGuard);
 
     if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) {

From d5721d70b0d8f2311d5c1e9c501936accfaf4909 Mon Sep 17 00:00:00 2001
From: Rawn <rawnhenry@gmail.com>
Date: Thu, 24 Dec 2020 00:12:04 -0800
Subject: [PATCH 11/14] If underived variables are used to index a workspace,
 we allocate space for the workspace based on the size of the sizes of the
 input tensors

---
 include/taco/lower/lowerer_impl.h |  4 ++--
 src/lower/lowerer_impl.cpp        | 22 ++++++++++++++++++----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/include/taco/lower/lowerer_impl.h b/include/taco/lower/lowerer_impl.h
index 71783c6ad..39675d450 100644
--- a/include/taco/lower/lowerer_impl.h
+++ b/include/taco/lower/lowerer_impl.h
@@ -350,8 +350,8 @@ class LowererImpl : public util::Uncopyable {
   /// Initializes a temporary workspace
   std::vector<ir::Stmt> codeToInitializeTemporary(Where where);
 
-  /// Gets the size of a temporary tensorVar
-  ir::Expr getTemporarySize(TensorVar var);
+  /// Gets the size of a temporary tensorVar in the where statement
+  ir::Expr getTemporarySize(Where where);
 
   /// Initializes helper arrays to give dense workspaces sparse acceleration
   std::vector<ir::Stmt> codeToInitializeDenseAcceleratorArrays(Where where);
diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp
index 10478c55b..2be37d0f4 100644
--- a/src/lower/lowerer_impl.cpp
+++ b/src/lower/lowerer_impl.cpp
@@ -1412,8 +1412,22 @@ Stmt LowererImpl::lowerForallBody(Expr coordinate, IndexStmt stmt,
                      appendCoords);
 }
 
-Expr LowererImpl::getTemporarySize(TensorVar temporary) {
+Expr LowererImpl::getTemporarySize(Where where) {
+  TensorVar temporary = where.getTemporary();
   Dimension temporarySize = temporary.getType().getShape().getDimension(0);
+  Access temporaryAccess = getResultAccesses(where.getProducer()).first[0];
+  std::vector<IndexVar> indexVars = temporaryAccess.getIndexVars();
+
+  if(util::all(indexVars, [&](const IndexVar& var) { return provGraph.isUnderived(var);})) {
+    // All index vars underived then use tensor properties to get tensor size
+    taco_iassert(util::contains(dimensions, indexVars[0])) << "Missing " << indexVars[0];
+    ir::Expr size = dimensions.at(indexVars[0]);
+    for(size_t i = 1; i < indexVars.size(); ++i) {
+      taco_iassert(util::contains(dimensions, indexVars[i])) << "Missing " << indexVars[i];
+      size = ir::Mul::make(size, dimensions.at(indexVars[i]));
+    }
+    return size;
+  }
 
   if (temporarySize.isFixed()) {
     return ir::Literal::make(temporarySize.getSize());
@@ -1436,7 +1450,7 @@ vector<Stmt> LowererImpl::codeToInitializeDenseAcceleratorArrays(Where where) {
   // TODO: emit as uint64 and manually emit bit pack code
   const Datatype bitGuardType = taco::Bool;
   const std::string bitGuardName = temporary.getName() + "_already_set";
-  const Expr bitGuardSize = getTemporarySize(temporary);
+  const Expr bitGuardSize = getTemporarySize(where);
   const Expr alreadySetArr = ir::Var::make(bitGuardName,
                                            bitGuardType,
                                            true, false);
@@ -1560,7 +1574,7 @@ vector<Stmt> LowererImpl::codeToInitializeTemporary(Where where) {
                                   true, false);
       taco_iassert(temporary.getType().getOrder() == 1) << " Temporary order was "
                                                         << temporary.getType().getOrder();  // TODO
-      Expr size = getTemporarySize(temporary);
+      Expr size = getTemporarySize(where);
 
       // no decl needed for shared memory
       Stmt decl = Stmt();
@@ -1631,7 +1645,7 @@ Stmt LowererImpl::lowerWhere(Where where) {
     Expr values = ir::Var::make(temporary.getName(),
                                 temporary.getType().getDataType(),
                                 true, false);
-    Expr size = getTemporarySize(temporary);
+    Expr size = getTemporarySize(where);
     Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType()));
     Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial);
     initializeTemporary = Block::make(initializeTemporary, loopInit);

From 2eb298e355e39ffe013b3aab19c08c9770efb8b5 Mon Sep 17 00:00:00 2001
From: Rawn <rawnhenry@gmail.com>
Date: Thu, 24 Dec 2020 16:29:44 -0800
Subject: [PATCH 12/14] Relaxes requirements for spmm transformation

---
 src/index_notation/transformations.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp
index 955687f46..1bebe8c6d 100644
--- a/src/index_notation/transformations.cpp
+++ b/src/index_notation/transformations.cpp
@@ -1115,16 +1115,15 @@ static IndexStmt optimizeSpMM(IndexStmt stmt) {
   }
 
   TensorVar B = Baccess.getTensorVar();
-  if (B.getFormat().getModeFormats()[0].getName() != "dense" ||
-      B.getFormat().getModeFormats()[1].getName() != "compressed" ||
-      B.getFormat().getModeOrdering()[0] != 0 ||
+  if (B.getFormat().getModeOrdering()[0] != 0 ||
       B.getFormat().getModeOrdering()[1] != 1) {
     return stmt;
   }
 
+  // We need random access into the first mode or this tensor in order to perform a linear combination of rows
+  // algorithm. (I think?)
   TensorVar C = Caccess.getTensorVar();
-  if (C.getFormat().getModeFormats()[0].getName() != "dense" ||
-      C.getFormat().getModeFormats()[1].getName() != "compressed" ||
+  if (C.getFormat().getModeFormats()[0].getName() == "compressed" ||
       C.getFormat().getModeOrdering()[0] != 0 ||
       C.getFormat().getModeOrdering()[1] != 1) {
     return stmt;

From 46aed1307438a691a80c37598c1a58124a6ddd91 Mon Sep 17 00:00:00 2001
From: Rawn <rawnhenry@gmail.com>
Date: Thu, 24 Dec 2020 16:39:16 -0800
Subject: [PATCH 13/14] Checks if first mode of last tensor has locate for spmm
 transform

---
 src/index_notation/transformations.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp
index 1bebe8c6d..f0728ef2b 100644
--- a/src/index_notation/transformations.cpp
+++ b/src/index_notation/transformations.cpp
@@ -1123,7 +1123,7 @@ static IndexStmt optimizeSpMM(IndexStmt stmt) {
   // We need random access into the first mode or this tensor in order to perform a linear combination of rows
   // algorithm. (I think?)
   TensorVar C = Caccess.getTensorVar();
-  if (C.getFormat().getModeFormats()[0].getName() == "compressed" ||
+  if (!C.getFormat().getModeFormats()[0].hasLocate() ||
       C.getFormat().getModeOrdering()[0] != 0 ||
       C.getFormat().getModeOrdering()[1] != 1) {
     return stmt;

From 8471869aa3678c1bc777af073a212dbaaa4fd856 Mon Sep 17 00:00:00 2001
From: Rawn <rawnhenry@gmail.com>
Date: Thu, 24 Dec 2020 17:01:12 -0800
Subject: [PATCH 14/14] Changes SPMM tranform requirement. Unsure about this

---
 src/index_notation/transformations.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp
index f0728ef2b..5310455f6 100644
--- a/src/index_notation/transformations.cpp
+++ b/src/index_notation/transformations.cpp
@@ -1114,16 +1114,19 @@ static IndexStmt optimizeSpMM(IndexStmt stmt) {
     return stmt;
   }
 
+  // I think we can to linear combination of rows as long as there are no permutations in the format and the
+  // level formats are ordered. The i -> k -> j loops should iterate over the data structures without issue.
   TensorVar B = Baccess.getTensorVar();
-  if (B.getFormat().getModeOrdering()[0] != 0 ||
+  if (!B.getFormat().getModeFormats()[0].isOrdered() ||
+      !B.getFormat().getModeFormats()[1].isOrdered() ||
+      B.getFormat().getModeOrdering()[0] != 0 ||
       B.getFormat().getModeOrdering()[1] != 1) {
     return stmt;
   }
 
-  // We need random access into the first mode or this tensor in order to perform a linear combination of rows
-  // algorithm. (I think?)
   TensorVar C = Caccess.getTensorVar();
-  if (!C.getFormat().getModeFormats()[0].hasLocate() ||
+  if (!C.getFormat().getModeFormats()[0].isOrdered() ||
+      !C.getFormat().getModeFormats()[1].isOrdered() ||
       C.getFormat().getModeOrdering()[0] != 0 ||
       C.getFormat().getModeOrdering()[1] != 1) {
     return stmt;