[Mlir-commits] [mlir] 372e793 - [mlir][sparse] support affine expression on sparse dimensions (analysis implementation)

Tue Nov 22 16:02:53 PST 2022

Author: Peiming Liu
Date: 2022-11-23T00:02:47Z
New Revision: 372e7939d738d69e20db28c4433d4e161ed72397

URL: https://github.com/llvm/llvm-project/commit/372e7939d738d69e20db28c4433d4e161ed72397
DIFF: https://github.com/llvm/llvm-project/commit/372e7939d738d69e20db28c4433d4e161ed72397.diff

LOG: [mlir][sparse] support affine expression on sparse dimensions (analysis implementation)

Reviewed By: aartbik

Differential Revision: https://reviews.llvm.org/D138171

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
    mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
    mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
    mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
    mlir/unittests/Dialect/SparseTensor/MergerTest.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
index 7f9f0e9f0e0d2..60829fe99951c 100644

--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@@ -148,14 +148,29 @@ struct LatPoint {
 /// independently from the basic algorithm if bottlenecks are identified.
 class Merger {
 public:
-  /// Constructs a merger for the given number of tensors and loops. The
-  /// user supplies the number of tensors involved in the kernel, with the
-  /// last tensor in this set denoting the output tensor. The merger adds an
-  /// additional synthetic tensor at the end of this set to represent all
-  /// invariant expressions in the kernel.
-  Merger(unsigned t, unsigned l)
-      : outTensor(t - 1), syntheticTensor(t), numTensors(t + 1), numLoops(l),
-        hasSparseOut(false),
+  /// Constructs a merger for the given number of tensors, native loops, and
+  /// filter loops. The user supplies the number of tensors involved in the
+  /// kernel, with the last tensor in this set denoting the output tensor. The
+  /// merger adds an additional synthetic tensor at the end of this set to
+  /// represent all invariant expressions in the kernel.
+  /// In addition to natives
+  /// loops (which are specified by the GenericOp), extra filter loops are
+  /// needed in order to handle affine expressions on sparse dimensions.
+  /// E.g., (d0, d1, d2) => (d0 + d1, d2), a naive implementation of the filter
+  /// loop could be generated as:
+  /// for (coord : sparse_dim[0])
+  ///   if (coord == d0 + d1) {
+  ///      generated_code;
+  ///   }
+  /// }
+  /// to filter out coordinates that are not equal to the affine expression
+  /// result.
+  /// TODO: we want to make the filter loop more efficient in the future, e.g.,
+  /// by avoiding scanning the full stored index sparse (keeping the last
+  /// position in ordered list) or even apply binary search to find the index.
+  Merger(unsigned t, unsigned l, unsigned fl)
+      : outTensor(t - 1), syntheticTensor(t), numTensors(t + 1),
+        numNativeLoops(l), numLoops(l + fl), hasSparseOut(false),
         dimTypes(numTensors,
                  std::vector<DimLevelType>(numLoops, DimLevelType::Undef)),
         loopIdxToDim(numTensors,
@@ -231,6 +246,15 @@ class Merger {
   /// Bit translation (get loop index).
   unsigned index(unsigned b) const { return b / numTensors; }
 
+  /// Get the number of total loops (native loops + filter loops).
+  unsigned getNumLoops() const { return numLoops; }
+  /// Get the number of native loops.
+  unsigned getNumNativeLoops() const { return numNativeLoops; }
+  /// Get the number of filter loops.
+  unsigned getNumFilterLoops() const { return numLoops - numNativeLoops; }
+  /// Get the starting filter loop index.
+  unsigned getFilterLoopStartingIdx() const { return getNumNativeLoops(); }
+
   /// Returns true if bit corresponds to index of output tensor.
   bool isOutTensor(unsigned b, unsigned i) const {
     return tensor(b) == outTensor && index(b) == i;
@@ -242,6 +266,11 @@ class Merger {
   /// expressions).
   unsigned getSynTensorID() const { return syntheticTensor; }
 
+  bool isFilterLoop(unsigned ldx) const {
+    assert(ldx < numLoops);
+    return ldx >= numNativeLoops;
+  }
+
   /// Returns true if given tensor iterates *only* in the given tensor
   /// expression. For the output tensor, this defines a "simply dynamic"
   /// operation [Bik96]. For instance: a(i) *= 2.0 or a(i) += a(i) for
@@ -345,6 +374,7 @@ class Merger {
   const unsigned outTensor;
   const unsigned syntheticTensor;
   const unsigned numTensors;
+  const unsigned numNativeLoops;
   const unsigned numLoops;
   bool hasSparseOut;
   // Map that converts pair<tensor id, loop id> to the corresponding dimension

diff  --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
index e94224db2b347..a591bc3bd9b68 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@@ -329,6 +329,12 @@ Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(
   return loop;
 }
 
+Operation *SparseTensorLoopEmitter::enterFilterLoopOverTensorAtDim(
+    OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine,
+    MutableArrayRef<Value> reduc) {
+  llvm_unreachable("need to be implemented");
+}
+
 void SparseTensorLoopEmitter::genDenseAffineAddressAtCurLevel(
     OpBuilder &builder, Location loc, size_t tid, size_t dim,
     AffineExpr affine) {

diff  --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
index 2987490e25907..7e6a350b7117b 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@@ -406,6 +406,11 @@ class SparseTensorLoopEmitter {
                                       ArrayRef<size_t> extraTids = {},
                                       ArrayRef<size_t> extraDims = {});
 
+  Operation *enterFilterLoopOverTensorAtDim(OpBuilder &builder, Location loc,
+                                            size_t tid, size_t dim,
+                                            AffineExpr affine,
+                                            MutableArrayRef<Value> reduc = {});
+
   void genDenseAffineAddressAtCurLevel(OpBuilder &builder, Location loc,
                                        size_t tid, size_t dim,
                                        AffineExpr affine);

diff  --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index c9a15252c6ce4..b822a71e228ef 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -155,8 +155,11 @@ static AffineMap permute(MLIRContext *context, AffineMap m,
 /// Helper method to inspect affine expressions. Rejects cases where the
 /// same index is used more than once. Also rejects compound affine
 /// expressions in sparse dimensions.
+/// filterIdx stores the current filter loop idx should be used for the next
+/// compound affine sparse level, and it will be incremented by one when
+/// used.
 static bool findAffine(Merger &merger, unsigned tensor, unsigned dim,
-                       AffineExpr a, DimLevelType dlt,
+                       AffineExpr a, DimLevelType dlt, unsigned &filterLdx,
                        bool setLvlFormat = true) {
   switch (a.getKind()) {
   case AffineExprKind::DimId: {
@@ -169,22 +172,68 @@ static bool findAffine(Merger &merger, unsigned tensor, unsigned dim,
     return true;
   }
   case AffineExprKind::Add:
-  case AffineExprKind::Mul: {
-    if (!isDenseDLT(dlt))
-      return false; // compound only in dense dim
-    auto binOp = a.cast<AffineBinaryOpExpr>();
-    // We do not set dim level format for affine expresssion like d0 + d1 on
-    // both loop index at d0 and d1,
-    return findAffine(merger, tensor, dim, binOp.getLHS(), dlt, false) &&
-           findAffine(merger, tensor, dim, binOp.getRHS(), dlt, false);
+  case AffineExprKind::Mul:
+  case AffineExprKind::Constant: {
+    if (!isDenseDLT(dlt) && setLvlFormat) {
+      assert(isUndefDLT(merger.getDimLevelType(tensor, filterLdx)));
+      // Use a filter loop for sparse affine expression.
+      merger.setDimAndDimLevelType(tensor, filterLdx++, dim, dlt);
+    }
+
+    if (auto binOp = a.dyn_cast<AffineBinaryOpExpr>()) {
+      // We do not set dim level format for affine expresssion like d0 + d1 on
+      // either loop index at d0 or d1.
+      // We continue the recursion merely to check whether current affine is
+      // admissible or not.
+      return findAffine(merger, tensor, dim, binOp.getLHS(), dlt, filterLdx,
+                        false) &&
+             findAffine(merger, tensor, dim, binOp.getRHS(), dlt, filterLdx,
+                        false);
+    }
+    // Falls through when it is a constant Affine
+    return true;
   }
-  case AffineExprKind::Constant:
-    return isDenseDLT(dlt); // const only in dense dim
   default:
     return false;
   }
 }
 
+/// Get the total number of compound affine expressions in affineMap that are
+/// attached to the given tensor. For the following inputs:
+///
+/// affineMap = (d0, d1, d2) => (d0 + d1, d2)
+/// tensor = ["compressed", "compressed"]
+///
+/// Returns 1 (because the first level is compressed and its corresponding
+/// affineMap is d0 + d1)
+static unsigned getNumCompoundAffineOnSparseDims(AffineMap affineMap,
+                                                 Value tensor) {
+  unsigned num = 0;
+  auto enc = getSparseTensorEncoding(tensor.getType());
+  if (enc) {
+    ArrayRef<AffineExpr> exps = affineMap.getResults();
+    for (unsigned rank = 0; rank < exps.size(); rank++) {
+      auto aidx = toOrigDim(enc, rank);
+      auto affine = exps[aidx];
+      if (!affine.isa<AffineDimExpr>())
+        if (!isDenseDLT(getDimLevelType(enc, rank)))
+          num++;
+    }
+  }
+
+  return num;
+}
+
+/// Get the total number of compound affine expressions attached on a sparse
+/// level in the given GenericOp.
+static unsigned getNumCompoundAffineOnSparseDims(linalg::GenericOp op) {
+  unsigned num = 0;
+  for (OpOperand &t : op->getOpOperands())
+    num += getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(&t),
+                                            t.get());
+  return num;
+}
+
 /// Helper method to inspect sparse encodings in the tensor types.
 /// Fills the per-dimension sparsity information for all tensors.
 /// Returns true if the sparse annotations and affine subscript
@@ -192,19 +241,22 @@ static bool findAffine(Merger &merger, unsigned tensor, unsigned dim,
 /// no annotations are found or inadmissible constructs occur.
 static bool findSparseAnnotations(Merger &merger, linalg::GenericOp op) {
   bool annotated = false;
+  unsigned filterLdx = merger.getFilterLoopStartingIdx();
   for (OpOperand &t : op->getOpOperands()) {
     auto map = op.getMatchingIndexingMap(&t);
     auto enc = getSparseTensorEncoding(t.get().getType());
     if (enc)
       annotated = true;
     assert(map.getNumResults() == op.getRank(&t));
+
     for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
       unsigned tensor = t.getOperandNumber();
       AffineExpr a = map.getResult(toOrigDim(enc, d));
-      if (!findAffine(merger, tensor, d, a, getDimLevelType(enc, d)))
+      if (!findAffine(merger, tensor, d, a, getDimLevelType(enc, d), filterLdx))
         return false; // inadmissible affine expression
     }
   }
+  assert(filterLdx == merger.getNumLoops());
   return annotated;
 }
 
@@ -214,34 +266,58 @@ static bool findSparseAnnotations(Merger &merger, linalg::GenericOp op) {
 /// latest possible index.
 static bool topSortOptimal(unsigned n,
                            ArrayRef<utils::IteratorType> iteratorTypes,
-                           std::vector<unsigned> &topSort,
+                           const Merger &merger, std::vector<unsigned> &topSort,
                            std::vector<unsigned> &inDegree,
                            std::vector<std::vector<bool>> &adjM) {
-  std::vector<unsigned> redIt; // reduce iterator with 0 degree
-  std::vector<unsigned> parIt; // parallel iterator with 0 degree
+  std::vector<unsigned> redIt;    // reduce iterator with 0 degree
+  std::vector<unsigned> parIt;    // parallel iterator with 0 degree
+  std::vector<unsigned> filterIt; // filter loop with 0 degree
   for (unsigned i = 0; i < n; i++) {
     if (inDegree[i] == 0) {
-      if (linalg::isReductionIterator(iteratorTypes[i]))
+      if (merger.isFilterLoop(i))
+        filterIt.push_back(i);
+      else if (linalg::isReductionIterator(iteratorTypes[i]))
         redIt.push_back(i);
       else
         parIt.push_back(i);
     }
   }
 
-  while (!redIt.empty() || !parIt.empty()) {
-    // We always choose parallel iterator if there is any.
-    auto &it = !parIt.empty() ? parIt : redIt;
+  while (!redIt.empty() || !parIt.empty() || !filterIt.empty()) {
+    // We always choose in order of filter loop -> parallel loop -> reduction
+    // loop because
+    // 1. Putting reduction loop early might make the loop sequence
+    // inadmissible.
+    // 2. Filter loops should be put as early as possible for better
+    // performance, since only one (if any) iteration will carry the
+    // computation. E.g., for (1 to N)
+    //  for (1 to M)
+    //    for (1 to K)
+    //      if (xxx)
+    //        O(X) computation  => O(NMK+NMX) time complexity
+    //
+    // By putting the filter loop one level up, we got
+    //
+    // for (1 to N)
+    //  for (1 to K)
+    //    if (xxx)
+    //      for (1 to M)
+    //        O(X) computation  => O(NK+NMX) time complexity
+    auto &it = !filterIt.empty() ? filterIt : (!parIt.empty() ? parIt : redIt);
     auto src = it.back();
     topSort.push_back(src);
     it.pop_back();
     // Update in-degree, and push 0-degree node into worklist.
-    for (unsigned dst = 0; dst < n; dst++)
+    for (unsigned dst = 0; dst < n; dst++) {
       if (adjM[src][dst] && --inDegree[dst] == 0) {
-        if (linalg::isReductionIterator(iteratorTypes[dst]))
+        if (merger.isFilterLoop(dst))
+          filterIt.push_back(dst);
+        else if (linalg::isReductionIterator(iteratorTypes[dst]))
           redIt.push_back(dst);
         else
           parIt.push_back(dst);
       }
+    }
   }
   return topSort.size() == n;
 }
@@ -340,7 +416,7 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
                                   OpOperand *skip = nullptr) {
   // Set up an n x n from/to adjacency matrix of the iteration graph
   // for the implicit loop indices i_0 .. i_n-1.
-  unsigned n = op.getNumLoops();
+  unsigned n = merger.getNumLoops();
   std::vector<std::vector<bool>> adjM(n, std::vector<bool>(n, false));
   std::vector<unsigned> inDegree(n, 0); // in-degree of each node.
   auto iteratorTypes = op.getIteratorTypesArray();
@@ -352,7 +428,7 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
     // Get map and encoding.
     auto map = op.getMatchingIndexingMap(&t);
     auto enc = getSparseTensorEncoding(t.get().getType());
-    assert(map.getNumDims() == n);
+    assert(map.getNumDims() + getNumCompoundAffineOnSparseDims(op) == n);
     // Skip dense tensor constraints when not requested.
     if (!(mask & SortMask::kIncludeDense) && !enc)
       continue;
@@ -364,6 +440,19 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
       AffineExpr ta = map.getResult(toOrigDim(enc, d));
       Optional<unsigned> tldx = merger.getLoopIdx(t.getOperandNumber(), d);
 
+      // Filter loops should be constructed after all the dependent loops,
+      // i.e., d0 + d1 < filter_loop(d0 + d1)
+      if (tldx && merger.isFilterLoop(tldx.value())) {
+        assert(!ta.isa<AffineDimExpr>() &&
+               !isDenseDLT(getDimLevelType(enc, d)));
+        addAffineOrderings(adjM, inDegree, ta, AffineExpr(), llvm::None, tldx);
+        // Now that the ordering of affine expression is captured by filter
+        // loop idx, we only need to ensure the affine ordering against filter
+        // loop. Thus, we reset the affine express to nil here to mark it as
+        // resolved.
+        ta = AffineExpr();
+      }
+
       if (d > 0) {
         AffineExpr fa = map.getResult(toOrigDim(enc, d - 1));
         Optional<unsigned> fldx =
@@ -377,6 +466,11 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
         if (!(mask & SortMask::kIncludeDense))
           tryLoosenAffineDenseConstraints(op, fldx, fa, tldx, ta);
 
+        // (d0 + d1) < (d2 + d3), or
+        // filter_loop_d-1 < (d2 + d3), or
+        // (d0 + d1) < filter_loop_d, or
+        // filter_loop_d-1 < filter_loop_d depending on whether fa/ta is reset
+        // above.
         addAffineOrderings(adjM, inDegree, fa, ta, fldx, tldx);
       }
     }
@@ -402,7 +496,7 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
   // Report failure for a cyclic iteration graph.
   topSort.clear();
   topSort.reserve(n);
-  return topSortOptimal(n, iteratorTypes, topSort, inDegree, adjM);
+  return topSortOptimal(n, iteratorTypes, merger, topSort, inDegree, adjM);
 }
 
 /// Returns true if tensor materializes uninitialized into the computation.
@@ -430,9 +524,8 @@ static bool isAdmissibleTensorExp(Merger &merger, linalg::GenericOp op,
   // An all-dense annotated "sparse" output tensor becomes a linearized random
   // access 1-dim memref. Also admissible since insertions cannot occur.
   bool allDense = true;
-  auto iteratorTypes = op.getIteratorTypesArray();
-  unsigned numLoops = iteratorTypes.size();
-  for (unsigned i = 0; i < numLoops; i++)
+  unsigned numLoops = merger.getNumLoops(); // numNativeLoops + numFilterLoops
+  for (unsigned i = 0; i < merger.getNumLoops(); i++)
     if (isCompressedDLT(merger.getDimLevelType(tensor, i)) ||
         isSingletonDLT(merger.getDimLevelType(tensor, i))) {
       allDense = false;
@@ -443,19 +536,31 @@ static bool isAdmissibleTensorExp(Merger &merger, linalg::GenericOp op,
     }
   if (allDense)
     return true;
+
+  // TODO: support compound affine expression on sparse output.
+  if (getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(lhs),
+                                       lhs->get()) != 0)
+    return false;
+
   // A tensor expression with a sparse output tensor that changes its values
   // but not its nonzero structure, an operation called "simply dynamic" in
   // [Bik96,Ch9], is also admissible without special codegen.
   if (merger.isSingleCondition(tensor, exp))
     return true;
+
   // Accept "truly dynamic" if the output tensor materializes uninitialized
   // into the computation and insertions occur in lexicographic index order.
   if (isMaterializing(lhs->get())) {
+    auto iteratorTypes = op.getIteratorTypesArray();
     unsigned nest = 0;
     for (unsigned i = 0; i < numLoops; i++) {
-      if (linalg::isReductionIterator(iteratorTypes[topSort[i]]))
-        break; // terminate at first reduction
-      nest++;
+      if (!merger.isFilterLoop(topSort[i])) {
+        // We only count non-filter loops as filter loops should be considered
+        // as a special type of parallel loops.
+        if (linalg::isReductionIterator(iteratorTypes[topSort[i]]))
+          break; // terminate at first reduction
+        nest++;
+      }
     }
     // Determine admissible dynamic insertion situations:
     // (1) fully injective, since there are no reductions,
@@ -878,7 +983,14 @@ static void genInvariants(Merger &merger, CodeGen &codegen, OpBuilder &builder,
     auto enc = getSparseTensorEncoding(t.get().getType());
     for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
       AffineExpr a = map.getResult(toOrigDim(enc, d));
-      if (!isInvariantAffine(codegen, a, ldx, atLevel))
+      Optional<unsigned> sldx = merger.getLoopIdx(t.getOperandNumber(), d);
+      if (sldx && merger.isFilterLoop(sldx.value())) {
+        if (!codegen.getLoopIdxValue(sldx.value()))
+          // The filter loops has not been constructed.
+          return;
+        if (sldx.value() == ldx)
+          atLevel = true;
+      } else if (!isInvariantAffine(codegen, a, ldx, atLevel))
         return; // still in play
     }
     // All exhausted at this level (atLevel denotes exactly at this level).
@@ -1003,6 +1115,16 @@ static Operation *genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder,
 
   Operation *loop =
       genLoopBoundary(codegen, merger, [&](MutableArrayRef<Value> reduc) {
+        if (merger.isFilterLoop(idx)) {
+          assert(isSparse);
+          OpOperand *t = &op->getOpOperand(tid);
+          auto enc = getSparseTensorEncoding(t->get().getType());
+          // Retrieves the affine expression for the filter loop.
+          AffineExpr a =
+              op.getMatchingIndexingMap(t).getResult(toOrigDim(enc, dim));
+          return codegen.loopEmitter.enterFilterLoopOverTensorAtDim(
+              builder, loc, tid, dim, a, reduc);
+        }
         return codegen.loopEmitter.enterLoopOverTensorAtDim(
             builder, loc, tid, dim, reduc, isParallel, extraTids, extraDims);
       }).value();
@@ -1488,7 +1610,8 @@ struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
       return failure();
     unsigned numTensors = op->getNumOperands();
     unsigned numLoops = op.getNumLoops();
-    Merger merger(numTensors, numLoops);
+    unsigned numFilterLoops = getNumCompoundAffineOnSparseDims(op);
+    Merger merger(numTensors, numLoops, numFilterLoops);
     if (!findSparseAnnotations(merger, op))
       return failure();
 

diff  --git a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
index d2748460a094a..929900142d272 100644
--- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
+++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
@@ -127,7 +127,7 @@ class MergerTestBase : public ::testing::Test {
 protected:
   MergerTestBase(unsigned numTensors, unsigned numLoops)
       : numTensors(numTensors), numLoops(numLoops),
-        merger(numTensors, numLoops) {}
+        merger(numTensors, numLoops, /*numFilterLoops=*/0) {}
 
   ///
   /// Expression construction helpers.