[llvm-branch-commits] [mlir] 5508516 - [mlir][sparse] retry sparse-only for cyclic iteration graphs

Thu Jan 14 22:44:42 PST 2021

Author: Aart Bik
Date: 2021-01-14T22:39:29-08:00
New Revision: 5508516b06633e95fb5c2d6a5e196e4dcaa72c8d

URL: https://github.com/llvm/llvm-project/commit/5508516b06633e95fb5c2d6a5e196e4dcaa72c8d
DIFF: https://github.com/llvm/llvm-project/commit/5508516b06633e95fb5c2d6a5e196e4dcaa72c8d.diff

LOG: [mlir][sparse] retry sparse-only for cyclic iteration graphs

This is a very minor improvement during iteration graph construction.
If the first attempt considering the dimension order of all tensors fails,
a second attempt is made using the constraints of sparse tensors only.
Dense tensors prefer dimension order (locality) but provide random access
if needed, enabling the compilation of more sparse kernels.

Reviewed By: penpornk

Differential Revision: https://reviews.llvm.org/D94709

Added: 
    mlir/test/Dialect/Linalg/sparse_nd.mlir

Modified: 
    mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
index 7ba0a2f63071..84c71e84c42e 100644

--- a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
@@ -274,6 +274,11 @@ class Merger {
     return false;
   }
 
+  // Returns true if tensor has any sparse dimension.
+  bool isSparseTensor(unsigned t) const {
+    return llvm::any_of(dims[t], [](Dim d) { return d == Dim::kSparse; });
+  }
+
   // Setter
   void setDim(unsigned t, unsigned i, Dim d) { dims[t][i] = d; }
 
@@ -382,17 +387,22 @@ static bool topSortDFS(unsigned i, std::vector<unsigned> &visit,
 /// for sparse storage formats since these only support access along fixed
 /// dimensions. Even for dense storage formats, however, the natural index
 /// order yields innermost unit-stride access with better spatial locality.
-static bool computeIterationGraph(linalg::GenericOp op,
-                                  std::vector<unsigned> &topSort) {
+static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
+                                  std::vector<unsigned> &topSort,
+                                  bool sparseOnly) {
   // Set up an n x n from/to adjacency matrix of the iteration graph
   // for the implicit loop indices i_0 .. i_n-1.
   unsigned n = op.getNumLoops();
   std::vector<std::vector<bool>> adjM(n, std::vector<bool>(n, false));
 
   // Iterate over the indexing maps of every tensor in the tensor expression.
-  for (auto imap : llvm::enumerate(op.indexing_maps())) {
-    auto map = imap.value().template cast<AffineMapAttr>().getValue();
+  unsigned numTensors = op.getNumShapedOperands();
+  for (unsigned t = 0; t < numTensors; t++) {
+    auto map = op.getIndexingMap(t);
     assert(map.getNumDims() == n);
+    // Skip dense tensor constraints when sparse only is requested.
+    if (sparseOnly && !merger.isSparseTensor(t))
+      continue;
     // At the moment, we take the index variables in the tensor access
     // expression in the order in which they appear (conceptually a
     // "row-major" layout of every tensor). So, a tensor access A_ijk
@@ -407,6 +417,7 @@ static bool computeIterationGraph(linalg::GenericOp op,
 
   // Topologically sort the iteration graph to determine loop order.
   // Report failure for a cyclic iteration graph.
+  topSort.clear();
   topSort.reserve(n);
   std::vector<unsigned> visit(n, 0);
   for (unsigned i = 0; i < n; i++)
@@ -1207,10 +1218,9 @@ struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
     // tensors are visited in natural index order. Fails on cycles.
     // This assumes that higher-level passes have already put the
     // tensors in each tensor expression in a feasible order.
-    // TODO: try again without *dense* constraints on failure or
-    //       even try to insert sparse reorderings to resolve cycles
     std::vector<unsigned> topSort;
-    if (!computeIterationGraph(op, topSort))
+    if (!computeIterationGraph(merger, op, topSort, /*sparseOnly=*/false) &&
+        !computeIterationGraph(merger, op, topSort, /*sparseOnly=*/true))
       return failure();
 
     // Finds the terminating yield statement and builds the tensor

diff  --git a/mlir/test/Dialect/Linalg/sparse_nd.mlir b/mlir/test/Dialect/Linalg/sparse_nd.mlir
new file mode 100644
index 000000000000..2b0762b1bf37
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/sparse_nd.mlir
@@ -0,0 +1,94 @@
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: mlir-opt %s -test-sparsification | FileCheck %s
+
+// Example with cyclic iteration graph with sparse and dense constraints,
+// but an acyclic iteration graph using sparse constraints only.
+#trait_mul = {
+  indexing_maps = [
+    affine_map<(i,j,k,l,m,n,o,p) -> (i,j,k,l,m,n,o,p)>,  // A
+    affine_map<(i,j,k,l,m,n,o,p) -> (p,o,n,m,l,k,j,i)>,  // B
+    affine_map<(i,j,k,l,m,n,o,p) -> (i,j,k,l,m,n,o,p)>   // X
+  ],
+  sparse = [
+    [ "D", "D", "D", "D", "D", "D", "D", "D" ],  // a
+    [ "D", "D", "D", "S", "S", "D", "D", "D" ],  // b
+    [ "D", "D", "D", "D", "D", "D", "D", "D" ]   // x
+  ],
+  iterator_types = ["parallel", "parallel", "parallel", "parallel",
+                    "parallel", "parallel", "parallel", "parallel"],
+  doc = "X(i,j,k,l,m,n,o,p) = A(i,j,k,l,m,n,o,p)  * B(p,o,n,m,l,k,j,i)"
+}
+
+// CHECK-LABEL:   func @mul(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<100x200x300x400x500x600x700x800xf32>,
+// CHECK-SAME:              %[[VAL_1:.*]]: tensor<100x200x300x400x500x600x700x800xf32>) -> tensor<100x200x300x400x500x600x700x800xf32> {
+// CHECK:           %[[VAL_2:.*]] = constant 999 : index
+// CHECK:           %[[VAL_3:.*]] = constant 100 : index
+// CHECK:           %[[VAL_4:.*]] = constant 200 : index
+// CHECK:           %[[VAL_5:.*]] = constant 300 : index
+// CHECK:           %[[VAL_6:.*]] = constant 600 : index
+// CHECK:           %[[VAL_7:.*]] = constant 700 : index
+// CHECK:           %[[VAL_8:.*]] = constant 800 : index
+// CHECK:           %[[VAL_9:.*]] = constant 0 : index
+// CHECK:           %[[VAL_10:.*]] = constant 1 : index
+// CHECK:           %[[VAL_11:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           %[[VAL_12:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
+// CHECK:           %[[VAL_13:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
+// CHECK:           %[[VAL_14:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
+// CHECK:           %[[VAL_15:.*]] = alloca(%[[VAL_2]]) : memref<?xindex>
+// CHECK:           %[[VAL_16:.*]] = alloca(%[[VAL_2]]) : memref<?xf32>
+// CHECK:           %[[VAL_17:.*]] = alloca() : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           scf.for %[[VAL_18:.*]] = %[[VAL_9]] to %[[VAL_8]] step %[[VAL_10]] {
+// CHECK:             scf.for %[[VAL_19:.*]] = %[[VAL_9]] to %[[VAL_7]] step %[[VAL_10]] {
+// CHECK:               %[[VAL_20:.*]] = muli %[[VAL_18]], %[[VAL_7]] : index
+// CHECK:               %[[VAL_21:.*]] = addi %[[VAL_20]], %[[VAL_19]] : index
+// CHECK:               scf.for %[[VAL_22:.*]] = %[[VAL_9]] to %[[VAL_6]] step %[[VAL_10]] {
+// CHECK:                 %[[VAL_23:.*]] = muli %[[VAL_21]], %[[VAL_6]] : index
+// CHECK:                 %[[VAL_24:.*]] = addi %[[VAL_23]], %[[VAL_22]] : index
+// CHECK:                 %[[VAL_25:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_24]]] : memref<?xindex>
+// CHECK:                 %[[VAL_26:.*]] = addi %[[VAL_24]], %[[VAL_10]] : index
+// CHECK:                 %[[VAL_27:.*]] = load %[[VAL_12]]{{\[}}%[[VAL_26]]] : memref<?xindex>
+// CHECK:                 scf.for %[[VAL_28:.*]] = %[[VAL_25]] to %[[VAL_27]] step %[[VAL_10]] {
+// CHECK:                   %[[VAL_29:.*]] = load %[[VAL_13]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:                   %[[VAL_30:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:                   %[[VAL_31:.*]] = addi %[[VAL_28]], %[[VAL_10]] : index
+// CHECK:                   %[[VAL_32:.*]] = load %[[VAL_14]]{{\[}}%[[VAL_31]]] : memref<?xindex>
+// CHECK:                   scf.for %[[VAL_33:.*]] = %[[VAL_30]] to %[[VAL_32]] step %[[VAL_10]] {
+// CHECK:                     %[[VAL_34:.*]] = load %[[VAL_15]]{{\[}}%[[VAL_33]]] : memref<?xindex>
+// CHECK:                     scf.for %[[VAL_35:.*]] = %[[VAL_9]] to %[[VAL_5]] step %[[VAL_10]] {
+// CHECK:                       %[[VAL_36:.*]] = muli %[[VAL_33]], %[[VAL_5]] : index
+// CHECK:                       %[[VAL_37:.*]] = addi %[[VAL_36]], %[[VAL_35]] : index
+// CHECK:                       scf.for %[[VAL_38:.*]] = %[[VAL_9]] to %[[VAL_4]] step %[[VAL_10]] {
+// CHECK:                         %[[VAL_39:.*]] = muli %[[VAL_37]], %[[VAL_4]] : index
+// CHECK:                         %[[VAL_40:.*]] = addi %[[VAL_39]], %[[VAL_38]] : index
+// CHECK:                         scf.for %[[VAL_41:.*]] = %[[VAL_9]] to %[[VAL_3]] step %[[VAL_10]] {
+// CHECK:                           %[[VAL_42:.*]] = muli %[[VAL_40]], %[[VAL_3]] : index
+// CHECK:                           %[[VAL_43:.*]] = addi %[[VAL_42]], %[[VAL_41]] : index
+// CHECK:                           %[[VAL_44:.*]] = load %[[VAL_11]]{{\[}}%[[VAL_41]], %[[VAL_38]], %[[VAL_35]], %[[VAL_34]], %[[VAL_29]], %[[VAL_22]], %[[VAL_19]], %[[VAL_18]]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:                           %[[VAL_45:.*]] = load %[[VAL_16]]{{\[}}%[[VAL_43]]] : memref<?xf32>
+// CHECK:                           %[[VAL_46:.*]] = mulf %[[VAL_44]], %[[VAL_45]] : f32
+// CHECK:                           store %[[VAL_46]], %[[VAL_17]]{{\[}}%[[VAL_41]], %[[VAL_38]], %[[VAL_35]], %[[VAL_34]], %[[VAL_29]], %[[VAL_22]], %[[VAL_19]], %[[VAL_18]]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:                         }
+// CHECK:                       }
+// CHECK:                     }
+// CHECK:                   }
+// CHECK:                 }
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_47:.*]] = tensor_load %[[VAL_17]] : memref<100x200x300x400x500x600x700x800xf32>
+// CHECK:           return %[[VAL_47]] : tensor<100x200x300x400x500x600x700x800xf32>
+// CHECK:         }
+func @mul(%arga: tensor<100x200x300x400x500x600x700x800xf32>,
+          %argb: tensor<100x200x300x400x500x600x700x800xf32>)
+	      -> tensor<100x200x300x400x500x600x700x800xf32> {
+  %0 = linalg.generic #trait_mul
+    ins(%arga, %argb: tensor<100x200x300x400x500x600x700x800xf32>,
+                      tensor<100x200x300x400x500x600x700x800xf32>)
+    outs(%arga: tensor<100x200x300x400x500x600x700x800xf32>) {
+      ^bb(%a: f32, %b: f32, %s : f32):
+        %0 = mulf %a, %b : f32
+        linalg.yield %0 : f32
+    }      -> tensor<100x200x300x400x500x600x700x800xf32>
+  return %0 : tensor<100x200x300x400x500x600x700x800xf32>
+}