[Mlir-commits] [mlir] 4b03906 - [mlir][linalg] Perform checks early in hoist padding.

Thu Nov 25 02:41:20 PST 2021

Author: Tobias Gysi
Date: 2021-11-25T10:37:12Z
New Revision: 4b03906346a89b8c7e96b5ffe762183fb0fff7cd

URL: https://github.com/llvm/llvm-project/commit/4b03906346a89b8c7e96b5ffe762183fb0fff7cd
DIFF: https://github.com/llvm/llvm-project/commit/4b03906346a89b8c7e96b5ffe762183fb0fff7cd.diff

LOG: [mlir][linalg] Perform checks early in hoist padding.

Instead of checking for unexpected operations (any operation with a region except for scf::For and `padTensorOp` or operations with a memory effect) while cloning the packing loop nest perform the checks early. Update `dropNonIndexDependencies` to check for unexpected operations. Additionally, check all of these operations have index type operands only.

Depends On D114428

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D114438

Added: 
    

Modified: 
    mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
    mlir/test/Dialect/Linalg/pad-and-hoist.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
index ba46a813d1ea..29cc48350938 100644

--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -43,7 +43,8 @@ using namespace mlir::linalg;
 ///   2. Pad op does not have a constant padding value.
 ///   3. There is no immediately enclosing scf::ForOp.
 ///   4. The backward slice from the pad op to the scf::ForOp to hoist above
-///      contains an unknown op with a region.
+///      contains an unknown op with non index type operands, a region, or a
+///      memory effect.
 ///   5. The backward slice from the pad op to the scf::ForOp to hoist above is
 ///      empty.
 ///   6. The source tensor of pad op is not defined by an extract slice op.
@@ -80,7 +81,8 @@ struct HoistingAnalysis {
   /// operands consumed by `padTensorOp` and `sliceOp` and drops the operations
   /// not part of this index computation. Afterwards, the filtered
   /// `backwardSlice` contains only the loops whose induction variable is used,
-  /// directly or indirectly, to index the padded tensor.
+  /// directly or indirectly, to index the padded tensor. The method returns
+  /// failure if the filtered backward slice contains an unexpected operation.
   ///
   /// Example:
   /// ```
@@ -96,8 +98,8 @@ struct HoistingAnalysis {
   /// ```
   /// dropNonIndexDependencies(%padded_slice, %slice)
   /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.
-  void dropNonIndexDependencies(PadTensorOp padTensorOp,
-                                tensor::ExtractSliceOp sliceOp);
+  LogicalResult dropNonIndexDependencies(PadTensorOp padTensorOp,
+                                         tensor::ExtractSliceOp sliceOp);
 
   /// Encodes whether the analysis is valid and hoisting can proceed.
   bool valid;
@@ -209,18 +211,8 @@ HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int numLoops) {
   // Remove all ops in the backward slice that are not used to index the padded
   // tensor. In particular, keep `padTensorOp`, `sliceOp`, and the loop and
   // affine operations used for the index computation.
-  dropNonIndexDependencies(padTensorOp, sliceOp);
-
-  // Check if an op has a region it is either `padTensorOp`, a scf::ForOp, or a
-  // LinalgOp.
-  for (Operation *op : backwardSlice) {
-    if (op != padTensorOp && op->getNumRegions() > 0 &&
-        !isa<scf::ForOp, LinalgOp>(op)) {
-      LLVM_DEBUG(DBGS() << "Unsupported op with region: " << *op
-                        << " -> skip\n");
-      return;
-    }
-  }
+  if (failed(dropNonIndexDependencies(padTensorOp, sliceOp)))
+    return;
 
   // Add only the loops part of the filtered `backwardSlice` to the packing
   // loops. All other loops are not used to index the padded data and
@@ -239,8 +231,9 @@ HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int numLoops) {
   valid = true;
 }
 
-void HoistingAnalysis::dropNonIndexDependencies(
-    PadTensorOp padTensorOp, tensor::ExtractSliceOp sliceOp) {
+LogicalResult
+HoistingAnalysis::dropNonIndexDependencies(PadTensorOp padTensorOp,
+                                           tensor::ExtractSliceOp sliceOp) {
   // Set of all values used for index computation.
   SetVector<Value> indexEdges;
 
@@ -289,7 +282,7 @@ void HoistingAnalysis::dropNonIndexDependencies(
     // Add the index operands of the loop if its induction variable is
     // used for index computation.
     if (auto forOp = dyn_cast<scf::ForOp>(op)) {
-      if (indexEdges.contains(forOp.getInductionVar())) {
+      if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) {
         addIndexOperandsToIndexEdges(op);
         continue;
       }
@@ -298,6 +291,21 @@ void HoistingAnalysis::dropNonIndexDependencies(
     // used for index computation.
     if (hasIndexResult(op)) {
       addIndexOperandsToIndexEdges(op);
+      // Check the operands of the remaining operations all have index type.
+      if (llvm::any_of(op->getOperandTypes(),
+                       [](Type type) { return !type.isIndex(); })) {
+        LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: "
+                          << op << " -> skip\n");
+        return failure();
+      }
+      // Check the remaining operations do not have regions or memory effects.
+      auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op);
+      bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect();
+      if (hasMemoryEffect || op->getNumRegions() != 0) {
+        LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: "
+                          << op << " -> skip\n");
+        return failure();
+      }
       continue;
     }
     // Remove all other operation not used by the index computation except for
@@ -305,6 +313,7 @@ void HoistingAnalysis::dropNonIndexDependencies(
     if (!isa<arith::ConstantOp>(op))
       backwardSlice.remove(op);
   }
+  return success();
 }
 
 SmallVector<Value>
@@ -416,18 +425,13 @@ FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(PadTensorOp opToHoist,
     if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
       if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor)
         continue;
-    auto effects = dyn_cast<MemoryEffectOpInterface>(op);
-    bool hasNoEffects = !effects || effects.hasNoEffect();
-    if (hasNoEffects &&
-        (op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op))) {
+    // Clone all operations except it is a loop.
+    auto forOp = dyn_cast<scf::ForOp>(op);
+    if (!forOp) {
       b.clone(*op, bvm);
       continue;
     }
-    // TODO: support more cases as they appear.
-    auto forOp = dyn_cast<scf::ForOp>(op);
-    assert(forOp && llvm::is_contained(analysis.packingLoops, forOp) &&
-           "expect an scf::ForOp that is a packing loop");
-
+    // Create a packing loop that takes `packedTensor` as iteration argument.
     auto clonedForOp =
         b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()),
                              bvm.lookupOrDefault(forOp.upperBound()),

diff  --git a/mlir/test/Dialect/Linalg/pad-and-hoist.mlir b/mlir/test/Dialect/Linalg/pad-and-hoist.mlir
index d3f86d36504a..31a5469d03c6 100644
--- a/mlir/test/Dialect/Linalg/pad-and-hoist.mlir
+++ b/mlir/test/Dialect/Linalg/pad-and-hoist.mlir
@@ -436,3 +436,154 @@ func @non_constant_padding(%arg0: tensor<24x12xf32>,
   return %0 : tensor<24x25xf32>
 }
 
+// -----
+
+#map0 = affine_map<(d0) -> (5, -d0 + 24)>
+#map1 = affine_map<(d0) -> (7, -d0 + 25)>
+#map2 = affine_map<(d0) -> (-d0 + 5)>
+#map3 = affine_map<(d0) -> (-d0 + 7)>
+
+//      CHECK:  unexpected_operation
+//      CHECK-DOUBLE:  unexpected_operation
+// CHECK-SAME:    %[[ARG3:[0-9a-zA-Z]*]]: memref<?xindex>
+// CHECK-SAME:    %[[ARG4:[0-9a-zA-Z]*]]: i32
+func @unexpected_operation(%arg0: tensor<24x12xf32>,
+                           %arg1: tensor<12x25xf32>,
+                           %arg2: tensor<24x25xf32>,
+                           %arg3: memref<?xindex>,
+                           %arg4: i32) -> tensor<24x25xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %c5 = arith.constant 5 : index
+  %c7 = arith.constant 7 : index
+  %c6 = arith.constant 6 : index
+  %c24 = arith.constant 24 : index
+  %c25 = arith.constant 25 : index
+  %c12 = arith.constant 12 : index
+  %c0 = arith.constant 0 : index
+
+  //      CHECK:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
+  %0 = scf.for %arg5 = %c0 to %c24 step %c5 iter_args(%arg6 = %arg2) -> (tensor<24x25xf32>) {
+
+    // CHECK-NEXT:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
+    %1 = scf.for %arg7 = %c0 to %c25 step %c7 iter_args(%arg8 = %arg6) -> (tensor<24x25xf32>) {
+
+      // CHECK-NEXT:  scf.for %[[IV2:[0-9a-zA-Z]*]] =
+      %2 = scf.for %arg9 = %c0 to %c12 step %c6 iter_args(%arg10 = %arg8) -> (tensor<24x25xf32>) {
+        %3 = affine.min #map0(%arg5)
+        %4 = tensor.extract_slice %arg0[%arg5, %arg9] [%3, 6] [1, 1] : tensor<24x12xf32> to tensor<?x6xf32>
+        %5 = affine.min #map1(%arg7)
+        %6 = tensor.extract_slice %arg1[%arg9, %arg7] [6, %5] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32>
+        %7 = tensor.extract_slice %arg10[%arg5, %arg7] [%3, %5] [1, 1] : tensor<24x25xf32> to tensor<?x?xf32>
+        %8 = affine.apply #map2(%3)
+
+        // Check cannot hoist due to unexpected operation with memory effect.
+        //      CHECK: %[[IDX0:.*]] = memref.load %[[ARG3]]
+        //      CHECK: %[[T0:.*]] = linalg.pad_tensor {{.*}}, %[[IDX0]]
+        %9 = memref.load %arg3[%c0] : memref<?xindex>
+        %10 = linalg.pad_tensor %4 nofold low[%c0, %c0] high[%8, %9]  {
+        ^bb0(%arg11: index, %arg12: index):  // no predecessors
+          linalg.yield %cst : f32
+        } : tensor<?x6xf32> to tensor<5x6xf32>
+        %11 = affine.apply #map3(%5)
+
+        // Check cannot hoist due to unexpected operation with non index operand.
+        //      CHECK: %[[IDX1:.*]] = arith.index_cast %[[ARG4]]
+        //      CHECK: %[[T1:.*]] = linalg.pad_tensor {{.*}}[%[[IDX1]]
+        %12 = arith.index_cast %arg4 : i32 to index
+        %13 = linalg.pad_tensor %6 nofold low[%c0, %c0] high[%12, %11]  {
+        ^bb0(%arg11: index, %arg12: index):  // no predecessors
+          linalg.yield %cst : f32
+        } : tensor<6x?xf32> to tensor<6x7xf32>
+        %14 = linalg.pad_tensor %7 low[%c0, %c0] high[%8, %11]  {
+        ^bb0(%arg11: index, %arg12: index):  // no predecessors
+          linalg.yield %cst : f32
+        } : tensor<?x?xf32> to tensor<5x7xf32>
+
+        //      CHECK:  = linalg.matmul ins(%[[T0]], %[[T1]]
+        %15 = linalg.matmul ins(%10, %13 : tensor<5x6xf32>, tensor<6x7xf32>) outs(%14 : tensor<5x7xf32>) -> tensor<5x7xf32>
+        %16 = tensor.extract_slice %15[0, 0] [%3, %5] [1, 1] : tensor<5x7xf32> to tensor<?x?xf32>
+        %17 = tensor.insert_slice %16 into %arg10[%arg5, %arg7] [%3, %5] [1, 1] : tensor<?x?xf32> into tensor<24x25xf32>
+        scf.yield %17 : tensor<24x25xf32>
+      }
+      scf.yield %2 : tensor<24x25xf32>
+    }
+    scf.yield %1 : tensor<24x25xf32>
+  }
+  return %0 : tensor<24x25xf32>
+}
+
+// -----
+
+#map0 = affine_map<(d0) -> (5, -d0 + 24)>
+#map1 = affine_map<(d0) -> (7, -d0 + 25)>
+#map2 = affine_map<(d0) -> (-d0 + 5)>
+#map3 = affine_map<(d0) -> (-d0 + 7)>
+
+//      CHECK:  unexpected_loop
+//      CHECK-DOUBLE:  unexpected_loop
+// CHECK-SAME:    %[[ARG3:[0-9a-zA-Z]*]]: index
+func @unexpected_loop(%arg0: tensor<24x12xf32>,
+                      %arg1: tensor<12x25xf32>,
+                      %arg2: tensor<24x25xf32>,
+                      %arg3: index) -> tensor<24x25xf32> {
+  %c0 = arith.constant 0 : index
+  %c12 = arith.constant 12 : index
+  %c25 = arith.constant 25 : index
+  %c24 = arith.constant 24 : index
+  %c6 = arith.constant 6 : index
+  %c7 = arith.constant 7 : index
+  %c5 = arith.constant 5 : index
+  %cst = arith.constant 0.000000e+00 : f32
+
+  //      CHECK:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
+  %0 = scf.for %arg4 = %c0 to %c24 step %c5 iter_args(%arg5 = %arg2) -> (tensor<24x25xf32>) {
+
+    // CHECK-NEXT:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
+    %1 = scf.for %arg6 = %c0 to %c25 step %c7 iter_args(%arg7 = %arg5) -> (tensor<24x25xf32>) {
+
+      // Check the padding of the first input operand is hoisted.
+      //      CHECK:  = linalg.pad_tensor
+
+      //      CHECK:  scf.for %[[IV2:[0-9a-zA-Z]*]] =
+      %2 = scf.for %arg8 = %c0 to %c12 step %c6 iter_args(%arg9 = %arg7) -> (tensor<24x25xf32>) {
+        %3 = affine.min #map0(%arg4)
+        %4 = tensor.extract_slice %arg0[%arg4, %arg8] [%3, 6] [1, 1] : tensor<24x12xf32> to tensor<?x6xf32>
+        %5 = affine.min #map1(%arg6)
+        %6 = tensor.extract_slice %arg1[%arg8, %arg6] [6, %5] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32>
+        %7 = tensor.extract_slice %arg9[%arg4, %arg6] [%3, %5] [1, 1] : tensor<24x25xf32> to tensor<?x?xf32>
+        %8 = affine.apply #map2(%3)
+
+        // Check cannot hoist due to unexpected operation that has a region.
+        //      CHECK: %[[IDX0:.*]] = scf.for {{.*}} step %[[ARG3]]
+        //      CHECK: %[[T0:.*]] = linalg.pad_tensor {{.*}}, %[[IDX0]]
+        %9 = scf.for %arg10 = %c0 to %c24 step %arg3 iter_args(%arg11 = %c0) -> (index) {
+          %17 = arith.addi %arg3, %arg11 : index
+          scf.yield %17 : index
+        }
+        %10 = linalg.pad_tensor %4 nofold low[%c0, %c0] high[%8, %9]  {
+        ^bb0(%arg10: index, %arg11: index):  // no predecessors
+          linalg.yield %cst : f32
+        } : tensor<?x6xf32> to tensor<5x6xf32>
+        %11 = affine.apply #map3(%5)
+        %12 = linalg.pad_tensor %6 nofold low[%c0, %c0] high[%c0, %11]  {
+        ^bb0(%arg10: index, %arg11: index):  // no predecessors
+          linalg.yield %cst : f32
+        } : tensor<6x?xf32> to tensor<6x7xf32>
+        %13 = linalg.pad_tensor %7 low[%c0, %c0] high[%8, %11]  {
+        ^bb0(%arg10: index, %arg11: index):  // no predecessors
+          linalg.yield %cst : f32
+        } : tensor<?x?xf32> to tensor<5x7xf32>
+
+        //      CHECK:  = linalg.matmul ins(%[[T0]]
+        %14 = linalg.matmul ins(%10, %12 : tensor<5x6xf32>, tensor<6x7xf32>) outs(%13 : tensor<5x7xf32>) -> tensor<5x7xf32>
+        %15 = tensor.extract_slice %14[0, 0] [%3, %5] [1, 1] : tensor<5x7xf32> to tensor<?x?xf32>
+        %16 = tensor.insert_slice %15 into %arg9[%arg4, %arg6] [%3, %5] [1, 1] : tensor<?x?xf32> into tensor<24x25xf32>
+        scf.yield %16 : tensor<24x25xf32>
+      }
+      scf.yield %2 : tensor<24x25xf32>
+    }
+    scf.yield %1 : tensor<24x25xf32>
+  }
+  return %0 : tensor<24x25xf32>
+}
+