[Mlir-commits] [mlir] [MLIR][Transform] Hoist Pad generates linalg.transpose (PR #109669)

Mon Sep 23 09:45:22 PDT 2024

https://github.com/nujaa updated https://github.com/llvm/llvm-project/pull/109669

>From d7368895c2d16d5b69d419c0c15682a52139e90b Mon Sep 17 00:00:00 2001
From: Hugo <hugo.trachino at huawei.com>
Date: Thu, 19 Sep 2024 23:14:00 +0800
Subject: [PATCH] [MLIR][Transform] Hoist Pad generates linalg.transpose

---
 .../Dialect/Linalg/Transforms/Transforms.h    | 12 +++----
 .../include/mlir/Dialect/Linalg/Utils/Utils.h |  6 ----
 .../TransformOps/LinalgTransformOps.cpp       |  2 +-
 .../Linalg/Transforms/HoistPadding.cpp        | 25 +++++++------
 .../lib/Dialect/Linalg/Transforms/Padding.cpp |  2 +-
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp       | 35 -------------------
 ...-op-hoist-pad-build-packing-loop-nest.mlir |  4 +--
 .../Linalg/transform-op-hoist-pad.mlir        | 10 +++---
 8 files changed, 27 insertions(+), 69 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 0208f854f799ec..48e657cca96e39 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -549,7 +549,7 @@ namespace detail {
 struct PackingResult {
   SmallVector<OpFoldResult> offsets, sizes, strides;
   SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
-  GenericOp maybeTransposeOp;
+  TransposeOp maybeTransposeOp;
   tensor::PadOp hoistedPadOp;
 };
 
@@ -568,9 +568,9 @@ buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist,
 /// a larger tensor. On success, `opToHoist` is replaced by the cloned version
 /// in the packing loop so the caller can continue reasoning about the padding
 /// operation. If `transposeVector` is non-empty, hoist padding introduces a
-/// GenericOp to transpose the padded tensor before inserting it into the packed
-/// tensor. A `transposeVector` can change the storage order of the padded
-/// tensor but does not change the order of the pack or compute loops.
+/// TransposeOp to transpose the padded tensor before inserting it into the
+/// packed tensor. A `transposeVector` can change the storage order of the
+/// padded tensor but does not change the order of the pack or compute loops.
 ///
 /// TODO: In the future, we should consider rewriting as a tensor.pack after
 /// hoisting since this abstraction is now available.
@@ -615,13 +615,13 @@ FailureOr<Value>
 hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist,
                       int64_t numLoops, ArrayRef<int64_t> transposeVector,
                       tensor::PadOp &hoistedOp,
-                      SmallVectorImpl<GenericOp> &transposeOps);
+                      SmallVectorImpl<TransposeOp> &transposeOps);
 /// Calls into `hoistPaddingOnTensors` with a local IRRewriter.
 FailureOr<Value>
 hoistPaddingOnTensors(tensor::PadOp opToHoist, int64_t numLoops,
                       ArrayRef<int64_t> transposeVector,
                       tensor::PadOp &hoistedOp,
-                      SmallVectorImpl<GenericOp> &transposeOps);
+                      SmallVectorImpl<TransposeOp> &transposeOps);
 
 /// Apply padding and hoisting to `linalgOp` according to the configuration
 /// specified in `options`.
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
index f1df49ce3eaa36..1e4f3004dec7e7 100644
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -75,12 +75,6 @@ bool isReductionIterator(utils::IteratorType iteratorType);
 Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
                             Value source, Value pad, bool nofold);
 
-/// Returns a GenericOp that transposes `inputTensor` into `outputTensor`
-/// using `transposeVector` to permute the `inputTensor` dimensions.
-GenericOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor,
-                          Value outputTensor,
-                          ArrayRef<int64_t> transposeVector);
-
 /// Returns GenericOp that copies an n-D memref. Unlike the current
 /// implementation of memref::CopyOp, this op can further tile, lower to loops
 /// or vectorize.
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 29b5631f61b488..c28b07f33f5dcb 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2000,7 +2000,7 @@ transform::HoistPadOp::applyToOne(transform::TransformRewriter &rewriter,
                                   transform::ApplyToEachResultList &results,
                                   transform::TransformState &state) {
   tensor::PadOp hoistedPadOp;
-  SmallVector<GenericOp> transposeOps;
+  SmallVector<TransposeOp> transposeOps;
   FailureOr<Value> result =
       hoistPaddingOnTensors(rewriter, target, getNumLoops(), getTranspose(),
                             hoistedPadOp, transposeOps);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
index c3a08ce86082a8..d33a17af63459e 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -633,15 +633,15 @@ static FailureOr<PackingResult> buildPackingLoopNestImpl(
                                       rewriter.getIndexAttr(1));
 
   // Step 3. Optionally transpose the padded tensor.
-  GenericOp maybeTransposeOp;
+  TransposeOp maybeTransposeOp;
   Value paddedTensor = bvm.lookup(opToHoist.getResult());
   if (!transposeVector.empty()) {
     Value outputTensor = rewriter.create<tensor::ExtractSliceOp>(
         loc, transposedTensorType, hoistedPackedTensor, offsets, sizes,
         strides);
-    maybeTransposeOp = makeTransposeOp(rewriter, loc, paddedTensor,
-                                       outputTensor, transposeVector);
-    paddedTensor = maybeTransposeOp.getResult(0);
+    maybeTransposeOp = rewriter.create<linalg::TransposeOp>(
+        loc, paddedTensor, outputTensor, transposeVector);
+    paddedTensor = maybeTransposeOp.getResult()[0];
   }
 
   // Innermost tensor.insert_slice and yields are optional / need loops.
@@ -938,7 +938,7 @@ static Value replaceByPackingResult(RewriterBase &rewriter,
 FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
     RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops,
     ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,
-    SmallVectorImpl<GenericOp> &transposeOps) {
+    SmallVectorImpl<TransposeOp> &transposeOps) {
   LLVM_DEBUG(DBGS() << "\n"; DBGS() << " Try to hoist " << *(opToHoist) << "\n";
              DBGS() << " by " << numLoops << " loops\n");
 
@@ -980,9 +980,9 @@ FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
     // Transpose the packed tensor back to the original storage order.
     Value emptyTensor = rewriter.create<tensor::EmptyOp>(
         loc, paddedTensorType.getShape(), paddedTensorType.getElementType());
-    GenericOp unTransposeOp =
-        makeTransposeOp(rewriter, loc, newResult, emptyTensor, transposeVector);
-    newResult = unTransposeOp.getResult(0);
+    TransposeOp unTransposeOp = rewriter.create<linalg::TransposeOp>(
+        loc, newResult, emptyTensor, transposeVector);
+    newResult = unTransposeOp.getResult()[0];
     transposeOps.push_back(unTransposeOp);
   }
 
@@ -999,11 +999,10 @@ FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
   return newResult;
 }
 
-FailureOr<Value>
-mlir::linalg::hoistPaddingOnTensors(tensor::PadOp opToHoist, int64_t numLoops,
-                                    ArrayRef<int64_t> transposeVector,
-                                    tensor::PadOp &hoistedOp,
-                                    SmallVectorImpl<GenericOp> &transposeOps) {
+FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
+    tensor::PadOp opToHoist, int64_t numLoops,
+    ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,
+    SmallVectorImpl<TransposeOp> &transposeOps) {
   IRRewriter rewriter(opToHoist.getContext());
   return hoistPaddingOnTensors(rewriter, opToHoist, numLoops, transposeVector,
                                hoistedOp, transposeOps);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
index 518d2e138c02a9..a066c44408915e 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
@@ -299,7 +299,7 @@ mlir::linalg::padAndHoistLinalgOp(RewriterBase &rewriter, LinalgOp linalgOp,
     }
 
     tensor::PadOp hoistedOp;
-    SmallVector<GenericOp> transposeOps;
+    SmallVector<TransposeOp> transposeOps;
     SmallVector<int64_t> transposeVector =
         en.index() < options.transposePaddings.size()
             ? options.transposePaddings[en.index()]
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 6a3f2fc5fbc496..38e427af1c4846 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -249,41 +249,6 @@ Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
   return sliceOp.getSource();
 }
 
-GenericOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor,
-                          Value outputTensor,
-                          ArrayRef<int64_t> transposeVector) {
-  auto resultTensorType = cast<RankedTensorType>(outputTensor.getType());
-  Type elementType = resultTensorType.getElementType();
-
-  assert(isPermutationVector(transposeVector) &&
-         "expect transpose vector to be a permutation");
-  assert(transposeVector.size() ==
-             static_cast<size_t>(resultTensorType.getRank()) &&
-         "expect transpose vector size to match result tensor rank");
-
-  // Compute the transpose and the indentity indexing maps.
-  SmallVector<AffineMap> indexingMaps = {
-      inversePermutation(AffineMap::getPermutationMap(
-          SmallVector<unsigned>(transposeVector), b.getContext())),
-      AffineMap::getMultiDimIdentityMap(transposeVector.size(),
-                                        b.getContext())};
-  SmallVector<utils::IteratorType> iteratorTypes(transposeVector.size(),
-                                                 utils::IteratorType::parallel);
-
-  // Create a GenericOp to transpose `inputTensor` into `outputTensor`.
-  auto transposeOp =
-      b.create<GenericOp>(loc, resultTensorType, inputTensor, outputTensor,
-                          indexingMaps, iteratorTypes);
-
-  // Create the body of the transpose operation.
-  OpBuilder::InsertionGuard g(b);
-  Region &body = transposeOp.getRegion();
-  Block *bodyBlock = b.createBlock(&body, /*insertPt=*/{},
-                                   {elementType, elementType}, {loc, loc});
-  b.create<YieldOp>(loc, bodyBlock->getArgument(0));
-  return transposeOp;
-}
-
 GenericOp makeMemRefCopyOp(OpBuilder &b, Location loc, Value from, Value to) {
   auto memrefTypeTo = cast<MemRefType>(to.getType());
 #ifndef NDEBUG
diff --git a/mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir b/mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir
index ae63ed5f1a410a..a6943cf338d42a 100644
--- a/mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir
@@ -115,8 +115,8 @@ func.func @pad_and_hoist_lhs_transpose(
   //     BUILD-PACKING-LOOP-NEST: %[[PACKED:.*]] = scf.for %{{.*}} -> (tensor<?x12x5xf32>) {
   //     BUILD-PACKING-LOOP-NEST:   tensor.pad %{{.*}}
   //     BUILD-PACKING-LOOP-NEST:     : tensor<?x12xf32> to tensor<5x12xf32>
-  //     BUILD-PACKING-LOOP-NEST:   linalg.generic
-  //     BUILD-PACKING-LOOP-NEST:     -> tensor<12x5xf32>
+  //     BUILD-PACKING-LOOP-NEST:   linalg.transpose
+  //     BUILD-PACKING-LOOP-NEST:     ins({{.*}} : tensor<5x12xf32>) outs({{.*}} : tensor<12x5xf32>)
   //     BUILD-PACKING-LOOP-NEST:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0, 0] [1, 12, 5] [1, 1, 1]
   // BUILD-PACKING-LOOP-NEST-SAME:   : tensor<12x5xf32> into tensor<?x12x5xf32>
   //     BUILD-PACKING-LOOP-NEST: scf.for %{{.*}} -> (tensor<24x25xf32>)
diff --git a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir
index 499d9904c06b94..e075ff57666b0c 100644
--- a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir
@@ -123,17 +123,17 @@ func.func @pad_and_hoist_lhs_transpose(
      -> tensor<24x25xf32>
 {
   //     CHECK: %[[PACKED:.*]] = scf.for %{{.*}} -> (tensor<5x12x5xf32>) {
-  //     CHECK:   tensor.pad %{{.*}}
+  //     CHECK:   %[[PAD:.*]] = tensor.pad %{{.*}}
   //     CHECK:     : tensor<?x12xf32> to tensor<5x12xf32>
-  //     CHECK:   linalg.generic
-  //     CHECK:     -> tensor<12x5xf32>
+  //     CHECK:   linalg.transpose
+  //     CHECK:      ins(%[[PAD]] : tensor<5x12xf32>) outs(%{{.*}} : tensor<12x5xf32>)
   //     CHECK:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0, 0] [1, 12, 5] [1, 1, 1]
   // CHECK-SAME:   : tensor<12x5xf32> into tensor<5x12x5xf32>
   //     CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>) {
   //     CHECK:   %[[PADDED:.*]] = tensor.extract_slice %[[PACKED]][%{{.*}}, 0, 0] [1, 12, 5] [1, 1, 1]
   // CHECK-SAME:    : tensor<5x12x5xf32> to tensor<12x5xf32>
-  //     CHECK:   %[[TRANSPOSED:.*]] = linalg.generic
-  //     CHECK:     -> tensor<5x12xf32>
+  //     CHECK:   %[[TRANSPOSED:.*]] = linalg.transpose ins(%[[PADDED]] : tensor<12x5xf32>)
+  //     CHECK:     outs(%{{.*}} : tensor<5x12xf32>
   //     CHECK:   linalg.matmul ins(%[[TRANSPOSED]]
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
   func.return %0 : tensor<24x25xf32>