[Mlir-commits] [mlir] [mlir][scf] Rewrite vector.transfer_read/write after peeling (PR #88684)

Sun Apr 14 23:04:08 PDT 2024

https://github.com/ShivaChen created https://github.com/llvm/llvm-project/pull/88684

After peeling, the loop iteration will be multiple of step. If the vector size of vector.transfer_read/write is equal to step in the peeled loop, there won't be the remaining iteration smaller than the vector size.

In this case, rewriting vector.transfer_read/write to vector.load/store could avoid generating masks when lowering to LLVM Dialect.

>From a37d9207ad9b286dba461f1aa53209d3cd31545e Mon Sep 17 00:00:00 2001
From: Shiva Chen <shiva.chen at imgtec.com>
Date: Wed, 3 Apr 2024 08:02:03 +0100
Subject: [PATCH] [mlir][scf] Rewrite vector.transfer_read/write after peeling

After peeling, the loop iteration will be multiple of step.
If the vector size of vector.transfer_read/write is equal to step in the
peeled loop, there won't be the remaining iteration smaller than the
vector size.

In this case, rewriting vector.transfer_read/write to vector.load/store
could avoid generating masks when lowering to LLVM Dialect.
---
 .../SCF/Transforms/LoopSpecialization.cpp     | 49 +++++++++++++++++++
 .../for-loop-peeling-vector-load-store.mlir   | 30 ++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 mlir/test/Dialect/SCF/for-loop-peeling-vector-load-store.mlir

diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
index a30e349d49136c..4eb757d618a98d 100644
--- a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
@@ -20,6 +20,7 @@
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/Utils/AffineCanonicalizationUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/PatternMatch.h"
@@ -166,6 +167,52 @@ static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp,
   return success();
 }
 
+static void rewriteVectorReadWriteToLoadStore(RewriterBase &b, Operation *op) {
+  b.setInsertionPoint(op);
+  if (auto write = dyn_cast<vector::TransferWriteOp>(op)) {
+    b.replaceOpWithNewOp<vector::StoreOp>(
+        op, write.getVector(), write.getSource(), write.getIndices());
+  } else if (auto read = dyn_cast<vector::TransferReadOp>(op)) {
+    b.replaceOpWithNewOp<vector::LoadOp>(op, read.getVectorType(),
+                                         read.getSource(), read.getIndices());
+  }
+}
+
+static bool hasVectorSizeEqualToStep(Operation *Op,
+                                     std::optional<int64_t> step) {
+  if (!step)
+    return false;
+
+  if (isa<vector::TransferWriteOp, vector::TransferReadOp>(Op)) {
+    auto vectorType = isa<vector::TransferWriteOp>(Op)
+                          ? cast<vector::TransferWriteOp>(Op).getVectorType()
+                          : cast<vector::TransferReadOp>(Op).getVectorType();
+
+    if (vectorType.getRank() != 1)
+      return false;
+
+    auto vectorSize = vectorType.getShape()[0];
+    if (vectorSize == *step)
+      return true;
+  }
+
+  return false;
+}
+
+static void rewriteVectorizedLoopAfterPeeling(RewriterBase &rewriter,
+                                              ForOp forOp) {
+  auto stepInt = getConstantIntValue(forOp.getStep());
+
+  forOp.walk([&](Operation *affineOp) {
+    if (!isa<vector::TransferWriteOp, vector::TransferReadOp>(affineOp))
+      return WalkResult::advance();
+    if (!hasVectorSizeEqualToStep(affineOp, stepInt))
+      return WalkResult::advance();
+    rewriteVectorReadWriteToLoadStore(rewriter, affineOp);
+    return WalkResult::advance();
+  });
+}
+
 static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp,
                                         ForOp partialIteration,
                                         Value previousUb) {
@@ -200,6 +247,8 @@ LogicalResult mlir::scf::peelForLoopAndSimplifyBounds(RewriterBase &rewriter,
   if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound)))
     return failure();
 
+  rewriteVectorizedLoopAfterPeeling(rewriter, forOp);
+
   // Rewrite affine.min and affine.max ops.
   rewriteAffineOpAfterPeeling(rewriter, forOp, partialIteration, previousUb);
 
diff --git a/mlir/test/Dialect/SCF/for-loop-peeling-vector-load-store.mlir b/mlir/test/Dialect/SCF/for-loop-peeling-vector-load-store.mlir
new file mode 100644
index 00000000000000..04991930a2c262
--- /dev/null
+++ b/mlir/test/Dialect/SCF/for-loop-peeling-vector-load-store.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-opt %s -scf-for-loop-peeling -canonicalize -verify-diagnostics | FileCheck %s
+
+func.func @vector_read_write(%a : memref<100xi32>, %b : memref<100xi32>, %ub: index) {
+// %LB to %NEW_UB will be multiple of STEP after peeling.
+// So vector.transfer_write could be transferred to vector.store to avoid
+// generating mask when lowering to LLVM.
+//
+//  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> ((s0 floordiv 64) * 64)>
+//      CHECK: func @vector_read_write(
+// CHECK-SAME:   %[[A:.*]]: memref<100xi32>, %[[B:.*]]: memref<100xi32>, %[[UB:.*]]: index
+//      CHECK:   %[[LB:.*]] = arith.constant 0 : index
+//      CHECK:   %[[STEP:.*]] = arith.constant 64 : index
+//      CHECK:   %[[NEW_UB:.*]] = affine.apply #[[MAP0]]
+//      CHECK:   scf.for %[[IV:.*]] = %[[LB]] to %[[NEW_UB]] step %[[STEP]] {
+//      CHECK:     %[[VAL:.*]] = vector.load %[[B]][%[[IV]]]
+//      CHECK:     vector.store %[[VAL]], %[[A]][%[[IV]]]
+//      CHECK:   }
+//      CHECK:   scf.for %[[IV:.*]] = %[[NEW_UB]] to %[[UB]] step %[[STEP]] {
+//      CHECK:     %[[VAL:.*]] = vector.transfer_read %[[B]][%[[IV]]]
+//      CHECK:     vector.transfer_write %[[VAL]], %[[A]][%[[IV]]]
+//      CHECK:   }
+  %c0 = arith.constant 0 : index
+  %c64 = arith.constant 64 : index
+  %pad = arith.constant 0 : i32
+  scf.for %i = %c0 to %ub step %c64 {
+    %val = vector.transfer_read %b[%i], %pad : memref<100xi32>, vector<64xi32>
+    vector.transfer_write %val, %a[%i] : vector<64xi32>, memref<100xi32>
+  }
+  return
+}