[Mlir-commits] [mlir] [mlir][vector] Set InBound for vector read write after peeling (PR #89108)

Thu Apr 18 20:57:28 PDT 2024

https://github.com/ShivaChen updated https://github.com/llvm/llvm-project/pull/89108

>From a3fed33d46aa0f358a1e2f851e369c4d3eee156d Mon Sep 17 00:00:00 2001
From: Shiva Chen <shiva.chen at imgtec.com>
Date: Wed, 17 Apr 2024 17:22:24 +0100
Subject: [PATCH 1/3] [mlir][vector] Set InBound for vector read write after
 peeling

Setting InBound attribute for vector.transfer_read/write if the
loop iteration is multiple of vector size.

In this case, the number of effective vector elements in the last
iteration is equal to vector size. So the mask can be avoid to
generate by setting the attribute.

The loop could be produced by -scf-for-loop-peeling.
---
 mlir/lib/Dialect/Vector/IR/CMakeLists.txt     |  1 +
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      | 55 ++++++++++++++++++-
 .../Dialect/Vector/vector-transforms.mlir     | 23 ++++++++
 3 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/IR/CMakeLists.txt b/mlir/lib/Dialect/Vector/IR/CMakeLists.txt
index 204462ffd047c6..95d31b9d8639cc 100644
--- a/mlir/lib/Dialect/Vector/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/IR/CMakeLists.txt
@@ -24,6 +24,7 @@ add_mlir_dialect_library(MLIRVectorDialect
   MLIRMaskingOpInterface
   MLIRMemRefDialect
   MLIRSideEffectInterfaces
+  MLIRSCFDialect
   MLIRTensorDialect
   MLIRValueBoundsOpInterface
   MLIRVectorInterfaces
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 3e6425879cc67f..b233ad9786ce14 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -13,11 +13,13 @@
 
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
@@ -3975,6 +3977,57 @@ Type TransferReadOp::getExpectedMaskType() {
   return inferTransferOpMaskType(getVectorType(), getPermutationMap());
 }
 
+// Determine the loop upper bound UB == s1 - ((s1 - LB) mod Step).
+// Then loop iteration = UB - LB = s1 - LB - ((s1 - LB) mod Step)
+// which is multiple of step.
+// The loop could be produced by -scf-for-loop-peeling.
+static bool isLoopIterationAsMultipleOfStep(mlir::scf::ForOp forOp) {
+  OpBuilder b(forOp.getContext());
+
+  auto UBOp = forOp.getUpperBound().getDefiningOp();
+  if (!UBOp)
+    return false;
+  auto applyOp = dyn_cast<mlir::affine::AffineApplyOp>(UBOp);
+  if (!applyOp)
+    return false;
+
+  SmallVector<Value> mapOps(applyOp.getMapOperands().begin(),
+                            applyOp.getMapOperands().end());
+  if (mapOps.size() != 3)
+    return false;
+  if (mapOps[0] != forOp.getLowerBound())
+    return false;
+  if (mapOps[2] != forOp.getStep())
+    return false;
+
+  auto UBExpr = applyOp.getAffineMap().getResult(0);
+  auto LBExpr = b.getAffineSymbolExpr(0);
+  auto sym1 = b.getAffineSymbolExpr(1);
+  auto stepExpr = b.getAffineSymbolExpr(2);
+
+  return UBExpr == (sym1 - (sym1 - LBExpr) % stepExpr);
+}
+
+template <typename TransferOp>
+static bool isLoopIterationAsMultipleOfVectorSize(TransferOp op,
+            int64_t resultIdx, int64_t indicesIdx) {
+  Value index = op.getIndices()[indicesIdx];
+  auto forOp = dyn_cast<mlir::scf::ForOp>(op->getParentOp());
+  if (!forOp)
+    return false;
+  auto constantStep = forOp.getConstantStep();
+  if (!constantStep)
+    return false;
+  if (index != forOp.getInductionVar())
+    return false;
+  if (!isLoopIterationAsMultipleOfStep(forOp))
+    return false;
+
+  int64_t vectorSize = op.getVectorType().getDimSize(resultIdx);
+
+  return vectorSize == *constantStep;
+}
+
 template <typename TransferOp>
 static bool isInBounds(TransferOp op, int64_t resultIdx, int64_t indicesIdx) {
   // TODO: support more aggressive createOrFold on:
@@ -3984,7 +4037,7 @@ static bool isInBounds(TransferOp op, int64_t resultIdx, int64_t indicesIdx) {
   Value index = op.getIndices()[indicesIdx];
   std::optional<int64_t> cstOp = getConstantIntValue(index);
   if (!cstOp.has_value())
-    return false;
+    return isLoopIterationAsMultipleOfVectorSize(op, resultIdx, indicesIdx);
 
   int64_t sourceSize = op.getShapedType().getDimSize(indicesIdx);
   int64_t vectorSize = op.getVectorType().getDimSize(resultIdx);
diff --git a/mlir/test/Dialect/Vector/vector-transforms.mlir b/mlir/test/Dialect/Vector/vector-transforms.mlir
index eda6a5cc40d999..7a31b010b88337 100644
--- a/mlir/test/Dialect/Vector/vector-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-transforms.mlir
@@ -442,3 +442,26 @@ func.func @vec_0D(%arg0: vector<f32>) -> vector<i32> {
   %0 = vector.bitcast %arg0 : vector<f32> to vector<i32>
   return %0 : vector<i32>
 }
+
+// CHECK-LABEL: func @set_inbound
+//       CHECK:   scf.for
+//       CHECK:   vector.transfer_read  %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true]}
+//       CHECK:   vector.transfer_write %{{.*}}[%{{.*}}, %{{.*}}] {in_bounds = [true]}
+//       CHECK:   }
+//       CHECK:   return
+#map = affine_map<()[s0, s1, s2] -> (s1 - (s1 - s0) mod s2)>
+func.func @set_inbound(%arg0: index, %a : memref<100x100xi32>, %b : memref<100x100xi32>) {
+  %c0_i32 = arith.constant 0 : i32
+  %c64 = arith.constant 64 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  scf.for %arg1 = %c0 to %arg0 step %c1 {
+    %2 = affine.apply #map()[%c0, %arg0, %c64]
+    scf.for %arg2 = %c0 to %2 step %c64 {
+      %3 = vector.transfer_read %a[%arg1, %arg2], %c0_i32 : memref<100x100xi32>, vector<64xi32>
+      %4 = arith.addi %3, %3 : vector<64xi32>
+      vector.transfer_write %4, %b[%arg1, %arg2] : vector<64xi32>, memref<100x100xi32>
+    }
+  }
+  return
+}

>From dc0aab28638f4b0e41cb2f093052afc3e121dc7f Mon Sep 17 00:00:00 2001
From: Shiva Chen <shiva.chen at imgtec.com>
Date: Wed, 17 Apr 2024 18:42:35 +0100
Subject: [PATCH 2/3] Fix clang-format

---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index b233ad9786ce14..fa52d098a2be72 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -4010,7 +4010,8 @@ static bool isLoopIterationAsMultipleOfStep(mlir::scf::ForOp forOp) {
 
 template <typename TransferOp>
 static bool isLoopIterationAsMultipleOfVectorSize(TransferOp op,
-            int64_t resultIdx, int64_t indicesIdx) {
+                                                  int64_t resultIdx,
+                                                  int64_t indicesIdx) {
   Value index = op.getIndices()[indicesIdx];
   auto forOp = dyn_cast<mlir::scf::ForOp>(op->getParentOp());
   if (!forOp)

>From b7e404b528c12e0814f157bb39e34421ca2836a1 Mon Sep 17 00:00:00 2001
From: Shiva Chen <shiva.chen at imgtec.com>
Date: Fri, 19 Apr 2024 04:53:38 +0100
Subject: [PATCH 3/3] Set InBound if loop upper bound plus vector size smaller
 than access boundary

---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      | 47 ++++---------------
 .../Dialect/Vector/vector-transforms.mlir     | 20 ++++----
 2 files changed, 17 insertions(+), 50 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index fa52d098a2be72..9884f666cc2daf 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -3977,56 +3977,27 @@ Type TransferReadOp::getExpectedMaskType() {
   return inferTransferOpMaskType(getVectorType(), getPermutationMap());
 }
 
-// Determine the loop upper bound UB == s1 - ((s1 - LB) mod Step).
-// Then loop iteration = UB - LB = s1 - LB - ((s1 - LB) mod Step)
-// which is multiple of step.
-// The loop could be produced by -scf-for-loop-peeling.
-static bool isLoopIterationAsMultipleOfStep(mlir::scf::ForOp forOp) {
-  OpBuilder b(forOp.getContext());
-
-  auto UBOp = forOp.getUpperBound().getDefiningOp();
-  if (!UBOp)
-    return false;
-  auto applyOp = dyn_cast<mlir::affine::AffineApplyOp>(UBOp);
-  if (!applyOp)
-    return false;
-
-  SmallVector<Value> mapOps(applyOp.getMapOperands().begin(),
-                            applyOp.getMapOperands().end());
-  if (mapOps.size() != 3)
-    return false;
-  if (mapOps[0] != forOp.getLowerBound())
-    return false;
-  if (mapOps[2] != forOp.getStep())
-    return false;
-
-  auto UBExpr = applyOp.getAffineMap().getResult(0);
-  auto LBExpr = b.getAffineSymbolExpr(0);
-  auto sym1 = b.getAffineSymbolExpr(1);
-  auto stepExpr = b.getAffineSymbolExpr(2);
-
-  return UBExpr == (sym1 - (sym1 - LBExpr) % stepExpr);
-}
-
+// If the indice of vector.transfer_load/store is loop IV,
+// check the loop_upper_bound + vector_size <= source_type.
 template <typename TransferOp>
-static bool isLoopIterationAsMultipleOfVectorSize(TransferOp op,
+static bool isLoopUpperBoundPlusVectorSizeInBound(TransferOp op,
                                                   int64_t resultIdx,
                                                   int64_t indicesIdx) {
   Value index = op.getIndices()[indicesIdx];
   auto forOp = dyn_cast<mlir::scf::ForOp>(op->getParentOp());
   if (!forOp)
     return false;
-  auto constantStep = forOp.getConstantStep();
-  if (!constantStep)
-    return false;
   if (index != forOp.getInductionVar())
     return false;
-  if (!isLoopIterationAsMultipleOfStep(forOp))
+  Value upperBound = forOp.getUpperBound();
+  std::optional<int64_t> cstUpper = getConstantIntValue(upperBound);
+  if (!cstUpper.has_value())
     return false;
 
+  int64_t sourceSize = op.getShapedType().getDimSize(indicesIdx);
   int64_t vectorSize = op.getVectorType().getDimSize(resultIdx);
 
-  return vectorSize == *constantStep;
+  return *cstUpper + vectorSize <= sourceSize;
 }
 
 template <typename TransferOp>
@@ -4038,7 +4009,7 @@ static bool isInBounds(TransferOp op, int64_t resultIdx, int64_t indicesIdx) {
   Value index = op.getIndices()[indicesIdx];
   std::optional<int64_t> cstOp = getConstantIntValue(index);
   if (!cstOp.has_value())
-    return isLoopIterationAsMultipleOfVectorSize(op, resultIdx, indicesIdx);
+    return isLoopUpperBoundPlusVectorSizeInBound(op, resultIdx, indicesIdx);
 
   int64_t sourceSize = op.getShapedType().getDimSize(indicesIdx);
   int64_t vectorSize = op.getVectorType().getDimSize(resultIdx);
diff --git a/mlir/test/Dialect/Vector/vector-transforms.mlir b/mlir/test/Dialect/Vector/vector-transforms.mlir
index 7a31b010b88337..512eb67cebafbf 100644
--- a/mlir/test/Dialect/Vector/vector-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-transforms.mlir
@@ -445,23 +445,19 @@ func.func @vec_0D(%arg0: vector<f32>) -> vector<i32> {
 
 // CHECK-LABEL: func @set_inbound
 //       CHECK:   scf.for
-//       CHECK:   vector.transfer_read  %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true]}
-//       CHECK:   vector.transfer_write %{{.*}}[%{{.*}}, %{{.*}}] {in_bounds = [true]}
+//       CHECK:   vector.transfer_read  %{{.*}}[%{{.*}}], %{{.*}} {in_bounds = [true]}
+//       CHECK:   vector.transfer_write %{{.*}}[%{{.*}}] {in_bounds = [true]}
 //       CHECK:   }
 //       CHECK:   return
-#map = affine_map<()[s0, s1, s2] -> (s1 - (s1 - s0) mod s2)>
-func.func @set_inbound(%arg0: index, %a : memref<100x100xi32>, %b : memref<100x100xi32>) {
+func.func @set_inbound(%arg0: index, %a: memref<200xi32>, %b: memref<200xi32>) {
   %c0_i32 = arith.constant 0 : i32
   %c64 = arith.constant 64 : index
-  %c1 = arith.constant 1 : index
+  %c128 = arith.constant 128 : index
   %c0 = arith.constant 0 : index
-  scf.for %arg1 = %c0 to %arg0 step %c1 {
-    %2 = affine.apply #map()[%c0, %arg0, %c64]
-    scf.for %arg2 = %c0 to %2 step %c64 {
-      %3 = vector.transfer_read %a[%arg1, %arg2], %c0_i32 : memref<100x100xi32>, vector<64xi32>
-      %4 = arith.addi %3, %3 : vector<64xi32>
-      vector.transfer_write %4, %b[%arg1, %arg2] : vector<64xi32>, memref<100x100xi32>
-    }
+  scf.for %arg3 = %c0 to %c128 step %c64 {
+    %0 = vector.transfer_read %a[%arg3], %c0_i32 : memref<200xi32>, vector<64xi32>
+    %1 = arith.addi %0, %0 : vector<64xi32>
+    vector.transfer_write %1, %b[%arg3] : vector<64xi32>, memref<200xi32>
   }
   return
 }