[Mlir-commits] [mlir] [mlir][vector] Set InBound for vector read write after peeling (PR #89108)

Wed Apr 17 10:36:11 PDT 2024

https://github.com/ShivaChen created https://github.com/llvm/llvm-project/pull/89108

Setting InBound attribute for vector.transfer_read/write if the loop iteration is multiple of vector size.

In this case, the number of effective vector elements in the last iteration is equal to vector size. So the mask can be avoid to generate by setting the attribute.

The loop could be produced by -scf-for-loop-peeling.

>From a3fed33d46aa0f358a1e2f851e369c4d3eee156d Mon Sep 17 00:00:00 2001
From: Shiva Chen <shiva.chen at imgtec.com>
Date: Wed, 17 Apr 2024 17:22:24 +0100
Subject: [PATCH] [mlir][vector] Set InBound for vector read write after
 peeling

Setting InBound attribute for vector.transfer_read/write if the
loop iteration is multiple of vector size.

In this case, the number of effective vector elements in the last
iteration is equal to vector size. So the mask can be avoid to
generate by setting the attribute.

The loop could be produced by -scf-for-loop-peeling.
---
 mlir/lib/Dialect/Vector/IR/CMakeLists.txt     |  1 +
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      | 55 ++++++++++++++++++-
 .../Dialect/Vector/vector-transforms.mlir     | 23 ++++++++
 3 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/IR/CMakeLists.txt b/mlir/lib/Dialect/Vector/IR/CMakeLists.txt
index 204462ffd047c6..95d31b9d8639cc 100644
--- a/mlir/lib/Dialect/Vector/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/IR/CMakeLists.txt
@@ -24,6 +24,7 @@ add_mlir_dialect_library(MLIRVectorDialect
   MLIRMaskingOpInterface
   MLIRMemRefDialect
   MLIRSideEffectInterfaces
+  MLIRSCFDialect
   MLIRTensorDialect
   MLIRValueBoundsOpInterface
   MLIRVectorInterfaces
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 3e6425879cc67f..b233ad9786ce14 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -13,11 +13,13 @@
 
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
@@ -3975,6 +3977,57 @@ Type TransferReadOp::getExpectedMaskType() {
   return inferTransferOpMaskType(getVectorType(), getPermutationMap());
 }
 
+// Determine the loop upper bound UB == s1 - ((s1 - LB) mod Step).
+// Then loop iteration = UB - LB = s1 - LB - ((s1 - LB) mod Step)
+// which is multiple of step.
+// The loop could be produced by -scf-for-loop-peeling.
+static bool isLoopIterationAsMultipleOfStep(mlir::scf::ForOp forOp) {
+  OpBuilder b(forOp.getContext());
+
+  auto UBOp = forOp.getUpperBound().getDefiningOp();
+  if (!UBOp)
+    return false;
+  auto applyOp = dyn_cast<mlir::affine::AffineApplyOp>(UBOp);
+  if (!applyOp)
+    return false;
+
+  SmallVector<Value> mapOps(applyOp.getMapOperands().begin(),
+                            applyOp.getMapOperands().end());
+  if (mapOps.size() != 3)
+    return false;
+  if (mapOps[0] != forOp.getLowerBound())
+    return false;
+  if (mapOps[2] != forOp.getStep())
+    return false;
+
+  auto UBExpr = applyOp.getAffineMap().getResult(0);
+  auto LBExpr = b.getAffineSymbolExpr(0);
+  auto sym1 = b.getAffineSymbolExpr(1);
+  auto stepExpr = b.getAffineSymbolExpr(2);
+
+  return UBExpr == (sym1 - (sym1 - LBExpr) % stepExpr);
+}
+
+template <typename TransferOp>
+static bool isLoopIterationAsMultipleOfVectorSize(TransferOp op,
+            int64_t resultIdx, int64_t indicesIdx) {
+  Value index = op.getIndices()[indicesIdx];
+  auto forOp = dyn_cast<mlir::scf::ForOp>(op->getParentOp());
+  if (!forOp)
+    return false;
+  auto constantStep = forOp.getConstantStep();
+  if (!constantStep)
+    return false;
+  if (index != forOp.getInductionVar())
+    return false;
+  if (!isLoopIterationAsMultipleOfStep(forOp))
+    return false;
+
+  int64_t vectorSize = op.getVectorType().getDimSize(resultIdx);
+
+  return vectorSize == *constantStep;
+}
+
 template <typename TransferOp>
 static bool isInBounds(TransferOp op, int64_t resultIdx, int64_t indicesIdx) {
   // TODO: support more aggressive createOrFold on:
@@ -3984,7 +4037,7 @@ static bool isInBounds(TransferOp op, int64_t resultIdx, int64_t indicesIdx) {
   Value index = op.getIndices()[indicesIdx];
   std::optional<int64_t> cstOp = getConstantIntValue(index);
   if (!cstOp.has_value())
-    return false;
+    return isLoopIterationAsMultipleOfVectorSize(op, resultIdx, indicesIdx);
 
   int64_t sourceSize = op.getShapedType().getDimSize(indicesIdx);
   int64_t vectorSize = op.getVectorType().getDimSize(resultIdx);
diff --git a/mlir/test/Dialect/Vector/vector-transforms.mlir b/mlir/test/Dialect/Vector/vector-transforms.mlir
index eda6a5cc40d999..7a31b010b88337 100644
--- a/mlir/test/Dialect/Vector/vector-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-transforms.mlir
@@ -442,3 +442,26 @@ func.func @vec_0D(%arg0: vector<f32>) -> vector<i32> {
   %0 = vector.bitcast %arg0 : vector<f32> to vector<i32>
   return %0 : vector<i32>
 }
+
+// CHECK-LABEL: func @set_inbound
+//       CHECK:   scf.for
+//       CHECK:   vector.transfer_read  %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true]}
+//       CHECK:   vector.transfer_write %{{.*}}[%{{.*}}, %{{.*}}] {in_bounds = [true]}
+//       CHECK:   }
+//       CHECK:   return
+#map = affine_map<()[s0, s1, s2] -> (s1 - (s1 - s0) mod s2)>
+func.func @set_inbound(%arg0: index, %a : memref<100x100xi32>, %b : memref<100x100xi32>) {
+  %c0_i32 = arith.constant 0 : i32
+  %c64 = arith.constant 64 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  scf.for %arg1 = %c0 to %arg0 step %c1 {
+    %2 = affine.apply #map()[%c0, %arg0, %c64]
+    scf.for %arg2 = %c0 to %2 step %c64 {
+      %3 = vector.transfer_read %a[%arg1, %arg2], %c0_i32 : memref<100x100xi32>, vector<64xi32>
+      %4 = arith.addi %3, %3 : vector<64xi32>
+      vector.transfer_write %4, %b[%arg1, %arg2] : vector<64xi32>, memref<100x100xi32>
+    }
+  }
+  return
+}