[Mlir-commits] [mlir] [mlir] Add optimization to bubbleUpPadSlice pattern for no pad case (PR #135859)

Tue Apr 15 14:20:33 PDT 2025

https://github.com/nirvedhmeshram created https://github.com/llvm/llvm-project/pull/135859

In cases where there is no padding on a dim, we do not need to compute new offsets, lengths and padding, for example the new test case added can just be lowered to
```
    %extracted_slice = tensor.extract_slice %arg0[%arg2, 1, 2] [%arg2, 2, 1] [1, 1, 1] : tensor<3x4x5xf32> to tensor<?x2x1xf32>
 ```
 with this PR but without this PR we will have affine maps like
 ```
#map = affine_map<()[s0] -> (3, s0)>
#map1 = affine_map<()[s0, s1] -> (-s0 + 3, s1)>
%0 = affine.min #map()[%arg2]
 %1 = affine.min #map1()[%0, %arg2]
 %extracted_slice = tensor.extract_slice %arg0[%0, 1, 2] [%1, 2, 1] [1, 1, 1] : tensor<3x4x5xf32> to tensor<?x2x1xf32>
 ```
  which are unnecessary


>From dacce41401cc15d2a86d2b79bc56aedb25442cbf Mon Sep 17 00:00:00 2001
From: Nirvedh <nirvedh at gmail.com>
Date: Tue, 15 Apr 2025 16:14:22 -0500
Subject: [PATCH] [mlir] Add optimization to bubbleUpPadSlice pattern for no
 pad case

Signed-off-by: Nirvedh <nirvedh at gmail.com>
---
 .../Tensor/IR/TensorTilingInterfaceImpl.cpp     | 17 +++++++++++++----
 .../Linalg/matmul-shared-memory-padding.mlir    |  6 +++---
 .../Dialect/Linalg/subtensor-of-padtensor.mlir  | 17 +++++++++++++++++
 3 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
index 138e4be6b18e9..7778a02dbeaf4 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
@@ -122,7 +122,7 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
   OpFoldResult zero = b.getIndexAttr(0);
 
   // Compute new offsets, lengths, low padding, high padding.
-  SmallVector<OpFoldResult> newOffsets, newLengths, newStrides;
+  SmallVector<OpFoldResult> newOffsets, newLengths;
   SmallVector<OpFoldResult> newLows, newHighs;
   // Set to true if the original data source is not read at all.
   bool hasZeroLen = false;
@@ -131,6 +131,8 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
   Value dynHasZeroLenCond;
 
   int64_t rank = padOp.getSourceType().getRank();
+  // Only unit stride supported.
+  SmallVector<OpFoldResult> newStrides(rank, b.getIndexAttr(1));
   for (unsigned dim = 0; dim < rank; ++dim) {
     auto low = padOp.getMixedLowPad()[dim];
     bool hasLowPad = !isConstantIntValue(low, 0);
@@ -138,6 +140,16 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
     bool hasHighPad = !isConstantIntValue(high, 0);
     auto offset = offsets[dim];
     auto length = sizes[dim];
+    // If the dim has no padding, we dont need to calculate new values for that
+    // dim as the exisiting ones are correct even after the pattern.
+    if (!hasLowPad && !hasHighPad) {
+      newOffsets.push_back(offset);
+      newLengths.push_back(length);
+      newLows.push_back(low);
+      newHighs.push_back(high);
+      continue;
+    }
+
     auto srcSize = tensor::getMixedSize(b, loc, padOp.getSource(), dim);
 
     // The new amount of low padding is `low - offset`. Except for the case
@@ -216,9 +228,6 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
     OpFoldResult newHigh =
         hasHighPad ? sub(sub(length, newLength), newLow) : zero;
     newHighs.push_back(newHigh);
-
-    // Only unit stride supported.
-    newStrides.push_back(b.getIndexAttr(1));
   }
 
   // The shape of the result can be obtained from the sizes passed in.
diff --git a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
index d6c400dcbf2b9..6cab25b50460d 100644
--- a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
+++ b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
@@ -7,17 +7,17 @@
 //       CHECK:     scf.for
 //       CHECK:       memref.alloc() : memref<128x16xf32, 3>
 //       CHECK:       scf.forall
-//       CHECK:         vector.create_mask
+//       CHECK:         vector.constant_mask [16, 4] : vector<128x4xi1>
 //       CHECK:         vector.transfer_read
 //       CHECK:         vector.transfer_write
 //       CHECK:       memref.alloc() : memref<16x128xf32, 3>
 //       CHECK:       scf.forall
-//       CHECK:         vector.create_mask
+//       CHECK:         vector.constant_mask [16, 4] : vector<128x4xi1>
 //       CHECK:         vector.transfer_read
 //       CHECK:         vector.transfer_write
 //       CHECK:       memref.alloc() : memref<128x128xf32, 3>
 //       CHECK:       scf.forall
-//       CHECK:         vector.create_mask
+//   CHECK-NOT:         mask
 //       CHECK:         vector.transfer_read
 //       CHECK:         vector.transfer_write
 //       CHECK:       linalg.matmul
diff --git a/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir b/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir
index b4417641c9f83..d43b9a7ac6c04 100644
--- a/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir
+++ b/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir
@@ -216,3 +216,20 @@ func.func @dynamic_zero_high_padding(%arg0 : tensor<?x?xf32>, %pad : f32,
   %1 = tensor.extract_slice %0[%o1, %o2] [%s1, %s2] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
 }
+
+// -----
+// CHECK-LABEL: @nopaddim_with_dynamic_extract(
+//  CHECK-SAME:     %[[ARG0:.*]]: tensor<3x4x5xf32>
+//  CHECK-SAME:     %[[ARG1:.*]]: f32
+//  CHECK-SAME:     %[[ARG2:.*]]: index
+//       CHECK:   %[[RESULT:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 1, 2] [%[[ARG2]], 2, 1] [1, 1, 1] : tensor<3x4x5xf32> to tensor<?x2x1xf32>
+//       CHECK:   return %[[RESULT]]
+func.func @nopaddim_with_dynamic_extract(%arg0 : tensor<3x4x5xf32>, %pad : f32, %index : index)
+    -> tensor<?x2x1xf32> {
+  %0 = tensor.pad %arg0 low[0, 0, 0] high[0, 7, 8] {
+    ^bb0(%arg1: index, %arg2: index, %arg3: index):
+      tensor.yield %pad : f32
+    } : tensor<3x4x5xf32> to tensor<3x11x13xf32>
+  %1 = tensor.extract_slice %0[%index, 1, 2] [%index, 2, 1] [1, 1, 1] : tensor<3x11x13xf32> to tensor<?x2x1xf32>
+  return %1 : tensor<?x2x1xf32>
+}