[Mlir-commits] [mlir] [mlir][Tensor] Add rank-reducing slice in generatedSlices (PR #174248)

Tue Jan 6 09:01:09 PST 2026

https://github.com/bangtianliu updated https://github.com/llvm/llvm-project/pull/174248

>From 51fff993d557b3f79de40b247526f060bb4ec361 Mon Sep 17 00:00:00 2001
From: Bangtian Liu <liubangtian at gmail.com>
Date: Fri, 2 Jan 2026 14:54:43 -0800
Subject: [PATCH 1/3] [mlir][Tensor] Add rank-reducing slice in generatedSlices

Signed-off-by: Bangtian Liu <liubangtian at gmail.com>
---
 .../Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp
index 549ac7afca8ca..7903f3c51b73b 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp
@@ -53,6 +53,7 @@ FailureOr<TilingResult> tensor::replaceExtractSliceWithTiledProducer(
         builder, sliceOp.getLoc(), sliceOp.getType(),
         tiledResult->tiledValues[0], offsets, sliceOp.getMixedSizes(), strides);
     tiledResult->tiledValues[0] = newSliceOp;
+    tiledResult->generatedSlices.push_back(newSliceOp);
   }
 
   return *tiledResult;

>From 252732345b3a3fa1a19ed118d9687b7ffe4331a0 Mon Sep 17 00:00:00 2001
From: Bangtian Liu <liubangtian at gmail.com>
Date: Fri, 2 Jan 2026 17:27:51 -0800
Subject: [PATCH 2/3] fix timeout issue

Signed-off-by: Bangtian Liu <liubangtian at gmail.com>
---
 .../Dialect/SCF/Transforms/TileUsingInterface.cpp   | 13 ++++++++++++-
 .../tile-and-fuse-with-reduction-tiling.mlir        |  5 ++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 009c2c3537411..33960093d51c6 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -1327,7 +1327,18 @@ getUntiledProducerFromSliceSource(OpOperand *source,
   }
   if (loopIt == loops.rend())
     destinationIterArg = source;
-  return {dyn_cast<OpResult>(source->get()), destinationIterArg};
+
+  OpResult result = dyn_cast<OpResult>(source->get());
+  if (result) {
+    Operation *producer = result.getOwner();
+    Operation *innermostLoop = loops.back();
+    // If the producer is already inside the innermost loop (where the slice
+    // is), it has already been fused. Skip it to avoid infinite loops.
+    if (innermostLoop->isProperAncestor(producer))
+      return {OpResult(), std::nullopt};
+  }
+
+  return {result, destinationIterArg};
 }
 
 /// Implementation of fusing producer of a single slice by computing the
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-with-reduction-tiling.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-with-reduction-tiling.mlir
index 8cace28d441c6..62c82a15a5417 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-with-reduction-tiling.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-with-reduction-tiling.mlir
@@ -1,6 +1,9 @@
 // RUN: mlir-opt -transform-interpreter -cse -mlir-print-local-scope -split-input-file -verify-diagnostics %s | FileCheck %s
 
-// Check tile+ fuse works with partial reduction outer parallel strategy.
+// Check tile + fuse works with partial reduction outer parallel strategy.
+// This also tests that the fusion logic correctly skips producers that are
+// already inside the innermost loop (e.g., the rank-reducing slice of the
+// fused fill), avoiding infinite loops in the fusion worklist.
 
 module{
   func.func @tile_and_fuse_with_partial_reduction_outer_parallel(

>From 792ac2e4a0de5ab57996b33ae63c38551dc3fcdf Mon Sep 17 00:00:00 2001
From: Bangtian Liu <liubangtian at gmail.com>
Date: Tue, 6 Jan 2026 09:05:32 -0800
Subject: [PATCH 3/3] add a test about rank-reducing slices

Signed-off-by: Bangtian Liu <liubangtian at gmail.com>
---
 .../tile-and-fuse-using-interface.mlir        | 62 +++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
index 21d7816934bf9..aeb65ecef61c1 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
@@ -675,3 +675,65 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-SAME:           ins(%[[TILEDARG0]]
 //  CHECK-SAME:           outs(%[[TILEDARG1]]
 //       CHECK:           tensor.insert_slice %[[RES:.*]]
+
+// -----
+
+// Test that tile-and-fuse correctly handles rank-reducing extract_slice operations.
+//
+// The rank-reducing slice created during fusion is added to generatedSlices
+// (via SwapExtractSliceWithProducerPatterns.cpp). Note: this does NOT enable
+// additional fusion with default cleanup patterns because the slice's source
+// is the tiled producer (inside loop), and isProperAncestor prevents re-fusion.
+// The tracking enables specific cleanup patterns like SwapExtractSliceWithFillPatterns
+// (when added to cleanupPatterns) to transform the slice during tile-and-fuse.
+
+func.func @fuse_through_rank_reducing_slice(
+    %arg0: tensor<4x96xf16>) -> tensor<4x96xf16> {
+  %cst = arith.constant 1.0 : f16
+
+  // Producer: fill on 3D tensor with unit dimension
+  %empty_3d = tensor.empty() : tensor<4x1x96xf16>
+  %fill = linalg.fill ins(%cst : f16) outs(%empty_3d : tensor<4x1x96xf16>) -> tensor<4x1x96xf16>
+
+  // Rank-reducing slice: 3D (4x1x96) -> 2D (4x96), dropping the unit dimension
+  %reduced = tensor.extract_slice %fill[0, 0, 0] [4, 1, 96] [1, 1, 1]
+      : tensor<4x1x96xf16> to tensor<4x96xf16>
+
+  // Consumer: 2D operation
+  %result = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel"]}
+      ins(%reduced : tensor<4x96xf16>)
+      outs(%arg0 : tensor<4x96xf16>) {
+    ^bb0(%in: f16, %out: f16):
+      %sum = arith.addf %in, %out : f16
+      linalg.yield %sum : f16
+  } -> tensor<4x96xf16>
+
+  return %result : tensor<4x96xf16>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %consumer = transform.structured.match ops{["linalg.generic"]} in %arg0
+        : (!transform.any_op) -> !transform.any_op
+
+    %tiled, %loop = transform.structured.fuse %consumer tile_sizes [0, 32] {apply_cleanup}
+        : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">)
+    transform.yield
+  }
+}
+// CHECK-LABEL: func.func @fuse_through_rank_reducing_slice
+// CHECK-SAME:      %[[ARG0:[a-zA-Z0-9_]+]]: tensor<4x96xf16>
+// CHECK-DAG: %[[EMPTY_3D:.+]] = tensor.empty() : tensor<4x1x96xf16>
+// CHECK: scf.for %[[IV:[a-zA-Z0-9_]+]] = {{.*}} iter_args(%[[ITERARG:.+]] = %[[ARG0]])
+// CHECK: %[[FILL_DEST:.+]] = tensor.extract_slice %[[EMPTY_3D]][0, 0, %[[IV]]] [4, 1, 32]
+// CHECK-SAME: tensor<4x1x96xf16> to tensor<4x1x32xf16>
+// CHECK: %[[TILED_FILL:.+]] = linalg.fill
+// CHECK-SAME: outs(%[[FILL_DEST]] : tensor<4x1x32xf16>)
+// CHECK: %[[RANK_REDUCED:.+]] = tensor.extract_slice %[[TILED_FILL]][0, 0, 0] [4, 1, 32]
+// CHECK-SAME: tensor<4x1x32xf16> to tensor<4x32xf16>
+// CHECK: %[[CONSUMER_DEST:.+]] = tensor.extract_slice %[[ITERARG]]
+// CHECK: linalg.generic
+// CHECK-SAME: ins(%[[RANK_REDUCED]] : tensor<4x32xf16>)
+// CHECK-SAME: outs(%[[CONSUMER_DEST]] : tensor<4x32xf16>)