[Mlir-commits] [mlir] [MLIR][Transforms] Fix two bugs in loop-invariant-subset-hoisting (PR #188761)

Fri Mar 27 04:10:35 PDT 2026

https://github.com/joker-eph updated https://github.com/llvm/llvm-project/pull/188761

>From a663a425b6cb8b8c389ff779cf27eed3ef5e1ab0 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph at gmail.com>
Date: Thu, 26 Mar 2026 07:33:18 -0700
Subject: [PATCH] [MLIR][Transforms] Fix two bugs in
 loop-invariant-subset-hoisting

Fix two issues in `MatchingSubsets::populateSubsetOpsAtIterArg`:

1. The `collectHoistableOps` parameter was declared but never used when
   inserting subset ops via `insert(subsetOp)`. As a result, when
   recursing into nested loops with `collectHoistableOps=false`, the
   nested loop's subset ops were incorrectly added to the hoistable
   extraction/insertion pairs of the parent loop. This caused spurious
   failures in the `allDisjoint` check, preventing valid hoisting when
   nested loop ops overlapped with outer loop ops.
   Fix by passing the parameter: `insert(subsetOp, collectHoistableOps)`.

2. In the nested loop handling branch, there was no guard to detect when
   a value has multiple nested loop uses (i.e., is used as an init arg
   in more than one nested loop). Without the guard, `nextValue` would be
   silently overwritten, leading to an incorrect use-def chain traversal.
   Add `if (nextValue) return failure()` before setting `nextValue` for
   the nested loop case, mirroring the existing guard for insertion ops.

Fixes #147096

Assisted-by: Claude Code
---
 .../Utils/LoopInvariantCodeMotionUtils.cpp    |  6 +-
 .../Transform/test-loop-transforms.mlir       | 57 +++++++++++++++++++
 .../loop-invariant-subset-hoisting.mlir       | 47 +++++++++++++++
 3 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp b/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
index 5f3b04a6d8bce..aef81bf2e15a3 100644
--- a/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
@@ -271,6 +271,10 @@ MatchingSubsets::populateSubsetOpsAtIterArg(LoopLikeOpInterface loopLike,
         if (failed(populateSubsetOpsAtIterArg(nestedLoop, nestedIterArg,
                                               /*collectHoistableOps=*/false)))
           return failure();
+        // There must be a single use-def chain. Bail if there are multiple
+        // nested loops using this value.
+        if (nextValue)
+          return failure();
         nextValue = nestedLoop.getTiedLoopResult(&use);
         continue;
       }
@@ -278,7 +282,7 @@ MatchingSubsets::populateSubsetOpsAtIterArg(LoopLikeOpInterface loopLike,
       auto subsetOp = dyn_cast<SubsetOpInterface>(use.getOwner());
       if (!subsetOp)
         return failure();
-      insert(subsetOp);
+      insert(subsetOp, collectHoistableOps);
 
       if (auto insertionOp =
               dyn_cast<SubsetInsertionOpInterface>(use.getOwner())) {
diff --git a/mlir/test/Dialect/Transform/test-loop-transforms.mlir b/mlir/test/Dialect/Transform/test-loop-transforms.mlir
index 833dd738f79ed..475be0169dc66 100644
--- a/mlir/test/Dialect/Transform/test-loop-transforms.mlir
+++ b/mlir/test/Dialect/Transform/test-loop-transforms.mlir
@@ -54,6 +54,63 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// Regression test for `collectHoistableOps` not being passed to `insert()` in
+// `populateSubsetOpsAtIterArg`. Without the fix, when recursing into nested
+// loops, inner loop subset ops were incorrectly added to the outer loop's
+// hoistable extraction/insertion pairs. If the outer loop were processed before
+// the inner loop (pre-order), this would cause ops inside the inner loop to be
+// moved across region boundaries, producing invalid IR.
+//
+// This test uses `transform.split_handle` to process ONLY the outer loop,
+// simulating pre-order processing. With the fix, inner loop ops are only
+// collected in `allSubsetOps` (for disjointness checking) but NOT added to
+// the hoistable extractions/insertions of the outer loop. Thus only the outer
+// [0][5][1] pair is hoisted, and the inner [5][5][1] pair remains untouched.
+
+// CHECK-LABEL: func @hoist_outer_loop_only(
+//  CHECK-SAME:     %[[arg:.*]]: tensor<?xf32>
+func.func @hoist_outer_loop_only(%arg: tensor<?xf32>) -> tensor<?xf32> {
+  %lb = "test.foo"() : () -> (index)
+  %ub = "test.foo"() : () -> (index)
+  %step = "test.foo"() : () -> (index)
+
+  // Only the outer [0][5][1] pair is hoisted. The inner [5][5][1] pair stays
+  // inside the inner loop.
+  // CHECK: %[[slice:.*]] = tensor.extract_slice %[[arg]][0] [5] [1]
+  // CHECK: %[[outer:.*]]:2 = scf.for {{.*}} iter_args(%[[t:.*]] = %[[arg]], %[[hs:.*]] = %[[slice]])
+  // CHECK:   %[[inner:.*]] = scf.for {{.*}} iter_args(%[[t2:.*]] = %[[t]])
+  // CHECK:     tensor.extract_slice %[[t2]][5] [5] [1]
+  // CHECK:     tensor.insert_slice
+  %0 = scf.for %iv = %lb to %ub step %step iter_args(%t = %arg) -> (tensor<?xf32>) {
+    %1 = tensor.extract_slice %t[0][5][1] : tensor<?xf32> to tensor<5xf32>
+    %2 = "test.foo"(%1) : (tensor<5xf32>) -> (tensor<5xf32>)
+    %3 = tensor.insert_slice %2 into %t[0][5][1] : tensor<5xf32> into tensor<?xf32>
+    %4 = scf.for %iv2 = %lb to %ub step %step iter_args(%t2 = %3) -> (tensor<?xf32>) {
+      %5 = tensor.extract_slice %t2[5][5][1] : tensor<?xf32> to tensor<5xf32>
+      %6 = "test.foo"(%5) : (tensor<5xf32>) -> (tensor<5xf32>)
+      %7 = tensor.insert_slice %6 into %t2[5][5][1] : tensor<5xf32> into tensor<?xf32>
+      scf.yield %7 : tensor<?xf32>
+    }
+    scf.yield %4 : tensor<?xf32>
+  }
+  return %0 : tensor<?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    // structured.match returns loops innermost-first: %inner, then %outer.
+    %loops = transform.structured.match ops{["scf.for"]} in %arg0
+        : (!transform.any_op) -> !transform.any_op
+    // Split to get individual handles; process only the outer loop.
+    %inner, %outer = transform.split_handle %loops
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.loop.hoist_loop_invariant_subsets %outer : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 // Checks that transform ops from LoopExtensionOps and SCFTransformOps can be
 // used together.
 
diff --git a/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir b/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
index c720b3638fad5..57e9f7e772373 100644
--- a/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
+++ b/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
@@ -598,6 +598,53 @@ func.func @hoist_vector_transfer_write_pairs_disjoint_tensor(
 
 // -----
 
+// Regression test for missing `if (nextValue) return failure()` guard in the
+// nested loop branch of `populateSubsetOpsAtIterArg`. When a single value (%3)
+// is used as the init arg in two separate nested loops, nextValue was silently
+// overwritten by the second nested loop encountered during use-def traversal,
+// causing the analysis to incorrectly succeed and hoist the extract/insert pair.
+//
+// MLIR prepends uses (LIFO), so the LAST-created user appears FIRST in the
+// use-list. Here loop2 is created first (source order), loop1 second. Thus
+// loop1 is iterated first: nextValue is set to %r1. Without the fix, the second
+// iteration (loop2) overwrites nextValue to %r2. Since %r2 IS yielded for %t,
+// the analysis wrongly succeeds and hoists the extraction. With the fix, the
+// guard `if (nextValue) return failure()` fires when loop2 is encountered after
+// loop1 has already set nextValue, correctly preventing the hoisting.
+
+// CHECK-LABEL: func @do_not_hoist_multiple_nested_loop_uses(
+func.func @do_not_hoist_multiple_nested_loop_uses(%arg: tensor<?xf32>)
+    -> tensor<?xf32> {
+  %lb = "test.foo"() : () -> (index)
+  %ub = "test.foo"() : () -> (index)
+  %step = "test.foo"() : () -> (index)
+
+  // The extraction must NOT be hoisted: it stays inside the outer for loop.
+  // CHECK: scf.for
+  // CHECK: tensor.extract_slice
+  // CHECK: tensor.insert_slice
+  %0 = scf.for %iv = %lb to %ub step %step iter_args(%t = %arg) -> tensor<?xf32> {
+    %1 = tensor.extract_slice %t[0][5][1] : tensor<?xf32> to tensor<5xf32>
+    %2 = "test.foo"(%1) : (tensor<5xf32>) -> (tensor<5xf32>)
+    %3 = tensor.insert_slice %2 into %t[0][5][1] : tensor<5xf32> into tensor<?xf32>
+    // loop2 is created first (source order) → its use of %3 is at the BACK of
+    // the use-list → iterated SECOND during populateSubsetOpsAtIterArg.
+    %r2 = scf.for %iv3 = %lb to %ub step %step iter_args(%b = %3) -> tensor<?xf32> {
+      scf.yield %b : tensor<?xf32>
+    }
+    // loop1 is created second → prepended to the FRONT of the use-list →
+    // iterated FIRST: nextValue is set to %r1. Without the fix, iterating loop2
+    // next overwrites nextValue to %r2 (which is yielded), so the analysis
+    // incorrectly succeeds. With the fix, the guard fires here and returns
+    // failure, correctly preventing hoisting.
+    %r1 = scf.for %iv2 = %lb to %ub step %step iter_args(%a = %3) -> tensor<?xf32> {
+      scf.yield %a : tensor<?xf32>
+    }
+    scf.yield %r2 : tensor<?xf32>
+  }
+  return %0 : tensor<?xf32>
+}
+
 // Ensure that cases with buffer semantics exit gracefully.
 
 // CHECK-LABEL: @hoist_buffer