[Mlir-commits] [mlir] [MLIR][Transforms] Fix two bugs in loop-invariant-subset-hoisting (PR #188761)
Mehdi Amini
llvmlistbot at llvm.org
Fri Mar 27 04:10:35 PDT 2026
https://github.com/joker-eph updated https://github.com/llvm/llvm-project/pull/188761
>From a663a425b6cb8b8c389ff779cf27eed3ef5e1ab0 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph at gmail.com>
Date: Thu, 26 Mar 2026 07:33:18 -0700
Subject: [PATCH] [MLIR][Transforms] Fix two bugs in
loop-invariant-subset-hoisting
Fix two issues in `MatchingSubsets::populateSubsetOpsAtIterArg`:
1. The `collectHoistableOps` parameter was declared but never used when
inserting subset ops via `insert(subsetOp)`. As a result, when
recursing into nested loops with `collectHoistableOps=false`, the
nested loop's subset ops were incorrectly added to the hoistable
extraction/insertion pairs of the parent loop. This caused spurious
failures in the `allDisjoint` check, preventing valid hoisting when
nested loop ops overlapped with outer loop ops.
Fix by passing the parameter: `insert(subsetOp, collectHoistableOps)`.
2. In the nested loop handling branch, there was no guard to detect when
a value has multiple nested loop uses (i.e., is used as an init arg
in more than one nested loop). Without the guard, `nextValue` would be
silently overwritten, leading to an incorrect use-def chain traversal.
Add `if (nextValue) return failure()` before setting `nextValue` for
the nested loop case, mirroring the existing guard for insertion ops.
Fixes #147096
Assisted-by: Claude Code
---
.../Utils/LoopInvariantCodeMotionUtils.cpp | 6 +-
.../Transform/test-loop-transforms.mlir | 57 +++++++++++++++++++
.../loop-invariant-subset-hoisting.mlir | 47 +++++++++++++++
3 files changed, 109 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp b/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
index 5f3b04a6d8bce..aef81bf2e15a3 100644
--- a/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
@@ -271,6 +271,10 @@ MatchingSubsets::populateSubsetOpsAtIterArg(LoopLikeOpInterface loopLike,
if (failed(populateSubsetOpsAtIterArg(nestedLoop, nestedIterArg,
/*collectHoistableOps=*/false)))
return failure();
+ // There must be a single use-def chain. Bail if there are multiple
+ // nested loops using this value.
+ if (nextValue)
+ return failure();
nextValue = nestedLoop.getTiedLoopResult(&use);
continue;
}
@@ -278,7 +282,7 @@ MatchingSubsets::populateSubsetOpsAtIterArg(LoopLikeOpInterface loopLike,
auto subsetOp = dyn_cast<SubsetOpInterface>(use.getOwner());
if (!subsetOp)
return failure();
- insert(subsetOp);
+ insert(subsetOp, collectHoistableOps);
if (auto insertionOp =
dyn_cast<SubsetInsertionOpInterface>(use.getOwner())) {
diff --git a/mlir/test/Dialect/Transform/test-loop-transforms.mlir b/mlir/test/Dialect/Transform/test-loop-transforms.mlir
index 833dd738f79ed..475be0169dc66 100644
--- a/mlir/test/Dialect/Transform/test-loop-transforms.mlir
+++ b/mlir/test/Dialect/Transform/test-loop-transforms.mlir
@@ -54,6 +54,63 @@ module attributes {transform.with_named_sequence} {
// -----
+// Regression test for `collectHoistableOps` not being passed to `insert()` in
+// `populateSubsetOpsAtIterArg`. Without the fix, when recursing into nested
+// loops, inner loop subset ops were incorrectly added to the outer loop's
+// hoistable extraction/insertion pairs. If the outer loop were processed before
+// the inner loop (pre-order), this would cause ops inside the inner loop to be
+// moved across region boundaries, producing invalid IR.
+//
+// This test uses `transform.split_handle` to process ONLY the outer loop,
+// simulating pre-order processing. With the fix, inner loop ops are only
+// collected in `allSubsetOps` (for disjointness checking) but NOT added to
+// the hoistable extractions/insertions of the outer loop. Thus only the outer
+// [0][5][1] pair is hoisted, and the inner [5][5][1] pair remains untouched.
+
+// CHECK-LABEL: func @hoist_outer_loop_only(
+// CHECK-SAME: %[[arg:.*]]: tensor<?xf32>
+func.func @hoist_outer_loop_only(%arg: tensor<?xf32>) -> tensor<?xf32> {
+ %lb = "test.foo"() : () -> (index)
+ %ub = "test.foo"() : () -> (index)
+ %step = "test.foo"() : () -> (index)
+
+ // Only the outer [0][5][1] pair is hoisted. The inner [5][5][1] pair stays
+ // inside the inner loop.
+ // CHECK: %[[slice:.*]] = tensor.extract_slice %[[arg]][0] [5] [1]
+ // CHECK: %[[outer:.*]]:2 = scf.for {{.*}} iter_args(%[[t:.*]] = %[[arg]], %[[hs:.*]] = %[[slice]])
+ // CHECK: %[[inner:.*]] = scf.for {{.*}} iter_args(%[[t2:.*]] = %[[t]])
+ // CHECK: tensor.extract_slice %[[t2]][5] [5] [1]
+ // CHECK: tensor.insert_slice
+ %0 = scf.for %iv = %lb to %ub step %step iter_args(%t = %arg) -> (tensor<?xf32>) {
+ %1 = tensor.extract_slice %t[0][5][1] : tensor<?xf32> to tensor<5xf32>
+ %2 = "test.foo"(%1) : (tensor<5xf32>) -> (tensor<5xf32>)
+ %3 = tensor.insert_slice %2 into %t[0][5][1] : tensor<5xf32> into tensor<?xf32>
+ %4 = scf.for %iv2 = %lb to %ub step %step iter_args(%t2 = %3) -> (tensor<?xf32>) {
+ %5 = tensor.extract_slice %t2[5][5][1] : tensor<?xf32> to tensor<5xf32>
+ %6 = "test.foo"(%5) : (tensor<5xf32>) -> (tensor<5xf32>)
+ %7 = tensor.insert_slice %6 into %t2[5][5][1] : tensor<5xf32> into tensor<?xf32>
+ scf.yield %7 : tensor<?xf32>
+ }
+ scf.yield %4 : tensor<?xf32>
+ }
+ return %0 : tensor<?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ // structured.match returns loops innermost-first: %inner, then %outer.
+ %loops = transform.structured.match ops{["scf.for"]} in %arg0
+ : (!transform.any_op) -> !transform.any_op
+ // Split to get individual handles; process only the outer loop.
+ %inner, %outer = transform.split_handle %loops
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.loop.hoist_loop_invariant_subsets %outer : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
// Checks that transform ops from LoopExtensionOps and SCFTransformOps can be
// used together.
diff --git a/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir b/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
index c720b3638fad5..57e9f7e772373 100644
--- a/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
+++ b/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
@@ -598,6 +598,53 @@ func.func @hoist_vector_transfer_write_pairs_disjoint_tensor(
// -----
+// Regression test for missing `if (nextValue) return failure()` guard in the
+// nested loop branch of `populateSubsetOpsAtIterArg`. When a single value (%3)
+// is used as the init arg in two separate nested loops, nextValue was silently
+// overwritten by the second nested loop encountered during use-def traversal,
+// causing the analysis to incorrectly succeed and hoist the extract/insert pair.
+//
+// MLIR prepends uses (LIFO), so the LAST-created user appears FIRST in the
+// use-list. Here loop2 is created first (source order), loop1 second. Thus
+// loop1 is iterated first: nextValue is set to %r1. Without the fix, the second
+// iteration (loop2) overwrites nextValue to %r2. Since %r2 IS yielded for %t,
+// the analysis wrongly succeeds and hoists the extraction. With the fix, the
+// guard `if (nextValue) return failure()` fires when loop2 is encountered after
+// loop1 has already set nextValue, correctly preventing the hoisting.
+
+// CHECK-LABEL: func @do_not_hoist_multiple_nested_loop_uses(
+func.func @do_not_hoist_multiple_nested_loop_uses(%arg: tensor<?xf32>)
+ -> tensor<?xf32> {
+ %lb = "test.foo"() : () -> (index)
+ %ub = "test.foo"() : () -> (index)
+ %step = "test.foo"() : () -> (index)
+
+ // The extraction must NOT be hoisted: it stays inside the outer for loop.
+ // CHECK: scf.for
+ // CHECK: tensor.extract_slice
+ // CHECK: tensor.insert_slice
+ %0 = scf.for %iv = %lb to %ub step %step iter_args(%t = %arg) -> tensor<?xf32> {
+ %1 = tensor.extract_slice %t[0][5][1] : tensor<?xf32> to tensor<5xf32>
+ %2 = "test.foo"(%1) : (tensor<5xf32>) -> (tensor<5xf32>)
+ %3 = tensor.insert_slice %2 into %t[0][5][1] : tensor<5xf32> into tensor<?xf32>
+ // loop2 is created first (source order) → its use of %3 is at the BACK of
+ // the use-list → iterated SECOND during populateSubsetOpsAtIterArg.
+ %r2 = scf.for %iv3 = %lb to %ub step %step iter_args(%b = %3) -> tensor<?xf32> {
+ scf.yield %b : tensor<?xf32>
+ }
+ // loop1 is created second → prepended to the FRONT of the use-list →
+ // iterated FIRST: nextValue is set to %r1. Without the fix, iterating loop2
+ // next overwrites nextValue to %r2 (which is yielded), so the analysis
+ // incorrectly succeeds. With the fix, the guard fires here and returns
+ // failure, correctly preventing hoisting.
+ %r1 = scf.for %iv2 = %lb to %ub step %step iter_args(%a = %3) -> tensor<?xf32> {
+ scf.yield %a : tensor<?xf32>
+ }
+ scf.yield %r2 : tensor<?xf32>
+ }
+ return %0 : tensor<?xf32>
+}
+
// Ensure that cases with buffer semantics exit gracefully.
// CHECK-LABEL: @hoist_buffer
More information about the Mlir-commits
mailing list