[flang-commits] [flang] 3098b4d - [flang] Added LICM hoisting for nested regions. (#190696)

Tue Apr 7 11:41:05 PDT 2026

Author: Slava Zakharin
Date: 2026-04-07T11:41:00-07:00
New Revision: 3098b4da81c254b40f6bc931bbb1933df12dc0a9

URL: https://github.com/llvm/llvm-project/commit/3098b4da81c254b40f6bc931bbb1933df12dc0a9
DIFF: https://github.com/llvm/llvm-project/commit/3098b4da81c254b40f6bc931bbb1933df12dc0a9.diff

LOG: [flang] Added LICM hoisting for nested regions. (#190696)

This patch adds a couple of experimental LICM modes
that allow hoisting operations from regions nested
inside a loop, e.g. when there is `fir.if` inside
`fir.do_loop`. The aggressive mode hoists all operations
that are safe to hoist. The cheap mode hoists only
"cheap" operations (currently, only `fir.convert`),
though the definition of "cheap" needs to be worked out.

Added: 
    

Modified: 
    flang/include/flang/Optimizer/Transforms/Passes.h
    flang/include/flang/Optimizer/Transforms/Passes.td
    flang/lib/Optimizer/Transforms/LoopInvariantCodeMotion.cpp
    flang/test/Transforms/licm.fir

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 50e8e6c58bf62..adacd3cc0cf51 100644

--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -28,6 +28,14 @@ class ModuleOp;
 
 namespace fir {
 
+/// Controls hoisting of invariant ops from nested regions (e.g. scf.if
+/// within loops) in the flang-licm pass.
+enum class LICMNestedHoistingMode {
+  None,       ///< Do not hoist from nested regions.
+  Cheap,      ///< Only hoist cheap ops like fir.convert.
+  Aggressive, ///< Hoist all safe invariant ops.
+};
+
 //===----------------------------------------------------------------------===//
 // Passes defined in Passes.td
 //===----------------------------------------------------------------------===//

diff  --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 5c7920ce3fa62..71c9f7b62d2be 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -662,6 +662,19 @@ def LoopInvariantCodeMotion : Pass<"flang-licm", "::mlir::func::FuncOp"> {
     The pass only moves existing operations, so there are no dependent
     dialects.
   }];
+  let options = [Option<"hoistFromNestedRegions", "hoist-from-nested-regions",
+                        "::fir::LICMNestedHoistingMode",
+                        /*default=*/"::fir::LICMNestedHoistingMode::Cheap",
+                        "Control hoisting of invariant ops from nested regions "
+                        "(e.g. scf.if within loops)",
+                        [{::llvm::cl::values(
+            clEnumValN(::fir::LICMNestedHoistingMode::None,
+                       "none", "Do not hoist from nested regions"),
+            clEnumValN(::fir::LICMNestedHoistingMode::Cheap,
+                       "cheap", "Only hoist cheap ops like fir.convert"),
+            clEnumValN(::fir::LICMNestedHoistingMode::Aggressive,
+                       "aggressive", "Hoist all safe invariant ops")
+           )}]>];
 }
 
 #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES

diff  --git a/flang/lib/Optimizer/Transforms/LoopInvariantCodeMotion.cpp b/flang/lib/Optimizer/Transforms/LoopInvariantCodeMotion.cpp
index 8ebb8982936e8..d1c4046f38b19 100644
--- a/flang/lib/Optimizer/Transforms/LoopInvariantCodeMotion.cpp
+++ b/flang/lib/Optimizer/Transforms/LoopInvariantCodeMotion.cpp
@@ -18,6 +18,7 @@
 #include "flang/Optimizer/Dialect/FortranVariableInterface.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
 #include "llvm/ADT/TypeSwitch.h"
@@ -48,6 +49,7 @@ using namespace mlir;
 ///     (see isSafeToHoistLoad() comments below).
 struct LoopInvariantCodeMotion
     : fir::impl::LoopInvariantCodeMotionBase<LoopInvariantCodeMotion> {
+  using LoopInvariantCodeMotionBase::LoopInvariantCodeMotionBase;
   void runOnOperation() override;
 };
 
@@ -162,26 +164,37 @@ static bool isNonOptionalScalar(Value location) {
 /// Returns true iff it is safe to hoist the given load-like operation 'op',
 /// which access given memory 'locations', out of the operation 'loopLike'.
 /// The current safety conditions are:
-///   * The loop runs at least one iteration, OR
+///   * The load is known to be unconditionally executed in the loop and the
+///     loop runs at least one iteration, OR
 ///   * all the accessed locations are inside scalar non-OPTIONAL
 ///     Fortran objects (Fortran descriptors are considered to be scalars).
+///
+/// When \p maybeConditionallyExecuted is true, the load may be inside a
+/// conditional region (e.g. scf.if) within the loop, so the trip count
+/// shortcut cannot be used: even if the loop runs, the condition might never
+/// be true and the load might access an invalid location.
+/// TODO: analyze the parent operation to determine whether it truly
+/// conditionally executes its body (e.g. scf.execute_region always does).
 static bool isSafeToHoistLoad(Operation *op, ArrayRef<Value> locations,
                               LoopLikeOpInterface loopLike,
-                              AliasAnalysis &aliasAnalysis) {
+                              AliasAnalysis &aliasAnalysis,
+                              bool maybeConditionallyExecuted) {
   for (Value location : locations)
-    if (aliasAnalysis.getModRef(loopLike.getOperation(), location)
-            .isModAndRef()) {
+    if (aliasAnalysis.getModRef(loopLike.getOperation(), location).isMod()) {
       LDBG() << "Failure: reads location:\n"
              << location << "\nwhich is modified inside the loop";
       return false;
     }
 
   // Check that it is safe to read from all the locations before the loop.
-  std::optional<llvm::APInt> tripCount = loopLike.getStaticTripCount();
-  if (tripCount && !tripCount->isZero()) {
-    // Loop executes at least one iteration, so it is safe to hoist.
-    LDBG() << "Success: loop has non-zero iterations";
-    return true;
+  if (!maybeConditionallyExecuted) {
+    std::optional<llvm::APInt> tripCount = loopLike.getStaticTripCount();
+    if (tripCount && !tripCount->isZero()) {
+      // Loop executes at least one iteration and the load is unconditionally
+      // executed in the loop body, so it is safe to hoist.
+      LDBG() << "Success: loop has non-zero iterations";
+      return true;
+    }
   }
 
   // Check whether the access must always be valid.
@@ -193,8 +206,10 @@ static bool isSafeToHoistLoad(Operation *op, ArrayRef<Value> locations,
 
 /// Returns true iff the given 'op' is a load-like operation,
 /// and it can be hoisted out of 'loopLike' operation.
+/// See isSafeToHoistLoad for the meaning of \p maybeConditionallyExecuted.
 static bool canHoistLoad(Operation *op, LoopLikeOpInterface loopLike,
-                         AliasAnalysis &aliasAnalysis) {
+                         AliasAnalysis &aliasAnalysis,
+                         bool maybeConditionallyExecuted) {
   LDBG() << "Checking operation:\n" << *op;
   if (auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op)) {
     SmallVector<MemoryEffects::EffectInstance> effects;
@@ -216,12 +231,29 @@ static bool canHoistLoad(Operation *op, LoopLikeOpInterface loopLike,
       locations.insert(location);
     }
     return isSafeToHoistLoad(op, locations.getArrayRef(), loopLike,
-                             aliasAnalysis);
+                             aliasAnalysis, maybeConditionallyExecuted);
   }
   LDBG() << "Failure: has unknown effects";
   return false;
 }
 
+/// Recursively collect regions from operations inside \p region, skipping
+/// IsolatedFromAbove operations (whose regions form a separate scope) and
+/// LoopLikeOpInterface operations (which have their own LICM invocation).
+static void collectNestedRegions(Region &region,
+                                 SmallVectorImpl<Region *> &result) {
+  for (Operation &op : region.getOps()) {
+    if (op.hasTrait<OpTrait::IsIsolatedFromAbove>())
+      continue;
+    if (isa<LoopLikeOpInterface>(&op))
+      continue;
+    for (Region &nested : op.getRegions()) {
+      result.push_back(&nested);
+      collectNestedRegions(nested, result);
+    }
+  }
+}
+
 void LoopInvariantCodeMotion::runOnOperation() {
   if (disableFlangLICM) {
     LDBG() << "Skipping [HL]FIR LoopInvariantCodeMotion()";
@@ -233,8 +265,9 @@ void LoopInvariantCodeMotion::runOnOperation() {
   auto &aliasAnalysis = getAnalysis<AliasAnalysis>();
   aliasAnalysis.addAnalysisImplementation(fir::AliasAnalysis{});
 
-  std::function<bool(Operation *, LoopLikeOpInterface loopLike)>
-      shouldMoveOutOfLoop = [&](Operation *op, LoopLikeOpInterface loopLike) {
+  std::function<bool(Operation *, LoopLikeOpInterface, bool)>
+      shouldMoveOutOfLoop = [&](Operation *op, LoopLikeOpInterface loopLike,
+                                bool maybeConditionallyExecuted) {
         if (isPure(op)) {
           LDBG() << "Pure operation: " << *op;
           return true;
@@ -255,7 +288,8 @@ void LoopInvariantCodeMotion::runOnOperation() {
                   nestedOps.push_back(&nestedOp);
 
             bool result = llvm::all_of(nestedOps, [&](Operation *nestedOp) {
-              return shouldMoveOutOfLoop(nestedOp, loopLike);
+              return shouldMoveOutOfLoop(nestedOp, loopLike,
+                                         maybeConditionallyExecuted);
             });
             LDBG() << "Recursive operation can" << (result ? "" : "not")
                    << " be hoisted";
@@ -268,7 +302,8 @@ void LoopInvariantCodeMotion::runOnOperation() {
               return result;
           }
         }
-        return canHoistLoad(op, loopLike, aliasAnalysis);
+        return canHoistLoad(op, loopLike, aliasAnalysis,
+                            maybeConditionallyExecuted);
       };
 
   getOperation()->walk([&](LoopLikeOpInterface loopLike) {
@@ -297,26 +332,77 @@ void LoopInvariantCodeMotion::runOnOperation() {
       });
       return;
     }
+    auto isDefinedOutsideRegion = [&](Value value, Region *) {
+      return loopLike.isDefinedOutsideOfLoop(value);
+    };
+    auto canMoveOutOfLoop = [&](Operation *op) {
+      if (!fir::canMoveOutOf(loopLike, op)) {
+        LDBG() << "Cannot hoist " << *op << " out of the loop";
+        return false;
+      }
+      if (!fir::canMoveFromDescendant(parentOp, loopLike, op)) {
+        LDBG() << "Cannot hoist " << *op << " into the parent of the loop";
+        return false;
+      }
+      return true;
+    };
+    auto moveOutOfRegion = [&](Operation *op, Region *) {
+      loopLike.moveOutOfLoop(op);
+    };
+
     moveLoopInvariantCode(
-        loopLike.getLoopRegions(),
-        /*isDefinedOutsideRegion=*/
-        [&](Value value, Region *) {
-          return loopLike.isDefinedOutsideOfLoop(value);
-        },
+        loopLike.getLoopRegions(), isDefinedOutsideRegion,
         /*shouldMoveOutOfRegion=*/
         [&](Operation *op, Region *) {
-          if (!fir::canMoveOutOf(loopLike, op)) {
-            LDBG() << "Cannot hoist " << *op << " out of the loop";
-            return false;
-          }
-          if (!fir::canMoveFromDescendant(parentOp, loopLike, op)) {
-            LDBG() << "Cannot hoist " << *op << " into the parent of the loop";
-            return false;
-          }
-          return shouldMoveOutOfLoop(op, loopLike);
+          return canMoveOutOfLoop(op) &&
+                 shouldMoveOutOfLoop(op, loopLike,
+                                     /*maybeConditionallyExecuted=*/false);
         },
-        /*moveOutOfRegion=*/
-        [&](Operation *op, Region *) { loopLike.moveOutOfLoop(op); });
+        moveOutOfRegion);
+
+    if (hoistFromNestedRegions == fir::LICMNestedHoistingMode::None)
+      return;
+
+    // Hoist loop-invariant ops from nested regions (e.g., fir.convert
+    // inside scf.if) out of the loop. This enables CSE to deduplicate
+    // converted memrefs, which improves alias analysis for parallelization.
+    // The callbacks close over loopLike (ignoring the Region* parameter),
+    // so invariance and movement are evaluated against the loop, not the
+    // nested region.
+    // Loads hoisted from nested regions are treated as maybe-conditionally
+    // executed: we do not know whether the parent operation always executes
+    // its body (e.g. scf.execute_region does, scf.if might not), so the
+    // trip count shortcut cannot prove safety.
+    // TODO: analyze the parent operation to determine whether it truly
+    // conditionally executes its body.
+    SmallVector<Region *> nestedRegions;
+    for (Region *loopRegion : loopLike.getLoopRegions())
+      collectNestedRegions(*loopRegion, nestedRegions);
+
+    if (nestedRegions.empty())
+      return;
+
+    auto shouldMoveFromNestedRegion = [&](Operation *op, Region *) {
+      return canMoveOutOfLoop(op) &&
+             shouldMoveOutOfLoop(op, loopLike,
+                                 /*maybeConditionallyExecuted=*/true);
+    };
+    if (hoistFromNestedRegions == fir::LICMNestedHoistingMode::Aggressive) {
+      moveLoopInvariantCode(nestedRegions, isDefinedOutsideRegion,
+                            shouldMoveFromNestedRegion, moveOutOfRegion);
+    } else {
+      // "cheap" mode: only hoist fir.convert.
+      // TODO: refine the cost model for "cheap" hoisting to include
+      // other inexpensive operations.
+      moveLoopInvariantCode(
+          nestedRegions, isDefinedOutsideRegion,
+          /*shouldMoveOutOfRegion=*/
+          [&](Operation *op, Region *region) {
+            return isa<fir::ConvertOp>(op) &&
+                   shouldMoveFromNestedRegion(op, region);
+          },
+          moveOutOfRegion);
+    }
   });
 
   LDBG() << "Exit [HL]FIR LoopInvariantCodeMotion()";

diff  --git a/flang/test/Transforms/licm.fir b/flang/test/Transforms/licm.fir
index 049d86b6171fe..3490c64a67222 100644
--- a/flang/test/Transforms/licm.fir
+++ b/flang/test/Transforms/licm.fir
@@ -2087,3 +2087,344 @@ func.func @test_() {
   }
   return
 }
+
+// -----
+// Test hoisting of fir.convert from nested scf.if regions inside a loop.
+// The two fir.convert ops inside separate scf.if blocks should be hoisted
+// before the scf.for, enabling downstream CSE to merge them.
+// CHECK-LABEL:   func.func @test_nested_hoist(
+// CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<10xf32>>)
+// CHECK-DAG:     %[[CST1:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK-DAG:     %[[CST2:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK:         %[[CONV1:.*]] = fir.convert %[[ARG0]] : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+// CHECK:         %[[CONV2:.*]] = fir.convert %[[ARG0]] : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+// CHECK:         scf.for %[[I:.*]] =
+// CHECK:           scf.if
+// CHECK-NOT:         fir.convert
+// CHECK:             memref.store %[[CST1]], %[[CONV1]][%[[I]]] : memref<10xf32>
+// CHECK:           scf.if
+// CHECK-NOT:         fir.convert
+// CHECK:             memref.store %[[CST2]], %[[CONV2]][%[[I]]] : memref<10xf32>
+// CHECK:         return
+func.func @test_nested_hoist(%arg0: !fir.ref<!fir.array<10xf32>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c5 = arith.constant 5 : index
+  %c10 = arith.constant 10 : index
+  %cst1 = arith.constant 1.000000e+00 : f32
+  %cst2 = arith.constant 2.000000e+00 : f32
+  scf.for %i = %c0 to %c10 step %c1 {
+    %cmp1 = arith.cmpi slt, %i, %c5 : index
+    scf.if %cmp1 {
+      %mem = fir.convert %arg0 : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+      memref.store %cst1, %mem[%i] : memref<10xf32>
+    }
+    %cmp2 = arith.cmpi sge, %i, %c5 : index
+    scf.if %cmp2 {
+      %mem = fir.convert %arg0 : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+      memref.store %cst2, %mem[%i] : memref<10xf32>
+    }
+  }
+  return
+}
+
+// -----
+// Test hoisting of fir.convert from deeply nested scf.if (scf.if inside
+// scf.if inside scf.for). The fir.convert should be hoisted before the loop.
+// CHECK-LABEL:   func.func @test_deeply_nested_hoist(
+// CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<10xf32>>)
+// CHECK:         %[[CONV:.*]] = fir.convert %[[ARG0]] : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+// CHECK:         scf.for %[[I:.*]] =
+// CHECK:           scf.if
+// CHECK:             scf.if
+// CHECK-NOT:           fir.convert
+// CHECK:               memref.store %{{.*}}, %[[CONV]][%[[I]]] : memref<10xf32>
+// CHECK:         return
+func.func @test_deeply_nested_hoist(%arg0: !fir.ref<!fir.array<10xf32>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c5 = arith.constant 5 : index
+  %c10 = arith.constant 10 : index
+  %cst = arith.constant 1.000000e+00 : f32
+  scf.for %i = %c0 to %c10 step %c1 {
+    %cmp1 = arith.cmpi sgt, %i, %c3 : index
+    scf.if %cmp1 {
+      %cmp2 = arith.cmpi slt, %i, %c5 : index
+      scf.if %cmp2 {
+        %mem = fir.convert %arg0 : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+        memref.store %cst, %mem[%i] : memref<10xf32>
+      }
+    }
+  }
+  return
+}
+
+// -----
+// Test that fir.convert fully invariant w.r.t. both loops is hoisted all the
+// way out (inner loop top-level LICM hoists it between the loops, then outer
+// loop top-level LICM hoists it before both loops).
+// CHECK-LABEL:   func.func @test_nested_loop_fully_invariant(
+// CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>>)
+// CHECK:         %[[CONV:.*]] = fir.convert %[[ARG0]] : (!fir.ref<!fir.array<100xf32>>) -> memref<100xf32>
+// CHECK:         scf.for
+// CHECK:           scf.for %[[J:.*]] =
+// CHECK-NOT:         fir.convert
+// CHECK:             memref.store %{{.*}}, %[[CONV]][%[[J]]] : memref<100xf32>
+// CHECK:         return
+func.func @test_nested_loop_fully_invariant(%arg0: !fir.ref<!fir.array<100xf32>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %cst = arith.constant 1.000000e+00 : f32
+  scf.for %i = %c0 to %c10 step %c1 {
+    scf.for %j = %c0 to %c10 step %c1 {
+      %mem = fir.convert %arg0 : (!fir.ref<!fir.array<100xf32>>) -> memref<100xf32>
+      memref.store %cst, %mem[%j] : memref<100xf32>
+    }
+  }
+  return
+}
+
+// -----
+// Test that fir.convert inside scf.if inside nested loops is hoisted all the
+// way out when it is fully invariant (nested LICM on inner loop hoists from
+// scf.if, then top-level LICM on each loop hoists further).
+// CHECK-LABEL:   func.func @test_nested_loop_with_if_fully_invariant(
+// CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>>)
+// CHECK:         %[[CONV:.*]] = fir.convert %[[ARG0]] : (!fir.ref<!fir.array<100xf32>>) -> memref<100xf32>
+// CHECK:         scf.for
+// CHECK:           scf.for %[[J:.*]] =
+// CHECK:             scf.if
+// CHECK-NOT:           fir.convert
+// CHECK:               memref.store %{{.*}}, %[[CONV]][%[[J]]] : memref<100xf32>
+// CHECK:         return
+func.func @test_nested_loop_with_if_fully_invariant(%arg0: !fir.ref<!fir.array<100xf32>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c5 = arith.constant 5 : index
+  %c10 = arith.constant 10 : index
+  %cst = arith.constant 1.000000e+00 : f32
+  scf.for %i = %c0 to %c10 step %c1 {
+    scf.for %j = %c0 to %c10 step %c1 {
+      %cmp = arith.cmpi slt, %j, %c5 : index
+      scf.if %cmp {
+        %mem = fir.convert %arg0 : (!fir.ref<!fir.array<100xf32>>) -> memref<100xf32>
+        memref.store %cst, %mem[%j] : memref<100xf32>
+      }
+    }
+  }
+  return
+}
+
+// -----
+// Test that fir.convert using an outer-loop-variant value is hoisted from
+// scf.if to between the two loops (by inner loop's nested LICM), but NOT
+// further (because its operand %ptr depends on the outer loop induction var).
+// CHECK-LABEL:   func.func @test_nested_loop_outer_variant(
+// CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>>)
+// CHECK:         scf.for %[[I:.*]] =
+// CHECK:           %[[PTR:.*]] = fir.coordinate_of %[[ARG0]], %[[I]]
+// CHECK:           %[[CONV:.*]] = fir.convert %[[PTR]] : (!fir.ref<f32>) -> memref<f32>
+// CHECK:           scf.for %[[J:.*]] =
+// CHECK:             scf.if
+// CHECK-NOT:           fir.convert
+// CHECK:               memref.store %{{.*}}, %[[CONV]][] : memref<f32>
+// CHECK:         return
+func.func @test_nested_loop_outer_variant(%arg0: !fir.ref<!fir.array<100xf32>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c5 = arith.constant 5 : index
+  %c10 = arith.constant 10 : index
+  %cst = arith.constant 1.000000e+00 : f32
+  scf.for %i = %c0 to %c10 step %c1 {
+    %ptr = fir.coordinate_of %arg0, %i : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+    scf.for %j = %c0 to %c10 step %c1 {
+      %cmp = arith.cmpi slt, %j, %c5 : index
+      scf.if %cmp {
+        %mem = fir.convert %ptr : (!fir.ref<f32>) -> memref<f32>
+        memref.store %cst, %mem[] : memref<f32>
+      }
+    }
+  }
+  return
+}
+
+// -----
+// Test chained fir.convert ops across nested loop and scf.if boundaries.
+// %conv1 is inside the outer scf.if, %conv2 and %conv3 use the chain and are
+// inside scf.if inside an inner loop inside the outer scf.if.
+// All three should be hoisted before both loops via user-propagation:
+//   1. Inner loop nested LICM hoists %conv2, %conv3 from inner scf.if.
+//   2. Inner loop top-level LICM hoists them to the outer scf.if body.
+//   3. Outer loop nested LICM hoists %conv1 from outer scf.if; user
+//      propagation then hoists %conv2, %conv3 too.
+// CHECK-LABEL:   func.func @test_chained_converts(
+// CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<10xf32>>)
+// CHECK:         %[[C1:.*]] = fir.convert %[[ARG0]] : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+// CHECK:         %[[C2:.*]] = fir.convert %[[C1]] : (memref<10xf32>) -> !fir.ref<!fir.array<10xf32>>
+// CHECK:         %[[C3:.*]] = fir.convert %[[C2]] : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+// CHECK:         scf.for %{{.*}} =
+// CHECK:           scf.if
+// CHECK:             scf.for %[[J:.*]] =
+// CHECK:               scf.if
+// CHECK-NOT:             fir.convert
+// CHECK:                 memref.store %{{.*}}, %[[C3]][%[[J]]] : memref<10xf32>
+// CHECK:         return
+func.func @test_chained_converts(%arg0: !fir.ref<!fir.array<10xf32>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c5 = arith.constant 5 : index
+  %c10 = arith.constant 10 : index
+  %cst = arith.constant 1.000000e+00 : f32
+  scf.for %i = %c0 to %c10 step %c1 {
+    %cond1 = arith.cmpi slt, %i, %c5 : index
+    scf.if %cond1 {
+      %conv1 = fir.convert %arg0 : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+      scf.for %j = %c0 to %c10 step %c1 {
+        %cond2 = arith.cmpi slt, %j, %c5 : index
+        scf.if %cond2 {
+          %conv2 = fir.convert %conv1 : (memref<10xf32>) -> !fir.ref<!fir.array<10xf32>>
+          %conv3 = fir.convert %conv2 : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+          memref.store %cst, %conv3[%j] : memref<10xf32>
+        }
+      }
+    }
+  }
+  return
+}
+
+// -----
+// Test chained fir.convert ops across deeply nested scf.if regions within
+// a single loop. %conv1 feeds into both branches: one scf.if uses a chain
+// of two converts, the other has a further-nested scf.if with a three-deep
+// chain. All fir.convert ops should be hoisted before the loop.
+// CHECK-LABEL:   func.func @test_chained_deep_ifs(
+// CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<10xf32>>)
+// CHECK-DAG:     %[[CST1:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK-DAG:     %[[CST2:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK:         %[[C1:.*]] = fir.convert %[[ARG0]] : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+// CHECK:         %[[C2A:.*]] = fir.convert %[[C1]] : (memref<10xf32>) -> !fir.ref<!fir.array<10xf32>>
+// CHECK:         %[[C3A:.*]] = fir.convert %[[C2A]] : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+// CHECK:         %[[C2B:.*]] = fir.convert %[[C1]] : (memref<10xf32>) -> !fir.ref<!fir.array<10xf32>>
+// CHECK:         %[[C3B:.*]] = fir.convert %[[C2B]] : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+// CHECK:         scf.for %[[I:.*]] =
+// CHECK:           scf.if
+// CHECK:             scf.if
+// CHECK-NOT:           fir.convert
+// CHECK:               memref.store %[[CST1]], %[[C3A]][%[[I]]] : memref<10xf32>
+// CHECK:             scf.if
+// CHECK:               scf.if
+// CHECK-NOT:             fir.convert
+// CHECK:                 memref.store %[[CST2]], %[[C3B]][%[[I]]] : memref<10xf32>
+// CHECK:         return
+func.func @test_chained_deep_ifs(%arg0: !fir.ref<!fir.array<10xf32>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c5 = arith.constant 5 : index
+  %c10 = arith.constant 10 : index
+  %cst1 = arith.constant 1.000000e+00 : f32
+  %cst2 = arith.constant 2.000000e+00 : f32
+  scf.for %i = %c0 to %c10 step %c1 {
+    %cond1 = arith.cmpi slt, %i, %c5 : index
+    scf.if %cond1 {
+      %conv1 = fir.convert %arg0 : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+      %cond2 = arith.cmpi slt, %i, %c3 : index
+      scf.if %cond2 {
+        %conv2a = fir.convert %conv1 : (memref<10xf32>) -> !fir.ref<!fir.array<10xf32>>
+        %conv3a = fir.convert %conv2a : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+        memref.store %cst1, %conv3a[%i] : memref<10xf32>
+      }
+      %cond3 = arith.cmpi sge, %i, %c3 : index
+      scf.if %cond3 {
+        %conv2b = fir.convert %conv1 : (memref<10xf32>) -> !fir.ref<!fir.array<10xf32>>
+        %cond4 = arith.cmpi slt, %i, %c5 : index
+        scf.if %cond4 {
+          %conv3b = fir.convert %conv2b : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+          memref.store %cst2, %conv3b[%i] : memref<10xf32>
+        }
+      }
+    }
+  }
+  return
+}
+
+// -----
+// Test that canMoveOutOf prevents nested hoisting of fir.convert when its
+// operand is a data operand (private variable) of acc.loop, while
+// fir.convert of a non-data operand is still hoisted.
+// CHECK-LABEL:   func.func @test_acc_loop_nested_canMoveOutOf(
+// CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<10xf32>>)
+// CHECK:         acc.parallel combined(loop) {
+// CHECK:           %[[PRIV:.*]] = acc.private
+// The non-private fir.convert IS hoisted out of acc.loop:
+// CHECK:           %[[CVT_ARG:.*]] = fir.convert %[[ARG0]] : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+// CHECK:           acc.loop{{.*}}private(%[[PRIV]] : !fir.ref<f32>)
+// CHECK:             scf.if
+// The private fir.convert is NOT hoisted (canMoveOutOf blocks it):
+// CHECK:               fir.convert %[[PRIV]] : (!fir.ref<f32>) -> !fir.ref<f32>
+// CHECK:               memref.store %{{.*}}, %[[CVT_ARG]]
+func.func @test_acc_loop_nested_canMoveOutOf(%arg0: !fir.ref<!fir.array<10xf32>>) {
+  %cst = arith.constant 1.000000e+00 : f32
+  %c10_i32 = arith.constant 10 : i32
+  %c5_i32 = arith.constant 5 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca f32 {bindc_name = "b", uniq_name = "_QFtestEb"}
+  %2 = fir.declare %1 {uniq_name = "_QFtestEb"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  acc.parallel combined(loop) {
+    %priv = acc.private varPtr(%2 : !fir.ref<f32>) recipe(@privatization_ref_f32) -> !fir.ref<f32> {name = "b"}
+    acc.loop combined(parallel) private(%priv : !fir.ref<f32>) control(%arg1 : i32) = (%c1_i32 : i32) to (%c10_i32 : i32)  step (%c1_i32 : i32) {
+      %cond = arith.cmpi slt, %arg1, %c5_i32 : i32
+      scf.if %cond {
+        %cvt_priv = fir.convert %priv : (!fir.ref<f32>) -> !fir.ref<f32>
+        fir.store %cst to %cvt_priv : !fir.ref<f32>
+        %cvt_arg = fir.convert %arg0 : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+        %idx = fir.convert %arg1 : (i32) -> index
+        memref.store %cst, %cvt_arg[%idx] : memref<10xf32>
+      }
+      acc.yield
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+    acc.yield
+  }
+  return
+}
+
+// -----
+// Test that canMoveFromDescendant prevents nested hoisting when the parent
+// of the loop is an omp.wsloop (LoopWrapperInterface), which disallows
+// moving operations from its descendants into it.
+// The fir.convert of %arg0 inside scf.if is loop-invariant but must NOT
+// be hoisted because canMoveFromDescendant(omp.wsloop, omp.loop_nest, ...)
+// returns false.
+// CHECK-LABEL:   func.func @test_omp_wsloop_nested_canMoveFromDescendant(
+// CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<10xf32>>)
+// CHECK:         omp.parallel
+// CHECK-NEXT:      omp.wsloop
+// CHECK-NEXT:        omp.loop_nest
+// CHECK:               scf.if
+// CHECK:                 fir.convert %[[ARG0]]
+func.func @test_omp_wsloop_nested_canMoveFromDescendant(%arg0: !fir.ref<!fir.array<10xf32>>) {
+  %cst = arith.constant 1.000000e+00 : f32
+  %c10_i32 = arith.constant 10 : i32
+  %c5_i32 = arith.constant 5 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %alloca = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtestEi"}
+  %decl_i = fir.declare %alloca {uniq_name = "_QFtestEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  omp.parallel {
+    omp.wsloop private(@_QFtestEi_private_i32 %decl_i -> %arg1 : !fir.ref<i32>) {
+      omp.loop_nest (%arg2) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+        %cond = arith.cmpi slt, %arg2, %c5_i32 : i32
+        scf.if %cond {
+          %cvt = fir.convert %arg0 : (!fir.ref<!fir.array<10xf32>>) -> memref<10xf32>
+          %idx = fir.convert %arg2 : (i32) -> index
+          memref.store %cst, %cvt[%idx] : memref<10xf32>
+        }
+        omp.yield
+      }
+    }
+    omp.terminator
+  }
+  return
+}