[flang-commits] [flang] [flang][HLFIR] Relax InlineElementals to support more than two users (PR #186916)

via flang-commits flang-commits at lists.llvm.org
Wed Apr 22 20:58:24 PDT 2026


https://github.com/anoopkg6 updated https://github.com/llvm/llvm-project/pull/186916

>From 1dc2b52f0064209fa70dd57e1492268f3678be73 Mon Sep 17 00:00:00 2001
From: "anoop.kumar6 at ibm.com" <anoopk at b35lp63.lnxne.boe>
Date: Mon, 16 Mar 2026 15:26:39 +0100
Subject: [PATCH 1/4] [flang][HLFIR] Optimize MINLOC/MAXLOC for equality masks

This patch implements `isEqualityMask` to identify when the MASK argument is an equality comparison against an invariant value (e.g., MASK = A == X).

- This allows the SimplifyHLFIRIntrinsicscation pass to extract the invariant
  search target and bypasses the creation of a temporary logical mask array
  by inlining the equality comparison directly into the reduction loop.
  optimization removes the 'hlfir.apply' to the mask's hlfir.elemental, which
  gets eliminated in bufferize-hlfir pass.
- Simplifies the reduction state by removing the min/max value tracker,
  as the target value is already known.
- Implements a "first-hit" locking mechanism.

Test Coverage:
- 1D, 2D, 3D Variable/Constant equality searches - Verified optimized
- Duplicate match handling - Verified first-occurrence logic
- No-match cases - Verified zero result
- Different array/Non-invariant target - Verified safe fallback
---
 .../Transforms/SimplifyHLFIRIntrinsics.cpp    | 225 +++++++++++++-
 ...plify-hlfir-intrinsics-equality-maxloc.fir | 269 +++++++++++++++++
 ...plify-hlfir-intrinsics-equality-minloc.fir | 274 ++++++++++++++++++
 3 files changed, 764 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir
 create mode 100644 flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index 26c5b63cb05b6..9671de7c6eaa1 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -32,6 +32,94 @@ namespace hlfir {
 
 #define DEBUG_TYPE "simplify-hlfir-intrinsics"
 
+namespace {
+// Check if the given mask is an equality comparison of the search array
+// against an invariant value (e.g., MASK = A == target) by traversing
+// HLFIR/FIR operations to find the underlying elemental comparison
+// and extract the invariant search targetVal.
+// It returns true if the mask is a simple equality comparison against a
+// scalar/invariant.
+bool isEqualityMask(mlir::Value mask, mlir::Value searchArray,
+                    mlir::Value &targetVal) {
+  if (!mask)
+    return false;
+
+  // Trace back HLFIR/FIR wrappers to get Elemental producer.
+  mlir::Value currentMask = mask;
+  while (auto def = currentMask.getDefiningOp()) {
+    if (!mlir::isa<hlfir::AsExprOp, fir::ConvertOp, hlfir::DeclareOp,
+                   hlfir::CopyInOp>(def))
+      break;
+    currentMask = def->getOperand(0);
+  }
+  // Ensure the mask is produced by an hlfir.elemental.
+  auto elemental = currentMask.getDefiningOp<hlfir::ElementalOp>();
+  if (!elemental)
+    return false;
+
+  // Inspect the elemental body to find the boolean result logic.
+  mlir::Block &body = elemental.getRegion().front();
+  auto yieldOp = mlir::cast<hlfir::YieldElementOp>(body.getTerminator());
+  mlir::Value val = yieldOp.getElementValue();
+  // Get core comparison, ignoring intermediate type casts.
+  while (auto conv = val.getDefiningOp<fir::ConvertOp>())
+    val = conv.getOperand();
+
+  // We currently only optimize integer equality (arith.cmpi eq).
+  auto cmpOp = val.getDefiningOp<mlir::arith::CmpIOp>();
+  if (!cmpOp || cmpOp.getPredicate() != mlir::arith::CmpIPredicate::eq)
+    return false;
+
+  // Determine if a value is invariant relative to the mask loop.
+  // Handles constants, function arguments, and values defined in outer scopes.
+  auto isInvariant = [&](mlir::Value v) {
+    if (auto arg = mlir::dyn_cast<mlir::BlockArgument>(v))
+      return arg.getOwner()->getParent() != &elemental.getRegion();
+    if (auto *op = v.getDefiningOp())
+      return !elemental.getRegion().isAncestor(op->getParentRegion());
+    return true;
+  };
+
+  // Trace the Array Side to the base buffer.
+  auto getBase = [](mlir::Value v) -> mlir::Value {
+    while (v) {
+      mlir::Operation *def = v.getDefiningOp();
+      if (!def)
+        break;
+      if (auto decl = mlir::dyn_cast<hlfir::DeclareOp>(def))
+        v = decl.getMemref();
+      else if (auto load = mlir::dyn_cast<fir::LoadOp>(def))
+        v = load.getMemref();
+      else if (auto apply = mlir::dyn_cast<hlfir::ApplyOp>(def))
+        v = apply.getExpr();
+      else if (auto des = mlir::dyn_cast<hlfir::DesignateOp>(def))
+        v = des.getMemref();
+      else if (mlir::isa<fir::ConvertOp, hlfir::AsExprOp>(def))
+        v = def->getOperand(0);
+      else
+        break;
+    }
+    return v;
+  };
+
+  mlir::Value lhs = cmpOp.getLhs(), rhs = cmpOp.getRhs();
+  bool lhsInv = isInvariant(lhs), rhsInv = isInvariant(rhs);
+  // The optimization is valid only if exactly one side is invariant (the
+  // target) and the other side is variant (the array element).
+  if (lhsInv == rhsInv)
+    return false;
+
+  targetVal = lhsInv ? lhs : rhs;
+  mlir::Value arraySide = lhsInv ? rhs : lhs;
+
+  // Verify the mask refers to the same array being searched.
+  if (getBase(arraySide) == getBase(searchArray))
+    return true;
+
+  return false;
+}
+} // end anonymous namespace
+
 static llvm::cl::opt<bool> forceMatmulAsElemental(
     "flang-inline-matmul-as-elemental",
     llvm::cl::desc("Expand hlfir.matmul as elemental operation"),
@@ -530,6 +618,15 @@ class MinMaxlocAsElementalConverter : public ReductionAsElementalConverter {
 
   void
   checkReductions(const llvm::SmallVectorImpl<mlir::Value> &reductions) const {
+    mlir::Value targetVal;
+    // Check if the mask qualifies for the optimized equality mask search path.
+    if (isEqualityMask(this->getMask(), mlir::cast<T>(this->op).getArray(),
+                       targetVal)) {
+      // Expect coordinate indices.
+      assert(reductions.size() == getNumCoors() &&
+             "invalid number of reductions for equality mask MINLOC/MAXLOC");
+      return;
+    }
     if (!useIsFirst())
       assert(reductions.size() == getNumCoors() + 1 &&
              "invalid number of reductions for MINLOC/MAXLOC");
@@ -639,6 +736,51 @@ llvm::SmallVector<mlir::Value>
 MinMaxlocAsElementalConverter<T>::reduceOneElement(
     const llvm::SmallVectorImpl<mlir::Value> &currentValue, hlfir::Entity array,
     mlir::ValueRange oneBasedIndices) {
+  mlir::Value targetVal;
+  // The mask is an equality comparison (e.g., MASK = A == target) inline the
+  // comparison to find the first occurrence efficiently.
+  if (isEqualityMask(this->getMask(), array, targetVal)) {
+    // Directly load the array element and compare with the targetVal.
+    hlfir::Entity elementValue =
+        hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
+    mlir::Value isMatch = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::eq, (mlir::Value)elementValue,
+        targetVal);
+    // currentValue contains [Coord1, ..., CoordN, FirstHitBool]
+    mlir::Value firstHitBool = currentValue.back();
+    // shouldUpdate is true only if we have a match and we haven't found one
+    // yet.
+    mlir::Value shouldUpdate =
+        mlir::arith::AndIOp::create(builder, loc, isMatch, firstHitBool);
+    // Conditional Update: Only update coordinates if a match is found.
+    auto ifOp = fir::IfOp::create(builder, loc,
+                                  mlir::ValueRange(currentValue).getTypes(),
+                                  shouldUpdate, /*withElse=*/true);
+    // If match found and it's the first one, record coordinates.
+    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+    llvm::SmallVector<mlir::Value> thenResults;
+    unsigned rank = array.getRank();
+    // Get the firstHit flag.
+    for (unsigned i = 0; i < rank; ++i) {
+      mlir::Value loopIdx = builder.createConvert(
+          loc, currentValue[i].getType(), oneBasedIndices[i]);
+      thenResults.emplace_back(loopIdx);
+    }
+
+    // Update the flag: Set to 0 (False) for all future iterations.
+    mlir::Value falseVal =
+        mlir::arith::ConstantIntOp::create(builder, loc, 0, 1);
+    thenResults.emplace_back(falseVal);
+
+    fir::ResultOp::create(builder, loc, thenResults);
+
+    // No match or already found a previous match: maintain the current state.
+    builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
+    fir::ResultOp::create(builder, loc, currentValue);
+
+    builder.setInsertionPointAfter(ifOp);
+    return ifOp.getResults();
+  }
   checkReductions(currentValue);
   hlfir::Entity elementValue =
       hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
@@ -689,6 +831,49 @@ MinMaxlocAsElementalConverter<T>::reduceOneElement(
 template <typename T>
 hlfir::Entity MinMaxlocAsElementalConverter<T>::genFinalResult(
     const llvm::SmallVectorImpl<mlir::Value> &reductionResults) {
+  mlir::Value targetVal;
+  // Finalize results for the equality-mask search.
+  if (isEqualityMask(this->getMask(), mlir::cast<T>(this->op).getArray(),
+                     targetVal)) {
+    unsigned rank = getNumCoors();
+    mlir::Type resultElemTy =
+        hlfir::getFortranElementType(this->getResultType());
+    // MINLOC/MAXLOC returns an integer array of shape [rank].
+    // Manually build the HLFIR expression to hold the resulting coordinates.
+    llvm::SmallVector<int64_t> shapeVec{static_cast<int64_t>(rank)};
+    mlir::Type exprTy = hlfir::ExprType::get(builder.getContext(), shapeVec,
+                                             resultElemTy, false);
+    mlir::Value resRank =
+        builder.createIntegerConstant(loc, builder.getIndexType(), rank);
+    mlir::Value resShape = fir::ShapeOp::create(builder, loc, resRank);
+
+    // Create an elemental operation to map the scalar reduction results
+    // (coordinates) back into a Fortran array result.
+    auto elemental =
+        hlfir::ElementalOp::create(builder, loc, exprTy, resShape,
+                                   /*mold=*/mlir::Value{},
+                                   /*typeparams=*/mlir::ValueRange{},
+                                   /*isUnordered=*/false);
+    {
+      // Fill the elemental body.
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      builder.setInsertionPointToStart(elemental.getBody());
+      // Map the 1-based elemental index, result[i] = reductionResults[i-1].
+      mlir::Value elemIdx = elemental.getIndices()[0];
+      mlir::Value resultVal = reductionResults[0];
+      for (unsigned i = 1; i < rank; ++i) {
+        mlir::Value dimConst =
+            builder.createIntegerConstant(loc, builder.getIndexType(), i + 1);
+        mlir::Value isDimMatch = mlir::arith::CmpIOp::create(
+            builder, loc, mlir::arith::CmpIPredicate::eq, elemIdx, dimConst);
+        // Select specific coordinate matching current elemental dimension.
+        resultVal = mlir::arith::SelectOp::create(
+            builder, loc, isDimMatch, reductionResults[i], resultVal);
+      }
+      hlfir::YieldElementOp::create(builder, loc, resultVal);
+    }
+    return hlfir::Entity{elemental.getResult()};
+  }
   // Identification of the final result of MINLOC/MAXLOC:
   //   * If DIM is absent, the result is rank-one array.
   //   * If DIM is present:
@@ -1185,9 +1370,39 @@ mlir::LogicalResult ReductionAsElementalConverter::convert() {
       extents.push_back(
           builder.createConvert(loc, builder.getIndexType(), dimExtent));
 
-    // Initial value for the reduction.
-    llvm::SmallVector<mlir::Value, 1> reductionInitValues =
-        genReductionInitValues(inputIndices, extents);
+    mlir::Value minMaxMask;
+    if (auto minloc = mlir::dyn_cast<hlfir::MinlocOp>(op)) {
+      minMaxMask = minloc.getMask();
+    } else if (auto maxloc = mlir::dyn_cast<hlfir::MaxlocOp>(op)) {
+      minMaxMask = maxloc.getMask();
+    }
+    mlir::Value targetVal;
+    bool isFixedSearch = false;
+    // Check if the mask allows for a simplified search optimization.
+    if (minMaxMask)
+      isFixedSearch =
+          isEqualityMask(minMaxMask, this->op->getOperand(0), targetVal);
+    llvm::SmallVector<mlir::Value, 1> reductionInitValues;
+    if (isFixedSearch) {
+      // For optimized equality searches, we skip the 'Min/Max value' reduction
+      // and only track coordinate indices and the firstHit flag.
+      unsigned rank = hlfir::Entity{array}.getRank();
+      mlir::Type resElemTy =
+          hlfir::getFortranElementType(this->getResultType());
+      mlir::Value zeroVal = builder.createIntegerConstant(loc, resElemTy, 0);
+
+      // Initialize all coordinates to 0.
+      for (unsigned i = 0; i < rank; ++i) {
+        reductionInitValues.emplace_back(zeroVal);
+      }
+      // First hit flag: [Row, Col, FirstHit=1] (Size: 3)
+      mlir::Type i1Type = builder.getI1Type();
+      mlir::Value firstHitTrue = mlir::arith::ConstantOp::create(
+          builder, loc, i1Type, builder.getBoolAttr(true));
+      reductionInitValues.emplace_back(firstHitTrue);
+    } else {
+      reductionInitValues = genReductionInitValues(inputIndices, extents);
+    }
 
     auto genBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                        mlir::ValueRange oneBasedIndices,
@@ -1209,7 +1424,9 @@ mlir::LogicalResult ReductionAsElementalConverter::convert() {
       llvm::transform(reductionValues, std::back_inserter(reductionTypes),
                       [](mlir::Value v) { return v.getType(); });
       fir::IfOp ifOp;
-      if (mask) {
+      // Skip standard masking block in case of 'isFixedSearch', as it handles
+      // its own masking logic inside the comparison.
+      if (mask && !isFixedSearch) {
         // Make the reduction value update conditional on the value
         // of the mask.
         if (!maskValue) {
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir
new file mode 100644
index 0000000000000..31925ae41467e
--- /dev/null
+++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir
@@ -0,0 +1,269 @@
+// RUN: fir-opt %s --simplify-hlfir-intrinsics | FileCheck %s
+
+// Rank 1: Variable: A == %target
+func.func @test_maxloc_1d_equality_variable(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_1d_equality_variable
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK-NOT: arith.constant -2147483648 : i32
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
+// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %{{.*}}
+// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
+// CHECK:   %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
+
+// Rank 2: Variable: A == %target
+func.func @test_maxloc_2d_equality_variable(%arg0: !hlfir.expr<?x?xi32>, %target: i32) -> !hlfir.expr<2xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
+  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index):
+    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
+  return %res : !hlfir.expr<2xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_2d_equality_variable
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[RES_OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[OUT1:.*]] = %[[C0]], %[[OUT2:.*]] = %[[C0]], %[[OUT3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
+// CHECK:   %[[RES_INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[IN1:.*]] = %[[OUT1]], %[[IN2:.*]] = %[[OUT2]], %[[IN3:.*]] = %[[OUT3]]) -> (i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:     %[[EQ:.*]] = arith.cmpi eq, {{.*}}
+// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[IN3]]
+// CHECK:     %[[IF:.*]]:3 = fir.if %[[COND]] -> (i32, i32, i1)
+
+// Rank 3: Variable: A == %target
+func.func @test_maxloc_3d_equality_variable(%arg0: !hlfir.expr<?x?x?xi32>, %target: i32) -> !hlfir.expr<3xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
+  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index, %k: index):
+    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
+  return %res : !hlfir.expr<3xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_3d_equality_variable
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[OUTER:.*]]:4 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
+// CHECK:   %[[MIDDLE:.*]]:4 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
+// CHECK:     %[[INNER:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:       %[[EQ:.*]] = arith.cmpi eq, {{.*}}
+// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
+// CHECK:       %[[IF:.*]]:4 = fir.if %[[COND]] -> (i32, i32, i32, i1)
+
+// Rank 1: Constant: A == 42
+func.func @test_maxloc_1d_equality_constant(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
+  %c42 = arith.constant 42 : i32
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c42 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_1d_equality_constant
+// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+
+// CHECK: %[[RES:.*]]:2 = fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
+// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
+// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
+// CHECK:   %[[IF:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
+
+// Rank 2: Constant: A == 42
+func.func @test_maxloc_2d_equality_constant(%arg0: !hlfir.expr<?x?xi32>) -> !hlfir.expr<2xi32> {
+  %c42 = arith.constant 42 : i32
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
+  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index):
+    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c42 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
+  return %res : !hlfir.expr<2xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_2d_equality_constant
+// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
+// CHECK:   %[[INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[O1]], %[[I2:.*]] = %[[O2]], %[[I3:.*]] = %[[O3]]) -> (i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:     %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV2]], %[[IV1]]
+// CHECK:     %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
+// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[I3]]
+
+// Rank 3: Constant: A == 42
+func.func @test_maxloc_3d_equality_constant(%arg0: !hlfir.expr<?x?x?xi32>) -> !hlfir.expr<3xi32> {
+  %c42 = arith.constant 42 : i32
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
+  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index, %k: index):
+    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c42 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
+  return %res : !hlfir.expr<3xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_3d_equality_constant
+// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[OUT:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
+// CHECK:   %[[MID:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
+// CHECK:     %[[INN:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:       %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV3]], %{{.*}}, %{{.*}}
+// CHECK:       %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
+// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
+
+// No Match: Result must be 0
+func.func @test_maxloc_no_match(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
+  %c99 = arith.constant 99 : i32 
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c99 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_no_match(
+// CHECK-SAME: %[[ARRAY_NM:.*]]: !hlfir.expr<?xi32>)
+// CHECK-DAG:    %[[C99:.*]] = arith.constant 99 : i32
+// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_NM]], %[[IV]]
+// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[C99]] : i32
+// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
+// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
+// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
+// CHECK:        fir.result %[[CONV]], %false
+// CHECK:      } else {
+// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
+// CHECK:      }
+
+// First Match: Duplicate values
+func.func @test_maxloc_first_match(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_first_match(
+// CHECK-SAME: %[[ARRAY_FM:.*]]: !hlfir.expr<?xi32>, %[[TARGET_FM:.*]]: i32)
+// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
+// CHECK-DAG:    %[[FALSE:.*]] = arith.constant false
+// Verify loop has only 2 iter_args (Coord, FirstHitFlag)
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// Verify mask elemental is bypassed
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// Verify the "Locking" logic: (Match 'and' is_first)
+// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_FM]], %[[IV]]
+// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[TARGET_FM]] : i32
+// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
+// Verify that once a match is found, we result in %false to lock it
+// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
+// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
+// CHECK:        fir.result %[[CONV]], %[[FALSE]] : i32, i1
+// CHECK:      } else {
+// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
+// CHECK:      }
+
+// Negative test: Mask refers to a different array (%arg1) than the search 
+// array (%arg0).
+func.func @test_maxloc_different_arrays(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    // Optimization should fail here because %arg1 != %arg0
+    %val_b = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val_b, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_different_arrays(
+// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>, %[[TARGET:.*]]: i32)
+// CHECK: %[[SENTINEL:.*]] = arith.constant -2147483648 : i32
+// Verify the loop uses three iter_args (standard path)
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MAX:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
+// Verify the mask is applied (Since we can't inline it safely)
+// CHECK: %[[MASK_VAL:.*]] = hlfir.apply {{.*}} : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_VAL]] : (!fir.logical<4>) -> i1
+// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
+// CHECK:   %[[VAL_A:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
+// CHECK:   %[[CMP:.*]] = arith.cmpi sgt, %[[VAL_A]], %[[MAX]]
+
+// Negative Test: The target value is another array, so it is not invariant.
+func.func @test_maxloc_non_invariant_target(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val_a = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %val_target = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
+    // Optimization should fail here because %val_target is defined inside the elemental
+    %cmp = arith.cmpi eq, %val_a, %val_target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_non_invariant_target(
+// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>)
+// CHECK: %[[SENTINEL:.*]] = arith.constant -2147483648 : i32
+// Verify the loop uses three iter_args (Standard path)
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MAX:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
+// Verify the mask is still applied (because we couldn't inline the comparison)
+// CHECK: %[[MASK_BIT:.*]] = hlfir.apply %{{.*}}, %[[IV]] : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_BIT]] : (!fir.logical<4>) -> i1
+// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
+// CHECK:   %[[VAL:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
+// CHECK:   %[[CMP:.*]] = arith.cmpi sgt, %[[VAL]], %[[MAX]]
+
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir
new file mode 100644
index 0000000000000..0bfa58968a2fe
--- /dev/null
+++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir
@@ -0,0 +1,274 @@
+// RUN: fir-opt %s --simplify-hlfir-intrinsics | FileCheck %s
+
+// Rank 1: Variable: A == %target
+func.func @test_minloc_1d_equality_variable(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_1d_equality_variable
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK-NOT: arith.constant 2147483647 : i32
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
+// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %{{.*}}
+// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
+// CHECK:   %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
+
+// Rank 2: Variable: A == %target
+func.func @test_minloc_2d_equality_variable(%arg0: !hlfir.expr<?x?xi32>, %target: i32) -> !hlfir.expr<2xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
+  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index):
+    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
+  return %res : !hlfir.expr<2xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_2d_equality_variable
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[RES_OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[OUT1:.*]] = %[[C0]], %[[OUT2:.*]] = %[[C0]], %[[OUT3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
+// CHECK:   %[[RES_INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[IN1:.*]] = %[[OUT1]], %[[IN2:.*]] = %[[OUT2]], %[[IN3:.*]] = %[[OUT3]]) -> (i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:     %[[EQ:.*]] = arith.cmpi eq, {{.*}}
+// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[IN3]]
+// CHECK:     %[[IF:.*]]:3 = fir.if %[[COND]] -> (i32, i32, i1)
+
+// Rank 3: Variable: A == %target
+func.func @test_minloc_3d_equality_variable(%arg0: !hlfir.expr<?x?x?xi32>, %target: i32) -> !hlfir.expr<3xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
+  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index, %k: index):
+    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
+  return %res : !hlfir.expr<3xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_3d_equality_variable
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[OUTER:.*]]:4 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
+// CHECK:   %[[MIDDLE:.*]]:4 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
+// CHECK:     %[[INNER:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:       %[[EQ:.*]] = arith.cmpi eq, {{.*}}
+// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
+// CHECK:       %[[IF:.*]]:4 = fir.if %[[COND]] -> (i32, i32, i32, i1)
+
+// Rank 1: Constant: A == 42
+func.func @test_minloc_1d_equality_constant(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
+  %c42 = arith.constant 42 : i32
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c42 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_1d_equality_constant
+// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+
+// CHECK: %[[RES:.*]]:2 = fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
+// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
+// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
+// CHECK:   %[[IF:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
+
+// Rank 2: Constant: A == 42
+func.func @test_minloc_2d_equality_constant(%arg0: !hlfir.expr<?x?xi32>) -> !hlfir.expr<2xi32> {
+  %c42 = arith.constant 42 : i32
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
+  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index):
+    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c42 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
+  return %res : !hlfir.expr<2xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_2d_equality_constant
+// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
+// CHECK:   %[[INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[O1]], %[[I2:.*]] = %[[O2]], %[[I3:.*]] = %[[O3]]) -> (i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:     %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV2]], %[[IV1]]
+// CHECK:     %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
+// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[I3]]
+
+// Rank 3: Constant: A == 42
+func.func @test_minloc_3d_equality_constant(%arg0: !hlfir.expr<?x?x?xi32>) -> !hlfir.expr<3xi32> {
+  %c42 = arith.constant 42 : i32
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
+  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index, %k: index):
+    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c42 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
+  return %res : !hlfir.expr<3xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_3d_equality_constant
+// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[OUT:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
+// CHECK:   %[[MID:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
+// CHECK:     %[[INN:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:       %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV3]], %{{.*}}, %{{.*}}
+// CHECK:       %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
+// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
+
+// No Match: Result must be 0
+func.func @test_minloc_no_match(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
+  %c99 = arith.constant 99 : i32 
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c99 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_no_match(
+// CHECK-SAME: %[[ARRAY_NM:.*]]: !hlfir.expr<?xi32>)
+// CHECK-DAG:    %[[C99:.*]] = arith.constant 99 : i32
+// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_NM]], %[[IV]]
+// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[C99]] : i32
+// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
+// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
+// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
+// CHECK:        fir.result %[[CONV]], %false
+// CHECK:      } else {
+// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
+// CHECK:      }
+
+// First Match: Duplicate values
+func.func @test_minloc_first_match(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_first_match(
+// CHECK-SAME: %[[ARRAY_FM:.*]]: !hlfir.expr<?xi32>, %[[TARGET_FM:.*]]: i32)
+// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
+// CHECK-DAG:    %[[FALSE:.*]] = arith.constant false
+// Verify loop has only 2 iter_args (Coord, FirstHitFlag)
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// Verify mask elemental is bypassed
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// Verify the "Locking" logic: (Match AND is_first)
+// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_FM]], %[[IV]]
+// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[TARGET_FM]] : i32
+// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
+// Verify that once a match is found, we result in %false to lock it
+// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
+// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
+// CHECK:        fir.result %[[CONV]], %[[FALSE]] : i32, i1
+// CHECK:      } else {
+// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
+// CHECK:      }
+
+// Negative test: Mask refers to a different array (%arg1) than the search 
+// array (%arg0).
+func.func @test_minloc_different_arrays(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    // Optimization should fail here because %arg1 != %arg0
+    %val_b = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val_b, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_different_arrays(
+// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>, %[[TARGET:.*]]: i32)
+// CHECK: %[[SENTINEL:.*]] = arith.constant 2147483647 : i32
+
+// 1. Verify the loop uses three iter_args (Standard path: Loc, MinVal, FirstHit)
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MIN_VAL:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
+
+// 2. Verify the mask IS still applied (Optimization correctly skipped)
+// CHECK: %[[MASK_VAL:.*]] = hlfir.apply {{.*}} : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_VAL]] : (!fir.logical<4>) -> i1
+
+// 3. Verify the standard path's MINLOC comparison logic (slt instead of sgt)
+// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
+// CHECK:   %[[VAL_A:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
+// CHECK:   %[[CMP:.*]] = arith.cmpi slt, %[[VAL_A]], %[[MIN_VAL]] : i32
+
+// Negative Test: The target value is another array, so it is not invariant.
+func.func @test_minloc_non_invariant_target(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val_a = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %val_target = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
+    // Optimization should fail here because %val_target is defined inside the 
+    // elemental
+    %cmp = arith.cmpi eq, %val_a, %val_target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_non_invariant_target(
+// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>)
+// CHECK: %[[SENTINEL:.*]] = arith.constant 2147483647 : i32
+// Verify the loop uses three iter_args (standard path)
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MAX:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
+// Verify the mask is still applied (because we couldn't inline the comparison)
+// CHECK: %[[MASK_BIT:.*]] = hlfir.apply %{{.*}}, %[[IV]] : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_BIT]] : (!fir.logical<4>) -> i1
+// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
+// CHECK:   %[[VAL:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
+// CHECK:   %[[CMP:.*]] = arith.cmpi slt, %[[VAL]], %[[MAX]]
+

>From 916578a35168ad6b144683f87554b1a16b915c80 Mon Sep 17 00:00:00 2001
From: "anoop.kumar6 at ibm.com" <anoopk at b35lp63.lnxne.boe>
Date: Thu, 26 Mar 2026 22:10:20 +0100
Subject: [PATCH 2/4] [flang][HLFIR] Relax InlineElementals to support more
 than two users

This patch updates the InlineElementals pass to allow the resulting
hlfir.apply of the mask hlfir.elemental to be inlined inside the loop even
when they have multiple users. Previously, inlining was strictly restricted
to elementals with exactly 2 users (apply, destroy). Now, focus is uniqueness
 of the hlfir.apply site rather than the total user count.
The transformation preserves the elemental producer if other users remain
while optimizing the scalar path within the loop.
---
 .../HLFIR/Transforms/InlineElementals.cpp     |  27 +-
 .../Transforms/SimplifyHLFIRIntrinsics.cpp    | 225 +-----------
 .../HLFIR/inline-elemental-multi-users.fir    | 342 ++++++++++++++++++
 ...plify-hlfir-intrinsics-equality-maxloc.fir | 269 --------------
 ...plify-hlfir-intrinsics-equality-minloc.fir | 274 --------------
 5 files changed, 362 insertions(+), 775 deletions(-)
 create mode 100644 flang/test/HLFIR/inline-elemental-multi-users.fir
 delete mode 100644 flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir
 delete mode 100644 flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
index ff84a3cff0afb..e3ced12dc93b3 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
@@ -35,12 +35,6 @@ namespace hlfir {
 /// a destroy operation, return those two, otherwise return {}
 static std::optional<std::pair<hlfir::ApplyOp, hlfir::DestroyOp>>
 getTwoUses(hlfir::ElementalOp elemental) {
-  mlir::Operation::user_range users = elemental->getUsers();
-  // don't inline anything with more than one use (plus hfir.destroy)
-  if (std::distance(users.begin(), users.end()) != 2) {
-    return std::nullopt;
-  }
-
   // If the ElementalOp must produce a temporary (e.g. for
   // finalization purposes), then we cannot inline it.
   if (hlfir::elementalOpMustProduceTemp(elemental))
@@ -48,12 +42,21 @@ getTwoUses(hlfir::ElementalOp elemental) {
 
   hlfir::ApplyOp apply;
   hlfir::DestroyOp destroy;
-  for (mlir::Operation *user : users)
+  unsigned applyCount = 0;
+
+  for (mlir::Operation *user : elemental->getUsers()) {
     mlir::TypeSwitch<mlir::Operation *, void>(user)
-        .Case([&](hlfir::ApplyOp op) { apply = op; })
+        .Case([&](hlfir::ApplyOp op) {
+          apply = op;
+          applyCount++;
+        })
         .Case([&](hlfir::DestroyOp op) { destroy = op; });
+  }
 
-  if (!apply || !destroy)
+  // Only inline if there is a unique 'apply' site. Other users (such as
+  // intrinsic operations) are allowed because scalarizing the elemental
+  // renders the original array result redundant.
+  if (applyCount != 1 || !destroy)
     return std::nullopt;
 
   // we can't inline if the return type of the yield doesn't match the return
@@ -80,7 +83,7 @@ class InlineElementalConversion
         getTwoUses(elemental);
     if (!maybeTuple)
       return rewriter.notifyMatchFailure(
-          elemental, "hlfir.elemental does not have two uses");
+          elemental, "hlfir.elemental is not a candidate for inlining");
 
     if (elemental.isOrdered()) {
       // We can only inline the ordered elemental into a loop-like
@@ -104,7 +107,9 @@ class InlineElementalConversion
     rewriter.replaceOp(apply, {yield.getElementValue()});
     rewriter.eraseOp(yield);
     rewriter.eraseOp(destroy);
-    rewriter.eraseOp(elemental);
+    // Only erase the elemental if that was its last use.
+    if (elemental->use_empty())
+      rewriter.eraseOp(elemental);
 
     return mlir::success();
   }
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index 9671de7c6eaa1..26c5b63cb05b6 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -32,94 +32,6 @@ namespace hlfir {
 
 #define DEBUG_TYPE "simplify-hlfir-intrinsics"
 
-namespace {
-// Check if the given mask is an equality comparison of the search array
-// against an invariant value (e.g., MASK = A == target) by traversing
-// HLFIR/FIR operations to find the underlying elemental comparison
-// and extract the invariant search targetVal.
-// It returns true if the mask is a simple equality comparison against a
-// scalar/invariant.
-bool isEqualityMask(mlir::Value mask, mlir::Value searchArray,
-                    mlir::Value &targetVal) {
-  if (!mask)
-    return false;
-
-  // Trace back HLFIR/FIR wrappers to get Elemental producer.
-  mlir::Value currentMask = mask;
-  while (auto def = currentMask.getDefiningOp()) {
-    if (!mlir::isa<hlfir::AsExprOp, fir::ConvertOp, hlfir::DeclareOp,
-                   hlfir::CopyInOp>(def))
-      break;
-    currentMask = def->getOperand(0);
-  }
-  // Ensure the mask is produced by an hlfir.elemental.
-  auto elemental = currentMask.getDefiningOp<hlfir::ElementalOp>();
-  if (!elemental)
-    return false;
-
-  // Inspect the elemental body to find the boolean result logic.
-  mlir::Block &body = elemental.getRegion().front();
-  auto yieldOp = mlir::cast<hlfir::YieldElementOp>(body.getTerminator());
-  mlir::Value val = yieldOp.getElementValue();
-  // Get core comparison, ignoring intermediate type casts.
-  while (auto conv = val.getDefiningOp<fir::ConvertOp>())
-    val = conv.getOperand();
-
-  // We currently only optimize integer equality (arith.cmpi eq).
-  auto cmpOp = val.getDefiningOp<mlir::arith::CmpIOp>();
-  if (!cmpOp || cmpOp.getPredicate() != mlir::arith::CmpIPredicate::eq)
-    return false;
-
-  // Determine if a value is invariant relative to the mask loop.
-  // Handles constants, function arguments, and values defined in outer scopes.
-  auto isInvariant = [&](mlir::Value v) {
-    if (auto arg = mlir::dyn_cast<mlir::BlockArgument>(v))
-      return arg.getOwner()->getParent() != &elemental.getRegion();
-    if (auto *op = v.getDefiningOp())
-      return !elemental.getRegion().isAncestor(op->getParentRegion());
-    return true;
-  };
-
-  // Trace the Array Side to the base buffer.
-  auto getBase = [](mlir::Value v) -> mlir::Value {
-    while (v) {
-      mlir::Operation *def = v.getDefiningOp();
-      if (!def)
-        break;
-      if (auto decl = mlir::dyn_cast<hlfir::DeclareOp>(def))
-        v = decl.getMemref();
-      else if (auto load = mlir::dyn_cast<fir::LoadOp>(def))
-        v = load.getMemref();
-      else if (auto apply = mlir::dyn_cast<hlfir::ApplyOp>(def))
-        v = apply.getExpr();
-      else if (auto des = mlir::dyn_cast<hlfir::DesignateOp>(def))
-        v = des.getMemref();
-      else if (mlir::isa<fir::ConvertOp, hlfir::AsExprOp>(def))
-        v = def->getOperand(0);
-      else
-        break;
-    }
-    return v;
-  };
-
-  mlir::Value lhs = cmpOp.getLhs(), rhs = cmpOp.getRhs();
-  bool lhsInv = isInvariant(lhs), rhsInv = isInvariant(rhs);
-  // The optimization is valid only if exactly one side is invariant (the
-  // target) and the other side is variant (the array element).
-  if (lhsInv == rhsInv)
-    return false;
-
-  targetVal = lhsInv ? lhs : rhs;
-  mlir::Value arraySide = lhsInv ? rhs : lhs;
-
-  // Verify the mask refers to the same array being searched.
-  if (getBase(arraySide) == getBase(searchArray))
-    return true;
-
-  return false;
-}
-} // end anonymous namespace
-
 static llvm::cl::opt<bool> forceMatmulAsElemental(
     "flang-inline-matmul-as-elemental",
     llvm::cl::desc("Expand hlfir.matmul as elemental operation"),
@@ -618,15 +530,6 @@ class MinMaxlocAsElementalConverter : public ReductionAsElementalConverter {
 
   void
   checkReductions(const llvm::SmallVectorImpl<mlir::Value> &reductions) const {
-    mlir::Value targetVal;
-    // Check if the mask qualifies for the optimized equality mask search path.
-    if (isEqualityMask(this->getMask(), mlir::cast<T>(this->op).getArray(),
-                       targetVal)) {
-      // Expect coordinate indices.
-      assert(reductions.size() == getNumCoors() &&
-             "invalid number of reductions for equality mask MINLOC/MAXLOC");
-      return;
-    }
     if (!useIsFirst())
       assert(reductions.size() == getNumCoors() + 1 &&
              "invalid number of reductions for MINLOC/MAXLOC");
@@ -736,51 +639,6 @@ llvm::SmallVector<mlir::Value>
 MinMaxlocAsElementalConverter<T>::reduceOneElement(
     const llvm::SmallVectorImpl<mlir::Value> &currentValue, hlfir::Entity array,
     mlir::ValueRange oneBasedIndices) {
-  mlir::Value targetVal;
-  // The mask is an equality comparison (e.g., MASK = A == target) inline the
-  // comparison to find the first occurrence efficiently.
-  if (isEqualityMask(this->getMask(), array, targetVal)) {
-    // Directly load the array element and compare with the targetVal.
-    hlfir::Entity elementValue =
-        hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
-    mlir::Value isMatch = mlir::arith::CmpIOp::create(
-        builder, loc, mlir::arith::CmpIPredicate::eq, (mlir::Value)elementValue,
-        targetVal);
-    // currentValue contains [Coord1, ..., CoordN, FirstHitBool]
-    mlir::Value firstHitBool = currentValue.back();
-    // shouldUpdate is true only if we have a match and we haven't found one
-    // yet.
-    mlir::Value shouldUpdate =
-        mlir::arith::AndIOp::create(builder, loc, isMatch, firstHitBool);
-    // Conditional Update: Only update coordinates if a match is found.
-    auto ifOp = fir::IfOp::create(builder, loc,
-                                  mlir::ValueRange(currentValue).getTypes(),
-                                  shouldUpdate, /*withElse=*/true);
-    // If match found and it's the first one, record coordinates.
-    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-    llvm::SmallVector<mlir::Value> thenResults;
-    unsigned rank = array.getRank();
-    // Get the firstHit flag.
-    for (unsigned i = 0; i < rank; ++i) {
-      mlir::Value loopIdx = builder.createConvert(
-          loc, currentValue[i].getType(), oneBasedIndices[i]);
-      thenResults.emplace_back(loopIdx);
-    }
-
-    // Update the flag: Set to 0 (False) for all future iterations.
-    mlir::Value falseVal =
-        mlir::arith::ConstantIntOp::create(builder, loc, 0, 1);
-    thenResults.emplace_back(falseVal);
-
-    fir::ResultOp::create(builder, loc, thenResults);
-
-    // No match or already found a previous match: maintain the current state.
-    builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-    fir::ResultOp::create(builder, loc, currentValue);
-
-    builder.setInsertionPointAfter(ifOp);
-    return ifOp.getResults();
-  }
   checkReductions(currentValue);
   hlfir::Entity elementValue =
       hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
@@ -831,49 +689,6 @@ MinMaxlocAsElementalConverter<T>::reduceOneElement(
 template <typename T>
 hlfir::Entity MinMaxlocAsElementalConverter<T>::genFinalResult(
     const llvm::SmallVectorImpl<mlir::Value> &reductionResults) {
-  mlir::Value targetVal;
-  // Finalize results for the equality-mask search.
-  if (isEqualityMask(this->getMask(), mlir::cast<T>(this->op).getArray(),
-                     targetVal)) {
-    unsigned rank = getNumCoors();
-    mlir::Type resultElemTy =
-        hlfir::getFortranElementType(this->getResultType());
-    // MINLOC/MAXLOC returns an integer array of shape [rank].
-    // Manually build the HLFIR expression to hold the resulting coordinates.
-    llvm::SmallVector<int64_t> shapeVec{static_cast<int64_t>(rank)};
-    mlir::Type exprTy = hlfir::ExprType::get(builder.getContext(), shapeVec,
-                                             resultElemTy, false);
-    mlir::Value resRank =
-        builder.createIntegerConstant(loc, builder.getIndexType(), rank);
-    mlir::Value resShape = fir::ShapeOp::create(builder, loc, resRank);
-
-    // Create an elemental operation to map the scalar reduction results
-    // (coordinates) back into a Fortran array result.
-    auto elemental =
-        hlfir::ElementalOp::create(builder, loc, exprTy, resShape,
-                                   /*mold=*/mlir::Value{},
-                                   /*typeparams=*/mlir::ValueRange{},
-                                   /*isUnordered=*/false);
-    {
-      // Fill the elemental body.
-      mlir::OpBuilder::InsertionGuard guard(builder);
-      builder.setInsertionPointToStart(elemental.getBody());
-      // Map the 1-based elemental index, result[i] = reductionResults[i-1].
-      mlir::Value elemIdx = elemental.getIndices()[0];
-      mlir::Value resultVal = reductionResults[0];
-      for (unsigned i = 1; i < rank; ++i) {
-        mlir::Value dimConst =
-            builder.createIntegerConstant(loc, builder.getIndexType(), i + 1);
-        mlir::Value isDimMatch = mlir::arith::CmpIOp::create(
-            builder, loc, mlir::arith::CmpIPredicate::eq, elemIdx, dimConst);
-        // Select specific coordinate matching current elemental dimension.
-        resultVal = mlir::arith::SelectOp::create(
-            builder, loc, isDimMatch, reductionResults[i], resultVal);
-      }
-      hlfir::YieldElementOp::create(builder, loc, resultVal);
-    }
-    return hlfir::Entity{elemental.getResult()};
-  }
   // Identification of the final result of MINLOC/MAXLOC:
   //   * If DIM is absent, the result is rank-one array.
   //   * If DIM is present:
@@ -1370,39 +1185,9 @@ mlir::LogicalResult ReductionAsElementalConverter::convert() {
       extents.push_back(
           builder.createConvert(loc, builder.getIndexType(), dimExtent));
 
-    mlir::Value minMaxMask;
-    if (auto minloc = mlir::dyn_cast<hlfir::MinlocOp>(op)) {
-      minMaxMask = minloc.getMask();
-    } else if (auto maxloc = mlir::dyn_cast<hlfir::MaxlocOp>(op)) {
-      minMaxMask = maxloc.getMask();
-    }
-    mlir::Value targetVal;
-    bool isFixedSearch = false;
-    // Check if the mask allows for a simplified search optimization.
-    if (minMaxMask)
-      isFixedSearch =
-          isEqualityMask(minMaxMask, this->op->getOperand(0), targetVal);
-    llvm::SmallVector<mlir::Value, 1> reductionInitValues;
-    if (isFixedSearch) {
-      // For optimized equality searches, we skip the 'Min/Max value' reduction
-      // and only track coordinate indices and the firstHit flag.
-      unsigned rank = hlfir::Entity{array}.getRank();
-      mlir::Type resElemTy =
-          hlfir::getFortranElementType(this->getResultType());
-      mlir::Value zeroVal = builder.createIntegerConstant(loc, resElemTy, 0);
-
-      // Initialize all coordinates to 0.
-      for (unsigned i = 0; i < rank; ++i) {
-        reductionInitValues.emplace_back(zeroVal);
-      }
-      // First hit flag: [Row, Col, FirstHit=1] (Size: 3)
-      mlir::Type i1Type = builder.getI1Type();
-      mlir::Value firstHitTrue = mlir::arith::ConstantOp::create(
-          builder, loc, i1Type, builder.getBoolAttr(true));
-      reductionInitValues.emplace_back(firstHitTrue);
-    } else {
-      reductionInitValues = genReductionInitValues(inputIndices, extents);
-    }
+    // Initial value for the reduction.
+    llvm::SmallVector<mlir::Value, 1> reductionInitValues =
+        genReductionInitValues(inputIndices, extents);
 
     auto genBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                        mlir::ValueRange oneBasedIndices,
@@ -1424,9 +1209,7 @@ mlir::LogicalResult ReductionAsElementalConverter::convert() {
       llvm::transform(reductionValues, std::back_inserter(reductionTypes),
                       [](mlir::Value v) { return v.getType(); });
       fir::IfOp ifOp;
-      // Skip standard masking block in case of 'isFixedSearch', as it handles
-      // its own masking logic inside the comparison.
-      if (mask && !isFixedSearch) {
+      if (mask) {
         // Make the reduction value update conditional on the value
         // of the mask.
         if (!maskValue) {
diff --git a/flang/test/HLFIR/inline-elemental-multi-users.fir b/flang/test/HLFIR/inline-elemental-multi-users.fir
new file mode 100644
index 0000000000000..f8b195e637091
--- /dev/null
+++ b/flang/test/HLFIR/inline-elemental-multi-users.fir
@@ -0,0 +1,342 @@
+// RUN: fir-opt --inline-elementals %s | FileCheck %s
+
+// Test inlining of hlfir.elemental into its hlfir.apply site when the 
+// elemental has more than two users.
+
+// Check successful inlining where hlfir.elemental survives because the
+// 'associate' op is still using it.
+func.func @test_inlining_use_mask(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10, %c10 : (index, index) -> !fir.shape<2>
+
+  // Elemental Mask.
+  %mask = hlfir.elemental %shape unordered : (!fir.shape<2>) -> !hlfir.expr<10x10x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index):
+    %val = hlfir.designate %arg0 (%i, %j) : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+    %load = fir.load %val : !fir.ref<i32>
+    %ref = fir.load %arg1 : !fir.ref<i32>
+    %cmp = arith.cmpi eq, %load, %ref : i32
+    %res = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %res : !fir.logical<4>
+  }
+
+  // Extra User - This keeps the elemental alive even after inlining the apply
+  // site. Total uses = 3 (associate, apply, destroy).
+  %extra:3 = hlfir.associate %mask : (!hlfir.expr<10x10x!fir.logical<4>>) -> (!fir.box<!fir.array<10x10x!fir.logical<4>>>, !fir.ref<!fir.array<10x10x!fir.logical<4>>>, i1)
+
+  // Apply inside a loop
+  fir.do_loop %arg2 = %c1 to %c10 step %c1 {
+    fir.do_loop %arg3 = %c1 to %c10 step %c1 {
+      %applied = hlfir.apply %mask, %arg3, %arg2 : (!hlfir.expr<10x10x!fir.logical<4>>, index, index) -> !fir.logical<4>
+      %dummy_ref = fir.alloca !fir.logical<4>
+      fir.store %applied to %dummy_ref : !fir.ref<!fir.logical<4>>
+    }
+ }
+
+  hlfir.end_associate %extra#0, %extra#2 : !fir.box<!fir.array<10x10x!fir.logical<4>>>, i1
+  hlfir.destroy %mask : !hlfir.expr<10x10x!fir.logical<4>>
+  return
+}
+// CHECK-LABEL: func.func @test_inlining_use_mask
+// CHECK: %[[MASK:.*]] = hlfir.elemental
+// CHECK: hlfir.associate %[[MASK]]
+// CHECK: fir.do_loop
+// CHECK:   fir.do_loop
+// CHECK-NOT: hlfir.apply
+// CHECK:     %[[VAL:.*]] = hlfir.designate %arg0
+// CHECK:     %[[LOAD:.*]] = fir.load %[[VAL]]
+// CHECK:     %[[REF:.*]] = fir.load %arg1
+// CHECK:     %[[CMP:.*]] = arith.cmpi eq, %[[LOAD]], %[[REF]]
+// CHECK:     %[[RES:.*]] = fir.convert %[[CMP]]
+// CHECK:     fir.store %[[RES]]
+// CHECK-NOT: hlfir.destroy %[[MASK]]
+
+// Test elemental removal as use_count becomes zero.
+func.func @test_inlining_elemental_cleanup(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10, %c10 : (index, index) -> !fir.shape<2>
+
+  // Elemental Mask.
+  %mask = hlfir.elemental %shape unordered : (!fir.shape<2>) -> !hlfir.expr<10x10x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index):
+    %val = hlfir.designate %arg0 (%i, %j) : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+    %load = fir.load %val : !fir.ref<i32>
+    %ref = fir.load %arg1 : !fir.ref<i32>
+    %cmp = arith.cmpi eq, %load, %ref : i32
+    %res = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %res : !fir.logical<4>
+  }
+
+  %extra = fir.convert %mask : (!hlfir.expr<10x10x!fir.logical<4>>) -> !hlfir.expr<10x10x!fir.logical<4>>
+
+  // Apply Site.
+  fir.do_loop %arg2 = %c1 to %c10 step %c1 {
+    fir.do_loop %arg3 = %c1 to %c10 step %c1 {
+      %applied = hlfir.apply %mask, %arg3, %arg2 : (!hlfir.expr<10x10x!fir.logical<4>>, index, index) -> !fir.logical<4>
+      %dummy_ref = fir.alloca !fir.logical<4>
+      fir.store %applied to %dummy_ref : !fir.ref<!fir.logical<4>>
+    }
+  }
+
+  hlfir.destroy %mask : !hlfir.expr<10x10x!fir.logical<4>>
+  return
+}
+// CHECK-LABEL: func.func @test_inlining_elemental_cleanup
+// CHECK-NOT:     hlfir.elemental
+// CHECK-NOT:     fir.convert
+// CHECK:         fir.do_loop
+// CHECK:           fir.do_loop
+// CHECK-NOT:         hlfir.apply
+// CHECK:             arith.cmpi eq
+// CHECK-NOT:     hlfir.destroy
+
+// Check that inlining is blocked when there is more than one hlfir.apply
+// site for the same elemental.
+func.func @test_multi_apply_no_inlining(%arg0: !hlfir.expr<?xi32>, %target: i32, %shape: !fir.shape<1>) -> (i1, i1) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  // Producer (Elemental)
+  %mask = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+    ^bb0(%i: index):
+      %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+      %cmp = arith.cmpi eq, %val, %target : i32
+      %log = fir.convert %cmp : (i1) -> !fir.logical<4>
+      hlfir.yield_element %log : !fir.logical<4>
+  }
+
+  // First Apply Site.
+  %apply1 = hlfir.apply %mask, %c1 : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+  %cond1 = fir.convert %apply1 : (!fir.logical<4>) -> i1
+
+  // Second Apply Site.
+  %apply2 = hlfir.apply %mask, %c2 : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+  %cond2 = fir.convert %apply2 : (!fir.logical<4>) -> i1
+
+  // Destroy.
+  hlfir.destroy %mask : !hlfir.expr<?x!fir.logical<4>>
+
+  return %cond1, %cond2 : i1, i1
+}
+// CHECK-LABEL: func.func @test_multi_apply_no_inlining(
+// CHECK-SAME:  %[[ARG0:.*]]: !hlfir.expr<?xi32>, %[[TARGET:.*]]: i32, %[[SHAPE:.*]]: !fir.shape<1>)
+// CHECK:       %[[MASK:.*]] = hlfir.elemental %[[SHAPE]]
+// CHECK:       %[[A1:.*]] = hlfir.apply %[[MASK]], %{{.*}}
+// CHECK:       %[[A2:.*]] = hlfir.apply %[[MASK]], %{{.*}}
+// CHECK:       hlfir.destroy %[[MASK]]
+
+// Check inlining one elemental into another.
+// a = b * c + d
+// Declare a global symbol to store the intermediate mask.
+fir.global @mask_storage : !hlfir.expr<10x10x!fir.logical<4>>
+func.func @test_nested_elemental(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10, %c10 : (index, index) -> !fir.shape<2>
+
+  // The Elemental Mask (b * c)
+  %mask = hlfir.elemental %shape unordered : (!fir.shape<2>) -> !hlfir.expr<10x10x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index):
+    %val = hlfir.designate %arg0 (%i, %j) : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+    %load = fir.load %val : !fir.ref<i32>
+    %ref = fir.load %arg1 : !fir.ref<i32>
+    %cmp = arith.cmpi eq, %load, %ref : i32
+    %res = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %res : !fir.logical<4>
+  }
+
+  // Total users - 1. fir.store, 2. hlfir.apply, 3. hlfir.destroy.
+  %ptr = fir.address_of(@mask_storage) : !fir.ref<!hlfir.expr<10x10x!fir.logical<4>>>
+  fir.store %mask to %ptr : !fir.ref<!hlfir.expr<10x10x!fir.logical<4>>>
+
+  // Target loop using the mask.
+  fir.do_loop %arg2 = %c1 to %c10 step %c1 {
+    fir.do_loop %arg3 = %c1 to %c10 step %c1 {
+      // CHECK-NOT: hlfir.apply
+      %applied = hlfir.apply %mask, %arg3, %arg2 : (!hlfir.expr<10x10x!fir.logical<4>>, index, index) -> !fir.logical<4>
+      %dummy_ref = fir.alloca !fir.logical<4>
+      fir.store %applied to %dummy_ref : !fir.ref<!fir.logical<4>>
+    }
+  }
+
+  hlfir.destroy %mask : !hlfir.expr<10x10x!fir.logical<4>>
+  return
+}
+// CHECK-LABEL: func.func @test_nested_elemental
+// CHECK:         hlfir.elemental
+// CHECK-NOT:       hlfir.apply
+// CHECK:           arith.cmpi eq
+// CHECK-NOT:         hlfir.destroy
+
+// Inlining into a single hlfir.apply.
+// a = (b * c)[1]
+func.func @test_scalar_apply_inline(%b: !fir.box<!fir.array<?xf32>>, %c: !fir.box<!fir.array<?xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+
+  // Elemental (b * c)
+  %prod = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+  ^bb0(%i: index):
+    %b_ref = hlfir.designate %b (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %b_val = fir.load %b_ref : !fir.ref<f32>
+    %c_ref = hlfir.designate %c (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %c_val = fir.load %c_ref : !fir.ref<f32>
+    %mul = arith.mulf %b_val, %c_val : f32
+    hlfir.yield_element %mul : f32
+  }
+
+  // Total users = 3 (store, apply, destroy).
+  %ptr = fir.address_of(@scalar_storage) : !fir.ref<!hlfir.expr<10xf32>>
+  fir.store %prod to %ptr : !fir.ref<!hlfir.expr<10xf32>>
+
+  // Scalar apply site - a = (b * c)(1)
+  // CHECK-NOT: hlfir.apply
+  %scalar_val = hlfir.apply %prod, %c1 : (!hlfir.expr<10xf32>, index) -> f32
+  
+  // Use the scalar result.
+  %dummy_ref = fir.alloca f32
+  fir.store %scalar_val to %dummy_ref : !fir.ref<f32>
+
+  hlfir.destroy %prod : !hlfir.expr<10xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_scalar_apply_inline
+// CHECK:         %[[MASK:.*]] = hlfir.elemental
+// CHECK:         fir.store %[[MASK]] to {{.*}}
+// CHECK-NOT:     hlfir.apply
+// CHECK:         arith.mulf
+// CHECK-NOT:     hlfir.destroy
+
+// Check long chains of elementals.
+// subroutine reproducer(a)
+//   real, dimension(:) :: a
+//   a = sqrt(a * (a - 1))
+// end subroutine
+func.func @_QPreproducer(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a"}) {
+  %c0 = arith.constant 0 : index
+  %f1 = arith.constant 1.0 : f32
+  %0:2 = hlfir.declare %arg0 {uniq_name = "_QFreproducerEa"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %1:3 = fir.box_dims %0#0, %c0 : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+  %2 = fir.shape %1#1 : (index) -> !fir.shape<1>
+
+  // tmp1 = a - 1
+  %tmp1 = hlfir.elemental %2 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+  ^bb0(%i: index):
+    %a_ref = hlfir.designate %0#0 (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %a_val = fir.load %a_ref : !fir.ref<f32>
+    %sub = arith.subf %a_val, %f1 : f32
+    hlfir.yield_element %sub : f32
+  }
+
+  %dummy = hlfir.no_reassoc %tmp1 : !hlfir.expr<?xf32>
+
+  // tmp2 = a * tmp1
+  %tmp2 = hlfir.elemental %2 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+  ^bb0(%j: index):
+    %a_ref_2 = hlfir.designate %0#0 (%j) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %t1_val = hlfir.apply %tmp1, %j : (!hlfir.expr<?xf32>, index) -> f32
+    %a_val_2 = fir.load %a_ref_2 : !fir.ref<f32>
+    %mul = arith.mulf %a_val_2, %t1_val : f32
+    hlfir.yield_element %mul : f32
+  }
+
+  // tmp3 = sqrt(tmp2)
+  %tmp3 = hlfir.elemental %2 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+  ^bb0(%k: index):
+    %t2_val = hlfir.apply %tmp2, %k : (!hlfir.expr<?xf32>, index) -> f32
+    %res = math.sqrt %t2_val : f32
+    hlfir.yield_element %res : f32
+  }
+
+  // Final assignment.
+  hlfir.assign %tmp3 to %0#0 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+
+  hlfir.destroy %tmp3 : !hlfir.expr<?xf32>
+  hlfir.destroy %tmp2 : !hlfir.expr<?xf32>
+  hlfir.destroy %dummy: !hlfir.expr<?xf32>
+  hlfir.destroy %tmp1 : !hlfir.expr<?xf32>
+  return
+}
+// CHECK-LABEL: func.func @_QPreproducer
+// CHECK:         %[[TMP1:.*]] = hlfir.elemental
+// CHECK:         hlfir.no_reassoc %[[TMP1]]
+// CHECK-NOT:     hlfir.apply
+// CHECK-DAG:     arith.subf
+// CHECK-DAG:     arith.mulf
+// CHECK-DAG:     math.sqrt
+// CHECK:         hlfir.assign
+// The apply site was inlined, so the elemental's lifecycle (destroy) 
+// is removed even though metadata users like no_reassoc remain.
+// CHECK-NOT:     hlfir.destroy %[[TMP1]]
+
+// Check that the ordered elemental is not inlined into another:
+// a = b + c + d (where b + c is ordered)
+func.func private @persistent_user(!hlfir.expr<?xf32>)
+func.func @test_noinline_ordered(%arg0: !hlfir.expr<?xf32>, %arg1: !hlfir.expr<?xf32>, %shape: !fir.shape<1>) {
+  %c1 = arith.constant 1 : index
+
+  // Producer (b + c) - ordered
+  %el_a = hlfir.elemental %shape {ordered = true} : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+    ^bb0(%i: index):
+      %0 = hlfir.apply %arg0, %i : (!hlfir.expr<?xf32>, index) -> f32
+      %1 = hlfir.apply %arg1, %i : (!hlfir.expr<?xf32>, index) -> f32
+      %sum = arith.addf %0, %1 : f32
+      hlfir.yield_element %sum : f32
+  }
+
+  fir.call @persistent_user(%el_a) : (!hlfir.expr<?xf32>) -> ()
+
+  // Consumer (el_a + d)
+  %el_b = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+    ^bb0(%j: index):
+      // This apply must remain because el_a is ordered.
+      %a_val = hlfir.apply %el_a, %j : (!hlfir.expr<?xf32>, index) -> f32
+      %total = arith.addf %a_val, %a_val : f32
+      hlfir.yield_element %total : f32
+  }
+
+  hlfir.destroy %el_a : !hlfir.expr<?xf32>
+  %val = hlfir.apply %el_b, %c1 : (!hlfir.expr<?xf32>, index) -> f32
+  hlfir.destroy %el_b : !hlfir.expr<?xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_noinline_ordered
+// CHECK:         %[[PRODUCER:.*]] = hlfir.elemental %{{.*}} {ordered = true}
+// CHECK:         fir.call @persistent_user(%[[PRODUCER]])
+// CHECK:         %[[CONSUMER:.*]] = hlfir.elemental
+// CHECK:           hlfir.apply %[[PRODUCER]], %{{.*}}
+// CHECK:         hlfir.destroy %[[PRODUCER]]
+
+// Check that the elemental is not inlined, because its array result
+// must be finalized.
+func.func @test_noinline_due_to_finalization(%arg0: !fir.box<!fir.array<?x!fir.type<_QMtypesTt1{x:f32}>>>, %shape: !fir.shape<1>) {
+  %c1 = arith.constant 1 : index
+
+  // Producer - Derived-type Elemental.
+  %el = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.type<_QMtypesTt1{x:f32}>> {
+  ^bb0(%i: index):
+    %res = fir.alloca !fir.type<_QMtypesTt1{x:f32}>
+    %ld = fir.load %res : !fir.ref<!fir.type<_QMtypesTt1{x:f32}>>
+    hlfir.yield_element %ld : !fir.type<_QMtypesTt1{x:f32}>
+  }
+
+  fir.call @persistent_user(%el) : (!hlfir.expr<?x!fir.type<_QMtypesTt1{x:f32}>>) -> ()
+
+  // This apply must remain because the elemental result requires finalization.
+  %res_ptr = fir.alloca !fir.type<_QMtypesTt1{x:f32}>
+  %apply = hlfir.apply %el, %c1 : (!hlfir.expr<?x!fir.type<_QMtypesTt1{x:f32}>>, index) -> !hlfir.expr<!fir.type<_QMtypesTt1{x:f32}>>
+  hlfir.assign %apply to %res_ptr : !hlfir.expr<!fir.type<_QMtypesTt1{x:f32}>>, !fir.ref<!fir.type<_QMtypesTt1{x:f32}>>
+
+  // Destroy with 'finalize' keyword, hlfir::elementalOpMustProduceTemp becomes
+  // true.
+  hlfir.destroy %el finalize : !hlfir.expr<?x!fir.type<_QMtypesTt1{x:f32}>>
+  return
+}
+// CHECK-LABEL: func.func @test_noinline_due_to_finalization
+// CHECK:         %[[EL:.*]] = hlfir.elemental
+// CHECK:         fir.call @persistent_user(%[[EL]])
+// CHECK:         %[[APPLY:.*]] = hlfir.apply %[[EL]], %{{.*}}
+// CHECK:         hlfir.destroy %[[EL]] finalize
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir
deleted file mode 100644
index 31925ae41467e..0000000000000
--- a/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir
+++ /dev/null
@@ -1,269 +0,0 @@
-// RUN: fir-opt %s --simplify-hlfir-intrinsics | FileCheck %s
-
-// Rank 1: Variable: A == %target
-func.func @test_maxloc_1d_equality_variable(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_1d_equality_variable
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK-NOT: arith.constant -2147483648 : i32
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
-// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %{{.*}}
-// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
-// CHECK:   %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
-
-// Rank 2: Variable: A == %target
-func.func @test_maxloc_2d_equality_variable(%arg0: !hlfir.expr<?x?xi32>, %target: i32) -> !hlfir.expr<2xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
-  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index):
-    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
-  return %res : !hlfir.expr<2xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_2d_equality_variable
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[RES_OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[OUT1:.*]] = %[[C0]], %[[OUT2:.*]] = %[[C0]], %[[OUT3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
-// CHECK:   %[[RES_INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[IN1:.*]] = %[[OUT1]], %[[IN2:.*]] = %[[OUT2]], %[[IN3:.*]] = %[[OUT3]]) -> (i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:     %[[EQ:.*]] = arith.cmpi eq, {{.*}}
-// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[IN3]]
-// CHECK:     %[[IF:.*]]:3 = fir.if %[[COND]] -> (i32, i32, i1)
-
-// Rank 3: Variable: A == %target
-func.func @test_maxloc_3d_equality_variable(%arg0: !hlfir.expr<?x?x?xi32>, %target: i32) -> !hlfir.expr<3xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
-  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index, %k: index):
-    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
-  return %res : !hlfir.expr<3xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_3d_equality_variable
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[OUTER:.*]]:4 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
-// CHECK:   %[[MIDDLE:.*]]:4 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
-// CHECK:     %[[INNER:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:       %[[EQ:.*]] = arith.cmpi eq, {{.*}}
-// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
-// CHECK:       %[[IF:.*]]:4 = fir.if %[[COND]] -> (i32, i32, i32, i1)
-
-// Rank 1: Constant: A == 42
-func.func @test_maxloc_1d_equality_constant(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
-  %c42 = arith.constant 42 : i32
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c42 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_1d_equality_constant
-// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-
-// CHECK: %[[RES:.*]]:2 = fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
-// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
-// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
-// CHECK:   %[[IF:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
-
-// Rank 2: Constant: A == 42
-func.func @test_maxloc_2d_equality_constant(%arg0: !hlfir.expr<?x?xi32>) -> !hlfir.expr<2xi32> {
-  %c42 = arith.constant 42 : i32
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
-  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index):
-    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c42 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
-  return %res : !hlfir.expr<2xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_2d_equality_constant
-// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
-// CHECK:   %[[INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[O1]], %[[I2:.*]] = %[[O2]], %[[I3:.*]] = %[[O3]]) -> (i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:     %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV2]], %[[IV1]]
-// CHECK:     %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
-// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[I3]]
-
-// Rank 3: Constant: A == 42
-func.func @test_maxloc_3d_equality_constant(%arg0: !hlfir.expr<?x?x?xi32>) -> !hlfir.expr<3xi32> {
-  %c42 = arith.constant 42 : i32
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
-  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index, %k: index):
-    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c42 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
-  return %res : !hlfir.expr<3xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_3d_equality_constant
-// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[OUT:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
-// CHECK:   %[[MID:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
-// CHECK:     %[[INN:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:       %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV3]], %{{.*}}, %{{.*}}
-// CHECK:       %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
-// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
-
-// No Match: Result must be 0
-func.func @test_maxloc_no_match(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
-  %c99 = arith.constant 99 : i32 
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c99 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_no_match(
-// CHECK-SAME: %[[ARRAY_NM:.*]]: !hlfir.expr<?xi32>)
-// CHECK-DAG:    %[[C99:.*]] = arith.constant 99 : i32
-// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_NM]], %[[IV]]
-// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[C99]] : i32
-// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
-// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
-// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
-// CHECK:        fir.result %[[CONV]], %false
-// CHECK:      } else {
-// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
-// CHECK:      }
-
-// First Match: Duplicate values
-func.func @test_maxloc_first_match(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_first_match(
-// CHECK-SAME: %[[ARRAY_FM:.*]]: !hlfir.expr<?xi32>, %[[TARGET_FM:.*]]: i32)
-// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
-// CHECK-DAG:    %[[FALSE:.*]] = arith.constant false
-// Verify loop has only 2 iter_args (Coord, FirstHitFlag)
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// Verify mask elemental is bypassed
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// Verify the "Locking" logic: (Match 'and' is_first)
-// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_FM]], %[[IV]]
-// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[TARGET_FM]] : i32
-// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
-// Verify that once a match is found, we result in %false to lock it
-// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
-// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
-// CHECK:        fir.result %[[CONV]], %[[FALSE]] : i32, i1
-// CHECK:      } else {
-// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
-// CHECK:      }
-
-// Negative test: Mask refers to a different array (%arg1) than the search 
-// array (%arg0).
-func.func @test_maxloc_different_arrays(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    // Optimization should fail here because %arg1 != %arg0
-    %val_b = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val_b, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_different_arrays(
-// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>, %[[TARGET:.*]]: i32)
-// CHECK: %[[SENTINEL:.*]] = arith.constant -2147483648 : i32
-// Verify the loop uses three iter_args (standard path)
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MAX:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
-// Verify the mask is applied (Since we can't inline it safely)
-// CHECK: %[[MASK_VAL:.*]] = hlfir.apply {{.*}} : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
-// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_VAL]] : (!fir.logical<4>) -> i1
-// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
-// CHECK:   %[[VAL_A:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
-// CHECK:   %[[CMP:.*]] = arith.cmpi sgt, %[[VAL_A]], %[[MAX]]
-
-// Negative Test: The target value is another array, so it is not invariant.
-func.func @test_maxloc_non_invariant_target(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val_a = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %val_target = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
-    // Optimization should fail here because %val_target is defined inside the elemental
-    %cmp = arith.cmpi eq, %val_a, %val_target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_non_invariant_target(
-// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>)
-// CHECK: %[[SENTINEL:.*]] = arith.constant -2147483648 : i32
-// Verify the loop uses three iter_args (Standard path)
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MAX:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
-// Verify the mask is still applied (because we couldn't inline the comparison)
-// CHECK: %[[MASK_BIT:.*]] = hlfir.apply %{{.*}}, %[[IV]] : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
-// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_BIT]] : (!fir.logical<4>) -> i1
-// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
-// CHECK:   %[[VAL:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
-// CHECK:   %[[CMP:.*]] = arith.cmpi sgt, %[[VAL]], %[[MAX]]
-
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir
deleted file mode 100644
index 0bfa58968a2fe..0000000000000
--- a/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir
+++ /dev/null
@@ -1,274 +0,0 @@
-// RUN: fir-opt %s --simplify-hlfir-intrinsics | FileCheck %s
-
-// Rank 1: Variable: A == %target
-func.func @test_minloc_1d_equality_variable(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_1d_equality_variable
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK-NOT: arith.constant 2147483647 : i32
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
-// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %{{.*}}
-// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
-// CHECK:   %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
-
-// Rank 2: Variable: A == %target
-func.func @test_minloc_2d_equality_variable(%arg0: !hlfir.expr<?x?xi32>, %target: i32) -> !hlfir.expr<2xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
-  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index):
-    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
-  return %res : !hlfir.expr<2xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_2d_equality_variable
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[RES_OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[OUT1:.*]] = %[[C0]], %[[OUT2:.*]] = %[[C0]], %[[OUT3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
-// CHECK:   %[[RES_INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[IN1:.*]] = %[[OUT1]], %[[IN2:.*]] = %[[OUT2]], %[[IN3:.*]] = %[[OUT3]]) -> (i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:     %[[EQ:.*]] = arith.cmpi eq, {{.*}}
-// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[IN3]]
-// CHECK:     %[[IF:.*]]:3 = fir.if %[[COND]] -> (i32, i32, i1)
-
-// Rank 3: Variable: A == %target
-func.func @test_minloc_3d_equality_variable(%arg0: !hlfir.expr<?x?x?xi32>, %target: i32) -> !hlfir.expr<3xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
-  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index, %k: index):
-    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
-  return %res : !hlfir.expr<3xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_3d_equality_variable
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[OUTER:.*]]:4 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
-// CHECK:   %[[MIDDLE:.*]]:4 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
-// CHECK:     %[[INNER:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:       %[[EQ:.*]] = arith.cmpi eq, {{.*}}
-// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
-// CHECK:       %[[IF:.*]]:4 = fir.if %[[COND]] -> (i32, i32, i32, i1)
-
-// Rank 1: Constant: A == 42
-func.func @test_minloc_1d_equality_constant(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
-  %c42 = arith.constant 42 : i32
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c42 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_1d_equality_constant
-// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-
-// CHECK: %[[RES:.*]]:2 = fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
-// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
-// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
-// CHECK:   %[[IF:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
-
-// Rank 2: Constant: A == 42
-func.func @test_minloc_2d_equality_constant(%arg0: !hlfir.expr<?x?xi32>) -> !hlfir.expr<2xi32> {
-  %c42 = arith.constant 42 : i32
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
-  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index):
-    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c42 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
-  return %res : !hlfir.expr<2xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_2d_equality_constant
-// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
-// CHECK:   %[[INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[O1]], %[[I2:.*]] = %[[O2]], %[[I3:.*]] = %[[O3]]) -> (i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:     %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV2]], %[[IV1]]
-// CHECK:     %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
-// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[I3]]
-
-// Rank 3: Constant: A == 42
-func.func @test_minloc_3d_equality_constant(%arg0: !hlfir.expr<?x?x?xi32>) -> !hlfir.expr<3xi32> {
-  %c42 = arith.constant 42 : i32
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
-  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index, %k: index):
-    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c42 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
-  return %res : !hlfir.expr<3xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_3d_equality_constant
-// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[OUT:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
-// CHECK:   %[[MID:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
-// CHECK:     %[[INN:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:       %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV3]], %{{.*}}, %{{.*}}
-// CHECK:       %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
-// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
-
-// No Match: Result must be 0
-func.func @test_minloc_no_match(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
-  %c99 = arith.constant 99 : i32 
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c99 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_no_match(
-// CHECK-SAME: %[[ARRAY_NM:.*]]: !hlfir.expr<?xi32>)
-// CHECK-DAG:    %[[C99:.*]] = arith.constant 99 : i32
-// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_NM]], %[[IV]]
-// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[C99]] : i32
-// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
-// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
-// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
-// CHECK:        fir.result %[[CONV]], %false
-// CHECK:      } else {
-// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
-// CHECK:      }
-
-// First Match: Duplicate values
-func.func @test_minloc_first_match(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_first_match(
-// CHECK-SAME: %[[ARRAY_FM:.*]]: !hlfir.expr<?xi32>, %[[TARGET_FM:.*]]: i32)
-// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
-// CHECK-DAG:    %[[FALSE:.*]] = arith.constant false
-// Verify loop has only 2 iter_args (Coord, FirstHitFlag)
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// Verify mask elemental is bypassed
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// Verify the "Locking" logic: (Match AND is_first)
-// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_FM]], %[[IV]]
-// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[TARGET_FM]] : i32
-// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
-// Verify that once a match is found, we result in %false to lock it
-// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
-// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
-// CHECK:        fir.result %[[CONV]], %[[FALSE]] : i32, i1
-// CHECK:      } else {
-// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
-// CHECK:      }
-
-// Negative test: Mask refers to a different array (%arg1) than the search 
-// array (%arg0).
-func.func @test_minloc_different_arrays(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    // Optimization should fail here because %arg1 != %arg0
-    %val_b = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val_b, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_different_arrays(
-// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>, %[[TARGET:.*]]: i32)
-// CHECK: %[[SENTINEL:.*]] = arith.constant 2147483647 : i32
-
-// 1. Verify the loop uses three iter_args (Standard path: Loc, MinVal, FirstHit)
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MIN_VAL:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
-
-// 2. Verify the mask IS still applied (Optimization correctly skipped)
-// CHECK: %[[MASK_VAL:.*]] = hlfir.apply {{.*}} : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
-// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_VAL]] : (!fir.logical<4>) -> i1
-
-// 3. Verify the standard path's MINLOC comparison logic (slt instead of sgt)
-// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
-// CHECK:   %[[VAL_A:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
-// CHECK:   %[[CMP:.*]] = arith.cmpi slt, %[[VAL_A]], %[[MIN_VAL]] : i32
-
-// Negative Test: The target value is another array, so it is not invariant.
-func.func @test_minloc_non_invariant_target(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val_a = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %val_target = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
-    // Optimization should fail here because %val_target is defined inside the 
-    // elemental
-    %cmp = arith.cmpi eq, %val_a, %val_target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_non_invariant_target(
-// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>)
-// CHECK: %[[SENTINEL:.*]] = arith.constant 2147483647 : i32
-// Verify the loop uses three iter_args (standard path)
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MAX:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
-// Verify the mask is still applied (because we couldn't inline the comparison)
-// CHECK: %[[MASK_BIT:.*]] = hlfir.apply %{{.*}}, %[[IV]] : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
-// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_BIT]] : (!fir.logical<4>) -> i1
-// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
-// CHECK:   %[[VAL:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
-// CHECK:   %[[CMP:.*]] = arith.cmpi slt, %[[VAL]], %[[MAX]]
-

>From 5592e0ab8042652b8fa1b64a4fe5e0a195f4e7f0 Mon Sep 17 00:00:00 2001
From: "anoop.kumar6 at ibm.com" <anoopk at b35lp63.lnxne.boe>
Date: Wed, 1 Apr 2026 00:35:48 +0200
Subject: [PATCH 3/4] [flang][HLFIR] This patch adds a data-flow and
 memory-effect analysis to   InlineElementals pass to ensure semantic safety.
 It replaces the "two-use"   constraint with a worklist-based traversal to
 trace elemental results   through hlfir.declare and fir.convert. A new safety
 check, isSafeToInline,   uses AliasAnalysis and a recursive region walk to
 detect conflicting   writes between the producer and consumer, preventing
 unsafe inlining   across structured control flow.

---
 .../HLFIR/Transforms/InlineElementals.cpp     | 216 +++++++++++-
 .../HLFIR/inline-elemental-multi-users.fir    | 315 +++++++++++++++---
 2 files changed, 480 insertions(+), 51 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
index e3ced12dc93b3..9a6ad46309947 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
@@ -16,9 +16,11 @@
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
+#include "mlir/Analysis/AliasAnalysis.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -31,10 +33,127 @@ namespace hlfir {
 #include "flang/Optimizer/HLFIR/Passes.h.inc"
 } // namespace hlfir
 
+/// Collects all memory values (buffers/references) that the elemental body
+/// reads from.
+static void getReadDependencies(hlfir::ElementalOp elemental,
+                                llvm::SmallVectorImpl<mlir::Value> &deps) {
+  elemental.getRegion().walk([&](mlir::Operation *op) {
+    if (auto designate = mlir::dyn_cast<hlfir::DesignateOp>(op))
+      deps.push_back(designate.getMemref());
+    else if (auto load = mlir::dyn_cast<fir::LoadOp>(op))
+      deps.push_back(load.getMemref());
+    // Capture any value defined outside the elemental but used inside it.
+    for (mlir::Value operand : op->getOperands()) {
+      if (operand.getParentRegion() != &elemental.getRegion())
+        if (mlir::isa<fir::ReferenceType, fir::PointerType, fir::HeapType,
+                      fir::BoxType>(operand.getType()))
+          deps.push_back(operand);
+    }
+  });
+}
+
+/// Checks if an operation 'op' potentially modifies any memory location that
+/// the elemental reads from (captured in 'deps').
+static bool isConflictingWrite(mlir::Operation *op,
+                               const llvm::SmallVectorImpl<mlir::Value> &deps,
+                               mlir::AliasAnalysis &aa) {
+  // Operations explicitly marked as having no memory effects are safe.
+  if (mlir::isMemoryEffectFree(op))
+    return false;
+
+  // Explicitly allow safe HLFIR/FIR metadata/lifetime operations.
+  // While these may have internal effects (e.g. allocating a descriptor),
+  // they do not modify the user data being read by the elemental.
+  if (mlir::isa<hlfir::DeclareOp, hlfir::AssociateOp, hlfir::EndAssociateOp,
+                fir::AllocaOp, hlfir::NoReassocOp>(op))
+    return false;
+
+  // Check for explicit memory effects via the MemoryEffectOpInterface.
+  if (auto memInterface = mlir::dyn_cast<mlir::MemoryEffectOpInterface>(op)) {
+    llvm::SmallVector<mlir::MemoryEffects::EffectInstance, 4> effects;
+    memInterface.getEffects(effects);
+
+    for (const auto &effect : effects) {
+      // Analyze effects that modify memory or release resources.
+      if (mlir::isa<mlir::MemoryEffects::Write>(effect.getEffect()) ||
+          mlir::isa<mlir::MemoryEffects::Free>(effect.getEffect())) {
+
+        mlir::Value accessedValue = effect.getValue();
+        // If the effect is on an unknown resource (e.g. external call),
+        // assume a conflict.
+        if (!accessedValue)
+          return true;
+
+        // Perform alias analysis against all read dependencies.
+        for (mlir::Value dep : deps) {
+          if (!aa.alias(accessedValue, dep).isNo())
+            return true;
+        }
+      }
+    }
+  } else if (op->getNumRegions() == 0) {
+    // Conservative Fallback: If an operation lacks the interface and has no
+    // regions (e.g. a fir.call to an external function), assume it can
+    // potentially modifies any memory.
+    return true;
+  }
+
+  // Recursive Analysis into structured control flow regions.
+  // (e.g. fir.if, fir.do_loop) to find nested conflicting writes.
+  for (mlir::Region &region : op->getRegions()) {
+    for (mlir::Block &block : region) {
+      for (mlir::Operation &nestedOp : block) {
+        if (isConflictingWrite(&nestedOp, deps, aa))
+          return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+bool isSafeToInline(hlfir::ElementalOp producer, hlfir::ApplyOp applySite,
+                    mlir::AliasAnalysis &aa) {
+  mlir::DominanceInfo domInfo(producer->getParentOp());
+  if (!domInfo.properlyDominates(producer.getOperation(),
+                                 applySite.getOperation()))
+    return false;
+
+  llvm::SmallVector<mlir::Value> deps;
+  getReadDependencies(producer, deps);
+
+  mlir::Operation *func = producer->getParentOfType<mlir::func::FuncOp>();
+  bool conflict = false;
+
+  func->walk([&](mlir::Operation *op) {
+    // Skip the producer and applySite themselves.
+    if (op == producer.getOperation() || op == applySite.getOperation())
+      return mlir::WalkResult::advance();
+
+    // Skip the operation that contains the applySite.
+    // We only care about operations that execute before the applySite
+    // starts or between the producer and the start of the loop.
+    if (op->isAncestor(applySite.getOperation()))
+      return mlir::WalkResult::advance();
+
+    // Only check operations that strictly execute between definition and use.
+    if (domInfo.properlyDominates(producer.getOperation(), op) &&
+        domInfo.dominates(op, applySite.getOperation())) {
+      if (isConflictingWrite(op, deps, aa)) {
+        conflict = true;
+        return mlir::WalkResult::interrupt();
+      }
+    }
+    return mlir::WalkResult::advance();
+  });
+
+  return !conflict;
+}
+
 /// If the elemental has only two uses and those two are an apply operation and
 /// a destroy operation, return those two, otherwise return {}
 static std::optional<std::pair<hlfir::ApplyOp, hlfir::DestroyOp>>
-getTwoUses(hlfir::ElementalOp elemental) {
+getTwoUses(hlfir::ElementalOp elemental, mlir::AliasAnalysis &aliasAnalysis) {
   // If the ElementalOp must produce a temporary (e.g. for
   // finalization purposes), then we cannot inline it.
   if (hlfir::elementalOpMustProduceTemp(elemental))
@@ -44,13 +163,77 @@ getTwoUses(hlfir::ElementalOp elemental) {
   hlfir::DestroyOp destroy;
   unsigned applyCount = 0;
 
-  for (mlir::Operation *user : elemental->getUsers()) {
-    mlir::TypeSwitch<mlir::Operation *, void>(user)
-        .Case([&](hlfir::ApplyOp op) {
-          apply = op;
-          applyCount++;
-        })
-        .Case([&](hlfir::DestroyOp op) { destroy = op; });
+  llvm::SmallVector<mlir::Value> worklist;
+  worklist.push_back(elemental.getResult());
+  llvm::SmallPtrSet<mlir::Value, 16> visited;
+
+  while (!worklist.empty()) {
+    mlir::Value current = worklist.pop_back_val();
+    if (!current || !visited.insert(current).second)
+      continue;
+
+    for (mlir::OpOperand &use : current.getUses()) {
+      mlir::Operation *user = use.getOwner();
+
+      mlir::TypeSwitch<mlir::Operation *, void>(user)
+          .Case<hlfir::ApplyOp>([&](hlfir::ApplyOp op) {
+            apply = op;
+            applyCount++;
+          })
+          .Case<hlfir::DestroyOp>([&](hlfir::DestroyOp op) {
+            // Track the mandatory destroy operation for the elemental expr.
+            destroy = op;
+          })
+          .Case<hlfir::DeclareOp>([&](hlfir::DeclareOp op) {
+            // Follow the dataflow through variable declarations.
+            worklist.push_back(op.getBase());
+          })
+          .Case<fir::ConvertOp>([&](fir::ConvertOp op) {
+            // Follow the dataflow through type conversions.
+            worklist.push_back(op.getResult());
+          })
+          .Case<mlir::BranchOpInterface>([&](mlir::BranchOpInterface branch) {
+            for (unsigned i = 0; i < branch->getNumSuccessors(); ++i) {
+              mlir::SuccessorOperands operands = branch.getSuccessorOperands(i);
+              for (unsigned j = 0; j < operands.size(); ++j) {
+                if (operands[j] == current) {
+                  // The j-th operand of the branch maps to the j-th block
+                  // argument of the successor block.
+                  mlir::Block *successor = branch->getSuccessor(i);
+                  worklist.push_back(successor->getArgument(j));
+                }
+              }
+            }
+          })
+          .Case<fir::ResultOp>([&](fir::ResultOp op) {
+            mlir::Operation *parent = op->getParentOp();
+            if (parent) {
+              for (auto it : llvm::enumerate(op.getOperands())) {
+                if (it.value() == current) {
+                  // 'current' is being yielded. The value outside the loop is
+                  // the i-th result of the parent operation.
+                  unsigned i = it.index();
+                  if (i < parent->getNumResults()) {
+                    worklist.push_back(parent->getResult(i));
+                  }
+                }
+              }
+            }
+          })
+          .Default([&](mlir::Operation *op) {
+            // If the elemental result is used by an operation with regions
+            // (like fir.if or fir.do_loop), the apply site may be nested
+            // inside.
+            if (op->getNumRegions() > 0) {
+              op->walk([&](hlfir::ApplyOp nestedApply) {
+                if (nestedApply.getExpr() == current) {
+                  apply = nestedApply;
+                  applyCount++;
+                }
+              });
+            }
+          });
+    }
   }
 
   // Only inline if there is a unique 'apply' site. Other users (such as
@@ -59,6 +242,10 @@ getTwoUses(hlfir::ElementalOp elemental) {
   if (applyCount != 1 || !destroy)
     return std::nullopt;
 
+  // Verify memory effect and dataflow analysis.
+  if (!isSafeToInline(elemental, apply, aliasAnalysis))
+    return std::nullopt;
+
   // we can't inline if the return type of the yield doesn't match the return
   // type of the apply
   auto yield = mlir::dyn_cast_or_null<hlfir::YieldElementOp>(
@@ -75,12 +262,14 @@ class InlineElementalConversion
     : public mlir::OpRewritePattern<hlfir::ElementalOp> {
 public:
   using mlir::OpRewritePattern<hlfir::ElementalOp>::OpRewritePattern;
-
+  explicit InlineElementalConversion(mlir::MLIRContext *context,
+                                     mlir::AliasAnalysis &aa)
+      : OpRewritePattern<hlfir::ElementalOp>(context), aliasAnalysis(aa) {}
   llvm::LogicalResult
   matchAndRewrite(hlfir::ElementalOp elemental,
                   mlir::PatternRewriter &rewriter) const override {
     std::optional<std::pair<hlfir::ApplyOp, hlfir::DestroyOp>> maybeTuple =
-        getTwoUses(elemental);
+        getTwoUses(elemental, aliasAnalysis);
     if (!maybeTuple)
       return rewriter.notifyMatchFailure(
           elemental, "hlfir.elemental is not a candidate for inlining");
@@ -113,6 +302,9 @@ class InlineElementalConversion
 
     return mlir::success();
   }
+
+private:
+  mlir::AliasAnalysis &aliasAnalysis;
 };
 
 class InlineElementalsPass
@@ -121,13 +313,15 @@ class InlineElementalsPass
   void runOnOperation() override {
     mlir::MLIRContext *context = &getContext();
 
+    // Get AliasAnalysis from the pass manager.
+    mlir::AliasAnalysis &aliasAnalysis = getAnalysis<mlir::AliasAnalysis>();
     mlir::GreedyRewriteConfig config;
     // Prevent the pattern driver from merging blocks.
     config.setRegionSimplificationLevel(
         mlir::GreedySimplifyRegionLevel::Disabled);
 
     mlir::RewritePatternSet patterns(context);
-    patterns.insert<InlineElementalConversion>(context);
+    patterns.insert<InlineElementalConversion>(context, aliasAnalysis);
 
     if (mlir::failed(mlir::applyPatternsGreedily(
             getOperation(), std::move(patterns), config))) {
diff --git a/flang/test/HLFIR/inline-elemental-multi-users.fir b/flang/test/HLFIR/inline-elemental-multi-users.fir
index f8b195e637091..32c9305320bdb 100644
--- a/flang/test/HLFIR/inline-elemental-multi-users.fir
+++ b/flang/test/HLFIR/inline-elemental-multi-users.fir
@@ -1,9 +1,92 @@
 // RUN: fir-opt --inline-elementals %s | FileCheck %s
 
-// Test inlining of hlfir.elemental into its hlfir.apply site when the 
+// Test inlining of hlfir.elemental into its hlfir.apply site when the
 // elemental has more than two users.
 
-// Check successful inlining where hlfir.elemental survives because the
+// Test successful inlining with relaxed inlining.
+func.func @test_safe_loop_inlining(%arg0: !fir.ref<!fir.array<10xf32>>, %i: index) {
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+
+  // Elemental Mask.
+  %elem = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+    ^bb0(%idx: index):
+      %addr = fir.coordinate_of %arg0, %idx : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+      %val = fir.load %addr : !fir.ref<f32>
+      hlfir.yield_element %val : f32
+  }
+
+  // User 1 - hlfir.apply inside nested loops (The target for inlining).
+  fir.do_loop %arg1 = %c1 to %c10 step %c1 {
+    fir.do_loop %arg2 = %c1 to %c10 step %c1 {
+      %res = hlfir.apply %elem, %arg2 : (!hlfir.expr<10xf32>, index) -> f32
+      %dummy = fir.alloca f32
+      fir.store %res to %dummy : !fir.ref<f32>
+    }
+  }
+
+  // User 2 - Associate (Simulating a shared mask use-case).
+  %temp:3 = hlfir.associate %elem(%shape) : (!hlfir.expr<10xf32>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>, i1)
+
+  // User 3 - Destroy
+  hlfir.destroy %elem : !hlfir.expr<10xf32>
+
+  hlfir.end_associate %temp#0, %temp#2 : !fir.ref<!fir.array<10xf32>>, i1
+  return
+}
+// CHECK-LABEL: func.func @test_safe_loop_inlining
+// CHECK: %[[ELEM:.*]] = hlfir.elemental
+// CHECK: fir.do_loop
+// CHECK:   fir.do_loop %[[INNER_IDX:.*]] =
+// CHECK:     %[[ADDR:.*]] = fir.coordinate_of %arg0, %[[INNER_IDX]]
+// CHECK:     %[[VAL:.*]] = fir.load %[[ADDR]]
+// CHECK-NOT: hlfir.apply
+// CHECK: hlfir.associate %[[ELEM]]
+// CHECK-NOT: hlfir.destroy %[[ELEM]]
+
+// Test blocking of incorrect inlining because of alias conflict.
+func.func @test_unsafe_loop_alias_conflict(%arg0: !fir.ref<!fir.array<10xf32>>, %new_val: f32) {
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+
+  // Elemental depends on the values in %arg0 (Producer).
+  %elem = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+    ^bb0(%idx: index):
+      %addr = fir.coordinate_of %arg0, %idx : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+      %val = fir.load %addr : !fir.ref<f32>
+      hlfir.yield_element %val : f32
+  }
+
+  fir.do_loop %arg1 = %c1 to %c10 step %c1 {
+    // We modify the array that the elemental needs to read from.
+    // Inlining the elemental here would see the new value.
+    %write_addr = fir.coordinate_of %arg0, %arg1 : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+    fir.store %new_val to %write_addr : !fir.ref<f32>
+
+    // Target for inlining.
+    %res = hlfir.apply %elem, %arg1 : (!hlfir.expr<10xf32>, index) -> f32
+
+    %dummy = fir.alloca f32
+    fir.store %res to %dummy : !fir.ref<f32>
+  }
+
+  hlfir.destroy %elem : !hlfir.expr<10xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_unsafe_loop_alias_conflict
+// Elemental should not be inlined, check presence of elemental and apply.
+// CHECK: %[[ELEM:.*]] = hlfir.elemental
+// CHECK: fir.do_loop
+// CHECK:   fir.store
+// CHECK:   %[[APPLIED:.*]] = hlfir.apply %[[ELEM]]
+// CHECK:   fir.store %[[APPLIED]]
+// Inlined code (coordinate_of/load) should not appear inside the loop.
+// CHECK-NOT: fir.coordinate_of %arg0, %arg1
+
+
+// Check successful inlining where 2-d hlfir.elemental survives because the
 // 'associate' op is still using it.
 func.func @test_inlining_use_mask(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.ref<i32>) {
   %c1 = arith.constant 1 : index
@@ -98,7 +181,7 @@ func.func @test_multi_apply_no_inlining(%arg0: !hlfir.expr<?xi32>, %target: i32,
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
 
-  // Producer (Elemental)
+  // Producer (Elemental).
   %mask = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
     ^bb0(%i: index):
       %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
@@ -127,16 +210,14 @@ func.func @test_multi_apply_no_inlining(%arg0: !hlfir.expr<?xi32>, %target: i32,
 // CHECK:       %[[A2:.*]] = hlfir.apply %[[MASK]], %{{.*}}
 // CHECK:       hlfir.destroy %[[MASK]]
 
-// Check inlining one elemental into another.
-// a = b * c + d
-// Declare a global symbol to store the intermediate mask.
+// Check global store blocks inlining.
 fir.global @mask_storage : !hlfir.expr<10x10x!fir.logical<4>>
 func.func @test_nested_elemental(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.ref<i32>) {
   %c1 = arith.constant 1 : index
   %c10 = arith.constant 10 : index
   %shape = fir.shape %c10, %c10 : (index, index) -> !fir.shape<2>
 
-  // The Elemental Mask (b * c)
+  // The Elemental Mask (b * c).
   %mask = hlfir.elemental %shape unordered : (!fir.shape<2>) -> !hlfir.expr<10x10x!fir.logical<4>> {
   ^bb0(%i: index, %j: index):
     %val = hlfir.designate %arg0 (%i, %j) : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
@@ -165,50 +246,56 @@ func.func @test_nested_elemental(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !f
   return
 }
 // CHECK-LABEL: func.func @test_nested_elemental
-// CHECK:         hlfir.elemental
-// CHECK-NOT:       hlfir.apply
-// CHECK:           arith.cmpi eq
-// CHECK-NOT:         hlfir.destroy
+// CHECK: %[[MASK:.*]] = hlfir.elemental
+// CHECK: %[[PTR:.*]] = fir.address_of(@mask_storage)
+// CHECK: fir.store %[[MASK]] to %[[PTR]]
+// Apply site not inlined.
+// CHECK: fir.do_loop
+// CHECK:   fir.do_loop
+// CHECK:     %[[VAL:.*]] = hlfir.apply %[[MASK]]
+// CHECK:     fir.store %[[VAL]]
+// The designate/load should only be inside the elemental.
+// CHECK-NOT: hlfir.designate
 
-// Inlining into a single hlfir.apply.
+// Inlining into a single hlfir.apply (relaxed inlining).
 // a = (b * c)[1]
-func.func @test_scalar_apply_inline(%b: !fir.box<!fir.array<?xf32>>, %c: !fir.box<!fir.array<?xf32>>) {
-  %c1 = arith.constant 1 : index
+func.func @test_scalar_apply_inlining_safe(%b: !fir.ref<!fir.array<10xf32>>, %c1: index) {
   %c10 = arith.constant 10 : index
   %shape = fir.shape %c10 : (index) -> !fir.shape<1>
 
-  // Elemental (b * c)
+  // Producer(1D Elemental).
   %prod = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
   ^bb0(%i: index):
-    %b_ref = hlfir.designate %b (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-    %b_val = fir.load %b_ref : !fir.ref<f32>
-    %c_ref = hlfir.designate %c (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-    %c_val = fir.load %c_ref : !fir.ref<f32>
-    %mul = arith.mulf %b_val, %c_val : f32
-    hlfir.yield_element %mul : f32
+    %b_addr = fir.coordinate_of %b, %i : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+    %b_val = fir.load %b_addr : !fir.ref<f32>
+    %res = arith.addf %b_val, %b_val : f32
+    hlfir.yield_element %res : f32
   }
 
-  // Total users = 3 (store, apply, destroy).
-  %ptr = fir.address_of(@scalar_storage) : !fir.ref<!hlfir.expr<10xf32>>
-  fir.store %prod to %ptr : !fir.ref<!hlfir.expr<10xf32>>
-
-  // Scalar apply site - a = (b * c)(1)
-  // CHECK-NOT: hlfir.apply
+  // Scalar Apply (Target for inlining).
   %scalar_val = hlfir.apply %prod, %c1 : (!hlfir.expr<10xf32>, index) -> f32
-  
-  // Use the scalar result.
-  %dummy_ref = fir.alloca f32
-  fir.store %scalar_val to %dummy_ref : !fir.ref<f32>
+
+  %temp:3 = hlfir.associate %prod(%shape) : (!hlfir.expr<10xf32>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>, i1)
 
   hlfir.destroy %prod : !hlfir.expr<10xf32>
+
+  // Use the result.
+  %dummy = fir.alloca f32
+  fir.store %scalar_val to %dummy : !fir.ref<f32>
+
+  hlfir.end_associate %temp#0, %temp#2 : !fir.ref<!fir.array<10xf32>>, i1
   return
 }
-// CHECK-LABEL: func.func @test_scalar_apply_inline
-// CHECK:         %[[MASK:.*]] = hlfir.elemental
-// CHECK:         fir.store %[[MASK]] to {{.*}}
-// CHECK-NOT:     hlfir.apply
-// CHECK:         arith.mulf
-// CHECK-NOT:     hlfir.destroy
+
+// CHECK-LABEL: func.func @test_scalar_apply_inlining_safe
+// CHECK: %[[ELEM:.*]] = hlfir.elemental
+// CHECK: hlfir.yield_element
+// CHECK: %[[ADDR:.*]] = fir.coordinate_of %arg0, %arg1
+// CHECK: %[[VAL:.*]] = fir.load %[[ADDR]]
+// CHECK: arith.addf %[[VAL]], %[[VAL]]
+// CHECK-NOT: hlfir.apply
+// CHECK: hlfir.associate %[[ELEM]]
+// CHECK-NOT: hlfir.destroy %[[ELEM]]
 
 // Check long chains of elementals.
 // subroutine reproducer(a)
@@ -222,7 +309,7 @@ func.func @_QPreproducer(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a
   %1:3 = fir.box_dims %0#0, %c0 : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
   %2 = fir.shape %1#1 : (index) -> !fir.shape<1>
 
-  // tmp1 = a - 1
+  // tmp1 = a - 1.
   %tmp1 = hlfir.elemental %2 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
   ^bb0(%i: index):
     %a_ref = hlfir.designate %0#0 (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
@@ -233,7 +320,7 @@ func.func @_QPreproducer(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a
 
   %dummy = hlfir.no_reassoc %tmp1 : !hlfir.expr<?xf32>
 
-  // tmp2 = a * tmp1
+  // tmp2 = a * tmp1.
   %tmp2 = hlfir.elemental %2 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
   ^bb0(%j: index):
     %a_ref_2 = hlfir.designate %0#0 (%j) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
@@ -243,7 +330,7 @@ func.func @_QPreproducer(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a
     hlfir.yield_element %mul : f32
   }
 
-  // tmp3 = sqrt(tmp2)
+  // tmp3 = sqrt(tmp2).
   %tmp3 = hlfir.elemental %2 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
   ^bb0(%k: index):
     %t2_val = hlfir.apply %tmp2, %k : (!hlfir.expr<?xf32>, index) -> f32
@@ -340,3 +427,151 @@ func.func @test_noinline_due_to_finalization(%arg0: !fir.box<!fir.array<?x!fir.t
 // CHECK:         fir.call @persistent_user(%[[EL]])
 // CHECK:         %[[APPLY:.*]] = hlfir.apply %[[EL]], %{{.*}}
 // CHECK:         hlfir.destroy %[[EL]] finalize
+
+// Test conflicting writes hidden within nested regions (like fir.if) between
+// producer and the apply site.
+func.func @test_nested_region_conflict(%arg0: !fir.ref<f32>, %cond: i1) {
+  %c1 = arith.constant 1 : index
+  %shape = fir.shape %c1 : (index)  -> !fir.shape<1>
+
+  // Producer.
+  %elem = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+    ^bb0(%i: index):
+      %val = fir.load %arg0 : !fir.ref<f32>
+      hlfir.yield_element %val : f32
+  }
+
+  // Nested region (fir.if) containing a store.
+  // This tests if your walk(func) correctly sees into the 'then' block.
+  fir.if %cond {
+    %new_val = arith.constant 3.0 : f32
+    fir.store %new_val to %arg0 : !fir.ref<f32>
+  } else {
+    // Distractor op
+    fir.no_reassoc %cond : i1
+  }
+
+  // Apply Site.
+  %res = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+    ^bb0(%j: index):
+      %val = hlfir.apply %elem, %j : (!hlfir.expr<1xf32>, index) -> f32
+      hlfir.yield_element %val : f32
+  }
+
+  hlfir.destroy %elem : !hlfir.expr<1xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_nested_region_conflict
+// CHECK: %[[ELEM:.*]] = hlfir.elemental
+// CHECK: fir.if %{{.*}} {
+// CHECK:   fir.store
+// CHECK: }
+// CHECK: hlfir.elemental
+// CHECK:   hlfir.apply %[[ELEM]]
+// CHECK: hlfir.destroy %[[ELEM]]
+
+// Checks tracking the elemntal result through block arguments to find the
+// hlfir.apply site across a branch, fir.store in the intervening block (^bb1)
+// blocks the inlining. It was getting inlined with relaxed inlining patch.
+func.func @test_cross_block_conflict(%arg0: !fir.ref<f32>, %shape: !fir.shape<1>) {
+  // Producer in Entry Block.
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %val = fir.load %arg0 : !fir.ref<f32>
+    hlfir.yield_element %val : f32
+  }
+
+  // Pass value to block argument to maintain dataflow for the worklist.
+  cf.br ^bb1(%elemental : !hlfir.expr<1xf32>)
+
+^bb1(%block_arg: !hlfir.expr<1xf32>):
+  // Conflicting Write.
+  // This store between producer and apply site across blocks.
+  %new_val = arith.constant 2.0 : f32
+  fir.store %new_val to %arg0 : !fir.ref<f32>
+
+  // Apply Site.
+  %res = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%j: index):
+    %val = hlfir.apply %block_arg, %j : (!hlfir.expr<1xf32>, index) -> f32
+    hlfir.yield_element %val : f32
+  }
+
+  hlfir.destroy %elemental : !hlfir.expr<1xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_cross_block_conflict
+// CHECK: %[[ELM:.*]] = hlfir.elemental
+// CHECK: cf.br ^bb1(%[[ELM]] : !hlfir.expr<1xf32>)
+// CHECK: ^bb1(%[[BARG:.*]]: !hlfir.expr<1xf32>):
+// CHECK: fir.store {{.*}} to %arg0
+// CHECK: hlfir.elemental
+// CHECK: hlfir.apply %[[BARG]]
+// CHECK: hlfir.destroy %[[ELM]]
+
+// External impure procedure that might modify %arg0.
+func.func private @impure_side_effect()
+
+func.func @test_impure_call_conflict(%arg0: !fir.ref<f32>, %shape: !fir.shape<1>) {
+  // Reads from %arg0.
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %val = fir.load %arg0 : !fir.ref<f32>
+    hlfir.yield_element %val : f32
+  }
+
+  // Acts as a memory barrier.
+  fir.call @impure_side_effect() : () -> ()
+
+  // Apply Site.
+  %res = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%j: index):
+    %val = hlfir.apply %elemental, %j : (!hlfir.expr<1xf32>, index) -> f32
+    hlfir.yield_element %val : f32
+  }
+
+  hlfir.destroy %elemental : !hlfir.expr<1xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_impure_call_conflict
+// CHECK: %[[ELEM:.*]] = hlfir.elemental
+// CHECK: fir.call @impure_side_effect()
+// CHECK: hlfir.apply %[[ELEM]]
+// CHECK: hlfir.destroy %[[ELEM]]
+
+// Check conflicting write to the same memory buffer read by the elemental
+// producer in loop body blocks inlining.
+func.func @test_memory_dependency_with_designate(%arg0: !fir.ref<!fir.array<10xf32>>, %shape: !fir.shape<1>) {
+  %c1 = arith.constant 1 : index
+  %c5 = arith.constant 5 : index
+  %val = arith.constant 2.0 : f32
+
+  // Reads from the whole array %arg0.
+  %elem = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+    ^bb0(%i: index):
+      %addr = fir.coordinate_of %arg0, %i : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+      %load = fir.load %addr : !fir.ref<f32>
+      hlfir.yield_element %load : f32
+  }
+
+  // Modifies one element of the same array, partial write to the same base
+  // buffer.
+  %specific_addr = hlfir.designate %arg0 (%c1) : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+  fir.store %val to %specific_addr : !fir.ref<f32>
+
+  // Apply Site.
+  %res = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+    ^bb0(%j: index):
+      %applied = hlfir.apply %elem, %j : (!hlfir.expr<10xf32>, index) -> f32
+      hlfir.yield_element %applied : f32
+  }
+
+  hlfir.destroy %elem : !hlfir.expr<10xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_memory_dependency_with_designate
+// CHECK: %[[ELEM:.*]] = hlfir.elemental {{.*}} unordered
+// CHECK: fir.store {{.*}} to %{{.*}}
+// CHECK: hlfir.elemental
+// CHECK:   hlfir.apply %[[ELEM]]
+// CHECK: hlfir.destroy %[[ELEM]]

>From 8fc3545fe21c40aba2dcdff3cb030253499eb5b0 Mon Sep 17 00:00:00 2001
From: "anoop.kumar6 at ibm.com" <anoopk at b35lp63.lnxne.boe>
Date: Thu, 23 Apr 2026 03:59:43 +0200
Subject: [PATCH 4/4] [flang][HLFIR] Enhance inlining with robust data-flow and
 memory-effect   analysis as suggested in code review.   - Updated
 `getReadDependencies` with a fail-safe implementation using    
 `MemoryEffectOpInterface`. It now conservatively captures external    
 references for unknown operations.   - Refactored `isConflictingWrite` to
 recursively walk nested regions     using Alias Analysis to ensure no memory
 conflicts exist between     the producer and the consumer.   - Expanded the
 data-flow worklist to handle metadata (declares/converts)     and block
 boundaries, ensuring all paths to nested apply sites are     tracked. Cases
 involving complex control-flow boundaries     (cross-block branches and
 loop-exits) are now successfully handled.   - Decoupled inlining from buffer
 destruction to ensure hlfir.destroy is     preserved when the result is
 required by intrinsic operations or other     consumers.

---
 .../HLFIR/Transforms/InlineElementals.cpp     | 212 +++++---
 .../HLFIR/inline-elemental-multi-users.fir    | 461 +++++++++++++++++-
 2 files changed, 586 insertions(+), 87 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
index 9a6ad46309947..e02717913f68a 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
@@ -34,20 +34,40 @@ namespace hlfir {
 } // namespace hlfir
 
 /// Collects all memory values (buffers/references) that the elemental body
-/// reads from.
+/// reads from. Use MemoryEffectOpInterface for a fail-safe implementation.
 static void getReadDependencies(hlfir::ElementalOp elemental,
                                 llvm::SmallVectorImpl<mlir::Value> &deps) {
   elemental.getRegion().walk([&](mlir::Operation *op) {
-    if (auto designate = mlir::dyn_cast<hlfir::DesignateOp>(op))
-      deps.push_back(designate.getMemref());
-    else if (auto load = mlir::dyn_cast<fir::LoadOp>(op))
-      deps.push_back(load.getMemref());
-    // Capture any value defined outside the elemental but used inside it.
+    // Check if the operation explicitly implements memory effects.
+    if (auto memInterface = mlir::dyn_cast<mlir::MemoryEffectOpInterface>(op)) {
+      llvm::SmallVector<mlir::MemoryEffects::EffectInstance, 4> effects;
+      memInterface.getEffects(effects);
+      bool hasUnspecifiedRead = false;
+      for (const auto &effect : effects) {
+        if (mlir::isa<mlir::MemoryEffects::Read>(effect.getEffect())) {
+          if (mlir::Value val = effect.getValue()) {
+            deps.push_back(val);
+          } else {
+            // Read effect on an unspecified resource (e.g., global state).
+            hasUnspecifiedRead = true;
+          }
+        }
+      }
+      // If the op has a read effect but the specific value is unknown,
+      // conservatively capture all potential reference operands.
+      if (!hasUnspecifiedRead)
+        return;
+    }
+
+    // Fail-safe Fallback: For operations without the interface or with
+    // unspecified effects, capture any external reference used inside.
     for (mlir::Value operand : op->getOperands()) {
-      if (operand.getParentRegion() != &elemental.getRegion())
+      if (operand.getParentRegion() != &elemental.getRegion()) {
         if (mlir::isa<fir::ReferenceType, fir::PointerType, fir::HeapType,
-                      fir::BoxType>(operand.getType()))
+                      fir::BoxType>(operand.getType())) {
           deps.push_back(operand);
+        }
+      }
     }
   });
 }
@@ -57,59 +77,51 @@ static void getReadDependencies(hlfir::ElementalOp elemental,
 static bool isConflictingWrite(mlir::Operation *op,
                                const llvm::SmallVectorImpl<mlir::Value> &deps,
                                mlir::AliasAnalysis &aa) {
-  // Operations explicitly marked as having no memory effects are safe.
-  if (mlir::isMemoryEffectFree(op))
-    return false;
+  // Use walk to handle nested regions (fir.if, fir.do_loop, etc.) recursively.
+  mlir::WalkResult result = op->walk([&](mlir::Operation *nestedOp) {
+    // Operations explicitly marked as having no memory effects are safe.
+    if (mlir::isMemoryEffectFree(nestedOp))
+      return mlir::WalkResult::advance();
 
-  // Explicitly allow safe HLFIR/FIR metadata/lifetime operations.
-  // While these may have internal effects (e.g. allocating a descriptor),
-  // they do not modify the user data being read by the elemental.
-  if (mlir::isa<hlfir::DeclareOp, hlfir::AssociateOp, hlfir::EndAssociateOp,
-                fir::AllocaOp, hlfir::NoReassocOp>(op))
-    return false;
+    // Explicitly allow safe HLFIR/FIR metadata/lifetime operations.
+    if (mlir::isa<hlfir::DeclareOp, hlfir::AssociateOp, hlfir::EndAssociateOp,
+                  fir::AllocaOp, hlfir::NoReassocOp>(nestedOp))
+      return mlir::WalkResult::advance();
 
-  // Check for explicit memory effects via the MemoryEffectOpInterface.
-  if (auto memInterface = mlir::dyn_cast<mlir::MemoryEffectOpInterface>(op)) {
-    llvm::SmallVector<mlir::MemoryEffects::EffectInstance, 4> effects;
-    memInterface.getEffects(effects);
-
-    for (const auto &effect : effects) {
-      // Analyze effects that modify memory or release resources.
-      if (mlir::isa<mlir::MemoryEffects::Write>(effect.getEffect()) ||
-          mlir::isa<mlir::MemoryEffects::Free>(effect.getEffect())) {
-
-        mlir::Value accessedValue = effect.getValue();
-        // If the effect is on an unknown resource (e.g. external call),
-        // assume a conflict.
-        if (!accessedValue)
-          return true;
-
-        // Perform alias analysis against all read dependencies.
-        for (mlir::Value dep : deps) {
-          if (!aa.alias(accessedValue, dep).isNo())
-            return true;
+    // Check for explicit memory effects via the interface.
+    if (auto memInterface =
+            mlir::dyn_cast<mlir::MemoryEffectOpInterface>(nestedOp)) {
+      llvm::SmallVector<mlir::MemoryEffects::EffectInstance, 4> effects;
+      memInterface.getEffects(effects);
+
+      for (const auto &effect : effects) {
+        // Analyze effects that modify memory or release resources.
+        if (mlir::isa<mlir::MemoryEffects::Write, mlir::MemoryEffects::Free>(
+                effect.getEffect())) {
+          mlir::Value accessedValue = effect.getValue();
+          // Fail-safe: Assuming conflict for Unknown resource (e.g. external
+          // call).
+          if (!accessedValue)
+            return mlir::WalkResult::interrupt();
+
+          // Perform alias analysis against all read dependencies.
+          for (mlir::Value dep : deps) {
+            if (!aa.alias(accessedValue, dep).isNo())
+              return mlir::WalkResult::interrupt();
+          }
         }
       }
+    } else if (nestedOp->getNumRegions() == 0) {
+      // Conservative Fallback: If an operation doesn't have  interface and
+      // has no regions (e.g. a fir.call), assume it can modify anything.
+      return mlir::WalkResult::interrupt();
     }
-  } else if (op->getNumRegions() == 0) {
-    // Conservative Fallback: If an operation lacks the interface and has no
-    // regions (e.g. a fir.call to an external function), assume it can
-    // potentially modifies any memory.
-    return true;
-  }
 
-  // Recursive Analysis into structured control flow regions.
-  // (e.g. fir.if, fir.do_loop) to find nested conflicting writes.
-  for (mlir::Region &region : op->getRegions()) {
-    for (mlir::Block &block : region) {
-      for (mlir::Operation &nestedOp : block) {
-        if (isConflictingWrite(&nestedOp, deps, aa))
-          return true;
-      }
-    }
-  }
+    return mlir::WalkResult::advance();
+  });
 
-  return false;
+  // Conflict found as walk interrupted.
+  return result.wasInterrupted();
 }
 
 bool isSafeToInline(hlfir::ElementalOp producer, hlfir::ApplyOp applySite,
@@ -162,10 +174,12 @@ getTwoUses(hlfir::ElementalOp elemental, mlir::AliasAnalysis &aliasAnalysis) {
   hlfir::ApplyOp apply;
   hlfir::DestroyOp destroy;
   unsigned applyCount = 0;
+  bool hasOtherUsers = false;
 
   llvm::SmallVector<mlir::Value> worklist;
   worklist.push_back(elemental.getResult());
   llvm::SmallPtrSet<mlir::Value, 16> visited;
+  llvm::SmallPtrSet<mlir::Operation *, 4> uniqueApplies;
 
   while (!worklist.empty()) {
     mlir::Value current = worklist.pop_back_val();
@@ -177,21 +191,28 @@ getTwoUses(hlfir::ElementalOp elemental, mlir::AliasAnalysis &aliasAnalysis) {
 
       mlir::TypeSwitch<mlir::Operation *, void>(user)
           .Case<hlfir::ApplyOp>([&](hlfir::ApplyOp op) {
-            apply = op;
-            applyCount++;
+            // Use raw operation pointer to ensure each apply site is
+            // counted only once.
+            if (uniqueApplies.insert(op.getOperation()).second) {
+              apply = op;
+              applyCount++;
+            }
           })
           .Case<hlfir::DestroyOp>([&](hlfir::DestroyOp op) {
             // Track the mandatory destroy operation for the elemental expr.
             destroy = op;
           })
-          .Case<hlfir::DeclareOp>([&](hlfir::DeclareOp op) {
-            // Follow the dataflow through variable declarations.
-            worklist.push_back(op.getBase());
-          })
-          .Case<fir::ConvertOp>([&](fir::ConvertOp op) {
-            // Follow the dataflow through type conversions.
-            worklist.push_back(op.getResult());
+          .Case<hlfir::DeclareOp, fir::ConvertOp>([&](mlir::Operation *op) {
+            // Follow the dataflow through all results of the operation.
+            // For hlfir.declare, this catches both the variable and base
+            // results. For fir.convert, this catches the converted result.
+            for (mlir::Value result : op->getResults()) {
+              worklist.push_back(result);
+            }
           })
+          // Buffer Consumers - These require the destroy to stay.
+          .Case<hlfir::AssociateOp, hlfir::SumOp, hlfir::AssignOp, fir::CallOp>(
+              [&](mlir::Operation *) { hasOtherUsers = true; })
           .Case<mlir::BranchOpInterface>([&](mlir::BranchOpInterface branch) {
             for (unsigned i = 0; i < branch->getNumSuccessors(); ++i) {
               mlir::SuccessorOperands operands = branch.getSuccessorOperands(i);
@@ -207,30 +228,57 @@ getTwoUses(hlfir::ElementalOp elemental, mlir::AliasAnalysis &aliasAnalysis) {
           })
           .Case<fir::ResultOp>([&](fir::ResultOp op) {
             mlir::Operation *parent = op->getParentOp();
-            if (parent) {
+            // Only forward if the parent is an op that yields values out.
+            if (parent &&
+                mlir::isa<mlir::RegionBranchOpInterface, fir::IfOp,
+                          fir::DoLoopOp, hlfir::ElementalOp>(parent)) {
               for (auto it : llvm::enumerate(op.getOperands())) {
                 if (it.value() == current) {
-                  // 'current' is being yielded. The value outside the loop is
-                  // the i-th result of the parent operation.
+                  // Map the result index to the parent's result index.
                   unsigned i = it.index();
                   if (i < parent->getNumResults()) {
                     worklist.push_back(parent->getResult(i));
                   }
                 }
               }
+            } else {
+              // If it's a terminator for an unknown op.
+              hasOtherUsers = true;
             }
           })
           .Default([&](mlir::Operation *op) {
-            // If the elemental result is used by an operation with regions
-            // (like fir.if or fir.do_loop), the apply site may be nested
-            // inside.
             if (op->getNumRegions() > 0) {
-              op->walk([&](hlfir::ApplyOp nestedApply) {
-                if (nestedApply.getExpr() == current) {
-                  apply = nestedApply;
-                  applyCount++;
+              // Follow the value through metadata ops (declare, convert, etc.)
+              // nested inside regions.
+              op->walk([&](mlir::Operation *innerOp) {
+                for (mlir::Value operand : innerOp->getOperands()) {
+                  if (operand == current) {
+                    if (auto nestedApply =
+                            mlir::dyn_cast<hlfir::ApplyOp>(innerOp)) {
+                      // Use a set to prevent double-counting if walker
+                      // and worklist hit the same apply site.
+                      if (uniqueApplies.insert(nestedApply.getOperation())
+                              .second) {
+                        apply = nestedApply;
+                        applyCount++;
+                      }
+                    } else if (mlir::isa<hlfir::DeclareOp, fir::ConvertOp>(
+                                   innerOp)) {
+                      // Feed internal metadata results back into the worklist.
+                      for (mlir::Value res : innerOp->getResults())
+                        worklist.push_back(res);
+                    } else if (!mlir::isa<hlfir::DestroyOp, fir::ResultOp,
+                                          mlir::BranchOpInterface>(innerOp)) {
+                      // If it's an intrinsic or unknown consumer, it needs the
+                      // buffer.
+                      hasOtherUsers = true;
+                    }
+                  }
                 }
               });
+            } else {
+              // Non-region op not handled by specific Case<> (e.g. hlfir.sum)
+              hasOtherUsers = true;
             }
           });
     }
@@ -254,7 +302,9 @@ getTwoUses(hlfir::ElementalOp elemental, mlir::AliasAnalysis &aliasAnalysis) {
   if (apply.getResult().getType() != yield.getElementValue().getType())
     return std::nullopt;
 
-  return std::pair{apply, destroy};
+  // Only return the destroy op if there's exactly one apply and no other users.
+  bool safeToDelete = (applyCount == 1 && !hasOtherUsers);
+  return std::make_pair(apply, safeToDelete ? destroy : nullptr);
 }
 
 namespace {
@@ -295,11 +345,15 @@ class InlineElementalConversion
     // remove the old elemental and all of the bookkeeping
     rewriter.replaceOp(apply, {yield.getElementValue()});
     rewriter.eraseOp(yield);
-    rewriter.eraseOp(destroy);
-    // Only erase the elemental if that was its last use.
-    if (elemental->use_empty())
-      rewriter.eraseOp(elemental);
-
+    // Only erase the destroy and elemental if the analysis shows it's safe.
+    if (hlfir::DestroyOp destroyOp = maybeTuple->second) {
+      // IR has no users left.
+      if (destroyOp->use_empty())
+        rewriter.eraseOp(destroyOp);
+
+      if (elemental.getResult().use_empty())
+        rewriter.eraseOp(elemental);
+    }
     return mlir::success();
   }
 
diff --git a/flang/test/HLFIR/inline-elemental-multi-users.fir b/flang/test/HLFIR/inline-elemental-multi-users.fir
index 32c9305320bdb..20fc862e6dba2 100644
--- a/flang/test/HLFIR/inline-elemental-multi-users.fir
+++ b/flang/test/HLFIR/inline-elemental-multi-users.fir
@@ -1,4 +1,4 @@
-// RUN: fir-opt --inline-elementals %s | FileCheck %s
+// RUN: fir-opt -allow-unregistered-dialect --inline-elementals %s | FileCheck %s
 
 // Test inlining of hlfir.elemental into its hlfir.apply site when the
 // elemental has more than two users.
@@ -43,7 +43,10 @@ func.func @test_safe_loop_inlining(%arg0: !fir.ref<!fir.array<10xf32>>, %i: inde
 // CHECK:     %[[VAL:.*]] = fir.load %[[ADDR]]
 // CHECK-NOT: hlfir.apply
 // CHECK: hlfir.associate %[[ELEM]]
-// CHECK-NOT: hlfir.destroy %[[ELEM]]
+// This verifies that even when an elemental is inlined into an apply site,
+// the destroy operation is preserved because another user (hlfir.associate)
+// still requires the array buffer.
+// CHECK: hlfir.destroy %[[ELEM]]
 
 // Test blocking of incorrect inlining because of alias conflict.
 func.func @test_unsafe_loop_alias_conflict(%arg0: !fir.ref<!fir.array<10xf32>>, %new_val: f32) {
@@ -133,9 +136,11 @@ func.func @test_inlining_use_mask(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !
 // CHECK:     %[[CMP:.*]] = arith.cmpi eq, %[[LOAD]], %[[REF]]
 // CHECK:     %[[RES:.*]] = fir.convert %[[CMP]]
 // CHECK:     fir.store %[[RES]]
-// CHECK-NOT: hlfir.destroy %[[MASK]]
+// CHECK: hlfir.destroy %[[MASK]]
 
-// Test elemental removal as use_count becomes zero.
+// Check elemental and destroy are successfully removed even when
+// the expression is passed through metadata operations (fir.convert)
+// and used in nested loops.
 func.func @test_inlining_elemental_cleanup(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.ref<i32>) {
   %c1 = arith.constant 1 : index
   %c10 = arith.constant 10 : index
@@ -295,7 +300,7 @@ func.func @test_scalar_apply_inlining_safe(%b: !fir.ref<!fir.array<10xf32>>, %c1
 // CHECK: arith.addf %[[VAL]], %[[VAL]]
 // CHECK-NOT: hlfir.apply
 // CHECK: hlfir.associate %[[ELEM]]
-// CHECK-NOT: hlfir.destroy %[[ELEM]]
+// CHECK: hlfir.destroy %[[ELEM]]
 
 // Check long chains of elementals.
 // subroutine reproducer(a)
@@ -355,9 +360,7 @@ func.func @_QPreproducer(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a
 // CHECK-DAG:     arith.mulf
 // CHECK-DAG:     math.sqrt
 // CHECK:         hlfir.assign
-// The apply site was inlined, so the elemental's lifecycle (destroy) 
-// is removed even though metadata users like no_reassoc remain.
-// CHECK-NOT:     hlfir.destroy %[[TMP1]]
+// CHECK:     hlfir.destroy %[[TMP1]]
 
 // Check that the ordered elemental is not inlined into another:
 // a = b + c + d (where b + c is ordered)
@@ -575,3 +578,445 @@ func.func @test_memory_dependency_with_designate(%arg0: !fir.ref<!fir.array<10xf
 // CHECK: hlfir.elemental
 // CHECK:   hlfir.apply %[[ELEM]]
 // CHECK: hlfir.destroy %[[ELEM]]
+
+// Inlining is blocked because an unknown op uses %arg0.
+func.func @test_unknown_op_fallback(%arg0: !fir.ref<f32>) -> f32 {
+  %c1 = arith.constant 1 : index
+  %f1 = arith.constant 1.0 : f32
+  %shape = fir.shape %c1 : (index) -> !fir.shape<1>
+
+  %elemental = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    "unknown.op"(%arg0) : (!fir.ref<f32>) -> ()
+    %val = fir.load %arg0 : !fir.ref<f32>
+    hlfir.yield_element %val : f32
+  }
+
+  fir.store %f1 to %arg0 : !fir.ref<f32>
+
+  %apply = hlfir.apply %elemental, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+  hlfir.destroy %elemental : !hlfir.expr<1xf32>
+  return %apply : f32
+}
+// CHECK-LABEL: func.func @test_unknown_op_fallback
+// CHECK: hlfir.elemental
+// CHECK: hlfir.apply
+
+// Inlining is blocked because the walker found a dependency inside a region.
+func.func @test_nested_dependency(%arg0: !fir.ref<f32>, %cond: i1) -> f32 {
+  %c1 = arith.constant 1 : index
+  %f1 = arith.constant 1.0 : f32
+  %shape = fir.shape %c1 : (index) -> !fir.shape<1>
+
+  %elemental = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %res = fir.if %cond -> f32 {
+      %val = fir.load %arg0 : !fir.ref<f32>
+      fir.result %val : f32
+    } else {
+      %c0 = arith.constant 0.0 : f32
+      fir.result %c0 : f32
+    }
+    hlfir.yield_element %res : f32
+  }
+
+  fir.store %f1 to %arg0 : !fir.ref<f32>
+
+  %apply = hlfir.apply %elemental, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+  hlfir.destroy %elemental : !hlfir.expr<1xf32>
+  return %apply : f32
+}
+// CHECK-LABEL: func.func @test_nested_dependency
+// CHECK: hlfir.elemental
+// CHECK: hlfir.apply
+
+// Check if fir.box dependency are captured in fail-safe logic.
+func.func @test_box_dependency(%arg0: !fir.box<!fir.array<?xf32>>) -> f32 {
+  %c1 = arith.constant 1 : index
+  %f0 = arith.constant 0.0 : f32
+  %shape = fir.shape %c1 : (index) -> !fir.shape<1>
+
+  %elemental = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    // 'unknown.op' uses the box. The fail-safe should capture %arg0 dependency.
+    "unknown.op"(%arg0) : (!fir.box<!fir.array<?xf32>>) -> ()
+
+    %addr_in = hlfir.designate %arg0 (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %val = fir.load %addr_in : !fir.ref<f32>
+    hlfir.yield_element %val : f32
+  }
+
+  // Inlining blocked as %arg0 was captured.
+  %addr_out = hlfir.designate %arg0 (%c1) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+  fir.store %f0 to %addr_out : !fir.ref<f32>
+
+  %apply = hlfir.apply %elemental, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+  hlfir.destroy %elemental : !hlfir.expr<1xf32>
+  return %apply : f32
+}
+// CHECK-LABEL: func.func @test_box_dependency
+// CHECK: hlfir.elemental
+// CHECK: hlfir.apply
+
+// Check if .Default walker correctly enters a region (fir.if) and identifies
+// metadata (fir.convert). It then feeds the result back to the  worklist,
+// allowing analysis to find the nested hlfir.apply.
+func.func @test_nested_metadata_feedback(%arg0: !fir.shape<1>, %cond: i1) -> f32 {
+  %c1 = arith.constant 1 : index
+  %el = hlfir.elemental %arg0 : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %c0 = arith.constant 5.0 : f32
+    hlfir.yield_element %c0 : f32
+  }
+
+  %res = fir.if %cond -> f32 {
+    %conv1 = fir.convert %el : (!hlfir.expr<1xf32>) -> !hlfir.expr<1xf32>
+    %conv2 = fir.convert %conv1 : (!hlfir.expr<1xf32>) -> !hlfir.expr<1xf32>
+    %apply = hlfir.apply %conv2, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+    fir.result %apply : f32
+  } else {
+    %f0 = arith.constant 0.0 : f32
+    fir.result %f0 : f32
+  }
+
+  hlfir.destroy %el : !hlfir.expr<1xf32>
+  return %res : f32
+}
+// CHECK-LABEL: func.func @test_nested_metadata_feedback
+// CHECK-NOT: hlfir.elemental
+// CHECK-NOT: hlfir.apply
+// CHECK: %[[VAL:.*]] = arith.constant 5.000000e+00 : f32
+// CHECK: fir.if
+// CHECK:   fir.result %{{.*}} : f32
+// CHECK: else
+// CHECK:   fir.result %{{.*}} : f32
+
+// Check if analysis can find an hlfir.apply nested inside a fir.if
+// when other operations like hlfir.declare are present in the
+// same region. It ensures the .Default walker doesn't get stuck.
+func.func @test_nested_declare_results(%arg0: !fir.shape<1>, %cond: i1) -> f32 {
+  %c1 = arith.constant 1 : index
+  %el = hlfir.elemental %arg0 : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %cst = arith.constant 1.0 : f32
+    hlfir.yield_element %cst : f32
+  }
+
+  %res = fir.if %cond -> f32 {
+    // Metadata ops present in the region.
+    %mem = fir.alloca !hlfir.expr<1xf32>
+    %v:2 = hlfir.declare %mem {uniq_name = "test_var"} : (!fir.ref<!hlfir.expr<1xf32>>) -> (!fir.ref<!hlfir.expr<1xf32>>, !fir.ref<!hlfir.expr<1xf32>>)
+
+    // The apply using the elemental.
+    %apply = hlfir.apply %el, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+    fir.result %apply : f32
+  } else {
+    %f0 = arith.constant 0.0 : f32
+    fir.result %f0 : f32
+  }
+
+  hlfir.destroy %el : !hlfir.expr<1xf32>
+  return %res : f32
+}
+// CHECK-LABEL: func.func @test_nested_declare_results
+// CHECK-NOT: hlfir.elemental
+// CHECK-NOT: hlfir.apply
+// CHECK: %[[VAL:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK: fir.if
+// CHECK:   fir.result %{{.*}} : f32
+// CHECK: else
+// CHECK:   fir.result %{{.*}} : f32
+
+// Check fail-safe logic in getReadDependencies.
+// The 'test.unknown_read' operation has no memory interface, so the pass
+// should conservatively capture its operand (%mem) as a dependency.
+// No inlining as there is a fir.store to %mem before the apply.
+func.func @test_unspecified_read_fallback(%mem: !fir.ref<f32>) -> f32 {
+  %c1 = arith.constant 1 : index
+  %shape = fir.shape %c1 : (index) -> !fir.shape<1>
+  %el = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    "test.unknown_read"(%mem) : (!fir.ref<f32>) -> ()
+    %val = fir.load %mem : !fir.ref<f32>
+    hlfir.yield_element %val : f32
+  }
+  %f1 = arith.constant 1.0 : f32
+  fir.store %f1 to %mem : !fir.ref<f32>
+  %apply = hlfir.apply %el, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+  hlfir.destroy %el : !hlfir.expr<1xf32>
+  return %apply : f32
+}
+// CHECK-LABEL: func.func @test_unspecified_read_fallback
+// CHECK: %[[EL:.*]] = hlfir.elemental
+// CHECK: fir.store
+// CHECK: %[[RES:.*]] = hlfir.apply %[[EL]]
+// CHECK: hlfir.destroy %[[EL]]
+// CHECK: return %[[RES]]
+
+// Checks recursive analysis of the .Default walker. It identifies an
+// hlfir.apply nested inside a fir.if block, skipping the container itself
+// and successfully inlining the elemental body into the nested site.
+func.func @test_minimal_nested(%arg0: !fir.shape<1>, %cond: i1) {
+  %c1 = arith.constant 1 : index
+  %el = hlfir.elemental %arg0 : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %c0 = arith.constant 0.0 : f32
+    hlfir.yield_element %c0 : f32
+  }
+
+  fir.if %cond {
+    %apply = hlfir.apply %el, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+    "test.sink"(%apply) : (f32) -> ()
+  }
+
+  hlfir.destroy %el : !hlfir.expr<1xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_minimal_nested
+// CHECK-NOT: hlfir.elemental
+// CHECK-NOT: hlfir.apply
+// CHECK: arith.constant 0.000000e+00 : f32
+// CHECK: "test.sink"
+
+// Check .Default case identify unknown region-based operations as other users.
+// The pass should inline the hlfir.apply but should preserve the
+// hlfir.elemental and hlfir.destroy because the buffer is still required by
+// unknown op.
+func.func @test_unknown_region_consumer(%arg0: !fir.shape<1>) {
+  %el = hlfir.elemental %arg0 : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %c0 = arith.constant 0.0 : f32
+    hlfir.yield_element %c0 : f32
+  }
+
+  // An operation with regions that can't be recognized as a metadata/apply op.
+  "unknown.region_op"(%el) ({
+    ^bb0(%arg1: !hlfir.expr<1xf32>):
+      "some.inner.op"() : () -> ()
+  }) : (!hlfir.expr<1xf32>) -> ()
+
+  %c1 = arith.constant 1 : index
+  %apply = hlfir.apply %el, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+  hlfir.destroy %el : !hlfir.expr<1xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_unknown_region_consumer
+// CHECK: hlfir.elemental
+// CHECK-NOT: hlfir.apply
+// CHECK: hlfir.destroy
+
+// Check recursive metadata (Walker -> Worklist -> Walker).
+// hlfir.apply nested within multiple layers of control flow (nested fir.if).
+// .Default walker and main worklist can hand off the elemental result across
+// region boundaries many times.
+func.func @test_deep_hybrid_nesting(%arg0: !fir.shape<1>, %cond: i1) -> f32 {
+  %c1 = arith.constant 1 : index
+  %el = hlfir.elemental %arg0 : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %c0 = arith.constant 42.0 : f32
+    hlfir.yield_element %c0 : f32
+  }
+
+  %res = fir.if %cond -> f32 {
+    // .Default walker finds this, feeds back to worklist
+    %conv = fir.convert %el : (!hlfir.expr<1xf32>) -> !hlfir.expr<1xf32>
+
+    // Worklist finds this nested if, .Default walker runs again.
+    %inner = fir.if %cond -> f32 {
+       %apply = hlfir.apply %conv, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+       fir.result %apply : f32
+    } else {
+       %f0 = arith.constant 0.0 : f32
+       fir.result %f0 : f32
+    }
+    fir.result %inner : f32
+  } else {
+    %f0 = arith.constant 0.0 : f32
+    fir.result %f0 : f32
+  }
+  hlfir.destroy %el : !hlfir.expr<1xf32>
+  return %res : f32
+}
+// CHECK-LABEL: func.func @test_deep_hybrid_nesting
+// CHECK-NOT: hlfir.elemental
+// CHECK-NOT: hlfir.apply
+// CHECK: arith.constant 4.200000e+01 : f32
+
+// Check conservative fail-safe in getReadDependencies. When an unknown
+// operation is encountered, all of its reference-type operands are captured
+// as dependencies. Inlining is blocked because of any of these captured
+// values.
+func.func @test_multi_operand_fallback(%mem1: !fir.ref<f32>, %mem2: !fir.ref<f32>) -> f32 {
+  %c1 = arith.constant 1 : index
+  %shape = fir.shape %c1 : (index) -> !fir.shape<1>
+
+  %el = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    // Unknown op with two reference operands.
+    "unknown.op"(%mem1, %mem2) : (!fir.ref<f32>, !fir.ref<f32>) -> ()
+    %val = fir.load %mem1 : !fir.ref<f32>
+    hlfir.yield_element %val : f32
+  }
+
+  // Write to the second operand (%mem2).
+  // In case of  correctly captured all operands of the unknown op,
+  // inlining of the elemental should be blocked due to write to %mem2.
+  %f1 = arith.constant 1.0 : f32
+  fir.store %f1 to %mem2 : !fir.ref<f32>
+
+  %apply = hlfir.apply %el, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+  hlfir.destroy %el : !hlfir.expr<1xf32>
+  return %apply : f32
+}
+// CHECK-LABEL: func.func @test_multi_operand_fallback
+// CHECK: hlfir.elemental
+// CHECK: hlfir.apply
+
+// Check fir::ResultOp mapping logic in the TypeSwitch - analysis can track
+// the elemental result as it is yielded out of a fir.do_loop, correctly maps
+// iter_arg to the loop result to find hlfir.apply site.
+func.func @test_loop_carried_result(%arg0: !fir.shape<1>, %n: index) -> f32 {
+  %c1 = arith.constant 1 : index
+  %el = hlfir.elemental %arg0 : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %cst = arith.constant 7.0 : f32
+    hlfir.yield_element %cst : f32
+  }
+
+  // The elemental result is passed through a loop result.
+  %loop_res = fir.do_loop %i = %c1 to %n step %c1 iter_args(%arg = %el) -> (!hlfir.expr<1xf32>) {
+    fir.result %arg : !hlfir.expr<1xf32>
+  }
+
+  // The apply uses the result of the loop.
+  // TypeSwitch for fir::ResultOp should map %arg back to %loop_res.
+  %apply = hlfir.apply %loop_res, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+  hlfir.destroy %el : !hlfir.expr<1xf32>
+  return %apply : f32
+}
+// CHECK-LABEL: func.func @test_loop_carried_result
+// CHECK-NOT: hlfir.elemental
+// CHECK: arith.constant 7.0
+
+// Check if worklist successfully traverses block boundaries.
+// BranchOpInterface logic correctly maps the elemental result to the
+// successor block's argument, allowing the pass to find hlfir.apply in
+// a separate basic block.
+func.func @test_cross_block_dataflow(%arg0: !fir.shape<1>, %cond: i1) -> f32 {
+  %c1 = arith.constant 1 : index
+  %el = hlfir.elemental %arg0 : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %cst = arith.constant 3.14 : f32
+    hlfir.yield_element %cst : f32
+  }
+
+  // Value is passed as a block argument to bb1.
+  cf.cond_br %cond, ^bb1(%el : !hlfir.expr<1xf32>), ^bb2
+
+^bb1(%block_arg: !hlfir.expr<1xf32>):
+  // The worklist must map %el to %block_arg via BranchOpInterface
+  %apply = hlfir.apply %block_arg, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+  hlfir.destroy %el : !hlfir.expr<1xf32>
+  return %apply : f32
+
+^bb2:
+  %f0 = arith.constant 0.0 : f32
+  return %f0 : f32
+}
+// CHECK-LABEL: func.func @test_cross_block_dataflow
+// CHECK-NOT: hlfir.elemental
+// CHECK-NOT: hlfir.apply
+// CHECK-NOT: hlfir.destroy
+// CHECK: arith.constant 3.14
+
+// This test verifiess following the elemental result through a loop exit.
+// By mapping the fir.result operand to the parent operation's result,
+// the worklist can trace the value from inside the loop to an hlfir.apply
+// site located outside the loop.
+func.func @test_loop_exit_dataflow(%arg0: !fir.shape<1>, %n: index) -> f32 {
+  %c1 = arith.constant 1 : index
+  %el = hlfir.elemental %arg0 : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %cst = arith.constant 1.23 : f32
+    hlfir.yield_element %cst : f32
+  }
+
+  // The elemental result exits the loop via fir.result.
+  %loop_res = fir.do_loop %i = %c1 to %n step %c1 iter_args(%arg = %el) -> (!hlfir.expr<1xf32>) {
+    fir.result %arg : !hlfir.expr<1xf32>
+  }
+
+  // The apply site is outside the loop, using the loop's result.
+  %apply = hlfir.apply %loop_res, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+  hlfir.destroy %el : !hlfir.expr<1xf32>
+  return %apply : f32
+}
+// CHECK-LABEL: func.func @test_loop_exit_dataflow
+// CHECK-NOT: hlfir.elemental
+// CHECK-NOT: hlfir.apply
+// CHECK-NOT: hlfir.destroy
+// CHECK: arith.constant 1.23
+
+// Check if worklist can follow a result through nested control flow
+// (if inside a loop) and out of the loop result itself.
+func.func @test_complex_nesting_valid(%arg0: !fir.shape<1>, %n: index, %cond: i1) -> f32 {
+  %c1 = arith.constant 1 : index
+  %el = hlfir.elemental %arg0 : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %cst = arith.constant 100.0 : f32
+    hlfir.yield_element %cst : f32
+  }
+
+  // Value enters the loop via iter_args.
+  %loop = fir.do_loop %i = %c1 to %n step %c1 iter_args(%arg = %el) -> (!hlfir.expr<1xf32>) {
+    // Nested structured control flow. 
+    %res = fir.if %cond -> !hlfir.expr<1xf32> {
+      fir.result %arg : !hlfir.expr<1xf32>
+    } else {
+      fir.result %arg : !hlfir.expr<1xf32>
+    }
+    fir.result %res : !hlfir.expr<1xf32>
+  }
+
+  // Value exits the loop and is used by apply.
+  %apply = hlfir.apply %loop, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+  hlfir.destroy %el : !hlfir.expr<1xf32>
+  return %apply : f32
+}
+// CHECK-LABEL: func.func @test_complex_nesting_valid
+// CHECK-NOT: hlfir.elemental
+// CHECK-NOT: hlfir.apply
+// CHECK-NOT: hlfir.destroy
+// CHECK: arith.constant 1.000000e+02 : f32
+
+// Recursive .Default walker can find an apply site deep within nested regions
+// (if inside a loop).
+func.func @test_deep_nested_discovery(%arg0: !fir.shape<1>, %n: index, %cond: i1) -> f32 {
+  %c1 = arith.constant 1 : index
+  %f0 = arith.constant 0.0 : f32
+  %el = hlfir.elemental %arg0 : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %c100 = arith.constant 100.0 : f32
+    hlfir.yield_element %c100 : f32
+  }
+
+  %res = fir.do_loop %i = %c1 to %n step %c1 iter_args(%arg = %el) -> (!hlfir.expr<1xf32>) {
+    %inner = fir.if %cond -> !hlfir.expr<1xf32> {
+      // The .Default walker on the fir.if finds this.
+      %apply = hlfir.apply %arg, %c1 : (!hlfir.expr<1xf32>, index) -> f32
+      "test.sink"(%apply) : (f32) -> ()
+      fir.result %arg : !hlfir.expr<1xf32>
+    } else {
+      fir.result %arg : !hlfir.expr<1xf32>
+    }
+    fir.result %inner : !hlfir.expr<1xf32>
+  }
+
+  hlfir.destroy %el : !hlfir.expr<1xf32>
+  return %f0 : f32
+}
+// CHECK-LABEL: func.func @test_deep_nested_discovery
+// CHECK-NOT: hlfir.elemental
+// CHECK-NOT: hlfir.apply
+// CHECK-NOT: hlfir.destroy
+// CHECK: arith.constant 1.000000e+02 : f32



More information about the flang-commits mailing list