[flang-commits] [flang] [flang][HLFIR] Optimize MINLOC/MAXLOC for equality masks (PR #186916)

via flang-commits flang-commits at lists.llvm.org
Tue Mar 31 17:56:08 PDT 2026


https://github.com/anoopkg6 updated https://github.com/llvm/llvm-project/pull/186916

>From 263b74db14a1b8bab613f8493eaf586cb7e73652 Mon Sep 17 00:00:00 2001
From: "anoop.kumar6 at ibm.com" <anoopk at b35lp63.lnxne.boe>
Date: Mon, 16 Mar 2026 15:26:39 +0100
Subject: [PATCH 1/3] [flang][HLFIR] Optimize MINLOC/MAXLOC for equality masks

This patch implements `isEqualityMask` to identify when the MASK argument is an equality comparison against an invariant value (e.g., MASK = A == X).

- This allows the SimplifyHLFIRIntrinsicscation pass to extract the invariant
  search target and bypasses the creation of a temporary logical mask array
  by inlining the equality comparison directly into the reduction loop.
  optimization removes the 'hlfir.apply' to the mask's hlfir.elemental, which
  gets eliminated in bufferize-hlfir pass.
- Simplifies the reduction state by removing the min/max value tracker,
  as the target value is already known.
- Implements a "first-hit" locking mechanism.

Test Coverage:
- 1D, 2D, 3D Variable/Constant equality searches - Verified optimized
- Duplicate match handling - Verified first-occurrence logic
- No-match cases - Verified zero result
- Different array/Non-invariant target - Verified safe fallback
---
 .../Transforms/SimplifyHLFIRIntrinsics.cpp    | 225 +++++++++++++-
 ...plify-hlfir-intrinsics-equality-maxloc.fir | 269 +++++++++++++++++
 ...plify-hlfir-intrinsics-equality-minloc.fir | 274 ++++++++++++++++++
 3 files changed, 764 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir
 create mode 100644 flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index 7ff9dc61110d3..cf38bd61cfb51 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -32,6 +32,94 @@ namespace hlfir {
 
 #define DEBUG_TYPE "simplify-hlfir-intrinsics"
 
+namespace {
+// Check if the given mask is an equality comparison of the search array
+// against an invariant value (e.g., MASK = A == target) by traversing
+// HLFIR/FIR operations to find the underlying elemental comparison
+// and extract the invariant search targetVal.
+// It returns true if the mask is a simple equality comparison against a
+// scalar/invariant.
+bool isEqualityMask(mlir::Value mask, mlir::Value searchArray,
+                    mlir::Value &targetVal) {
+  if (!mask)
+    return false;
+
+  // Trace back HLFIR/FIR wrappers to get Elemental producer.
+  mlir::Value currentMask = mask;
+  while (auto def = currentMask.getDefiningOp()) {
+    if (!mlir::isa<hlfir::AsExprOp, fir::ConvertOp, hlfir::DeclareOp,
+                   hlfir::CopyInOp>(def))
+      break;
+    currentMask = def->getOperand(0);
+  }
+  // Ensure the mask is produced by an hlfir.elemental.
+  auto elemental = currentMask.getDefiningOp<hlfir::ElementalOp>();
+  if (!elemental)
+    return false;
+
+  // Inspect the elemental body to find the boolean result logic.
+  mlir::Block &body = elemental.getRegion().front();
+  auto yieldOp = mlir::cast<hlfir::YieldElementOp>(body.getTerminator());
+  mlir::Value val = yieldOp.getElementValue();
+  // Get core comparison, ignoring intermediate type casts.
+  while (auto conv = val.getDefiningOp<fir::ConvertOp>())
+    val = conv.getOperand();
+
+  // We currently only optimize integer equality (arith.cmpi eq).
+  auto cmpOp = val.getDefiningOp<mlir::arith::CmpIOp>();
+  if (!cmpOp || cmpOp.getPredicate() != mlir::arith::CmpIPredicate::eq)
+    return false;
+
+  // Determine if a value is invariant relative to the mask loop.
+  // Handles constants, function arguments, and values defined in outer scopes.
+  auto isInvariant = [&](mlir::Value v) {
+    if (auto arg = mlir::dyn_cast<mlir::BlockArgument>(v))
+      return arg.getOwner()->getParent() != &elemental.getRegion();
+    if (auto *op = v.getDefiningOp())
+      return !elemental.getRegion().isAncestor(op->getParentRegion());
+    return true;
+  };
+
+  // Trace the Array Side to the base buffer.
+  auto getBase = [](mlir::Value v) -> mlir::Value {
+    while (v) {
+      mlir::Operation *def = v.getDefiningOp();
+      if (!def)
+        break;
+      if (auto decl = mlir::dyn_cast<hlfir::DeclareOp>(def))
+        v = decl.getMemref();
+      else if (auto load = mlir::dyn_cast<fir::LoadOp>(def))
+        v = load.getMemref();
+      else if (auto apply = mlir::dyn_cast<hlfir::ApplyOp>(def))
+        v = apply.getExpr();
+      else if (auto des = mlir::dyn_cast<hlfir::DesignateOp>(def))
+        v = des.getMemref();
+      else if (mlir::isa<fir::ConvertOp, hlfir::AsExprOp>(def))
+        v = def->getOperand(0);
+      else
+        break;
+    }
+    return v;
+  };
+
+  mlir::Value lhs = cmpOp.getLhs(), rhs = cmpOp.getRhs();
+  bool lhsInv = isInvariant(lhs), rhsInv = isInvariant(rhs);
+  // The optimization is valid only if exactly one side is invariant (the
+  // target) and the other side is variant (the array element).
+  if (lhsInv == rhsInv)
+    return false;
+
+  targetVal = lhsInv ? lhs : rhs;
+  mlir::Value arraySide = lhsInv ? rhs : lhs;
+
+  // Verify the mask refers to the same array being searched.
+  if (getBase(arraySide) == getBase(searchArray))
+    return true;
+
+  return false;
+}
+} // end anonymous namespace
+
 static llvm::cl::opt<bool> forceMatmulAsElemental(
     "flang-inline-matmul-as-elemental",
     llvm::cl::desc("Expand hlfir.matmul as elemental operation"),
@@ -530,6 +618,15 @@ class MinMaxlocAsElementalConverter : public ReductionAsElementalConverter {
 
   void
   checkReductions(const llvm::SmallVectorImpl<mlir::Value> &reductions) const {
+    mlir::Value targetVal;
+    // Check if the mask qualifies for the optimized equality mask search path.
+    if (isEqualityMask(this->getMask(), mlir::cast<T>(this->op).getArray(),
+                       targetVal)) {
+      // Expect coordinate indices.
+      assert(reductions.size() == getNumCoors() &&
+             "invalid number of reductions for equality mask MINLOC/MAXLOC");
+      return;
+    }
     if (!useIsFirst())
       assert(reductions.size() == getNumCoors() + 1 &&
              "invalid number of reductions for MINLOC/MAXLOC");
@@ -639,6 +736,51 @@ llvm::SmallVector<mlir::Value>
 MinMaxlocAsElementalConverter<T>::reduceOneElement(
     const llvm::SmallVectorImpl<mlir::Value> &currentValue, hlfir::Entity array,
     mlir::ValueRange oneBasedIndices) {
+  mlir::Value targetVal;
+  // The mask is an equality comparison (e.g., MASK = A == target) inline the
+  // comparison to find the first occurrence efficiently.
+  if (isEqualityMask(this->getMask(), array, targetVal)) {
+    // Directly load the array element and compare with the targetVal.
+    hlfir::Entity elementValue =
+        hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
+    mlir::Value isMatch = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::eq, (mlir::Value)elementValue,
+        targetVal);
+    // currentValue contains [Coord1, ..., CoordN, FirstHitBool]
+    mlir::Value firstHitBool = currentValue.back();
+    // shouldUpdate is true only if we have a match and we haven't found one
+    // yet.
+    mlir::Value shouldUpdate =
+        mlir::arith::AndIOp::create(builder, loc, isMatch, firstHitBool);
+    // Conditional Update: Only update coordinates if a match is found.
+    auto ifOp = fir::IfOp::create(builder, loc,
+                                  mlir::ValueRange(currentValue).getTypes(),
+                                  shouldUpdate, /*withElse=*/true);
+    // If match found and it's the first one, record coordinates.
+    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+    llvm::SmallVector<mlir::Value> thenResults;
+    unsigned rank = array.getRank();
+    // Get the firstHit flag.
+    for (unsigned i = 0; i < rank; ++i) {
+      mlir::Value loopIdx = builder.createConvert(
+          loc, currentValue[i].getType(), oneBasedIndices[i]);
+      thenResults.emplace_back(loopIdx);
+    }
+
+    // Update the flag: Set to 0 (False) for all future iterations.
+    mlir::Value falseVal =
+        mlir::arith::ConstantIntOp::create(builder, loc, 0, 1);
+    thenResults.emplace_back(falseVal);
+
+    fir::ResultOp::create(builder, loc, thenResults);
+
+    // No match or already found a previous match: maintain the current state.
+    builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
+    fir::ResultOp::create(builder, loc, currentValue);
+
+    builder.setInsertionPointAfter(ifOp);
+    return ifOp.getResults();
+  }
   checkReductions(currentValue);
   hlfir::Entity elementValue =
       hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
@@ -689,6 +831,49 @@ MinMaxlocAsElementalConverter<T>::reduceOneElement(
 template <typename T>
 hlfir::Entity MinMaxlocAsElementalConverter<T>::genFinalResult(
     const llvm::SmallVectorImpl<mlir::Value> &reductionResults) {
+  mlir::Value targetVal;
+  // Finalize results for the equality-mask search.
+  if (isEqualityMask(this->getMask(), mlir::cast<T>(this->op).getArray(),
+                     targetVal)) {
+    unsigned rank = getNumCoors();
+    mlir::Type resultElemTy =
+        hlfir::getFortranElementType(this->getResultType());
+    // MINLOC/MAXLOC returns an integer array of shape [rank].
+    // Manually build the HLFIR expression to hold the resulting coordinates.
+    llvm::SmallVector<int64_t> shapeVec{static_cast<int64_t>(rank)};
+    mlir::Type exprTy = hlfir::ExprType::get(builder.getContext(), shapeVec,
+                                             resultElemTy, false);
+    mlir::Value resRank =
+        builder.createIntegerConstant(loc, builder.getIndexType(), rank);
+    mlir::Value resShape = fir::ShapeOp::create(builder, loc, resRank);
+
+    // Create an elemental operation to map the scalar reduction results
+    // (coordinates) back into a Fortran array result.
+    auto elemental =
+        hlfir::ElementalOp::create(builder, loc, exprTy, resShape,
+                                   /*mold=*/mlir::Value{},
+                                   /*typeparams=*/mlir::ValueRange{},
+                                   /*isUnordered=*/false);
+    {
+      // Fill the elemental body.
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      builder.setInsertionPointToStart(elemental.getBody());
+      // Map the 1-based elemental index, result[i] = reductionResults[i-1].
+      mlir::Value elemIdx = elemental.getIndices()[0];
+      mlir::Value resultVal = reductionResults[0];
+      for (unsigned i = 1; i < rank; ++i) {
+        mlir::Value dimConst =
+            builder.createIntegerConstant(loc, builder.getIndexType(), i + 1);
+        mlir::Value isDimMatch = mlir::arith::CmpIOp::create(
+            builder, loc, mlir::arith::CmpIPredicate::eq, elemIdx, dimConst);
+        // Select specific coordinate matching current elemental dimension.
+        resultVal = mlir::arith::SelectOp::create(
+            builder, loc, isDimMatch, reductionResults[i], resultVal);
+      }
+      hlfir::YieldElementOp::create(builder, loc, resultVal);
+    }
+    return hlfir::Entity{elemental.getResult()};
+  }
   // Identification of the final result of MINLOC/MAXLOC:
   //   * If DIM is absent, the result is rank-one array.
   //   * If DIM is present:
@@ -1184,9 +1369,39 @@ mlir::LogicalResult ReductionAsElementalConverter::convert() {
       extents.push_back(
           builder.createConvert(loc, builder.getIndexType(), dimExtent));
 
-    // Initial value for the reduction.
-    llvm::SmallVector<mlir::Value, 1> reductionInitValues =
-        genReductionInitValues(inputIndices, extents);
+    mlir::Value minMaxMask;
+    if (auto minloc = mlir::dyn_cast<hlfir::MinlocOp>(op)) {
+      minMaxMask = minloc.getMask();
+    } else if (auto maxloc = mlir::dyn_cast<hlfir::MaxlocOp>(op)) {
+      minMaxMask = maxloc.getMask();
+    }
+    mlir::Value targetVal;
+    bool isFixedSearch = false;
+    // Check if the mask allows for a simplified search optimization.
+    if (minMaxMask)
+      isFixedSearch =
+          isEqualityMask(minMaxMask, this->op->getOperand(0), targetVal);
+    llvm::SmallVector<mlir::Value, 1> reductionInitValues;
+    if (isFixedSearch) {
+      // For optimized equality searches, we skip the 'Min/Max value' reduction
+      // and only track coordinate indices and the firstHit flag.
+      unsigned rank = hlfir::Entity{array}.getRank();
+      mlir::Type resElemTy =
+          hlfir::getFortranElementType(this->getResultType());
+      mlir::Value zeroVal = builder.createIntegerConstant(loc, resElemTy, 0);
+
+      // Initialize all coordinates to 0.
+      for (unsigned i = 0; i < rank; ++i) {
+        reductionInitValues.emplace_back(zeroVal);
+      }
+      // First hit flag: [Row, Col, FirstHit=1] (Size: 3)
+      mlir::Type i1Type = builder.getI1Type();
+      mlir::Value firstHitTrue = mlir::arith::ConstantOp::create(
+          builder, loc, i1Type, builder.getBoolAttr(true));
+      reductionInitValues.emplace_back(firstHitTrue);
+    } else {
+      reductionInitValues = genReductionInitValues(inputIndices, extents);
+    }
 
     auto genBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                        mlir::ValueRange oneBasedIndices,
@@ -1208,7 +1423,9 @@ mlir::LogicalResult ReductionAsElementalConverter::convert() {
       llvm::transform(reductionValues, std::back_inserter(reductionTypes),
                       [](mlir::Value v) { return v.getType(); });
       fir::IfOp ifOp;
-      if (mask) {
+      // Skip standard masking block in case of 'isFixedSearch', as it handles
+      // its own masking logic inside the comparison.
+      if (mask && !isFixedSearch) {
         // Make the reduction value update conditional on the value
         // of the mask.
         if (!maskValue) {
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir
new file mode 100644
index 0000000000000..31925ae41467e
--- /dev/null
+++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir
@@ -0,0 +1,269 @@
+// RUN: fir-opt %s --simplify-hlfir-intrinsics | FileCheck %s
+
+// Rank 1: Variable: A == %target
+func.func @test_maxloc_1d_equality_variable(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_1d_equality_variable
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK-NOT: arith.constant -2147483648 : i32
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
+// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %{{.*}}
+// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
+// CHECK:   %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
+
+// Rank 2: Variable: A == %target
+func.func @test_maxloc_2d_equality_variable(%arg0: !hlfir.expr<?x?xi32>, %target: i32) -> !hlfir.expr<2xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
+  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index):
+    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
+  return %res : !hlfir.expr<2xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_2d_equality_variable
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[RES_OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[OUT1:.*]] = %[[C0]], %[[OUT2:.*]] = %[[C0]], %[[OUT3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
+// CHECK:   %[[RES_INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[IN1:.*]] = %[[OUT1]], %[[IN2:.*]] = %[[OUT2]], %[[IN3:.*]] = %[[OUT3]]) -> (i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:     %[[EQ:.*]] = arith.cmpi eq, {{.*}}
+// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[IN3]]
+// CHECK:     %[[IF:.*]]:3 = fir.if %[[COND]] -> (i32, i32, i1)
+
+// Rank 3: Variable: A == %target
+func.func @test_maxloc_3d_equality_variable(%arg0: !hlfir.expr<?x?x?xi32>, %target: i32) -> !hlfir.expr<3xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
+  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index, %k: index):
+    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
+  return %res : !hlfir.expr<3xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_3d_equality_variable
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[OUTER:.*]]:4 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
+// CHECK:   %[[MIDDLE:.*]]:4 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
+// CHECK:     %[[INNER:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:       %[[EQ:.*]] = arith.cmpi eq, {{.*}}
+// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
+// CHECK:       %[[IF:.*]]:4 = fir.if %[[COND]] -> (i32, i32, i32, i1)
+
+// Rank 1: Constant: A == 42
+func.func @test_maxloc_1d_equality_constant(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
+  %c42 = arith.constant 42 : i32
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c42 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_1d_equality_constant
+// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+
+// CHECK: %[[RES:.*]]:2 = fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
+// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
+// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
+// CHECK:   %[[IF:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
+
+// Rank 2: Constant: A == 42
+func.func @test_maxloc_2d_equality_constant(%arg0: !hlfir.expr<?x?xi32>) -> !hlfir.expr<2xi32> {
+  %c42 = arith.constant 42 : i32
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
+  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index):
+    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c42 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
+  return %res : !hlfir.expr<2xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_2d_equality_constant
+// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
+// CHECK:   %[[INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[O1]], %[[I2:.*]] = %[[O2]], %[[I3:.*]] = %[[O3]]) -> (i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:     %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV2]], %[[IV1]]
+// CHECK:     %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
+// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[I3]]
+
+// Rank 3: Constant: A == 42
+func.func @test_maxloc_3d_equality_constant(%arg0: !hlfir.expr<?x?x?xi32>) -> !hlfir.expr<3xi32> {
+  %c42 = arith.constant 42 : i32
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
+  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index, %k: index):
+    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c42 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
+  return %res : !hlfir.expr<3xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_3d_equality_constant
+// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[OUT:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
+// CHECK:   %[[MID:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
+// CHECK:     %[[INN:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:       %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV3]], %{{.*}}, %{{.*}}
+// CHECK:       %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
+// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
+
+// No Match: Result must be 0
+func.func @test_maxloc_no_match(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
+  %c99 = arith.constant 99 : i32 
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c99 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_no_match(
+// CHECK-SAME: %[[ARRAY_NM:.*]]: !hlfir.expr<?xi32>)
+// CHECK-DAG:    %[[C99:.*]] = arith.constant 99 : i32
+// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_NM]], %[[IV]]
+// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[C99]] : i32
+// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
+// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
+// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
+// CHECK:        fir.result %[[CONV]], %false
+// CHECK:      } else {
+// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
+// CHECK:      }
+
+// First Match: Duplicate values
+func.func @test_maxloc_first_match(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_first_match(
+// CHECK-SAME: %[[ARRAY_FM:.*]]: !hlfir.expr<?xi32>, %[[TARGET_FM:.*]]: i32)
+// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
+// CHECK-DAG:    %[[FALSE:.*]] = arith.constant false
+// Verify loop has only 2 iter_args (Coord, FirstHitFlag)
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// Verify mask elemental is bypassed
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// Verify the "Locking" logic: (Match 'and' is_first)
+// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_FM]], %[[IV]]
+// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[TARGET_FM]] : i32
+// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
+// Verify that once a match is found, we result in %false to lock it
+// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
+// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
+// CHECK:        fir.result %[[CONV]], %[[FALSE]] : i32, i1
+// CHECK:      } else {
+// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
+// CHECK:      }
+
+// Negative test: Mask refers to a different array (%arg1) than the search 
+// array (%arg0).
+func.func @test_maxloc_different_arrays(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    // Optimization should fail here because %arg1 != %arg0
+    %val_b = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val_b, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_different_arrays(
+// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>, %[[TARGET:.*]]: i32)
+// CHECK: %[[SENTINEL:.*]] = arith.constant -2147483648 : i32
+// Verify the loop uses three iter_args (standard path)
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MAX:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
+// Verify the mask is applied (Since we can't inline it safely)
+// CHECK: %[[MASK_VAL:.*]] = hlfir.apply {{.*}} : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_VAL]] : (!fir.logical<4>) -> i1
+// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
+// CHECK:   %[[VAL_A:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
+// CHECK:   %[[CMP:.*]] = arith.cmpi sgt, %[[VAL_A]], %[[MAX]]
+
+// Negative Test: The target value is another array, so it is not invariant.
+func.func @test_maxloc_non_invariant_target(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val_a = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %val_target = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
+    // Optimization should fail here because %val_target is defined inside the elemental
+    %cmp = arith.cmpi eq, %val_a, %val_target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_maxloc_non_invariant_target(
+// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>)
+// CHECK: %[[SENTINEL:.*]] = arith.constant -2147483648 : i32
+// Verify the loop uses three iter_args (Standard path)
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MAX:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
+// Verify the mask is still applied (because we couldn't inline the comparison)
+// CHECK: %[[MASK_BIT:.*]] = hlfir.apply %{{.*}}, %[[IV]] : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_BIT]] : (!fir.logical<4>) -> i1
+// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
+// CHECK:   %[[VAL:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
+// CHECK:   %[[CMP:.*]] = arith.cmpi sgt, %[[VAL]], %[[MAX]]
+
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir
new file mode 100644
index 0000000000000..0bfa58968a2fe
--- /dev/null
+++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir
@@ -0,0 +1,274 @@
+// RUN: fir-opt %s --simplify-hlfir-intrinsics | FileCheck %s
+
+// Rank 1: Variable: A == %target
+func.func @test_minloc_1d_equality_variable(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_1d_equality_variable
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK-NOT: arith.constant 2147483647 : i32
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
+// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %{{.*}}
+// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
+// CHECK:   %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
+
+// Rank 2: Variable: A == %target
+func.func @test_minloc_2d_equality_variable(%arg0: !hlfir.expr<?x?xi32>, %target: i32) -> !hlfir.expr<2xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
+  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index):
+    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
+  return %res : !hlfir.expr<2xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_2d_equality_variable
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[RES_OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[OUT1:.*]] = %[[C0]], %[[OUT2:.*]] = %[[C0]], %[[OUT3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
+// CHECK:   %[[RES_INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[IN1:.*]] = %[[OUT1]], %[[IN2:.*]] = %[[OUT2]], %[[IN3:.*]] = %[[OUT3]]) -> (i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:     %[[EQ:.*]] = arith.cmpi eq, {{.*}}
+// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[IN3]]
+// CHECK:     %[[IF:.*]]:3 = fir.if %[[COND]] -> (i32, i32, i1)
+
+// Rank 3: Variable: A == %target
+func.func @test_minloc_3d_equality_variable(%arg0: !hlfir.expr<?x?x?xi32>, %target: i32) -> !hlfir.expr<3xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
+  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index, %k: index):
+    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
+  return %res : !hlfir.expr<3xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_3d_equality_variable
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[OUTER:.*]]:4 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
+// CHECK:   %[[MIDDLE:.*]]:4 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
+// CHECK:     %[[INNER:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:       %[[EQ:.*]] = arith.cmpi eq, {{.*}}
+// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
+// CHECK:       %[[IF:.*]]:4 = fir.if %[[COND]] -> (i32, i32, i32, i1)
+
+// Rank 1: Constant: A == 42
+func.func @test_minloc_1d_equality_constant(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
+  %c42 = arith.constant 42 : i32
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c42 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_1d_equality_constant
+// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+
+// CHECK: %[[RES:.*]]:2 = fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
+// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
+// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
+// CHECK:   %[[IF:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
+
+// Rank 2: Constant: A == 42
+func.func @test_minloc_2d_equality_constant(%arg0: !hlfir.expr<?x?xi32>) -> !hlfir.expr<2xi32> {
+  %c42 = arith.constant 42 : i32
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
+  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index):
+    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c42 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
+  return %res : !hlfir.expr<2xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_2d_equality_constant
+// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
+// CHECK:   %[[INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[O1]], %[[I2:.*]] = %[[O2]], %[[I3:.*]] = %[[O3]]) -> (i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:     %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV2]], %[[IV1]]
+// CHECK:     %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
+// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[I3]]
+
+// Rank 3: Constant: A == 42
+func.func @test_minloc_3d_equality_constant(%arg0: !hlfir.expr<?x?x?xi32>) -> !hlfir.expr<3xi32> {
+  %c42 = arith.constant 42 : i32
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
+  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index, %k: index):
+    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c42 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
+  return %res : !hlfir.expr<3xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_3d_equality_constant
+// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
+// CHECK: %[[OUT:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
+// CHECK:   %[[MID:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
+// CHECK:     %[[INN:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:       %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV3]], %{{.*}}, %{{.*}}
+// CHECK:       %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
+// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
+
+// No Match: Result must be 0
+func.func @test_minloc_no_match(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
+  %c99 = arith.constant 99 : i32 
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %c99 : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_no_match(
+// CHECK-SAME: %[[ARRAY_NM:.*]]: !hlfir.expr<?xi32>)
+// CHECK-DAG:    %[[C99:.*]] = arith.constant 99 : i32
+// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_NM]], %[[IV]]
+// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[C99]] : i32
+// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
+// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
+// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
+// CHECK:        fir.result %[[CONV]], %false
+// CHECK:      } else {
+// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
+// CHECK:      }
+
+// First Match: Duplicate values
+func.func @test_minloc_first_match(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_first_match(
+// CHECK-SAME: %[[ARRAY_FM:.*]]: !hlfir.expr<?xi32>, %[[TARGET_FM:.*]]: i32)
+// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
+// CHECK-DAG:    %[[FALSE:.*]] = arith.constant false
+// Verify loop has only 2 iter_args (Coord, FirstHitFlag)
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
+// Verify mask elemental is bypassed
+// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
+// Verify the "Locking" logic: (Match AND is_first)
+// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_FM]], %[[IV]]
+// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[TARGET_FM]] : i32
+// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
+// Verify that once a match is found, we result in %false to lock it
+// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
+// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
+// CHECK:        fir.result %[[CONV]], %[[FALSE]] : i32, i1
+// CHECK:      } else {
+// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
+// CHECK:      }
+
+// Negative test: Mask refers to a different array (%arg1) than the search 
+// array (%arg0).
+func.func @test_minloc_different_arrays(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    // Optimization should fail here because %arg1 != %arg0
+    %val_b = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %cmp = arith.cmpi eq, %val_b, %target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_different_arrays(
+// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>, %[[TARGET:.*]]: i32)
+// CHECK: %[[SENTINEL:.*]] = arith.constant 2147483647 : i32
+
+// 1. Verify the loop uses three iter_args (Standard path: Loc, MinVal, FirstHit)
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MIN_VAL:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
+
+// 2. Verify the mask IS still applied (Optimization correctly skipped)
+// CHECK: %[[MASK_VAL:.*]] = hlfir.apply {{.*}} : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_VAL]] : (!fir.logical<4>) -> i1
+
+// 3. Verify the standard path's MINLOC comparison logic (slt instead of sgt)
+// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
+// CHECK:   %[[VAL_A:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
+// CHECK:   %[[CMP:.*]] = arith.cmpi slt, %[[VAL_A]], %[[MIN_VAL]] : i32
+
+// Negative Test: The target value is another array, so it is not invariant.
+func.func @test_minloc_non_invariant_target(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
+  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
+  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%i: index):
+    %val_a = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+    %val_target = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
+    // Optimization should fail here because %val_target is defined inside the 
+    // elemental
+    %cmp = arith.cmpi eq, %val_a, %val_target : i32
+    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %logical : !fir.logical<4>
+  }
+  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
+  return %res : !hlfir.expr<1xi32>
+}
+// CHECK-LABEL: func.func @test_minloc_non_invariant_target(
+// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>)
+// CHECK: %[[SENTINEL:.*]] = arith.constant 2147483647 : i32
+// Verify the loop uses three iter_args (standard path)
+// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MAX:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
+// Verify the mask is still applied (because we couldn't inline the comparison)
+// CHECK: %[[MASK_BIT:.*]] = hlfir.apply %{{.*}}, %[[IV]] : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_BIT]] : (!fir.logical<4>) -> i1
+// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
+// CHECK:   %[[VAL:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
+// CHECK:   %[[CMP:.*]] = arith.cmpi slt, %[[VAL]], %[[MAX]]
+

>From 7220998307a734fdb174689e03b4620fd4fe5b29 Mon Sep 17 00:00:00 2001
From: "anoop.kumar6 at ibm.com" <anoopk at b35lp63.lnxne.boe>
Date: Thu, 26 Mar 2026 22:10:20 +0100
Subject: [PATCH 2/3] [flang][HLFIR] Relax InlineElementals to support more
 than two users

This patch updates the InlineElementals pass to allow the resulting
hlfir.apply of the mask hlfir.elemental to be inlined inside the loop even
when they have multiple users. Previously, inlining was strictly restricted
to elementals with exactly 2 users (apply, destroy). Now, focus is uniqueness
 of the hlfir.apply site rather than the total user count.
The transformation preserves the elemental producer if other users remain
while optimizing the scalar path within the loop.
---
 .../HLFIR/Transforms/InlineElementals.cpp     |  27 +-
 .../Transforms/SimplifyHLFIRIntrinsics.cpp    | 225 +-----------
 .../HLFIR/inline-elemental-multi-users.fir    | 342 ++++++++++++++++++
 ...plify-hlfir-intrinsics-equality-maxloc.fir | 269 --------------
 ...plify-hlfir-intrinsics-equality-minloc.fir | 274 --------------
 5 files changed, 362 insertions(+), 775 deletions(-)
 create mode 100644 flang/test/HLFIR/inline-elemental-multi-users.fir
 delete mode 100644 flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir
 delete mode 100644 flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
index ff84a3cff0afb..e3ced12dc93b3 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
@@ -35,12 +35,6 @@ namespace hlfir {
 /// a destroy operation, return those two, otherwise return {}
 static std::optional<std::pair<hlfir::ApplyOp, hlfir::DestroyOp>>
 getTwoUses(hlfir::ElementalOp elemental) {
-  mlir::Operation::user_range users = elemental->getUsers();
-  // don't inline anything with more than one use (plus hfir.destroy)
-  if (std::distance(users.begin(), users.end()) != 2) {
-    return std::nullopt;
-  }
-
   // If the ElementalOp must produce a temporary (e.g. for
   // finalization purposes), then we cannot inline it.
   if (hlfir::elementalOpMustProduceTemp(elemental))
@@ -48,12 +42,21 @@ getTwoUses(hlfir::ElementalOp elemental) {
 
   hlfir::ApplyOp apply;
   hlfir::DestroyOp destroy;
-  for (mlir::Operation *user : users)
+  unsigned applyCount = 0;
+
+  for (mlir::Operation *user : elemental->getUsers()) {
     mlir::TypeSwitch<mlir::Operation *, void>(user)
-        .Case([&](hlfir::ApplyOp op) { apply = op; })
+        .Case([&](hlfir::ApplyOp op) {
+          apply = op;
+          applyCount++;
+        })
         .Case([&](hlfir::DestroyOp op) { destroy = op; });
+  }
 
-  if (!apply || !destroy)
+  // Only inline if there is a unique 'apply' site. Other users (such as
+  // intrinsic operations) are allowed because scalarizing the elemental
+  // renders the original array result redundant.
+  if (applyCount != 1 || !destroy)
     return std::nullopt;
 
   // we can't inline if the return type of the yield doesn't match the return
@@ -80,7 +83,7 @@ class InlineElementalConversion
         getTwoUses(elemental);
     if (!maybeTuple)
       return rewriter.notifyMatchFailure(
-          elemental, "hlfir.elemental does not have two uses");
+          elemental, "hlfir.elemental is not a candidate for inlining");
 
     if (elemental.isOrdered()) {
       // We can only inline the ordered elemental into a loop-like
@@ -104,7 +107,9 @@ class InlineElementalConversion
     rewriter.replaceOp(apply, {yield.getElementValue()});
     rewriter.eraseOp(yield);
     rewriter.eraseOp(destroy);
-    rewriter.eraseOp(elemental);
+    // Only erase the elemental if that was its last use.
+    if (elemental->use_empty())
+      rewriter.eraseOp(elemental);
 
     return mlir::success();
   }
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index cf38bd61cfb51..7ff9dc61110d3 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -32,94 +32,6 @@ namespace hlfir {
 
 #define DEBUG_TYPE "simplify-hlfir-intrinsics"
 
-namespace {
-// Check if the given mask is an equality comparison of the search array
-// against an invariant value (e.g., MASK = A == target) by traversing
-// HLFIR/FIR operations to find the underlying elemental comparison
-// and extract the invariant search targetVal.
-// It returns true if the mask is a simple equality comparison against a
-// scalar/invariant.
-bool isEqualityMask(mlir::Value mask, mlir::Value searchArray,
-                    mlir::Value &targetVal) {
-  if (!mask)
-    return false;
-
-  // Trace back HLFIR/FIR wrappers to get Elemental producer.
-  mlir::Value currentMask = mask;
-  while (auto def = currentMask.getDefiningOp()) {
-    if (!mlir::isa<hlfir::AsExprOp, fir::ConvertOp, hlfir::DeclareOp,
-                   hlfir::CopyInOp>(def))
-      break;
-    currentMask = def->getOperand(0);
-  }
-  // Ensure the mask is produced by an hlfir.elemental.
-  auto elemental = currentMask.getDefiningOp<hlfir::ElementalOp>();
-  if (!elemental)
-    return false;
-
-  // Inspect the elemental body to find the boolean result logic.
-  mlir::Block &body = elemental.getRegion().front();
-  auto yieldOp = mlir::cast<hlfir::YieldElementOp>(body.getTerminator());
-  mlir::Value val = yieldOp.getElementValue();
-  // Get core comparison, ignoring intermediate type casts.
-  while (auto conv = val.getDefiningOp<fir::ConvertOp>())
-    val = conv.getOperand();
-
-  // We currently only optimize integer equality (arith.cmpi eq).
-  auto cmpOp = val.getDefiningOp<mlir::arith::CmpIOp>();
-  if (!cmpOp || cmpOp.getPredicate() != mlir::arith::CmpIPredicate::eq)
-    return false;
-
-  // Determine if a value is invariant relative to the mask loop.
-  // Handles constants, function arguments, and values defined in outer scopes.
-  auto isInvariant = [&](mlir::Value v) {
-    if (auto arg = mlir::dyn_cast<mlir::BlockArgument>(v))
-      return arg.getOwner()->getParent() != &elemental.getRegion();
-    if (auto *op = v.getDefiningOp())
-      return !elemental.getRegion().isAncestor(op->getParentRegion());
-    return true;
-  };
-
-  // Trace the Array Side to the base buffer.
-  auto getBase = [](mlir::Value v) -> mlir::Value {
-    while (v) {
-      mlir::Operation *def = v.getDefiningOp();
-      if (!def)
-        break;
-      if (auto decl = mlir::dyn_cast<hlfir::DeclareOp>(def))
-        v = decl.getMemref();
-      else if (auto load = mlir::dyn_cast<fir::LoadOp>(def))
-        v = load.getMemref();
-      else if (auto apply = mlir::dyn_cast<hlfir::ApplyOp>(def))
-        v = apply.getExpr();
-      else if (auto des = mlir::dyn_cast<hlfir::DesignateOp>(def))
-        v = des.getMemref();
-      else if (mlir::isa<fir::ConvertOp, hlfir::AsExprOp>(def))
-        v = def->getOperand(0);
-      else
-        break;
-    }
-    return v;
-  };
-
-  mlir::Value lhs = cmpOp.getLhs(), rhs = cmpOp.getRhs();
-  bool lhsInv = isInvariant(lhs), rhsInv = isInvariant(rhs);
-  // The optimization is valid only if exactly one side is invariant (the
-  // target) and the other side is variant (the array element).
-  if (lhsInv == rhsInv)
-    return false;
-
-  targetVal = lhsInv ? lhs : rhs;
-  mlir::Value arraySide = lhsInv ? rhs : lhs;
-
-  // Verify the mask refers to the same array being searched.
-  if (getBase(arraySide) == getBase(searchArray))
-    return true;
-
-  return false;
-}
-} // end anonymous namespace
-
 static llvm::cl::opt<bool> forceMatmulAsElemental(
     "flang-inline-matmul-as-elemental",
     llvm::cl::desc("Expand hlfir.matmul as elemental operation"),
@@ -618,15 +530,6 @@ class MinMaxlocAsElementalConverter : public ReductionAsElementalConverter {
 
   void
   checkReductions(const llvm::SmallVectorImpl<mlir::Value> &reductions) const {
-    mlir::Value targetVal;
-    // Check if the mask qualifies for the optimized equality mask search path.
-    if (isEqualityMask(this->getMask(), mlir::cast<T>(this->op).getArray(),
-                       targetVal)) {
-      // Expect coordinate indices.
-      assert(reductions.size() == getNumCoors() &&
-             "invalid number of reductions for equality mask MINLOC/MAXLOC");
-      return;
-    }
     if (!useIsFirst())
       assert(reductions.size() == getNumCoors() + 1 &&
              "invalid number of reductions for MINLOC/MAXLOC");
@@ -736,51 +639,6 @@ llvm::SmallVector<mlir::Value>
 MinMaxlocAsElementalConverter<T>::reduceOneElement(
     const llvm::SmallVectorImpl<mlir::Value> &currentValue, hlfir::Entity array,
     mlir::ValueRange oneBasedIndices) {
-  mlir::Value targetVal;
-  // The mask is an equality comparison (e.g., MASK = A == target) inline the
-  // comparison to find the first occurrence efficiently.
-  if (isEqualityMask(this->getMask(), array, targetVal)) {
-    // Directly load the array element and compare with the targetVal.
-    hlfir::Entity elementValue =
-        hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
-    mlir::Value isMatch = mlir::arith::CmpIOp::create(
-        builder, loc, mlir::arith::CmpIPredicate::eq, (mlir::Value)elementValue,
-        targetVal);
-    // currentValue contains [Coord1, ..., CoordN, FirstHitBool]
-    mlir::Value firstHitBool = currentValue.back();
-    // shouldUpdate is true only if we have a match and we haven't found one
-    // yet.
-    mlir::Value shouldUpdate =
-        mlir::arith::AndIOp::create(builder, loc, isMatch, firstHitBool);
-    // Conditional Update: Only update coordinates if a match is found.
-    auto ifOp = fir::IfOp::create(builder, loc,
-                                  mlir::ValueRange(currentValue).getTypes(),
-                                  shouldUpdate, /*withElse=*/true);
-    // If match found and it's the first one, record coordinates.
-    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-    llvm::SmallVector<mlir::Value> thenResults;
-    unsigned rank = array.getRank();
-    // Get the firstHit flag.
-    for (unsigned i = 0; i < rank; ++i) {
-      mlir::Value loopIdx = builder.createConvert(
-          loc, currentValue[i].getType(), oneBasedIndices[i]);
-      thenResults.emplace_back(loopIdx);
-    }
-
-    // Update the flag: Set to 0 (False) for all future iterations.
-    mlir::Value falseVal =
-        mlir::arith::ConstantIntOp::create(builder, loc, 0, 1);
-    thenResults.emplace_back(falseVal);
-
-    fir::ResultOp::create(builder, loc, thenResults);
-
-    // No match or already found a previous match: maintain the current state.
-    builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-    fir::ResultOp::create(builder, loc, currentValue);
-
-    builder.setInsertionPointAfter(ifOp);
-    return ifOp.getResults();
-  }
   checkReductions(currentValue);
   hlfir::Entity elementValue =
       hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
@@ -831,49 +689,6 @@ MinMaxlocAsElementalConverter<T>::reduceOneElement(
 template <typename T>
 hlfir::Entity MinMaxlocAsElementalConverter<T>::genFinalResult(
     const llvm::SmallVectorImpl<mlir::Value> &reductionResults) {
-  mlir::Value targetVal;
-  // Finalize results for the equality-mask search.
-  if (isEqualityMask(this->getMask(), mlir::cast<T>(this->op).getArray(),
-                     targetVal)) {
-    unsigned rank = getNumCoors();
-    mlir::Type resultElemTy =
-        hlfir::getFortranElementType(this->getResultType());
-    // MINLOC/MAXLOC returns an integer array of shape [rank].
-    // Manually build the HLFIR expression to hold the resulting coordinates.
-    llvm::SmallVector<int64_t> shapeVec{static_cast<int64_t>(rank)};
-    mlir::Type exprTy = hlfir::ExprType::get(builder.getContext(), shapeVec,
-                                             resultElemTy, false);
-    mlir::Value resRank =
-        builder.createIntegerConstant(loc, builder.getIndexType(), rank);
-    mlir::Value resShape = fir::ShapeOp::create(builder, loc, resRank);
-
-    // Create an elemental operation to map the scalar reduction results
-    // (coordinates) back into a Fortran array result.
-    auto elemental =
-        hlfir::ElementalOp::create(builder, loc, exprTy, resShape,
-                                   /*mold=*/mlir::Value{},
-                                   /*typeparams=*/mlir::ValueRange{},
-                                   /*isUnordered=*/false);
-    {
-      // Fill the elemental body.
-      mlir::OpBuilder::InsertionGuard guard(builder);
-      builder.setInsertionPointToStart(elemental.getBody());
-      // Map the 1-based elemental index, result[i] = reductionResults[i-1].
-      mlir::Value elemIdx = elemental.getIndices()[0];
-      mlir::Value resultVal = reductionResults[0];
-      for (unsigned i = 1; i < rank; ++i) {
-        mlir::Value dimConst =
-            builder.createIntegerConstant(loc, builder.getIndexType(), i + 1);
-        mlir::Value isDimMatch = mlir::arith::CmpIOp::create(
-            builder, loc, mlir::arith::CmpIPredicate::eq, elemIdx, dimConst);
-        // Select specific coordinate matching current elemental dimension.
-        resultVal = mlir::arith::SelectOp::create(
-            builder, loc, isDimMatch, reductionResults[i], resultVal);
-      }
-      hlfir::YieldElementOp::create(builder, loc, resultVal);
-    }
-    return hlfir::Entity{elemental.getResult()};
-  }
   // Identification of the final result of MINLOC/MAXLOC:
   //   * If DIM is absent, the result is rank-one array.
   //   * If DIM is present:
@@ -1369,39 +1184,9 @@ mlir::LogicalResult ReductionAsElementalConverter::convert() {
       extents.push_back(
           builder.createConvert(loc, builder.getIndexType(), dimExtent));
 
-    mlir::Value minMaxMask;
-    if (auto minloc = mlir::dyn_cast<hlfir::MinlocOp>(op)) {
-      minMaxMask = minloc.getMask();
-    } else if (auto maxloc = mlir::dyn_cast<hlfir::MaxlocOp>(op)) {
-      minMaxMask = maxloc.getMask();
-    }
-    mlir::Value targetVal;
-    bool isFixedSearch = false;
-    // Check if the mask allows for a simplified search optimization.
-    if (minMaxMask)
-      isFixedSearch =
-          isEqualityMask(minMaxMask, this->op->getOperand(0), targetVal);
-    llvm::SmallVector<mlir::Value, 1> reductionInitValues;
-    if (isFixedSearch) {
-      // For optimized equality searches, we skip the 'Min/Max value' reduction
-      // and only track coordinate indices and the firstHit flag.
-      unsigned rank = hlfir::Entity{array}.getRank();
-      mlir::Type resElemTy =
-          hlfir::getFortranElementType(this->getResultType());
-      mlir::Value zeroVal = builder.createIntegerConstant(loc, resElemTy, 0);
-
-      // Initialize all coordinates to 0.
-      for (unsigned i = 0; i < rank; ++i) {
-        reductionInitValues.emplace_back(zeroVal);
-      }
-      // First hit flag: [Row, Col, FirstHit=1] (Size: 3)
-      mlir::Type i1Type = builder.getI1Type();
-      mlir::Value firstHitTrue = mlir::arith::ConstantOp::create(
-          builder, loc, i1Type, builder.getBoolAttr(true));
-      reductionInitValues.emplace_back(firstHitTrue);
-    } else {
-      reductionInitValues = genReductionInitValues(inputIndices, extents);
-    }
+    // Initial value for the reduction.
+    llvm::SmallVector<mlir::Value, 1> reductionInitValues =
+        genReductionInitValues(inputIndices, extents);
 
     auto genBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                        mlir::ValueRange oneBasedIndices,
@@ -1423,9 +1208,7 @@ mlir::LogicalResult ReductionAsElementalConverter::convert() {
       llvm::transform(reductionValues, std::back_inserter(reductionTypes),
                       [](mlir::Value v) { return v.getType(); });
       fir::IfOp ifOp;
-      // Skip standard masking block in case of 'isFixedSearch', as it handles
-      // its own masking logic inside the comparison.
-      if (mask && !isFixedSearch) {
+      if (mask) {
         // Make the reduction value update conditional on the value
         // of the mask.
         if (!maskValue) {
diff --git a/flang/test/HLFIR/inline-elemental-multi-users.fir b/flang/test/HLFIR/inline-elemental-multi-users.fir
new file mode 100644
index 0000000000000..f8b195e637091
--- /dev/null
+++ b/flang/test/HLFIR/inline-elemental-multi-users.fir
@@ -0,0 +1,342 @@
+// RUN: fir-opt --inline-elementals %s | FileCheck %s
+
+// Test inlining of hlfir.elemental into its hlfir.apply site when the 
+// elemental has more than two users.
+
+// Check successful inlining where hlfir.elemental survives because the
+// 'associate' op is still using it.
+func.func @test_inlining_use_mask(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10, %c10 : (index, index) -> !fir.shape<2>
+
+  // Elemental Mask.
+  %mask = hlfir.elemental %shape unordered : (!fir.shape<2>) -> !hlfir.expr<10x10x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index):
+    %val = hlfir.designate %arg0 (%i, %j) : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+    %load = fir.load %val : !fir.ref<i32>
+    %ref = fir.load %arg1 : !fir.ref<i32>
+    %cmp = arith.cmpi eq, %load, %ref : i32
+    %res = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %res : !fir.logical<4>
+  }
+
+  // Extra User - This keeps the elemental alive even after inlining the apply
+  // site. Total uses = 3 (associate, apply, destroy).
+  %extra:3 = hlfir.associate %mask : (!hlfir.expr<10x10x!fir.logical<4>>) -> (!fir.box<!fir.array<10x10x!fir.logical<4>>>, !fir.ref<!fir.array<10x10x!fir.logical<4>>>, i1)
+
+  // Apply inside a loop
+  fir.do_loop %arg2 = %c1 to %c10 step %c1 {
+    fir.do_loop %arg3 = %c1 to %c10 step %c1 {
+      %applied = hlfir.apply %mask, %arg3, %arg2 : (!hlfir.expr<10x10x!fir.logical<4>>, index, index) -> !fir.logical<4>
+      %dummy_ref = fir.alloca !fir.logical<4>
+      fir.store %applied to %dummy_ref : !fir.ref<!fir.logical<4>>
+    }
+ }
+
+  hlfir.end_associate %extra#0, %extra#2 : !fir.box<!fir.array<10x10x!fir.logical<4>>>, i1
+  hlfir.destroy %mask : !hlfir.expr<10x10x!fir.logical<4>>
+  return
+}
+// CHECK-LABEL: func.func @test_inlining_use_mask
+// CHECK: %[[MASK:.*]] = hlfir.elemental
+// CHECK: hlfir.associate %[[MASK]]
+// CHECK: fir.do_loop
+// CHECK:   fir.do_loop
+// CHECK-NOT: hlfir.apply
+// CHECK:     %[[VAL:.*]] = hlfir.designate %arg0
+// CHECK:     %[[LOAD:.*]] = fir.load %[[VAL]]
+// CHECK:     %[[REF:.*]] = fir.load %arg1
+// CHECK:     %[[CMP:.*]] = arith.cmpi eq, %[[LOAD]], %[[REF]]
+// CHECK:     %[[RES:.*]] = fir.convert %[[CMP]]
+// CHECK:     fir.store %[[RES]]
+// CHECK-NOT: hlfir.destroy %[[MASK]]
+
+// Test elemental removal as use_count becomes zero.
+func.func @test_inlining_elemental_cleanup(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10, %c10 : (index, index) -> !fir.shape<2>
+
+  // Elemental Mask.
+  %mask = hlfir.elemental %shape unordered : (!fir.shape<2>) -> !hlfir.expr<10x10x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index):
+    %val = hlfir.designate %arg0 (%i, %j) : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+    %load = fir.load %val : !fir.ref<i32>
+    %ref = fir.load %arg1 : !fir.ref<i32>
+    %cmp = arith.cmpi eq, %load, %ref : i32
+    %res = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %res : !fir.logical<4>
+  }
+
+  %extra = fir.convert %mask : (!hlfir.expr<10x10x!fir.logical<4>>) -> !hlfir.expr<10x10x!fir.logical<4>>
+
+  // Apply Site.
+  fir.do_loop %arg2 = %c1 to %c10 step %c1 {
+    fir.do_loop %arg3 = %c1 to %c10 step %c1 {
+      %applied = hlfir.apply %mask, %arg3, %arg2 : (!hlfir.expr<10x10x!fir.logical<4>>, index, index) -> !fir.logical<4>
+      %dummy_ref = fir.alloca !fir.logical<4>
+      fir.store %applied to %dummy_ref : !fir.ref<!fir.logical<4>>
+    }
+  }
+
+  hlfir.destroy %mask : !hlfir.expr<10x10x!fir.logical<4>>
+  return
+}
+// CHECK-LABEL: func.func @test_inlining_elemental_cleanup
+// CHECK-NOT:     hlfir.elemental
+// CHECK-NOT:     fir.convert
+// CHECK:         fir.do_loop
+// CHECK:           fir.do_loop
+// CHECK-NOT:         hlfir.apply
+// CHECK:             arith.cmpi eq
+// CHECK-NOT:     hlfir.destroy
+
+// Check that inlining is blocked when there is more than one hlfir.apply
+// site for the same elemental.
+func.func @test_multi_apply_no_inlining(%arg0: !hlfir.expr<?xi32>, %target: i32, %shape: !fir.shape<1>) -> (i1, i1) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  // Producer (Elemental)
+  %mask = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+    ^bb0(%i: index):
+      %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
+      %cmp = arith.cmpi eq, %val, %target : i32
+      %log = fir.convert %cmp : (i1) -> !fir.logical<4>
+      hlfir.yield_element %log : !fir.logical<4>
+  }
+
+  // First Apply Site.
+  %apply1 = hlfir.apply %mask, %c1 : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+  %cond1 = fir.convert %apply1 : (!fir.logical<4>) -> i1
+
+  // Second Apply Site.
+  %apply2 = hlfir.apply %mask, %c2 : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+  %cond2 = fir.convert %apply2 : (!fir.logical<4>) -> i1
+
+  // Destroy.
+  hlfir.destroy %mask : !hlfir.expr<?x!fir.logical<4>>
+
+  return %cond1, %cond2 : i1, i1
+}
+// CHECK-LABEL: func.func @test_multi_apply_no_inlining(
+// CHECK-SAME:  %[[ARG0:.*]]: !hlfir.expr<?xi32>, %[[TARGET:.*]]: i32, %[[SHAPE:.*]]: !fir.shape<1>)
+// CHECK:       %[[MASK:.*]] = hlfir.elemental %[[SHAPE]]
+// CHECK:       %[[A1:.*]] = hlfir.apply %[[MASK]], %{{.*}}
+// CHECK:       %[[A2:.*]] = hlfir.apply %[[MASK]], %{{.*}}
+// CHECK:       hlfir.destroy %[[MASK]]
+
+// Check inlining one elemental into another.
+// a = b * c + d
+// Declare a global symbol to store the intermediate mask.
+fir.global @mask_storage : !hlfir.expr<10x10x!fir.logical<4>>
+func.func @test_nested_elemental(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10, %c10 : (index, index) -> !fir.shape<2>
+
+  // The Elemental Mask (b * c)
+  %mask = hlfir.elemental %shape unordered : (!fir.shape<2>) -> !hlfir.expr<10x10x!fir.logical<4>> {
+  ^bb0(%i: index, %j: index):
+    %val = hlfir.designate %arg0 (%i, %j) : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+    %load = fir.load %val : !fir.ref<i32>
+    %ref = fir.load %arg1 : !fir.ref<i32>
+    %cmp = arith.cmpi eq, %load, %ref : i32
+    %res = fir.convert %cmp : (i1) -> !fir.logical<4>
+    hlfir.yield_element %res : !fir.logical<4>
+  }
+
+  // Total users - 1. fir.store, 2. hlfir.apply, 3. hlfir.destroy.
+  %ptr = fir.address_of(@mask_storage) : !fir.ref<!hlfir.expr<10x10x!fir.logical<4>>>
+  fir.store %mask to %ptr : !fir.ref<!hlfir.expr<10x10x!fir.logical<4>>>
+
+  // Target loop using the mask.
+  fir.do_loop %arg2 = %c1 to %c10 step %c1 {
+    fir.do_loop %arg3 = %c1 to %c10 step %c1 {
+      // CHECK-NOT: hlfir.apply
+      %applied = hlfir.apply %mask, %arg3, %arg2 : (!hlfir.expr<10x10x!fir.logical<4>>, index, index) -> !fir.logical<4>
+      %dummy_ref = fir.alloca !fir.logical<4>
+      fir.store %applied to %dummy_ref : !fir.ref<!fir.logical<4>>
+    }
+  }
+
+  hlfir.destroy %mask : !hlfir.expr<10x10x!fir.logical<4>>
+  return
+}
+// CHECK-LABEL: func.func @test_nested_elemental
+// CHECK:         hlfir.elemental
+// CHECK-NOT:       hlfir.apply
+// CHECK:           arith.cmpi eq
+// CHECK-NOT:         hlfir.destroy
+
+// Inlining into a single hlfir.apply.
+// a = (b * c)[1]
+func.func @test_scalar_apply_inline(%b: !fir.box<!fir.array<?xf32>>, %c: !fir.box<!fir.array<?xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+
+  // Elemental (b * c)
+  %prod = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+  ^bb0(%i: index):
+    %b_ref = hlfir.designate %b (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %b_val = fir.load %b_ref : !fir.ref<f32>
+    %c_ref = hlfir.designate %c (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %c_val = fir.load %c_ref : !fir.ref<f32>
+    %mul = arith.mulf %b_val, %c_val : f32
+    hlfir.yield_element %mul : f32
+  }
+
+  // Total users = 3 (store, apply, destroy).
+  %ptr = fir.address_of(@scalar_storage) : !fir.ref<!hlfir.expr<10xf32>>
+  fir.store %prod to %ptr : !fir.ref<!hlfir.expr<10xf32>>
+
+  // Scalar apply site - a = (b * c)(1)
+  // CHECK-NOT: hlfir.apply
+  %scalar_val = hlfir.apply %prod, %c1 : (!hlfir.expr<10xf32>, index) -> f32
+  
+  // Use the scalar result.
+  %dummy_ref = fir.alloca f32
+  fir.store %scalar_val to %dummy_ref : !fir.ref<f32>
+
+  hlfir.destroy %prod : !hlfir.expr<10xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_scalar_apply_inline
+// CHECK:         %[[MASK:.*]] = hlfir.elemental
+// CHECK:         fir.store %[[MASK]] to {{.*}}
+// CHECK-NOT:     hlfir.apply
+// CHECK:         arith.mulf
+// CHECK-NOT:     hlfir.destroy
+
+// Check long chains of elementals.
+// subroutine reproducer(a)
+//   real, dimension(:) :: a
+//   a = sqrt(a * (a - 1))
+// end subroutine
+func.func @_QPreproducer(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a"}) {
+  %c0 = arith.constant 0 : index
+  %f1 = arith.constant 1.0 : f32
+  %0:2 = hlfir.declare %arg0 {uniq_name = "_QFreproducerEa"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %1:3 = fir.box_dims %0#0, %c0 : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+  %2 = fir.shape %1#1 : (index) -> !fir.shape<1>
+
+  // tmp1 = a - 1
+  %tmp1 = hlfir.elemental %2 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+  ^bb0(%i: index):
+    %a_ref = hlfir.designate %0#0 (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %a_val = fir.load %a_ref : !fir.ref<f32>
+    %sub = arith.subf %a_val, %f1 : f32
+    hlfir.yield_element %sub : f32
+  }
+
+  %dummy = hlfir.no_reassoc %tmp1 : !hlfir.expr<?xf32>
+
+  // tmp2 = a * tmp1
+  %tmp2 = hlfir.elemental %2 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+  ^bb0(%j: index):
+    %a_ref_2 = hlfir.designate %0#0 (%j) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %t1_val = hlfir.apply %tmp1, %j : (!hlfir.expr<?xf32>, index) -> f32
+    %a_val_2 = fir.load %a_ref_2 : !fir.ref<f32>
+    %mul = arith.mulf %a_val_2, %t1_val : f32
+    hlfir.yield_element %mul : f32
+  }
+
+  // tmp3 = sqrt(tmp2)
+  %tmp3 = hlfir.elemental %2 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+  ^bb0(%k: index):
+    %t2_val = hlfir.apply %tmp2, %k : (!hlfir.expr<?xf32>, index) -> f32
+    %res = math.sqrt %t2_val : f32
+    hlfir.yield_element %res : f32
+  }
+
+  // Final assignment.
+  hlfir.assign %tmp3 to %0#0 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+
+  hlfir.destroy %tmp3 : !hlfir.expr<?xf32>
+  hlfir.destroy %tmp2 : !hlfir.expr<?xf32>
+  hlfir.destroy %dummy: !hlfir.expr<?xf32>
+  hlfir.destroy %tmp1 : !hlfir.expr<?xf32>
+  return
+}
+// CHECK-LABEL: func.func @_QPreproducer
+// CHECK:         %[[TMP1:.*]] = hlfir.elemental
+// CHECK:         hlfir.no_reassoc %[[TMP1]]
+// CHECK-NOT:     hlfir.apply
+// CHECK-DAG:     arith.subf
+// CHECK-DAG:     arith.mulf
+// CHECK-DAG:     math.sqrt
+// CHECK:         hlfir.assign
+// The apply site was inlined, so the elemental's lifecycle (destroy) 
+// is removed even though metadata users like no_reassoc remain.
+// CHECK-NOT:     hlfir.destroy %[[TMP1]]
+
+// Check that the ordered elemental is not inlined into another:
+// a = b + c + d (where b + c is ordered)
+func.func private @persistent_user(!hlfir.expr<?xf32>)
+func.func @test_noinline_ordered(%arg0: !hlfir.expr<?xf32>, %arg1: !hlfir.expr<?xf32>, %shape: !fir.shape<1>) {
+  %c1 = arith.constant 1 : index
+
+  // Producer (b + c) - ordered
+  %el_a = hlfir.elemental %shape {ordered = true} : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+    ^bb0(%i: index):
+      %0 = hlfir.apply %arg0, %i : (!hlfir.expr<?xf32>, index) -> f32
+      %1 = hlfir.apply %arg1, %i : (!hlfir.expr<?xf32>, index) -> f32
+      %sum = arith.addf %0, %1 : f32
+      hlfir.yield_element %sum : f32
+  }
+
+  fir.call @persistent_user(%el_a) : (!hlfir.expr<?xf32>) -> ()
+
+  // Consumer (el_a + d)
+  %el_b = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+    ^bb0(%j: index):
+      // This apply must remain because el_a is ordered.
+      %a_val = hlfir.apply %el_a, %j : (!hlfir.expr<?xf32>, index) -> f32
+      %total = arith.addf %a_val, %a_val : f32
+      hlfir.yield_element %total : f32
+  }
+
+  hlfir.destroy %el_a : !hlfir.expr<?xf32>
+  %val = hlfir.apply %el_b, %c1 : (!hlfir.expr<?xf32>, index) -> f32
+  hlfir.destroy %el_b : !hlfir.expr<?xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_noinline_ordered
+// CHECK:         %[[PRODUCER:.*]] = hlfir.elemental %{{.*}} {ordered = true}
+// CHECK:         fir.call @persistent_user(%[[PRODUCER]])
+// CHECK:         %[[CONSUMER:.*]] = hlfir.elemental
+// CHECK:           hlfir.apply %[[PRODUCER]], %{{.*}}
+// CHECK:         hlfir.destroy %[[PRODUCER]]
+
+// Check that the elemental is not inlined, because its array result
+// must be finalized.
+func.func @test_noinline_due_to_finalization(%arg0: !fir.box<!fir.array<?x!fir.type<_QMtypesTt1{x:f32}>>>, %shape: !fir.shape<1>) {
+  %c1 = arith.constant 1 : index
+
+  // Producer - Derived-type Elemental.
+  %el = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.type<_QMtypesTt1{x:f32}>> {
+  ^bb0(%i: index):
+    %res = fir.alloca !fir.type<_QMtypesTt1{x:f32}>
+    %ld = fir.load %res : !fir.ref<!fir.type<_QMtypesTt1{x:f32}>>
+    hlfir.yield_element %ld : !fir.type<_QMtypesTt1{x:f32}>
+  }
+
+  fir.call @persistent_user(%el) : (!hlfir.expr<?x!fir.type<_QMtypesTt1{x:f32}>>) -> ()
+
+  // This apply must remain because the elemental result requires finalization.
+  %res_ptr = fir.alloca !fir.type<_QMtypesTt1{x:f32}>
+  %apply = hlfir.apply %el, %c1 : (!hlfir.expr<?x!fir.type<_QMtypesTt1{x:f32}>>, index) -> !hlfir.expr<!fir.type<_QMtypesTt1{x:f32}>>
+  hlfir.assign %apply to %res_ptr : !hlfir.expr<!fir.type<_QMtypesTt1{x:f32}>>, !fir.ref<!fir.type<_QMtypesTt1{x:f32}>>
+
+  // Destroy with 'finalize' keyword, hlfir::elementalOpMustProduceTemp becomes
+  // true.
+  hlfir.destroy %el finalize : !hlfir.expr<?x!fir.type<_QMtypesTt1{x:f32}>>
+  return
+}
+// CHECK-LABEL: func.func @test_noinline_due_to_finalization
+// CHECK:         %[[EL:.*]] = hlfir.elemental
+// CHECK:         fir.call @persistent_user(%[[EL]])
+// CHECK:         %[[APPLY:.*]] = hlfir.apply %[[EL]], %{{.*}}
+// CHECK:         hlfir.destroy %[[EL]] finalize
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir
deleted file mode 100644
index 31925ae41467e..0000000000000
--- a/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-maxloc.fir
+++ /dev/null
@@ -1,269 +0,0 @@
-// RUN: fir-opt %s --simplify-hlfir-intrinsics | FileCheck %s
-
-// Rank 1: Variable: A == %target
-func.func @test_maxloc_1d_equality_variable(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_1d_equality_variable
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK-NOT: arith.constant -2147483648 : i32
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
-// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %{{.*}}
-// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
-// CHECK:   %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
-
-// Rank 2: Variable: A == %target
-func.func @test_maxloc_2d_equality_variable(%arg0: !hlfir.expr<?x?xi32>, %target: i32) -> !hlfir.expr<2xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
-  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index):
-    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
-  return %res : !hlfir.expr<2xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_2d_equality_variable
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[RES_OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[OUT1:.*]] = %[[C0]], %[[OUT2:.*]] = %[[C0]], %[[OUT3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
-// CHECK:   %[[RES_INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[IN1:.*]] = %[[OUT1]], %[[IN2:.*]] = %[[OUT2]], %[[IN3:.*]] = %[[OUT3]]) -> (i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:     %[[EQ:.*]] = arith.cmpi eq, {{.*}}
-// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[IN3]]
-// CHECK:     %[[IF:.*]]:3 = fir.if %[[COND]] -> (i32, i32, i1)
-
-// Rank 3: Variable: A == %target
-func.func @test_maxloc_3d_equality_variable(%arg0: !hlfir.expr<?x?x?xi32>, %target: i32) -> !hlfir.expr<3xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
-  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index, %k: index):
-    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
-  return %res : !hlfir.expr<3xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_3d_equality_variable
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[OUTER:.*]]:4 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
-// CHECK:   %[[MIDDLE:.*]]:4 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
-// CHECK:     %[[INNER:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:       %[[EQ:.*]] = arith.cmpi eq, {{.*}}
-// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
-// CHECK:       %[[IF:.*]]:4 = fir.if %[[COND]] -> (i32, i32, i32, i1)
-
-// Rank 1: Constant: A == 42
-func.func @test_maxloc_1d_equality_constant(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
-  %c42 = arith.constant 42 : i32
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c42 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_1d_equality_constant
-// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-
-// CHECK: %[[RES:.*]]:2 = fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
-// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
-// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
-// CHECK:   %[[IF:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
-
-// Rank 2: Constant: A == 42
-func.func @test_maxloc_2d_equality_constant(%arg0: !hlfir.expr<?x?xi32>) -> !hlfir.expr<2xi32> {
-  %c42 = arith.constant 42 : i32
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
-  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index):
-    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c42 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
-  return %res : !hlfir.expr<2xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_2d_equality_constant
-// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
-// CHECK:   %[[INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[O1]], %[[I2:.*]] = %[[O2]], %[[I3:.*]] = %[[O3]]) -> (i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:     %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV2]], %[[IV1]]
-// CHECK:     %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
-// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[I3]]
-
-// Rank 3: Constant: A == 42
-func.func @test_maxloc_3d_equality_constant(%arg0: !hlfir.expr<?x?x?xi32>) -> !hlfir.expr<3xi32> {
-  %c42 = arith.constant 42 : i32
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
-  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index, %k: index):
-    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c42 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
-  return %res : !hlfir.expr<3xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_3d_equality_constant
-// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[OUT:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
-// CHECK:   %[[MID:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
-// CHECK:     %[[INN:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:       %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV3]], %{{.*}}, %{{.*}}
-// CHECK:       %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
-// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
-
-// No Match: Result must be 0
-func.func @test_maxloc_no_match(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
-  %c99 = arith.constant 99 : i32 
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c99 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_no_match(
-// CHECK-SAME: %[[ARRAY_NM:.*]]: !hlfir.expr<?xi32>)
-// CHECK-DAG:    %[[C99:.*]] = arith.constant 99 : i32
-// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_NM]], %[[IV]]
-// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[C99]] : i32
-// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
-// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
-// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
-// CHECK:        fir.result %[[CONV]], %false
-// CHECK:      } else {
-// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
-// CHECK:      }
-
-// First Match: Duplicate values
-func.func @test_maxloc_first_match(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_first_match(
-// CHECK-SAME: %[[ARRAY_FM:.*]]: !hlfir.expr<?xi32>, %[[TARGET_FM:.*]]: i32)
-// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
-// CHECK-DAG:    %[[FALSE:.*]] = arith.constant false
-// Verify loop has only 2 iter_args (Coord, FirstHitFlag)
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// Verify mask elemental is bypassed
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// Verify the "Locking" logic: (Match 'and' is_first)
-// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_FM]], %[[IV]]
-// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[TARGET_FM]] : i32
-// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
-// Verify that once a match is found, we result in %false to lock it
-// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
-// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
-// CHECK:        fir.result %[[CONV]], %[[FALSE]] : i32, i1
-// CHECK:      } else {
-// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
-// CHECK:      }
-
-// Negative test: Mask refers to a different array (%arg1) than the search 
-// array (%arg0).
-func.func @test_maxloc_different_arrays(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    // Optimization should fail here because %arg1 != %arg0
-    %val_b = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val_b, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_different_arrays(
-// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>, %[[TARGET:.*]]: i32)
-// CHECK: %[[SENTINEL:.*]] = arith.constant -2147483648 : i32
-// Verify the loop uses three iter_args (standard path)
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MAX:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
-// Verify the mask is applied (Since we can't inline it safely)
-// CHECK: %[[MASK_VAL:.*]] = hlfir.apply {{.*}} : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
-// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_VAL]] : (!fir.logical<4>) -> i1
-// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
-// CHECK:   %[[VAL_A:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
-// CHECK:   %[[CMP:.*]] = arith.cmpi sgt, %[[VAL_A]], %[[MAX]]
-
-// Negative Test: The target value is another array, so it is not invariant.
-func.func @test_maxloc_non_invariant_target(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val_a = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %val_target = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
-    // Optimization should fail here because %val_target is defined inside the elemental
-    %cmp = arith.cmpi eq, %val_a, %val_target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.maxloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_maxloc_non_invariant_target(
-// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>)
-// CHECK: %[[SENTINEL:.*]] = arith.constant -2147483648 : i32
-// Verify the loop uses three iter_args (Standard path)
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MAX:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
-// Verify the mask is still applied (because we couldn't inline the comparison)
-// CHECK: %[[MASK_BIT:.*]] = hlfir.apply %{{.*}}, %[[IV]] : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
-// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_BIT]] : (!fir.logical<4>) -> i1
-// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
-// CHECK:   %[[VAL:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
-// CHECK:   %[[CMP:.*]] = arith.cmpi sgt, %[[VAL]], %[[MAX]]
-
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir
deleted file mode 100644
index 0bfa58968a2fe..0000000000000
--- a/flang/test/HLFIR/simplify-hlfir-intrinsics-equality-minloc.fir
+++ /dev/null
@@ -1,274 +0,0 @@
-// RUN: fir-opt %s --simplify-hlfir-intrinsics | FileCheck %s
-
-// Rank 1: Variable: A == %target
-func.func @test_minloc_1d_equality_variable(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_1d_equality_variable
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK-NOT: arith.constant 2147483647 : i32
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
-// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %{{.*}}
-// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
-// CHECK:   %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
-
-// Rank 2: Variable: A == %target
-func.func @test_minloc_2d_equality_variable(%arg0: !hlfir.expr<?x?xi32>, %target: i32) -> !hlfir.expr<2xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
-  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index):
-    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
-  return %res : !hlfir.expr<2xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_2d_equality_variable
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[RES_OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[OUT1:.*]] = %[[C0]], %[[OUT2:.*]] = %[[C0]], %[[OUT3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
-// CHECK:   %[[RES_INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[IN1:.*]] = %[[OUT1]], %[[IN2:.*]] = %[[OUT2]], %[[IN3:.*]] = %[[OUT3]]) -> (i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:     %[[EQ:.*]] = arith.cmpi eq, {{.*}}
-// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[IN3]]
-// CHECK:     %[[IF:.*]]:3 = fir.if %[[COND]] -> (i32, i32, i1)
-
-// Rank 3: Variable: A == %target
-func.func @test_minloc_3d_equality_variable(%arg0: !hlfir.expr<?x?x?xi32>, %target: i32) -> !hlfir.expr<3xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
-  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index, %k: index):
-    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
-  return %res : !hlfir.expr<3xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_3d_equality_variable
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[OUTER:.*]]:4 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
-// CHECK:   %[[MIDDLE:.*]]:4 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
-// CHECK:     %[[INNER:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:       %[[EQ:.*]] = arith.cmpi eq, {{.*}}
-// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
-// CHECK:       %[[IF:.*]]:4 = fir.if %[[COND]] -> (i32, i32, i32, i1)
-
-// Rank 1: Constant: A == 42
-func.func @test_minloc_1d_equality_constant(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
-  %c42 = arith.constant 42 : i32
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c42 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_1d_equality_constant
-// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-
-// CHECK: %[[RES:.*]]:2 = fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:   %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV]]
-// CHECK:   %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
-// CHECK:   %[[COND:.*]] = arith.andi %[[EQ]], %[[FIRST]]
-// CHECK:   %[[IF:.*]]:2 = fir.if %[[COND]] -> (i32, i1)
-
-// Rank 2: Constant: A == 42
-func.func @test_minloc_2d_equality_constant(%arg0: !hlfir.expr<?x?xi32>) -> !hlfir.expr<2xi32> {
-  %c42 = arith.constant 42 : i32
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
-  %mask = hlfir.elemental %shape : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index):
-    %val = hlfir.apply %arg0, %i, %j : (!hlfir.expr<?x?xi32>, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c42 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<2xi32>
-  return %res : !hlfir.expr<2xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_2d_equality_constant
-// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[OUTER:.*]]:3 = fir.do_loop %[[IV1:.*]] = {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[TRUE]]) -> (i32, i32, i1)
-// CHECK:   %[[INNER:.*]]:3 = fir.do_loop %[[IV2:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[O1]], %[[I2:.*]] = %[[O2]], %[[I3:.*]] = %[[O3]]) -> (i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:     %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV2]], %[[IV1]]
-// CHECK:     %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
-// CHECK:     %[[COND:.*]] = arith.andi %[[EQ]], %[[I3]]
-
-// Rank 3: Constant: A == 42
-func.func @test_minloc_3d_equality_constant(%arg0: !hlfir.expr<?x?x?xi32>) -> !hlfir.expr<3xi32> {
-  %c42 = arith.constant 42 : i32
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?x?x?xi32>) -> !fir.shape<3>
-  %mask = hlfir.elemental %shape : (!fir.shape<3>) -> !hlfir.expr<?x?x?x!fir.logical<4>> {
-  ^bb0(%i: index, %j: index, %k: index):
-    %val = hlfir.apply %arg0, %i, %j, %k : (!hlfir.expr<?x?x?xi32>, index, index, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c42 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?x?x?xi32>, !hlfir.expr<?x?x?x!fir.logical<4>>) -> !hlfir.expr<3xi32>
-  return %res : !hlfir.expr<3xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_3d_equality_constant
-// CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
-// CHECK: %[[OUT:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[O1:.*]] = %[[C0]], %[[O2:.*]] = %[[C0]], %[[O3:.*]] = %[[C0]], %[[O4:.*]] = %[[TRUE]]) -> (i32, i32, i32, i1)
-// CHECK:   %[[MID:.*]]:4 = fir.do_loop {{.*}} iter_args(%[[M1:.*]] = %[[O1]], %[[M2:.*]] = %[[O2]], %[[M3:.*]] = %[[O3]], %[[M4:.*]] = %[[O4]]) -> (i32, i32, i32, i1)
-// CHECK:     %[[INN:.*]]:4 = fir.do_loop %[[IV3:.*]] = {{.*}} iter_args(%[[I1:.*]] = %[[M1]], %[[I2:.*]] = %[[M2]], %[[I3:.*]] = %[[M3]], %[[I4:.*]] = %[[M4]]) -> (i32, i32, i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:       %[[VAL:.*]] = hlfir.apply %{{.*}}, %[[IV3]], %{{.*}}, %{{.*}}
-// CHECK:       %[[EQ:.*]] = arith.cmpi eq, %[[VAL]], %[[C42]]
-// CHECK:       %[[COND:.*]] = arith.andi %[[EQ]], %[[I4]]
-
-// No Match: Result must be 0
-func.func @test_minloc_no_match(%arg0: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
-  %c99 = arith.constant 99 : i32 
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %c99 : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_no_match(
-// CHECK-SAME: %[[ARRAY_NM:.*]]: !hlfir.expr<?xi32>)
-// CHECK-DAG:    %[[C99:.*]] = arith.constant 99 : i32
-// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_NM]], %[[IV]]
-// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[C99]] : i32
-// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
-// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
-// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
-// CHECK:        fir.result %[[CONV]], %false
-// CHECK:      } else {
-// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
-// CHECK:      }
-
-// First Match: Duplicate values
-func.func @test_minloc_first_match(%arg0: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_first_match(
-// CHECK-SAME: %[[ARRAY_FM:.*]]: !hlfir.expr<?xi32>, %[[TARGET_FM:.*]]: i32)
-// CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : i32
-// CHECK-DAG:    %[[TRUE:.*]] = arith.constant true
-// CHECK-DAG:    %[[FALSE:.*]] = arith.constant false
-// Verify loop has only 2 iter_args (Coord, FirstHitFlag)
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = %[[C0]], %[[FIRST:.*]] = %[[TRUE]]) -> (i32, i1)
-// Verify mask elemental is bypassed
-// CHECK-NOT: hlfir.apply {{.*}} !fir.logical<4>
-// Verify the "Locking" logic: (Match AND is_first)
-// CHECK:      %[[VAL:.*]] = hlfir.apply %[[ARRAY_FM]], %[[IV]]
-// CHECK:      %[[MATCH:.*]] = arith.cmpi eq, %[[VAL]], %[[TARGET_FM]] : i32
-// CHECK:      %[[COND:.*]] = arith.andi %[[MATCH]], %[[FIRST]] : i1
-// Verify that once a match is found, we result in %false to lock it
-// CHECK:      %[[IF_RES:.*]]:2 = fir.if %[[COND]] -> (i32, i1) {
-// CHECK:        %[[CONV:.*]] = fir.convert %[[IV]]
-// CHECK:        fir.result %[[CONV]], %[[FALSE]] : i32, i1
-// CHECK:      } else {
-// CHECK:        fir.result %[[LOC]], %[[FIRST]] : i32, i1
-// CHECK:      }
-
-// Negative test: Mask refers to a different array (%arg1) than the search 
-// array (%arg0).
-func.func @test_minloc_different_arrays(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>, %target: i32) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    // Optimization should fail here because %arg1 != %arg0
-    %val_b = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %cmp = arith.cmpi eq, %val_b, %target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_different_arrays(
-// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>, %[[TARGET:.*]]: i32)
-// CHECK: %[[SENTINEL:.*]] = arith.constant 2147483647 : i32
-
-// 1. Verify the loop uses three iter_args (Standard path: Loc, MinVal, FirstHit)
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MIN_VAL:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
-
-// 2. Verify the mask IS still applied (Optimization correctly skipped)
-// CHECK: %[[MASK_VAL:.*]] = hlfir.apply {{.*}} : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
-// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_VAL]] : (!fir.logical<4>) -> i1
-
-// 3. Verify the standard path's MINLOC comparison logic (slt instead of sgt)
-// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
-// CHECK:   %[[VAL_A:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
-// CHECK:   %[[CMP:.*]] = arith.cmpi slt, %[[VAL_A]], %[[MIN_VAL]] : i32
-
-// Negative Test: The target value is another array, so it is not invariant.
-func.func @test_minloc_non_invariant_target(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<?xi32>) -> !hlfir.expr<1xi32> {
-  %shape = hlfir.shape_of %arg0 : (!hlfir.expr<?xi32>) -> !fir.shape<1>
-  %mask = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
-  ^bb0(%i: index):
-    %val_a = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
-    %val_target = hlfir.apply %arg1, %i : (!hlfir.expr<?xi32>, index) -> i32
-    // Optimization should fail here because %val_target is defined inside the 
-    // elemental
-    %cmp = arith.cmpi eq, %val_a, %val_target : i32
-    %logical = fir.convert %cmp : (i1) -> !fir.logical<4>
-    hlfir.yield_element %logical : !fir.logical<4>
-  }
-  %res = hlfir.minloc %arg0 mask %mask : (!hlfir.expr<?xi32>, !hlfir.expr<?x!fir.logical<4>>) -> !hlfir.expr<1xi32>
-  return %res : !hlfir.expr<1xi32>
-}
-// CHECK-LABEL: func.func @test_minloc_non_invariant_target(
-// CHECK-SAME: %[[ARRAY_A:.*]]: !hlfir.expr<?xi32>, %[[ARRAY_B:.*]]: !hlfir.expr<?xi32>)
-// CHECK: %[[SENTINEL:.*]] = arith.constant 2147483647 : i32
-// Verify the loop uses three iter_args (standard path)
-// CHECK: fir.do_loop %[[IV:.*]] = {{.*}} iter_args(%[[LOC:.*]] = {{.*}}, %[[MAX:.*]] = %[[SENTINEL]], %[[FIRST:.*]] = {{.*}}) -> (i32, i32, i1)
-// Verify the mask is still applied (because we couldn't inline the comparison)
-// CHECK: %[[MASK_BIT:.*]] = hlfir.apply %{{.*}}, %[[IV]] : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
-// CHECK: %[[MASK_I1:.*]] = fir.convert %[[MASK_BIT]] : (!fir.logical<4>) -> i1
-// CHECK: fir.if %[[MASK_I1]] -> (i32, i32, i1) {
-// CHECK:   %[[VAL:.*]] = hlfir.apply %[[ARRAY_A]], %[[IV]]
-// CHECK:   %[[CMP:.*]] = arith.cmpi slt, %[[VAL]], %[[MAX]]
-

>From 0792d7eb062391599904b6993fc03ac3f4b59b74 Mon Sep 17 00:00:00 2001
From: "anoop.kumar6 at ibm.com" <anoopk at b35lp63.lnxne.boe>
Date: Wed, 1 Apr 2026 00:35:48 +0200
Subject: [PATCH 3/3] [flang][HLFIR] This patch adds a data-flow and
 memory-effect analysis to   InlineElementals pass to ensure semantic safety.
 It replaces the "two-use"   constraint with a worklist-based traversal to
 trace elemental results   through hlfir.declare and fir.convert. A new safety
 check, isSafeToInline,   uses AliasAnalysis and a recursive region walk to
 detect conflicting   writes between the producer and consumer, preventing
 unsafe inlining   across structured control flow.

---
 .../HLFIR/Transforms/InlineElementals.cpp     | 216 +++++++++++-
 .../HLFIR/inline-elemental-multi-users.fir    | 315 +++++++++++++++---
 2 files changed, 480 insertions(+), 51 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
index e3ced12dc93b3..9a6ad46309947 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
@@ -16,9 +16,11 @@
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
+#include "mlir/Analysis/AliasAnalysis.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -31,10 +33,127 @@ namespace hlfir {
 #include "flang/Optimizer/HLFIR/Passes.h.inc"
 } // namespace hlfir
 
+/// Collects all memory values (buffers/references) that the elemental body
+/// reads from.
+static void getReadDependencies(hlfir::ElementalOp elemental,
+                                llvm::SmallVectorImpl<mlir::Value> &deps) {
+  elemental.getRegion().walk([&](mlir::Operation *op) {
+    if (auto designate = mlir::dyn_cast<hlfir::DesignateOp>(op))
+      deps.push_back(designate.getMemref());
+    else if (auto load = mlir::dyn_cast<fir::LoadOp>(op))
+      deps.push_back(load.getMemref());
+    // Capture any value defined outside the elemental but used inside it.
+    for (mlir::Value operand : op->getOperands()) {
+      if (operand.getParentRegion() != &elemental.getRegion())
+        if (mlir::isa<fir::ReferenceType, fir::PointerType, fir::HeapType,
+                      fir::BoxType>(operand.getType()))
+          deps.push_back(operand);
+    }
+  });
+}
+
+/// Checks if an operation 'op' potentially modifies any memory location that
+/// the elemental reads from (captured in 'deps').
+static bool isConflictingWrite(mlir::Operation *op,
+                               const llvm::SmallVectorImpl<mlir::Value> &deps,
+                               mlir::AliasAnalysis &aa) {
+  // Operations explicitly marked as having no memory effects are safe.
+  if (mlir::isMemoryEffectFree(op))
+    return false;
+
+  // Explicitly allow safe HLFIR/FIR metadata/lifetime operations.
+  // While these may have internal effects (e.g. allocating a descriptor),
+  // they do not modify the user data being read by the elemental.
+  if (mlir::isa<hlfir::DeclareOp, hlfir::AssociateOp, hlfir::EndAssociateOp,
+                fir::AllocaOp, hlfir::NoReassocOp>(op))
+    return false;
+
+  // Check for explicit memory effects via the MemoryEffectOpInterface.
+  if (auto memInterface = mlir::dyn_cast<mlir::MemoryEffectOpInterface>(op)) {
+    llvm::SmallVector<mlir::MemoryEffects::EffectInstance, 4> effects;
+    memInterface.getEffects(effects);
+
+    for (const auto &effect : effects) {
+      // Analyze effects that modify memory or release resources.
+      if (mlir::isa<mlir::MemoryEffects::Write>(effect.getEffect()) ||
+          mlir::isa<mlir::MemoryEffects::Free>(effect.getEffect())) {
+
+        mlir::Value accessedValue = effect.getValue();
+        // If the effect is on an unknown resource (e.g. external call),
+        // assume a conflict.
+        if (!accessedValue)
+          return true;
+
+        // Perform alias analysis against all read dependencies.
+        for (mlir::Value dep : deps) {
+          if (!aa.alias(accessedValue, dep).isNo())
+            return true;
+        }
+      }
+    }
+  } else if (op->getNumRegions() == 0) {
+    // Conservative Fallback: If an operation lacks the interface and has no
+    // regions (e.g. a fir.call to an external function), assume it can
+    // potentially modifies any memory.
+    return true;
+  }
+
+  // Recursive Analysis into structured control flow regions.
+  // (e.g. fir.if, fir.do_loop) to find nested conflicting writes.
+  for (mlir::Region &region : op->getRegions()) {
+    for (mlir::Block &block : region) {
+      for (mlir::Operation &nestedOp : block) {
+        if (isConflictingWrite(&nestedOp, deps, aa))
+          return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+bool isSafeToInline(hlfir::ElementalOp producer, hlfir::ApplyOp applySite,
+                    mlir::AliasAnalysis &aa) {
+  mlir::DominanceInfo domInfo(producer->getParentOp());
+  if (!domInfo.properlyDominates(producer.getOperation(),
+                                 applySite.getOperation()))
+    return false;
+
+  llvm::SmallVector<mlir::Value> deps;
+  getReadDependencies(producer, deps);
+
+  mlir::Operation *func = producer->getParentOfType<mlir::func::FuncOp>();
+  bool conflict = false;
+
+  func->walk([&](mlir::Operation *op) {
+    // Skip the producer and applySite themselves.
+    if (op == producer.getOperation() || op == applySite.getOperation())
+      return mlir::WalkResult::advance();
+
+    // Skip the operation that contains the applySite.
+    // We only care about operations that execute before the applySite
+    // starts or between the producer and the start of the loop.
+    if (op->isAncestor(applySite.getOperation()))
+      return mlir::WalkResult::advance();
+
+    // Only check operations that strictly execute between definition and use.
+    if (domInfo.properlyDominates(producer.getOperation(), op) &&
+        domInfo.dominates(op, applySite.getOperation())) {
+      if (isConflictingWrite(op, deps, aa)) {
+        conflict = true;
+        return mlir::WalkResult::interrupt();
+      }
+    }
+    return mlir::WalkResult::advance();
+  });
+
+  return !conflict;
+}
+
 /// If the elemental has only two uses and those two are an apply operation and
 /// a destroy operation, return those two, otherwise return {}
 static std::optional<std::pair<hlfir::ApplyOp, hlfir::DestroyOp>>
-getTwoUses(hlfir::ElementalOp elemental) {
+getTwoUses(hlfir::ElementalOp elemental, mlir::AliasAnalysis &aliasAnalysis) {
   // If the ElementalOp must produce a temporary (e.g. for
   // finalization purposes), then we cannot inline it.
   if (hlfir::elementalOpMustProduceTemp(elemental))
@@ -44,13 +163,77 @@ getTwoUses(hlfir::ElementalOp elemental) {
   hlfir::DestroyOp destroy;
   unsigned applyCount = 0;
 
-  for (mlir::Operation *user : elemental->getUsers()) {
-    mlir::TypeSwitch<mlir::Operation *, void>(user)
-        .Case([&](hlfir::ApplyOp op) {
-          apply = op;
-          applyCount++;
-        })
-        .Case([&](hlfir::DestroyOp op) { destroy = op; });
+  llvm::SmallVector<mlir::Value> worklist;
+  worklist.push_back(elemental.getResult());
+  llvm::SmallPtrSet<mlir::Value, 16> visited;
+
+  while (!worklist.empty()) {
+    mlir::Value current = worklist.pop_back_val();
+    if (!current || !visited.insert(current).second)
+      continue;
+
+    for (mlir::OpOperand &use : current.getUses()) {
+      mlir::Operation *user = use.getOwner();
+
+      mlir::TypeSwitch<mlir::Operation *, void>(user)
+          .Case<hlfir::ApplyOp>([&](hlfir::ApplyOp op) {
+            apply = op;
+            applyCount++;
+          })
+          .Case<hlfir::DestroyOp>([&](hlfir::DestroyOp op) {
+            // Track the mandatory destroy operation for the elemental expr.
+            destroy = op;
+          })
+          .Case<hlfir::DeclareOp>([&](hlfir::DeclareOp op) {
+            // Follow the dataflow through variable declarations.
+            worklist.push_back(op.getBase());
+          })
+          .Case<fir::ConvertOp>([&](fir::ConvertOp op) {
+            // Follow the dataflow through type conversions.
+            worklist.push_back(op.getResult());
+          })
+          .Case<mlir::BranchOpInterface>([&](mlir::BranchOpInterface branch) {
+            for (unsigned i = 0; i < branch->getNumSuccessors(); ++i) {
+              mlir::SuccessorOperands operands = branch.getSuccessorOperands(i);
+              for (unsigned j = 0; j < operands.size(); ++j) {
+                if (operands[j] == current) {
+                  // The j-th operand of the branch maps to the j-th block
+                  // argument of the successor block.
+                  mlir::Block *successor = branch->getSuccessor(i);
+                  worklist.push_back(successor->getArgument(j));
+                }
+              }
+            }
+          })
+          .Case<fir::ResultOp>([&](fir::ResultOp op) {
+            mlir::Operation *parent = op->getParentOp();
+            if (parent) {
+              for (auto it : llvm::enumerate(op.getOperands())) {
+                if (it.value() == current) {
+                  // 'current' is being yielded. The value outside the loop is
+                  // the i-th result of the parent operation.
+                  unsigned i = it.index();
+                  if (i < parent->getNumResults()) {
+                    worklist.push_back(parent->getResult(i));
+                  }
+                }
+              }
+            }
+          })
+          .Default([&](mlir::Operation *op) {
+            // If the elemental result is used by an operation with regions
+            // (like fir.if or fir.do_loop), the apply site may be nested
+            // inside.
+            if (op->getNumRegions() > 0) {
+              op->walk([&](hlfir::ApplyOp nestedApply) {
+                if (nestedApply.getExpr() == current) {
+                  apply = nestedApply;
+                  applyCount++;
+                }
+              });
+            }
+          });
+    }
   }
 
   // Only inline if there is a unique 'apply' site. Other users (such as
@@ -59,6 +242,10 @@ getTwoUses(hlfir::ElementalOp elemental) {
   if (applyCount != 1 || !destroy)
     return std::nullopt;
 
+  // Verify memory effect and dataflow analysis.
+  if (!isSafeToInline(elemental, apply, aliasAnalysis))
+    return std::nullopt;
+
   // we can't inline if the return type of the yield doesn't match the return
   // type of the apply
   auto yield = mlir::dyn_cast_or_null<hlfir::YieldElementOp>(
@@ -75,12 +262,14 @@ class InlineElementalConversion
     : public mlir::OpRewritePattern<hlfir::ElementalOp> {
 public:
   using mlir::OpRewritePattern<hlfir::ElementalOp>::OpRewritePattern;
-
+  explicit InlineElementalConversion(mlir::MLIRContext *context,
+                                     mlir::AliasAnalysis &aa)
+      : OpRewritePattern<hlfir::ElementalOp>(context), aliasAnalysis(aa) {}
   llvm::LogicalResult
   matchAndRewrite(hlfir::ElementalOp elemental,
                   mlir::PatternRewriter &rewriter) const override {
     std::optional<std::pair<hlfir::ApplyOp, hlfir::DestroyOp>> maybeTuple =
-        getTwoUses(elemental);
+        getTwoUses(elemental, aliasAnalysis);
     if (!maybeTuple)
       return rewriter.notifyMatchFailure(
           elemental, "hlfir.elemental is not a candidate for inlining");
@@ -113,6 +302,9 @@ class InlineElementalConversion
 
     return mlir::success();
   }
+
+private:
+  mlir::AliasAnalysis &aliasAnalysis;
 };
 
 class InlineElementalsPass
@@ -121,13 +313,15 @@ class InlineElementalsPass
   void runOnOperation() override {
     mlir::MLIRContext *context = &getContext();
 
+    // Get AliasAnalysis from the pass manager.
+    mlir::AliasAnalysis &aliasAnalysis = getAnalysis<mlir::AliasAnalysis>();
     mlir::GreedyRewriteConfig config;
     // Prevent the pattern driver from merging blocks.
     config.setRegionSimplificationLevel(
         mlir::GreedySimplifyRegionLevel::Disabled);
 
     mlir::RewritePatternSet patterns(context);
-    patterns.insert<InlineElementalConversion>(context);
+    patterns.insert<InlineElementalConversion>(context, aliasAnalysis);
 
     if (mlir::failed(mlir::applyPatternsGreedily(
             getOperation(), std::move(patterns), config))) {
diff --git a/flang/test/HLFIR/inline-elemental-multi-users.fir b/flang/test/HLFIR/inline-elemental-multi-users.fir
index f8b195e637091..32c9305320bdb 100644
--- a/flang/test/HLFIR/inline-elemental-multi-users.fir
+++ b/flang/test/HLFIR/inline-elemental-multi-users.fir
@@ -1,9 +1,92 @@
 // RUN: fir-opt --inline-elementals %s | FileCheck %s
 
-// Test inlining of hlfir.elemental into its hlfir.apply site when the 
+// Test inlining of hlfir.elemental into its hlfir.apply site when the
 // elemental has more than two users.
 
-// Check successful inlining where hlfir.elemental survives because the
+// Test successful inlining with relaxed inlining.
+func.func @test_safe_loop_inlining(%arg0: !fir.ref<!fir.array<10xf32>>, %i: index) {
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+
+  // Elemental Mask.
+  %elem = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+    ^bb0(%idx: index):
+      %addr = fir.coordinate_of %arg0, %idx : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+      %val = fir.load %addr : !fir.ref<f32>
+      hlfir.yield_element %val : f32
+  }
+
+  // User 1 - hlfir.apply inside nested loops (The target for inlining).
+  fir.do_loop %arg1 = %c1 to %c10 step %c1 {
+    fir.do_loop %arg2 = %c1 to %c10 step %c1 {
+      %res = hlfir.apply %elem, %arg2 : (!hlfir.expr<10xf32>, index) -> f32
+      %dummy = fir.alloca f32
+      fir.store %res to %dummy : !fir.ref<f32>
+    }
+  }
+
+  // User 2 - Associate (Simulating a shared mask use-case).
+  %temp:3 = hlfir.associate %elem(%shape) : (!hlfir.expr<10xf32>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>, i1)
+
+  // User 3 - Destroy
+  hlfir.destroy %elem : !hlfir.expr<10xf32>
+
+  hlfir.end_associate %temp#0, %temp#2 : !fir.ref<!fir.array<10xf32>>, i1
+  return
+}
+// CHECK-LABEL: func.func @test_safe_loop_inlining
+// CHECK: %[[ELEM:.*]] = hlfir.elemental
+// CHECK: fir.do_loop
+// CHECK:   fir.do_loop %[[INNER_IDX:.*]] =
+// CHECK:     %[[ADDR:.*]] = fir.coordinate_of %arg0, %[[INNER_IDX]]
+// CHECK:     %[[VAL:.*]] = fir.load %[[ADDR]]
+// CHECK-NOT: hlfir.apply
+// CHECK: hlfir.associate %[[ELEM]]
+// CHECK-NOT: hlfir.destroy %[[ELEM]]
+
+// Test blocking of incorrect inlining because of alias conflict.
+func.func @test_unsafe_loop_alias_conflict(%arg0: !fir.ref<!fir.array<10xf32>>, %new_val: f32) {
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+
+  // Elemental depends on the values in %arg0 (Producer).
+  %elem = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+    ^bb0(%idx: index):
+      %addr = fir.coordinate_of %arg0, %idx : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+      %val = fir.load %addr : !fir.ref<f32>
+      hlfir.yield_element %val : f32
+  }
+
+  fir.do_loop %arg1 = %c1 to %c10 step %c1 {
+    // We modify the array that the elemental needs to read from.
+    // Inlining the elemental here would see the new value.
+    %write_addr = fir.coordinate_of %arg0, %arg1 : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+    fir.store %new_val to %write_addr : !fir.ref<f32>
+
+    // Target for inlining.
+    %res = hlfir.apply %elem, %arg1 : (!hlfir.expr<10xf32>, index) -> f32
+
+    %dummy = fir.alloca f32
+    fir.store %res to %dummy : !fir.ref<f32>
+  }
+
+  hlfir.destroy %elem : !hlfir.expr<10xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_unsafe_loop_alias_conflict
+// Elemental should not be inlined, check presence of elemental and apply.
+// CHECK: %[[ELEM:.*]] = hlfir.elemental
+// CHECK: fir.do_loop
+// CHECK:   fir.store
+// CHECK:   %[[APPLIED:.*]] = hlfir.apply %[[ELEM]]
+// CHECK:   fir.store %[[APPLIED]]
+// Inlined code (coordinate_of/load) should not appear inside the loop.
+// CHECK-NOT: fir.coordinate_of %arg0, %arg1
+
+
+// Check successful inlining where 2-d hlfir.elemental survives because the
 // 'associate' op is still using it.
 func.func @test_inlining_use_mask(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.ref<i32>) {
   %c1 = arith.constant 1 : index
@@ -98,7 +181,7 @@ func.func @test_multi_apply_no_inlining(%arg0: !hlfir.expr<?xi32>, %target: i32,
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
 
-  // Producer (Elemental)
+  // Producer (Elemental).
   %mask = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
     ^bb0(%i: index):
       %val = hlfir.apply %arg0, %i : (!hlfir.expr<?xi32>, index) -> i32
@@ -127,16 +210,14 @@ func.func @test_multi_apply_no_inlining(%arg0: !hlfir.expr<?xi32>, %target: i32,
 // CHECK:       %[[A2:.*]] = hlfir.apply %[[MASK]], %{{.*}}
 // CHECK:       hlfir.destroy %[[MASK]]
 
-// Check inlining one elemental into another.
-// a = b * c + d
-// Declare a global symbol to store the intermediate mask.
+// Check global store blocks inlining.
 fir.global @mask_storage : !hlfir.expr<10x10x!fir.logical<4>>
 func.func @test_nested_elemental(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.ref<i32>) {
   %c1 = arith.constant 1 : index
   %c10 = arith.constant 10 : index
   %shape = fir.shape %c10, %c10 : (index, index) -> !fir.shape<2>
 
-  // The Elemental Mask (b * c)
+  // The Elemental Mask (b * c).
   %mask = hlfir.elemental %shape unordered : (!fir.shape<2>) -> !hlfir.expr<10x10x!fir.logical<4>> {
   ^bb0(%i: index, %j: index):
     %val = hlfir.designate %arg0 (%i, %j) : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
@@ -165,50 +246,56 @@ func.func @test_nested_elemental(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !f
   return
 }
 // CHECK-LABEL: func.func @test_nested_elemental
-// CHECK:         hlfir.elemental
-// CHECK-NOT:       hlfir.apply
-// CHECK:           arith.cmpi eq
-// CHECK-NOT:         hlfir.destroy
+// CHECK: %[[MASK:.*]] = hlfir.elemental
+// CHECK: %[[PTR:.*]] = fir.address_of(@mask_storage)
+// CHECK: fir.store %[[MASK]] to %[[PTR]]
+// Apply site not inlined.
+// CHECK: fir.do_loop
+// CHECK:   fir.do_loop
+// CHECK:     %[[VAL:.*]] = hlfir.apply %[[MASK]]
+// CHECK:     fir.store %[[VAL]]
+// The designate/load should only be inside the elemental.
+// CHECK-NOT: hlfir.designate
 
-// Inlining into a single hlfir.apply.
+// Inlining into a single hlfir.apply (relaxed inlining).
 // a = (b * c)[1]
-func.func @test_scalar_apply_inline(%b: !fir.box<!fir.array<?xf32>>, %c: !fir.box<!fir.array<?xf32>>) {
-  %c1 = arith.constant 1 : index
+func.func @test_scalar_apply_inlining_safe(%b: !fir.ref<!fir.array<10xf32>>, %c1: index) {
   %c10 = arith.constant 10 : index
   %shape = fir.shape %c10 : (index) -> !fir.shape<1>
 
-  // Elemental (b * c)
+  // Producer(1D Elemental).
   %prod = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
   ^bb0(%i: index):
-    %b_ref = hlfir.designate %b (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-    %b_val = fir.load %b_ref : !fir.ref<f32>
-    %c_ref = hlfir.designate %c (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-    %c_val = fir.load %c_ref : !fir.ref<f32>
-    %mul = arith.mulf %b_val, %c_val : f32
-    hlfir.yield_element %mul : f32
+    %b_addr = fir.coordinate_of %b, %i : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+    %b_val = fir.load %b_addr : !fir.ref<f32>
+    %res = arith.addf %b_val, %b_val : f32
+    hlfir.yield_element %res : f32
   }
 
-  // Total users = 3 (store, apply, destroy).
-  %ptr = fir.address_of(@scalar_storage) : !fir.ref<!hlfir.expr<10xf32>>
-  fir.store %prod to %ptr : !fir.ref<!hlfir.expr<10xf32>>
-
-  // Scalar apply site - a = (b * c)(1)
-  // CHECK-NOT: hlfir.apply
+  // Scalar Apply (Target for inlining).
   %scalar_val = hlfir.apply %prod, %c1 : (!hlfir.expr<10xf32>, index) -> f32
-  
-  // Use the scalar result.
-  %dummy_ref = fir.alloca f32
-  fir.store %scalar_val to %dummy_ref : !fir.ref<f32>
+
+  %temp:3 = hlfir.associate %prod(%shape) : (!hlfir.expr<10xf32>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>, i1)
 
   hlfir.destroy %prod : !hlfir.expr<10xf32>
+
+  // Use the result.
+  %dummy = fir.alloca f32
+  fir.store %scalar_val to %dummy : !fir.ref<f32>
+
+  hlfir.end_associate %temp#0, %temp#2 : !fir.ref<!fir.array<10xf32>>, i1
   return
 }
-// CHECK-LABEL: func.func @test_scalar_apply_inline
-// CHECK:         %[[MASK:.*]] = hlfir.elemental
-// CHECK:         fir.store %[[MASK]] to {{.*}}
-// CHECK-NOT:     hlfir.apply
-// CHECK:         arith.mulf
-// CHECK-NOT:     hlfir.destroy
+
+// CHECK-LABEL: func.func @test_scalar_apply_inlining_safe
+// CHECK: %[[ELEM:.*]] = hlfir.elemental
+// CHECK: hlfir.yield_element
+// CHECK: %[[ADDR:.*]] = fir.coordinate_of %arg0, %arg1
+// CHECK: %[[VAL:.*]] = fir.load %[[ADDR]]
+// CHECK: arith.addf %[[VAL]], %[[VAL]]
+// CHECK-NOT: hlfir.apply
+// CHECK: hlfir.associate %[[ELEM]]
+// CHECK-NOT: hlfir.destroy %[[ELEM]]
 
 // Check long chains of elementals.
 // subroutine reproducer(a)
@@ -222,7 +309,7 @@ func.func @_QPreproducer(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a
   %1:3 = fir.box_dims %0#0, %c0 : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
   %2 = fir.shape %1#1 : (index) -> !fir.shape<1>
 
-  // tmp1 = a - 1
+  // tmp1 = a - 1.
   %tmp1 = hlfir.elemental %2 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
   ^bb0(%i: index):
     %a_ref = hlfir.designate %0#0 (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
@@ -233,7 +320,7 @@ func.func @_QPreproducer(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a
 
   %dummy = hlfir.no_reassoc %tmp1 : !hlfir.expr<?xf32>
 
-  // tmp2 = a * tmp1
+  // tmp2 = a * tmp1.
   %tmp2 = hlfir.elemental %2 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
   ^bb0(%j: index):
     %a_ref_2 = hlfir.designate %0#0 (%j) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
@@ -243,7 +330,7 @@ func.func @_QPreproducer(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a
     hlfir.yield_element %mul : f32
   }
 
-  // tmp3 = sqrt(tmp2)
+  // tmp3 = sqrt(tmp2).
   %tmp3 = hlfir.elemental %2 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
   ^bb0(%k: index):
     %t2_val = hlfir.apply %tmp2, %k : (!hlfir.expr<?xf32>, index) -> f32
@@ -340,3 +427,151 @@ func.func @test_noinline_due_to_finalization(%arg0: !fir.box<!fir.array<?x!fir.t
 // CHECK:         fir.call @persistent_user(%[[EL]])
 // CHECK:         %[[APPLY:.*]] = hlfir.apply %[[EL]], %{{.*}}
 // CHECK:         hlfir.destroy %[[EL]] finalize
+
+// Test conflicting writes hidden within nested regions (like fir.if) between
+// producer and the apply site.
+func.func @test_nested_region_conflict(%arg0: !fir.ref<f32>, %cond: i1) {
+  %c1 = arith.constant 1 : index
+  %shape = fir.shape %c1 : (index)  -> !fir.shape<1>
+
+  // Producer.
+  %elem = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+    ^bb0(%i: index):
+      %val = fir.load %arg0 : !fir.ref<f32>
+      hlfir.yield_element %val : f32
+  }
+
+  // Nested region (fir.if) containing a store.
+  // This tests if your walk(func) correctly sees into the 'then' block.
+  fir.if %cond {
+    %new_val = arith.constant 3.0 : f32
+    fir.store %new_val to %arg0 : !fir.ref<f32>
+  } else {
+    // Distractor op
+    fir.no_reassoc %cond : i1
+  }
+
+  // Apply Site.
+  %res = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+    ^bb0(%j: index):
+      %val = hlfir.apply %elem, %j : (!hlfir.expr<1xf32>, index) -> f32
+      hlfir.yield_element %val : f32
+  }
+
+  hlfir.destroy %elem : !hlfir.expr<1xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_nested_region_conflict
+// CHECK: %[[ELEM:.*]] = hlfir.elemental
+// CHECK: fir.if %{{.*}} {
+// CHECK:   fir.store
+// CHECK: }
+// CHECK: hlfir.elemental
+// CHECK:   hlfir.apply %[[ELEM]]
+// CHECK: hlfir.destroy %[[ELEM]]
+
+// Checks tracking the elemntal result through block arguments to find the
+// hlfir.apply site across a branch, fir.store in the intervening block (^bb1)
+// blocks the inlining. It was getting inlined with relaxed inlining patch.
+func.func @test_cross_block_conflict(%arg0: !fir.ref<f32>, %shape: !fir.shape<1>) {
+  // Producer in Entry Block.
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %val = fir.load %arg0 : !fir.ref<f32>
+    hlfir.yield_element %val : f32
+  }
+
+  // Pass value to block argument to maintain dataflow for the worklist.
+  cf.br ^bb1(%elemental : !hlfir.expr<1xf32>)
+
+^bb1(%block_arg: !hlfir.expr<1xf32>):
+  // Conflicting Write.
+  // This store between producer and apply site across blocks.
+  %new_val = arith.constant 2.0 : f32
+  fir.store %new_val to %arg0 : !fir.ref<f32>
+
+  // Apply Site.
+  %res = hlfir.elemental %shape : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%j: index):
+    %val = hlfir.apply %block_arg, %j : (!hlfir.expr<1xf32>, index) -> f32
+    hlfir.yield_element %val : f32
+  }
+
+  hlfir.destroy %elemental : !hlfir.expr<1xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_cross_block_conflict
+// CHECK: %[[ELM:.*]] = hlfir.elemental
+// CHECK: cf.br ^bb1(%[[ELM]] : !hlfir.expr<1xf32>)
+// CHECK: ^bb1(%[[BARG:.*]]: !hlfir.expr<1xf32>):
+// CHECK: fir.store {{.*}} to %arg0
+// CHECK: hlfir.elemental
+// CHECK: hlfir.apply %[[BARG]]
+// CHECK: hlfir.destroy %[[ELM]]
+
+// External impure procedure that might modify %arg0.
+func.func private @impure_side_effect()
+
+func.func @test_impure_call_conflict(%arg0: !fir.ref<f32>, %shape: !fir.shape<1>) {
+  // Reads from %arg0.
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%i: index):
+    %val = fir.load %arg0 : !fir.ref<f32>
+    hlfir.yield_element %val : f32
+  }
+
+  // Acts as a memory barrier.
+  fir.call @impure_side_effect() : () -> ()
+
+  // Apply Site.
+  %res = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<1xf32> {
+  ^bb0(%j: index):
+    %val = hlfir.apply %elemental, %j : (!hlfir.expr<1xf32>, index) -> f32
+    hlfir.yield_element %val : f32
+  }
+
+  hlfir.destroy %elemental : !hlfir.expr<1xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_impure_call_conflict
+// CHECK: %[[ELEM:.*]] = hlfir.elemental
+// CHECK: fir.call @impure_side_effect()
+// CHECK: hlfir.apply %[[ELEM]]
+// CHECK: hlfir.destroy %[[ELEM]]
+
+// Check conflicting write to the same memory buffer read by the elemental
+// producer in loop body blocks inlining.
+func.func @test_memory_dependency_with_designate(%arg0: !fir.ref<!fir.array<10xf32>>, %shape: !fir.shape<1>) {
+  %c1 = arith.constant 1 : index
+  %c5 = arith.constant 5 : index
+  %val = arith.constant 2.0 : f32
+
+  // Reads from the whole array %arg0.
+  %elem = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+    ^bb0(%i: index):
+      %addr = fir.coordinate_of %arg0, %i : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+      %load = fir.load %addr : !fir.ref<f32>
+      hlfir.yield_element %load : f32
+  }
+
+  // Modifies one element of the same array, partial write to the same base
+  // buffer.
+  %specific_addr = hlfir.designate %arg0 (%c1) : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+  fir.store %val to %specific_addr : !fir.ref<f32>
+
+  // Apply Site.
+  %res = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+    ^bb0(%j: index):
+      %applied = hlfir.apply %elem, %j : (!hlfir.expr<10xf32>, index) -> f32
+      hlfir.yield_element %applied : f32
+  }
+
+  hlfir.destroy %elem : !hlfir.expr<10xf32>
+  return
+}
+// CHECK-LABEL: func.func @test_memory_dependency_with_designate
+// CHECK: %[[ELEM:.*]] = hlfir.elemental {{.*}} unordered
+// CHECK: fir.store {{.*}} to %{{.*}}
+// CHECK: hlfir.elemental
+// CHECK:   hlfir.apply %[[ELEM]]
+// CHECK: hlfir.destroy %[[ELEM]]



More information about the flang-commits mailing list