[flang-commits] [flang] [flang][hlfir] Resolve shape_of users when bufferizing eval_in_mem (PR #201214)

Wed Jun 3 02:18:51 PDT 2026

https://github.com/khaki3 updated https://github.com/llvm/llvm-project/pull/201214

>From 39e966b014fd36f21185856ed5dcd1eeede711b2 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Tue, 2 Jun 2026 15:01:43 -0700
Subject: [PATCH 1/3] [flang][hlfir] Reuse eval_in_mem shape in
 SeparateAllocatableAssign

When the RHS of an allocatable assignment is an hlfir.eval_in_mem, reuse
its shape operand instead of emitting hlfir.shape_of. A shape_of adds an
extra use of the eval_in_mem result, which prevents
EvaluateIntoMemoryAssignBufferization from evaluating the expression in
place and triggers a use-after-erase assertion in OptimizedBufferization.
---
 .../Transforms/SeparateAllocatableAssign.cpp  |  9 +++++-
 .../HLFIR/separate-allocatable-assign.fir     | 30 +++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp
index 0160ff7d75f76..69ca483642df0 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp
@@ -111,7 +111,14 @@ class SeparateAllocatableAssignConversion
     LLVM_DEBUG(llvm::dbgs() << "SeparateAllocatableAssign: splitting realloc "
                                "from assign\n");
 
-    mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
+    // Reuse the evaluate_in_memory shape operand instead of emitting a
+    // shape_of, which would add an extra use and block in-place bufferization.
+    mlir::Value rhsShape;
+    if (auto evalInMem =
+            assign.getRhs().getDefiningOp<hlfir::EvaluateInMemoryOp>())
+      rhsShape = evalInMem.getShape();
+    if (!rhsShape)
+      rhsShape = hlfir::genShape(loc, builder, rhs);
     llvm::SmallVector<mlir::Value> rhsExtents =
         hlfir::getIndexExtents(loc, builder, rhsShape);
 
diff --git a/flang/test/HLFIR/separate-allocatable-assign.fir b/flang/test/HLFIR/separate-allocatable-assign.fir
index 97c664c38a94f..406de58724c71 100644
--- a/flang/test/HLFIR/separate-allocatable-assign.fir
+++ b/flang/test/HLFIR/separate-allocatable-assign.fir
@@ -179,3 +179,33 @@ func.func @test_lower_bounds(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32
 // Lower bound 10 should appear in the embox/store of the new allocation.
 // CHECK: %[[BOX:.*]] = fir.load %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK: hlfir.assign %{{.*}} to %[[BOX]] : !fir.box<!fir.array<3xi32>>, !fir.box<!fir.heap<!fir.array<?xi32>>>
+
+// Test: allocatable = hlfir.eval_in_mem (e.g. b = matmul(...)). The pass must
+// reuse the eval_in_mem shape operand and must NOT emit an hlfir.shape_of of
+// the eval_in_mem result: an extra use would prevent
+// EvaluateIntoMemoryAssignBufferization from rewriting the evaluation in place.
+func.func private @sink(!fir.ref<!fir.array<?xf32>>)
+func.func @test_eval_in_mem_rhs(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, %n: index) {
+  %shape = fir.shape %n : (index) -> !fir.shape<1>
+
+  %a:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+
+  %expr = hlfir.eval_in_mem shape %shape : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+  ^bb0(%mem: !fir.ref<!fir.array<?xf32>>):
+    fir.call @sink(%mem) : (!fir.ref<!fir.array<?xf32>>) -> ()
+  }
+
+  hlfir.assign %expr to %a#0 realloc : !hlfir.expr<?xf32>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+  hlfir.destroy %expr : !hlfir.expr<?xf32>
+  return
+}
+
+// CHECK-LABEL: func.func @test_eval_in_mem_rhs
+// The eval_in_mem and its shape operand are reused; no shape_of is emitted.
+// CHECK: hlfir.eval_in_mem
+// CHECK-NOT: hlfir.shape_of
+// CHECK-NOT: hlfir.assign{{.*}}realloc
+// CHECK: fir.if
+// CHECK: fir.allocmem
+// CHECK: %[[BOX:.*]] = fir.load %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK: hlfir.assign %{{.*}} to %[[BOX]] : !hlfir.expr<?xf32>, !fir.box<!fir.heap<!fir.array<?xf32>>>

>From fc7fa65ee56db838440431f6d29659254a40ff19 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Tue, 2 Jun 2026 17:31:40 -0700
Subject: [PATCH 2/3] [flang][hlfir] Resolve shape_of users when bufferizing
 eval_in_mem

The RHS of an allocatable assignment is still a transformational intrinsic
(e.g. hlfir.matmul) when SeparateAllocatableAssign runs, so genShape emits
an hlfir.shape_of to size the reallocation. After the intrinsic is lowered
to hlfir.eval_in_mem, that shape_of is an extra user that
EvaluateIntoMemoryAssignBufferization did not expect, causing a
use-after-erase assertion at -O2.

Handle it generally in OptimizedBufferization: a shape_of user of an
eval_in_mem only needs the shape, which is already an operand, so redirect
it to that operand before erasing the eval_in_mem. This keeps the in-place
evaluation for any transformational intrinsic assigned to an allocatable.

This supersedes the earlier eval_in_mem shape-operand reuse in
SeparateAllocatableAssign, which is reverted here since the RHS is not an
eval_in_mem at that point.
---
 .../Transforms/OptimizedBufferization.cpp     | 24 +++++++++++----
 .../Transforms/SeparateAllocatableAssign.cpp  |  9 +-----
 .../HLFIR/opt-bufferization-eval_in_mem.fir   | 30 +++++++++++++++++++
 .../HLFIR/separate-allocatable-assign.fir     | 30 -------------------
 4 files changed, 50 insertions(+), 43 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index 51af673406b4a..34a0ade751dfa 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -638,16 +638,26 @@ tryUsingAssignLhsDirectly(hlfir::EvaluateInMemoryOp evalInMem,
   mlir::Location loc = evalInMem.getLoc();
   hlfir::DestroyOp destroy;
   hlfir::AssignOp assign;
-  for (auto user : llvm::enumerate(evalInMem->getUsers())) {
-    if (user.index() > 2)
+  // A hlfir.shape_of of the result only needs the shape, which the
+  // eval_in_mem already carries as an operand, so it can be redirected to that
+  // operand and does not prevent the in-place rewrite below. Any other user
+  // would dangle when the eval_in_mem is erased, so bail out on it.
+  llvm::SmallVector<hlfir::ShapeOfOp> shapeOfs;
+  for (mlir::Operation *user : evalInMem->getUsers()) {
+    if (auto op = mlir::dyn_cast<hlfir::AssignOp>(user))
+      assign = op;
+    else if (auto op = mlir::dyn_cast<hlfir::DestroyOp>(user))
+      destroy = op;
+    else if (auto op = mlir::dyn_cast<hlfir::ShapeOfOp>(user))
+      shapeOfs.push_back(op);
+    else
       return mlir::failure();
-    mlir::TypeSwitch<mlir::Operation *, void>(user.value())
-        .Case([&](hlfir::AssignOp op) { assign = op; })
-        .Case([&](hlfir::DestroyOp op) { destroy = op; });
   }
   if (!assign || !destroy || destroy.mustFinalizeExpr() ||
       assign.isAllocatableAssignment())
     return mlir::failure();
+  if (!shapeOfs.empty() && !evalInMem.getShape())
+    return mlir::failure();
 
   hlfir::Entity lhs{assign.getLhs()};
   // EvaluateInMemoryOp memory is contiguous, so in general, it can only be
@@ -690,6 +700,10 @@ tryUsingAssignLhsDirectly(hlfir::EvaluateInMemoryOp evalInMem,
   fir::FirOpBuilder builder(rewriter, evalInMem.getOperation());
   mlir::Value rawLhs = hlfir::genVariableRawAddress(loc, builder, lhs);
   hlfir::computeEvaluateOpIn(loc, builder, evalInMem, rawLhs);
+  // Redirect shape_of users to the shape operand so the eval_in_mem can be
+  // erased without leaving dangling uses.
+  for (hlfir::ShapeOfOp shapeOf : shapeOfs)
+    rewriter.replaceOp(shapeOf, evalInMem.getShape());
   rewriter.eraseOp(assign);
   rewriter.eraseOp(destroy);
   rewriter.eraseOp(evalInMem);
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp
index 69ca483642df0..0160ff7d75f76 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SeparateAllocatableAssign.cpp
@@ -111,14 +111,7 @@ class SeparateAllocatableAssignConversion
     LLVM_DEBUG(llvm::dbgs() << "SeparateAllocatableAssign: splitting realloc "
                                "from assign\n");
 
-    // Reuse the evaluate_in_memory shape operand instead of emitting a
-    // shape_of, which would add an extra use and block in-place bufferization.
-    mlir::Value rhsShape;
-    if (auto evalInMem =
-            assign.getRhs().getDefiningOp<hlfir::EvaluateInMemoryOp>())
-      rhsShape = evalInMem.getShape();
-    if (!rhsShape)
-      rhsShape = hlfir::genShape(loc, builder, rhs);
+    mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
     llvm::SmallVector<mlir::Value> rhsExtents =
         hlfir::getIndexExtents(loc, builder, rhsShape);
 
diff --git a/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir b/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir
index fdfb2ce6979dc..9ef8115284514 100644
--- a/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir
+++ b/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir
@@ -60,3 +60,33 @@ func.func @_QPnegative_test_is_target(%arg0: !fir.ref<!fir.array<10xf32>> {fir.b
 // CHECK:         hlfir.destroy %[[VAL_10]] : !hlfir.expr<10xf32>
 // CHECK:         return
 // CHECK:       }
+
+// A hlfir.shape_of user of the eval_in_mem (such as the one left behind by
+// SeparateAllocatableAssign when sizing a reallocation) must not block the
+// in-place rewrite: it is redirected to the eval_in_mem shape operand.
+func.func @_QPtest_shape_of_user(%arg0: !fir.ref<!fir.array<10xf32>> {fir.bindc_name = "x"}, %arg1: !fir.ref<index>) {
+  %c10 = arith.constant 10 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+  %2:2 = hlfir.declare %arg0(%1) dummy_scope %0 {uniq_name = "_QFtestEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+  %3 = hlfir.eval_in_mem shape %1 : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+  ^bb0(%arg2: !fir.ref<!fir.array<10xf32>>):
+    %4 = fir.call @_QPfoo() fastmath<contract> : () -> !fir.array<10xf32>
+    fir.save_result %4 to %arg2(%1) : !fir.array<10xf32>, !fir.ref<!fir.array<10xf32>>, !fir.shape<1>
+  }
+  %shp = hlfir.shape_of %3 : (!hlfir.expr<10xf32>) -> !fir.shape<1>
+  %ext = hlfir.get_extent %shp {dim = 0 : index} : (!fir.shape<1>) -> index
+  hlfir.assign %3 to %2#0 : !hlfir.expr<10xf32>, !fir.ref<!fir.array<10xf32>>
+  hlfir.destroy %3 : !hlfir.expr<10xf32>
+  fir.store %ext to %arg1 : !fir.ref<index>
+  return
+}
+// CHECK-LABEL: func.func @_QPtest_shape_of_user(
+// CHECK:         %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+// The shape_of is redirected to the eval_in_mem shape operand...
+// CHECK:         hlfir.get_extent %[[SHAPE]]
+// ...and the result is still evaluated directly into the LHS (no temporary).
+// CHECK:         %[[CALL:.*]] = fir.call @_QPfoo() fastmath<contract> : () -> !fir.array<10xf32>
+// CHECK:         fir.save_result %[[CALL]] to %{{.*}}#0(%[[SHAPE]])
+// CHECK-NOT:     hlfir.eval_in_mem
+// CHECK-NOT:     hlfir.shape_of
diff --git a/flang/test/HLFIR/separate-allocatable-assign.fir b/flang/test/HLFIR/separate-allocatable-assign.fir
index 406de58724c71..97c664c38a94f 100644
--- a/flang/test/HLFIR/separate-allocatable-assign.fir
+++ b/flang/test/HLFIR/separate-allocatable-assign.fir
@@ -179,33 +179,3 @@ func.func @test_lower_bounds(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32
 // Lower bound 10 should appear in the embox/store of the new allocation.
 // CHECK: %[[BOX:.*]] = fir.load %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK: hlfir.assign %{{.*}} to %[[BOX]] : !fir.box<!fir.array<3xi32>>, !fir.box<!fir.heap<!fir.array<?xi32>>>
-
-// Test: allocatable = hlfir.eval_in_mem (e.g. b = matmul(...)). The pass must
-// reuse the eval_in_mem shape operand and must NOT emit an hlfir.shape_of of
-// the eval_in_mem result: an extra use would prevent
-// EvaluateIntoMemoryAssignBufferization from rewriting the evaluation in place.
-func.func private @sink(!fir.ref<!fir.array<?xf32>>)
-func.func @test_eval_in_mem_rhs(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, %n: index) {
-  %shape = fir.shape %n : (index) -> !fir.shape<1>
-
-  %a:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
-
-  %expr = hlfir.eval_in_mem shape %shape : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
-  ^bb0(%mem: !fir.ref<!fir.array<?xf32>>):
-    fir.call @sink(%mem) : (!fir.ref<!fir.array<?xf32>>) -> ()
-  }
-
-  hlfir.assign %expr to %a#0 realloc : !hlfir.expr<?xf32>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-  hlfir.destroy %expr : !hlfir.expr<?xf32>
-  return
-}
-
-// CHECK-LABEL: func.func @test_eval_in_mem_rhs
-// The eval_in_mem and its shape operand are reused; no shape_of is emitted.
-// CHECK: hlfir.eval_in_mem
-// CHECK-NOT: hlfir.shape_of
-// CHECK-NOT: hlfir.assign{{.*}}realloc
-// CHECK: fir.if
-// CHECK: fir.allocmem
-// CHECK: %[[BOX:.*]] = fir.load %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-// CHECK: hlfir.assign %{{.*}} to %[[BOX]] : !hlfir.expr<?xf32>, !fir.box<!fir.heap<!fir.array<?xf32>>>

>From c43e21ce004601e390dc53b1242d8643c94df5f2 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Wed, 3 Jun 2026 02:17:43 -0700
Subject: [PATCH 3/3] [flang][hlfir] Address review comments

Rephrase the comment in tryUsingAssignLhsDirectly and bail out if the
eval_in_mem result has more than one hlfir.assign or hlfir.destroy user,
since the in-place rewrite only handles a single assign/destroy.
---
 .../Transforms/OptimizedBufferization.cpp     | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index 34a0ade751dfa..d717b39479380 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -638,20 +638,24 @@ tryUsingAssignLhsDirectly(hlfir::EvaluateInMemoryOp evalInMem,
   mlir::Location loc = evalInMem.getLoc();
   hlfir::DestroyOp destroy;
   hlfir::AssignOp assign;
-  // A hlfir.shape_of of the result only needs the shape, which the
-  // eval_in_mem already carries as an operand, so it can be redirected to that
-  // operand and does not prevent the in-place rewrite below. Any other user
-  // would dangle when the eval_in_mem is erased, so bail out on it.
+  // To evaluate the hlfir.eval_in_mem directly into the LHS, its result must
+  // only be used in the assignment, in a destroy, and in hlfir.shape_of (which
+  // can be replaced by a direct use of the shape operand).
   llvm::SmallVector<hlfir::ShapeOfOp> shapeOfs;
   for (mlir::Operation *user : evalInMem->getUsers()) {
-    if (auto op = mlir::dyn_cast<hlfir::AssignOp>(user))
+    if (auto op = mlir::dyn_cast<hlfir::AssignOp>(user)) {
+      if (assign)
+        return mlir::failure();
       assign = op;
-    else if (auto op = mlir::dyn_cast<hlfir::DestroyOp>(user))
+    } else if (auto op = mlir::dyn_cast<hlfir::DestroyOp>(user)) {
+      if (destroy)
+        return mlir::failure();
       destroy = op;
-    else if (auto op = mlir::dyn_cast<hlfir::ShapeOfOp>(user))
+    } else if (auto op = mlir::dyn_cast<hlfir::ShapeOfOp>(user)) {
       shapeOfs.push_back(op);
-    else
+    } else {
       return mlir::failure();
+    }
   }
   if (!assign || !destroy || destroy.mustFinalizeExpr() ||
       assign.isAllocatableAssignment())