[flang-commits] [flang] [flang] handle alloca outside of entry blocks in MemoryAllocation (PR #98457)

Mon Jul 15 06:52:13 PDT 2024

================
@@ -0,0 +1,220 @@
+//===- MemoryUtils.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Transforms/MemoryUtils.h"
+#include "flang/Optimizer/Builder/Todo.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Dominance.h"
+#include "llvm/ADT/STLExtras.h"
+
+namespace {
+class AllocaReplaceImpl {
+public:
+  AllocaReplaceImpl(fir::AllocaRewriterCallBack allocaRewriter,
+                    fir::DeallocCallBack deallocGenerator)
+      : allocaRewriter{allocaRewriter}, deallocGenerator{deallocGenerator} {}
+  bool replace(mlir::RewriterBase &, fir::AllocaOp);
+
+private:
+  mlir::Region *findDeallocationPointsAndOwner(
+      fir::AllocaOp alloca,
+      llvm::SmallVectorImpl<mlir::Operation *> &deallocationPoints);
+  bool
+  allocDominatesDealloc(fir::AllocaOp alloca,
+                        llvm::ArrayRef<mlir::Operation *> deallocationPoints) {
+    return llvm::all_of(deallocationPoints, [&](mlir::Operation *deallocPoint) {
+      return this->dominanceInfo.properlyDominates(alloca.getOperation(),
+                                                   deallocPoint);
+    });
+  }
+  void
+  genIndirectDeallocation(mlir::RewriterBase &, fir::AllocaOp,
+                          llvm::ArrayRef<mlir::Operation *> deallocationPoints,
+                          mlir::Value replacement, mlir::Region &owningRegion);
+
+private:
+  fir::AllocaRewriterCallBack allocaRewriter;
+  fir::DeallocCallBack deallocGenerator;
+  mlir::DominanceInfo dominanceInfo;
+};
+} // namespace
+
+static bool terminatorYieldsMemory(mlir::Operation &terminator) {
+  return llvm::any_of(terminator.getResults(), [](mlir::OpResult res) {
+    return fir::conformsWithPassByRef(res.getType());
+  });
+}
+
+static bool isRegionTerminator(mlir::Operation &terminator) {
+  // Using ReturnLike trait is tempting but it is not set on
+  // all region terminator that matters (like omp::TerminatorOp that
+  // has no results).
+  // May be true for dead code. It is not a correctness issue and dead code can
+  // be eliminated by running region simplification before this utility is
+  // used.
+  // May also be true for unreachable like terminators (e.g., after an abort
+  // call related to Fortran STOP). This is also OK, the inserted deallocation
+  // will simply never be reached. It is easier for the rest of the code here
+  // to assume there is always at least one deallocation point, so keep
+  // unreachable terminators.
+  return !terminator.hasSuccessors();
+}
+
+mlir::Region *AllocaReplaceImpl::findDeallocationPointsAndOwner(
+    fir::AllocaOp alloca,
+    llvm::SmallVectorImpl<mlir::Operation *> &deallocationPoints) {
+  // Step 1: Identify the operation and region owning the alloca.
+  mlir::Region *owningRegion = alloca.getOwnerRegion();
+  if (!owningRegion)
+    return nullptr;
+  mlir::Operation *owningOp = owningRegion->getParentOp();
+  assert(owningOp && "region expected to be owned");
+  // Step 2: Identify the exit points of the owning region, they are the default
+  // deallocation points. TODO: detect and use lifetime markers to get earlier
+  // deallocation points.
+  bool isOpenACCMPRecipe = mlir::isa<mlir::accomp::RecipeInterface>(owningOp);
+  for (mlir::Block &block : owningRegion->getBlocks())
+    if (mlir::Operation *terminator = block.getTerminator();
+        isRegionTerminator(*terminator)) {
+      // FIXME: OpenACC and OpenMP privatization recipe are stand alone
+      // operation meant to be later "inlined", the value they return may
+      // be the address of a local alloca. It would be incorrect to insert
+      // deallocation before the terminator (this would introduce use after
+      // free once the recipe is inlined.
+      // This probably require redesign or special handling on the OpenACC/MP
+      // side.
+      if (isOpenACCMPRecipe && terminatorYieldsMemory(*terminator))
+        return nullptr;
+      deallocationPoints.push_back(terminator);
+    }
+  // If the owningRegion did not adhere to the ReturnLike interface for its
+  // terminators, bail and do not attempt to translate it (we could maybe
+  // fallback to consider terminators with no block successor, but since all
+  // FIR, OpenACC, OpenMP, CUF, SCF operations with IsIsolatedFromAbove,
+  // AutomaticAllocationScope, or LoopLikeOpInterface have such terminators,
+  // avoid any untested complexity for now).
+  if (deallocationPoints.empty())
+    return nullptr;
+
+  // Step 3: detect loops between the alloc and deallocation points.
+  // If such loop exists, the easy solution is to consider the alloc
+  // as a deallocation point of any previous allocation. This works
+  // because the alloc does not properly dominates itself, so the
+  // inserted deallocation will be conditional.
+  // For now, always assume there may always be a loop if any of the
+  // deallocation point does not dominate the alloca. It is
+  // conservative approach. Bringing lifetime markers above will reduce
+  // the false positive for alloca made inside if like constructs or CFG.
+  if (!allocDominatesDealloc(alloca, deallocationPoints))
+    deallocationPoints.push_back(alloca.getOperation());
+  return owningRegion;
+}
----------------
jeanPerier wrote:

There are two things that are being described here: how block loops are detected, and how they are dealt with.
 
1. If the alloca is inside a block based loop, and the deallocation points are after the loop, the deallocation points will not be dominated by the alloca. So we can use dominance to detect block based loops with some false positive (alloca in "if-like" block CFG will also create cases where deallocation points are not dominated by the alloca).

2. if the alloca is inside a block based loop, and the deallocation points are after the loop, an extra deallocation must be added on the back edge to avoid memory leaks. The easiest is to insert a conditional deallocation right before the alloca: If the alloca has ever been executed before, the C pointer variable will be set and is deallocated right before executing the alloca again. This does not require any analysis to find the back edge and can be done in the "false positive" cases with no correctness consequences (in fact, it could always be done).

Here is an illustration:
```
subroutine test(n)
  integer :: n
  real :: res
  interface
    function create_array(i)
      real :: create_array(i)
    end function
  end interface
  res = 0
  do i=1,100
    res = res + sum(create_array(i))
    if (i>n) goto 1
  end do
1 call do_something(res)
end subroutine
```

Because of the goto, the do loop is lowered to block CFG (blocks + `cf.cond_br`). The alloca is contained inside a block for the loop body. The alloca owner is the func.func, since there are no region based operation for the loop. So the deallocation point that was found so far is the func.return. The func.return is not dominated by the block containing the alloca due to the cyclic aspect. Hence, a conditional deallocation will be inserted before the new allocmem.

```
  func.func @_QPtest(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}) {
    cf.br ^bb1

  ^bb1:  // 2 preds: ^bb0, ^bb4
    // ... loop entry
    cf.cond_br %8, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    %17 = fir.alloca !fir.array<?xf32>, %15 {bindc_name = ".result"}
    // ... SUM
    // ... cf.cond_br for the IF () GOTO 1
    cf.cond_br %38, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    cf.br ^bb5
  ^bb4:  // pred: ^bb2
    // ... index increment and cf.br back to the loop entry
    cf.br ^bb1
  ^bb5:  // 2 preds: ^bb1, ^bb3
    fir.call @_QPdo_something(%6) fastmath<contract> : (!fir.ref<f32>) -> ()
    return
  }
```

Will be turned into:


```
  func.func @_QPtest(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}) {
    // Init pointer variable to nullptr
    %0 = fir.alloca !fir.heap<!fir.array<?xf32>>
    %1 = fir.zero_bits !fir.heap<!fir.array<?xf32>>
    fir.store %1 to %0 : !fir.ref<!fir.heap<!fir.array<?xf32>>>
    cf.br ^bb1
  ^bb1:  // 2 preds: ^bb0, ^bb4
    // loop entry
    cf.cond_br %8, ^bb2, ^bb5
  ^bb2:  // pred: ^bb1
    // Conditionally dealloc the memory if pointer variable is set.
    %19 = fir.load %0 : !fir.ref<!fir.heap<!fir.array<?xf32>>>
    %20 = fir.convert %19 : (!fir.heap<!fir.array<?xf32>>) -> i64
    %21 = arith.cmpi ne, %20, %c0_i64 : i64
    fir.if %21 {
      fir.freemem %19 : !fir.heap<!fir.array<?xf32>>
    }
   // Allocate memory and set pointer variable to it.
    %22 = fir.allocmem !fir.array<?xf32>, %17 {bindc_name = ".result", uniq_name = ""}
    %23 = fir.convert %22 : (!fir.heap<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
    fir.store %22 to %0 : !fir.ref<!fir.heap<!fir.array<?xf32>>>
    //  ...
    cf.cond_br %38, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    cf.br ^bb5
  ^bb4:  // pred: ^bb2
    cf.br ^bb1
  ^bb5:  // 2 preds: ^bb1, ^bb3
    fir.call @_QPdo_something(%6) fastmath<contract> : (!fir.ref<f32>) -> ()
    // Conditionally dealloc the memory if pointer variable is set
    // ...
    fir.if %51 {
      fir.freemem %49 : !fir.heap<!fir.array<?xf32>>
    }
    return
  }
```

I rephrased the comment to be a bit more clear.

https://github.com/llvm/llvm-project/pull/98457