[flang-commits] [flang] [Flang][OpenMP] Provide option to use heap allocation for private adjustable arrays (PR #186795)

Mon Mar 16 07:06:27 PDT 2026

https://github.com/DominikAdamski updated https://github.com/llvm/llvm-project/pull/186795

>From d3e5b56115d2030dae403e9aab4777e3f21884b5 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Mon, 16 Mar 2026 07:59:06 -0500
Subject: [PATCH 1/2] [Flang][OpenMP] Provide option to use heap allocation for
 private adjustable arrays

The size of adjustable Fortran arrays is not known at compilation time.
Using limited GPU stack memory may cause hard-to-debug errors. On the other hand,
switching to heap memory allocation may lead to missed optimization opportunities
and significantly increased kernel execution time.

Adding the option `-mmlir --enable-gpu-heap-alloc` allows the user to generate
valid code for adjustable Fortran arrays. The flag is off by default,
so there is no efficiency penalty for code that does not use adjustable arrays.
---
 .../Lower/Support/PrivateReductionUtils.cpp   | 30 +++++--
 .../target-private-adjustable-array.f90       | 81 +++++++++++++++++++
 2 files changed, 104 insertions(+), 7 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/DelayedPrivatization/target-private-adjustable-array.f90

diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index d1a965d288cad..63614913cdcd0 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -29,6 +29,12 @@
 #include "flang/Semantics/symbol.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Location.h"
+#include "llvm/Support/CommandLine.h"
+
+static llvm::cl::opt<bool> enableGPUHeapAlloc(
+    "enable-gpu-heap-alloc",
+    llvm::cl::desc("Allow to use heap alloc for adjustable arrays on GPU"),
+    llvm::cl::init(false));
 
 static bool hasFinalization(const Fortran::semantics::Symbol &sym) {
   if (sym.has<Fortran::semantics::ObjectEntityDetails>())
@@ -383,7 +389,7 @@ class PopulateInitAndCleanupRegionsHelper {
     return loadedMoldArg;
   }
 
-  bool shouldAllocateTempOnStack() const;
+  bool shouldAllocateTempOnStack(fir::BaseBoxType boxTy) const;
 };
 
 } // namespace
@@ -446,7 +452,7 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
     builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
   }
 
-  bool shouldAllocateOnStack = shouldAllocateTempOnStack();
+  bool shouldAllocateOnStack = shouldAllocateTempOnStack(boxTy);
   mlir::Value valAlloc =
       (shouldAllocateOnStack)
           ? builder.createTemporary(loc, innerTy, /*name=*/{},
@@ -477,12 +483,22 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
   createYield(allocatedPrivVarArg);
 }
 
-bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack() const {
-  // On the GPU, always allocate on the stack since heap allocatins are very
-  // expensive.
+bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack(
+    fir::BaseBoxType boxTy) const {
   auto offloadMod =
       llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(*builder.getModule());
-  return offloadMod && offloadMod.getIsGPU();
+  // On the GPU, always allocate on the stack unless the user explicitly
+  // specifies otherwise since heap allocatins are very expensive.
+  bool isGPU = offloadMod && offloadMod.getIsGPU();
+  if (isGPU && enableGPUHeapAlloc) {
+    // Check if it is adjustable array
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(boxTy.getEleTy())) {
+      if (seqTy.hasUnknownShape() || seqTy.hasDynamicExtents()) {
+        return false;
+      }
+    }
+  }
+  return isGPU;
 }
 
 void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
@@ -527,7 +543,7 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
   // Allocating on the heap in case the whole reduction/privatization is nested
   // inside of a loop
   auto temp = [&]() {
-    if (shouldAllocateTempOnStack())
+    if (shouldAllocateTempOnStack(boxTy))
       return createStackTempFromMold(loc, builder, source);
 
     auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-adjustable-array.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-adjustable-array.f90
new file mode 100644
index 0000000000000..eac6580c18b99
--- /dev/null
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-adjustable-array.f90
@@ -0,0 +1,81 @@
+! Tests delayed privatization for `targets ... private(..)` for adjustable arrays.
+! Tests different allocation 
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization-staging \
+! RUN: -mmlir --enable-gpu-heap-alloc  -o - %s 2>&1 | FileCheck %s --check-prefix=CPU
+
+! RUN: %if amdgpu-registered-target %{ \
+! RUN:   %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir  \
+! RUN:     -fopenmp -fopenmp-is-target-device \
+! RUN:     -mmlir --enable-delayed-privatization-staging \
+! RUN:     -mmlir --enable-gpu-heap-alloc \
+! RUN:     -o - %s 2>&1 | \
+! RUN:   FileCheck %s --check-prefix=GPU-HEAP \
+! RUN: %}
+
+! RUN: %if amdgpu-registered-target %{ \
+! RUN:   %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir  \
+! RUN:     -fopenmp -fopenmp-is-target-device \
+! RUN:     -mmlir --enable-delayed-privatization-staging \
+! RUN:     -o - %s 2>&1 | \
+! RUN:   FileCheck %s --check-prefix=GPU-STACK  \
+! RUN: %}
+
+subroutine target_adjustable_array(n_size)
+  implicit none
+  integer, intent(in) :: n_size
+  integer  :: alloc_var(n_size)
+
+  !$omp target private(alloc_var)
+    alloc_var = 1
+  !$omp end target
+end subroutine target_adjustable_array
+
+! CPU-LABEL: omp.private {type = private}
+! CPU-SAME:    @[[VAR_PRIVATIZER_SYM:.*]] : ![[DESC_TYPE:.*]] init {
+! CPU-NEXT:  ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE:.*]], %[[PRIV_ALLOC:.*]]: ![[TYPE]]):
+! CPU-NEXT:  %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! CPU-NEXT:  %[[C0:.*]] = arith.constant 0 : index 
+! CPU-NEXT:  %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0]] : (![[DESC_TYPE]], index) -> (index, index, index)
+! CPU-NEXT:  %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
+! CPU-NEXT:  %[[PRIVATE_MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS]]#1
+! CPU-NEXT:  %4:2 = hlfir.declare %3(%2) {[[NAME_ATTR:.*]]} : (![[HEAP_ARRAY_TYPE:.*]], !fir.shape<1>) -> (![[DESC_TYPE]], ![[HEAP_ARRAY_TYPE]])
+! CPU:      omp.yield(%[[PRIV_ALLOC]] : ![[TYPE]])
+! CPU-NEXT: } dealloc {
+! CPU-NEXT: ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE]]):
+! CPU-NEXT:  %[[PRIV_ARG_VAL1:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! CPU-NEXT:  %[[ALLOC_ADDR:.*]] = fir.box_addr %[[PRIV_ARG_VAL1]] : (![[DESC_TYPE]]) -> ![[REF_ARRAY_TYPE:.*]]
+! CPU:       %[[CONV:.*]] = fir.convert %[[ALLOC_ADDR]] : (![[REF_ARRAY_TYPE]]) -> ![[HEAP_ARRAY_TYPE]]
+! CPU-NEXT:  fir.freemem %[[CONV]] : ![[HEAP_ARRAY_TYPE]]
+! CPU:      omp.yield
+! CPU-NEXT: }
+
+! GPU-HEAP-LABEL: omp.private {type = private}
+! GPU-HEAP-SAME:    @[[VAR_PRIVATIZER_SYM:.*]] : ![[DESC_TYPE:.*]] init {
+! GPU-HEAP-NEXT:  ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE:.*]], %[[PRIV_ALLOC:.*]]: ![[TYPE]]):
+! GPU-HEAP-NEXT:  %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! GPU-HEAP-NEXT:  %[[C0:.*]] = arith.constant 0 : index 
+! GPU-HEAP-NEXT:  %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0]] : (![[DESC_TYPE]], index) -> (index, index, index)
+! GPU-HEAP-NEXT:  %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
+! GPU-HEAP-NEXT:  %[[PRIVATE_MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS]]#1
+! GPU-HEAP-NEXT:  %4:2 = hlfir.declare %3(%2) {[[NAME_ATTR:.*]]} : (![[HEAP_ARRAY_TYPE:.*]], !fir.shape<1>) -> (![[DESC_TYPE]], ![[HEAP_ARRAY_TYPE]])
+! GPU-HEAP:      omp.yield(%[[PRIV_ALLOC]] : ![[TYPE]])
+! GPU-HEAP-NEXT: } dealloc {
+! GPU-HEAP-NEXT: ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE]]):
+! GPU-HEAP-NEXT:  %[[PRIV_ARG_VAL1:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! GPU-HEAP-NEXT:  %[[ALLOC_ADDR:.*]] = fir.box_addr %[[PRIV_ARG_VAL1]] : (![[DESC_TYPE]]) -> ![[REF_ARRAY_TYPE:.*]]
+! GPU-HEAP:       %[[CONV:.*]] = fir.convert %[[ALLOC_ADDR]] : (![[REF_ARRAY_TYPE]]) -> ![[HEAP_ARRAY_TYPE]]
+! GPU-HEAP-NEXT:  fir.freemem %[[CONV]] : ![[HEAP_ARRAY_TYPE]]
+! GPU-HEAP:      omp.yield
+! GPU-HEAP-NEXT: }
+
+! GPU-STACK-LABEL: omp.private {type = private}
+! GPU-STACK-SAME:    @[[VAR_PRIVATIZER_SYM:.*]] : ![[DESC_TYPE:.*]] init {
+! GPU-STACK-NEXT:  ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE:.*]], %[[PRIV_ALLOC:.*]]: ![[TYPE]]):
+! GPU-STACK-NEXT:  %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! GPU-STACK-NEXT:  %[[C0:.*]] = arith.constant 0 : index 
+! GPU-STACK-NEXT:  %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0]] : (![[DESC_TYPE]], index) -> (index, index, index)
+! GPU-STACK-NOT:   %[[PRIVATE_MEM:.*]] = fir.allocmem
+! GPU-STACK:       %[[ALLOCA_ADDR:.*]] = fir.alloca !fir.array<?xi32>, %[[BOX_DIMS]]#1 {[[NAME_ATTR:.*]]}
+! GPU-STACK:      omp.yield(%[[PRIV_ALLOC]] : ![[TYPE]])
+

>From a1018396f95b6e36971922f82aa1b14011afc7ce Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Mon, 16 Mar 2026 09:05:12 -0500
Subject: [PATCH 2/2] Applied remarks

---
 flang/lib/Lower/Support/PrivateReductionUtils.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index 63614913cdcd0..f63fb6ecfe43f 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -33,7 +33,8 @@
 
 static llvm::cl::opt<bool> enableGPUHeapAlloc(
     "enable-gpu-heap-alloc",
-    llvm::cl::desc("Allow to use heap alloc for adjustable arrays on GPU"),
+    llvm::cl::desc(
+        "Allow the use of heap allocation for dynamically sized arrays on GPU"),
     llvm::cl::init(false));
 
 static bool hasFinalization(const Fortran::semantics::Symbol &sym) {