[flang-commits] [flang] [Flang][OpenMP] Provide option to use heap allocation for private adjustable arrays (PR #186795)
via flang-commits
flang-commits at lists.llvm.org
Mon Mar 16 06:27:17 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: Dominik Adamski (DominikAdamski)
<details>
<summary>Changes</summary>
The size of adjustable Fortran arrays is not known at compilation time. Using limited GPU stack memory may cause hard-to-debug errors. On the other hand, switching to heap memory allocation may lead to missed optimization opportunities and significantly increased kernel execution time.
Adding the option `-mmlir --enable-gpu-heap-alloc` allows the user to generate valid code for adjustable Fortran arrays. The flag is off by default, so there is no efficiency penalty for code that does not use adjustable arrays.
---
Full diff: https://github.com/llvm/llvm-project/pull/186795.diff
2 Files Affected:
- (modified) flang/lib/Lower/Support/PrivateReductionUtils.cpp (+23-7)
- (added) flang/test/Lower/OpenMP/DelayedPrivatization/target-private-adjustable-array.f90 (+81)
``````````diff
diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index d1a965d288cad..63614913cdcd0 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -29,6 +29,12 @@
#include "flang/Semantics/symbol.h"
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
#include "mlir/IR/Location.h"
+#include "llvm/Support/CommandLine.h"
+
+static llvm::cl::opt<bool> enableGPUHeapAlloc(
+ "enable-gpu-heap-alloc",
+ llvm::cl::desc("Allow to use heap alloc for adjustable arrays on GPU"),
+ llvm::cl::init(false));
static bool hasFinalization(const Fortran::semantics::Symbol &sym) {
if (sym.has<Fortran::semantics::ObjectEntityDetails>())
@@ -383,7 +389,7 @@ class PopulateInitAndCleanupRegionsHelper {
return loadedMoldArg;
}
- bool shouldAllocateTempOnStack() const;
+ bool shouldAllocateTempOnStack(fir::BaseBoxType boxTy) const;
};
} // namespace
@@ -446,7 +452,7 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
}
- bool shouldAllocateOnStack = shouldAllocateTempOnStack();
+ bool shouldAllocateOnStack = shouldAllocateTempOnStack(boxTy);
mlir::Value valAlloc =
(shouldAllocateOnStack)
? builder.createTemporary(loc, innerTy, /*name=*/{},
@@ -477,12 +483,22 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
createYield(allocatedPrivVarArg);
}
-bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack() const {
- // On the GPU, always allocate on the stack since heap allocatins are very
- // expensive.
+bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack(
+ fir::BaseBoxType boxTy) const {
auto offloadMod =
llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(*builder.getModule());
- return offloadMod && offloadMod.getIsGPU();
+ // On the GPU, always allocate on the stack unless the user explicitly
+ // specifies otherwise since heap allocatins are very expensive.
+ bool isGPU = offloadMod && offloadMod.getIsGPU();
+ if (isGPU && enableGPUHeapAlloc) {
+ // Check if it is adjustable array
+ if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(boxTy.getEleTy())) {
+ if (seqTy.hasUnknownShape() || seqTy.hasDynamicExtents()) {
+ return false;
+ }
+ }
+ }
+ return isGPU;
}
void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
@@ -527,7 +543,7 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
// Allocating on the heap in case the whole reduction/privatization is nested
// inside of a loop
auto temp = [&]() {
- if (shouldAllocateTempOnStack())
+ if (shouldAllocateTempOnStack(boxTy))
return createStackTempFromMold(loc, builder, source);
auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-adjustable-array.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-adjustable-array.f90
new file mode 100644
index 0000000000000..eac6580c18b99
--- /dev/null
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-adjustable-array.f90
@@ -0,0 +1,81 @@
+! Tests delayed privatization for `targets ... private(..)` for adjustable arrays.
+! Tests different allocation
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization-staging \
+! RUN: -mmlir --enable-gpu-heap-alloc -o - %s 2>&1 | FileCheck %s --check-prefix=CPU
+
+! RUN: %if amdgpu-registered-target %{ \
+! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir \
+! RUN: -fopenmp -fopenmp-is-target-device \
+! RUN: -mmlir --enable-delayed-privatization-staging \
+! RUN: -mmlir --enable-gpu-heap-alloc \
+! RUN: -o - %s 2>&1 | \
+! RUN: FileCheck %s --check-prefix=GPU-HEAP \
+! RUN: %}
+
+! RUN: %if amdgpu-registered-target %{ \
+! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir \
+! RUN: -fopenmp -fopenmp-is-target-device \
+! RUN: -mmlir --enable-delayed-privatization-staging \
+! RUN: -o - %s 2>&1 | \
+! RUN: FileCheck %s --check-prefix=GPU-STACK \
+! RUN: %}
+
+subroutine target_adjustable_array(n_size)
+ implicit none
+ integer, intent(in) :: n_size
+ integer :: alloc_var(n_size)
+
+ !$omp target private(alloc_var)
+ alloc_var = 1
+ !$omp end target
+end subroutine target_adjustable_array
+
+! CPU-LABEL: omp.private {type = private}
+! CPU-SAME: @[[VAR_PRIVATIZER_SYM:.*]] : ![[DESC_TYPE:.*]] init {
+! CPU-NEXT: ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE:.*]], %[[PRIV_ALLOC:.*]]: ![[TYPE]]):
+! CPU-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! CPU-NEXT: %[[C0:.*]] = arith.constant 0 : index
+! CPU-NEXT: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0]] : (![[DESC_TYPE]], index) -> (index, index, index)
+! CPU-NEXT: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
+! CPU-NEXT: %[[PRIVATE_MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS]]#1
+! CPU-NEXT: %4:2 = hlfir.declare %3(%2) {[[NAME_ATTR:.*]]} : (![[HEAP_ARRAY_TYPE:.*]], !fir.shape<1>) -> (![[DESC_TYPE]], ![[HEAP_ARRAY_TYPE]])
+! CPU: omp.yield(%[[PRIV_ALLOC]] : ![[TYPE]])
+! CPU-NEXT: } dealloc {
+! CPU-NEXT: ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE]]):
+! CPU-NEXT: %[[PRIV_ARG_VAL1:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! CPU-NEXT: %[[ALLOC_ADDR:.*]] = fir.box_addr %[[PRIV_ARG_VAL1]] : (![[DESC_TYPE]]) -> ![[REF_ARRAY_TYPE:.*]]
+! CPU: %[[CONV:.*]] = fir.convert %[[ALLOC_ADDR]] : (![[REF_ARRAY_TYPE]]) -> ![[HEAP_ARRAY_TYPE]]
+! CPU-NEXT: fir.freemem %[[CONV]] : ![[HEAP_ARRAY_TYPE]]
+! CPU: omp.yield
+! CPU-NEXT: }
+
+! GPU-HEAP-LABEL: omp.private {type = private}
+! GPU-HEAP-SAME: @[[VAR_PRIVATIZER_SYM:.*]] : ![[DESC_TYPE:.*]] init {
+! GPU-HEAP-NEXT: ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE:.*]], %[[PRIV_ALLOC:.*]]: ![[TYPE]]):
+! GPU-HEAP-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! GPU-HEAP-NEXT: %[[C0:.*]] = arith.constant 0 : index
+! GPU-HEAP-NEXT: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0]] : (![[DESC_TYPE]], index) -> (index, index, index)
+! GPU-HEAP-NEXT: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
+! GPU-HEAP-NEXT: %[[PRIVATE_MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS]]#1
+! GPU-HEAP-NEXT: %4:2 = hlfir.declare %3(%2) {[[NAME_ATTR:.*]]} : (![[HEAP_ARRAY_TYPE:.*]], !fir.shape<1>) -> (![[DESC_TYPE]], ![[HEAP_ARRAY_TYPE]])
+! GPU-HEAP: omp.yield(%[[PRIV_ALLOC]] : ![[TYPE]])
+! GPU-HEAP-NEXT: } dealloc {
+! GPU-HEAP-NEXT: ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE]]):
+! GPU-HEAP-NEXT: %[[PRIV_ARG_VAL1:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! GPU-HEAP-NEXT: %[[ALLOC_ADDR:.*]] = fir.box_addr %[[PRIV_ARG_VAL1]] : (![[DESC_TYPE]]) -> ![[REF_ARRAY_TYPE:.*]]
+! GPU-HEAP: %[[CONV:.*]] = fir.convert %[[ALLOC_ADDR]] : (![[REF_ARRAY_TYPE]]) -> ![[HEAP_ARRAY_TYPE]]
+! GPU-HEAP-NEXT: fir.freemem %[[CONV]] : ![[HEAP_ARRAY_TYPE]]
+! GPU-HEAP: omp.yield
+! GPU-HEAP-NEXT: }
+
+! GPU-STACK-LABEL: omp.private {type = private}
+! GPU-STACK-SAME: @[[VAR_PRIVATIZER_SYM:.*]] : ![[DESC_TYPE:.*]] init {
+! GPU-STACK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE:.*]], %[[PRIV_ALLOC:.*]]: ![[TYPE]]):
+! GPU-STACK-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! GPU-STACK-NEXT: %[[C0:.*]] = arith.constant 0 : index
+! GPU-STACK-NEXT: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0]] : (![[DESC_TYPE]], index) -> (index, index, index)
+! GPU-STACK-NOT: %[[PRIVATE_MEM:.*]] = fir.allocmem
+! GPU-STACK: %[[ALLOCA_ADDR:.*]] = fir.alloca !fir.array<?xi32>, %[[BOX_DIMS]]#1 {[[NAME_ATTR:.*]]}
+! GPU-STACK: omp.yield(%[[PRIV_ALLOC]] : ![[TYPE]])
+
``````````
</details>
https://github.com/llvm/llvm-project/pull/186795
More information about the flang-commits
mailing list