[flang-commits] [flang] [Flang][OpenMP] Heap-allocate GPU dynamic private arrays in distribute parallel do (PR #200841)

Mon Jun 1 08:03:00 PDT 2026

https://github.com/TIFitis created https://github.com/llvm/llvm-project/pull/200841

Fixes GPU offload crashes for Fortran automatic arrays privatised in target teams distribute parallel do.

For delayed privatisation on GPU, dynamically sized boxed array privates are now routed through the existing heap-allocation path, with matching cleanup emitted in the privatiser dealloc region. This avoids lowering such arrays to runtime-sized scratch allocas whose descriptors can be captured across the distribute callback boundary.

Co-authored-by: Codex <codex at openai.com>

>From 391921279843133c3b4ca8578ae43aec6fbf7b41 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Mon, 1 Jun 2026 15:30:46 +0100
Subject: [PATCH] [Flang][OpenMP] Heap-allocate GPU dynamic private arrays in
 distribute parallel do

Fixes GPU offload crashes for Fortran automatic arrays privatized in target teams distribute parallel do.

For delayed privatization on GPU, dynamically sized boxed array privates are now routed through the existing heap-allocation path, with matching cleanup emitted in the privatizer dealloc region. This avoids lowering such arrays to runtime-sized scratch allocas whose descriptors can be captured across the distribute callback boundary.
---
 .../Lower/Support/PrivateReductionUtils.h     |  3 +-
 flang/include/flang/Lower/Support/Utils.h     |  3 +-
 .../lib/Lower/OpenMP/DataSharingProcessor.cpp |  3 +-
 flang/lib/Lower/OpenMP/DataSharingProcessor.h |  5 ++
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  1 +
 .../Lower/Support/PrivateReductionUtils.cpp   | 12 +++--
 flang/lib/Lower/Support/Utils.cpp             | 39 +++++++++++---
 ...ms-distribute-private-adjustable-array.f90 | 52 +++++++++++++++++++
 8 files changed, 104 insertions(+), 14 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/DelayedPrivatization/target-teams-distribute-private-adjustable-array.f90

diff --git a/flang/include/flang/Lower/Support/PrivateReductionUtils.h b/flang/include/flang/Lower/Support/PrivateReductionUtils.h
index 7c0f11a24afd8..8957d1ed0d1cd 100644
--- a/flang/include/flang/Lower/Support/PrivateReductionUtils.h
+++ b/flang/include/flang/Lower/Support/PrivateReductionUtils.h
@@ -58,7 +58,8 @@ void populateByRefInitAndCleanupRegions(
     mlir::Value allocatedPrivVarArg, mlir::Value moldArg,
     mlir::Region &cleanupRegion, DeclOperationKind kind,
     const Fortran::semantics::Symbol *sym = nullptr,
-    bool cannotHaveNonDefaultLowerBounds = false, bool isDoConcurrent = false);
+    bool cannotHaveNonDefaultLowerBounds = false, bool isDoConcurrent = false,
+    bool forceHeapAllocation = false);
 
 /// Generate a fir::ShapeShift op describing the provided boxed array.
 /// `cannotHaveNonDefaultLowerBounds` should be set if `box` is known to have
diff --git a/flang/include/flang/Lower/Support/Utils.h b/flang/include/flang/Lower/Support/Utils.h
index 4e83a0e3bfec7..e12233f2a21a2 100644
--- a/flang/include/flang/Lower/Support/Utils.h
+++ b/flang/include/flang/Lower/Support/Utils.h
@@ -120,7 +120,8 @@ void privatizeSymbol(
     llvm::SetVector<const semantics::Symbol *> &allPrivatizedSymbols,
     llvm::SmallPtrSet<const semantics::Symbol *, 16> &mightHaveReadHostSym,
     const semantics::Symbol *symToPrivatize, OperandsStructType *clauseOps,
-    std::optional<llvm::omp::Directive> dir = std::nullopt);
+    std::optional<llvm::omp::Directive> dir = std::nullopt,
+    bool forceHeapAllocationForPrivateDynamicArrays = false);
 
 } // end namespace Fortran::lower
 
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index e392497d30de7..34765f6f253e5 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -704,7 +704,8 @@ void DataSharingProcessor::privatizeSymbol(
   Fortran::lower::privatizeSymbol<mlir::omp::PrivateClauseOp,
                                   mlir::omp::PrivateClauseOps>(
       converter, firOpBuilder, symTable, allPrivatizedSymbols,
-      mightHaveReadHostSym, symToPrivatize, clauseOps, dir);
+      mightHaveReadHostSym, symToPrivatize, clauseOps, dir,
+      forceHeapAllocationForPrivateDynamicArrays);
 }
 } // namespace omp
 } // namespace lower
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index 5dd564d4bbb61..a9b57cada9d92 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -105,6 +105,7 @@ class DataSharingProcessor {
   lower::pft::Evaluation &eval;
   bool shouldCollectPreDeterminedSymbols;
   bool useDelayedPrivatization;
+  bool forceHeapAllocationForPrivateDynamicArrays = false;
   llvm::SmallPtrSet<const semantics::Symbol *, 16> mightHaveReadHostSym;
   lower::SymMap &symTable;
   bool isTargetPrivatization;
@@ -179,6 +180,10 @@ class DataSharingProcessor {
 
   void pushLoopIV(mlir::Value iv) { loopIVs.push_back(iv); }
 
+  void setForceHeapAllocationForPrivateDynamicArrays(bool value = true) {
+    forceHeapAllocationForPrivateDynamicArrays = value;
+  }
+
   const llvm::SetVector<const semantics::Symbol *> &
   getAllSymbolsToPrivatize() const {
     return allPrivatizedSymbols;
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 67911e9044ef1..f8c0b5ac9a3a6 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3429,6 +3429,7 @@ static mlir::omp::DistributeOp genCompositeDistributeParallelDo(
   DataSharingProcessor dsp(converter, semaCtx, doItem->clauses, eval,
                            /*shouldCollectPreDeterminedSymbols=*/true,
                            /*useDelayedPrivatization=*/true, symTable);
+  dsp.setForceHeapAllocationForPrivateDynamicArrays();
   dsp.processStep1(&parallelClauseOps);
 
   EntryBlockArgs parallelArgs;
diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index aaf2069ec34bd..a87b69f8f1069 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -323,13 +323,14 @@ class PopulateInitAndCleanupRegionsHelper {
       mlir::Value allocatedPrivVarArg, mlir::Value moldArg,
       mlir::Block *initBlock, mlir::Region &cleanupRegion,
       DeclOperationKind kind, const Fortran::semantics::Symbol *sym,
-      bool cannotHaveLowerBounds, bool isDoConcurrent)
+      bool cannotHaveLowerBounds, bool isDoConcurrent, bool forceHeapAllocation)
       : converter{converter}, builder{converter.getFirOpBuilder()}, loc{loc},
         argType{argType}, scalarInitValue{scalarInitValue},
         allocatedPrivVarArg{allocatedPrivVarArg}, moldArg{moldArg},
         initBlock{initBlock}, cleanupRegion{cleanupRegion}, kind{kind},
         sym{sym}, cannotHaveNonDefaultLowerBounds{cannotHaveLowerBounds},
-        isDoConcurrent{isDoConcurrent} {
+        isDoConcurrent{isDoConcurrent},
+        forceHeapAllocation{forceHeapAllocation} {
     valType = fir::unwrapRefType(argType);
   }
 
@@ -376,6 +377,7 @@ class PopulateInitAndCleanupRegionsHelper {
   bool cannotHaveNonDefaultLowerBounds;
 
   bool isDoConcurrent;
+  bool forceHeapAllocation;
 
   void createYield(mlir::Value ret) {
     if (isDoConcurrent)
@@ -520,6 +522,8 @@ bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack(
   // On the GPU, always allocate on the stack unless the user explicitly
   // specifies otherwise since heap allocatins are very expensive.
   bool isGPU = offloadMod && offloadMod.getIsGPU();
+  if (isGPU && forceHeapAllocation)
+    return false;
   if (isGPU && enableGPUHeapAlloc) {
     // Check if it is adjustable array
     if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(boxTy.getEleTy())) {
@@ -777,11 +781,11 @@ void Fortran::lower::populateByRefInitAndCleanupRegions(
     mlir::Value allocatedPrivVarArg, mlir::Value moldArg,
     mlir::Region &cleanupRegion, DeclOperationKind kind,
     const Fortran::semantics::Symbol *sym, bool cannotHaveLowerBounds,
-    bool isDoConcurrent) {
+    bool isDoConcurrent, bool forceHeapAllocation) {
   PopulateInitAndCleanupRegionsHelper helper(
       converter, loc, argType, scalarInitValue, allocatedPrivVarArg, moldArg,
       initBlock, cleanupRegion, kind, sym, cannotHaveLowerBounds,
-      isDoConcurrent);
+      isDoConcurrent, forceHeapAllocation);
   helper.populateByRefInitAndCleanupRegions();
 
   // Often we load moldArg to check something (e.g. length parameters, shape)
diff --git a/flang/lib/Lower/Support/Utils.cpp b/flang/lib/Lower/Support/Utils.cpp
index 280968975ea96..722e92016eb2a 100644
--- a/flang/lib/Lower/Support/Utils.cpp
+++ b/flang/lib/Lower/Support/Utils.cpp
@@ -20,6 +20,7 @@
 #include "flang/Lower/Support/PrivateReductionUtils.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Semantics/tools.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
@@ -678,6 +679,20 @@ void copyFirstPrivateSymbol(lower::AbstractConverter &converter,
     converter.copyHostAssociateVar(*sym, copyAssignIP);
 }
 
+static bool isDynamicallySizedBoxArrayType(mlir::Type type) {
+  auto boxType = mlir::dyn_cast<fir::BaseBoxType>(type);
+  if (!boxType)
+    return false;
+  auto seqType = mlir::dyn_cast<fir::SequenceType>(boxType.getEleTy());
+  return seqType && (seqType.hasUnknownShape() || seqType.hasDynamicExtents());
+}
+
+static bool isGpuModule(fir::FirOpBuilder &builder) {
+  auto offloadMod =
+      llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(*builder.getModule());
+  return offloadMod && offloadMod.getIsGPU();
+}
+
 template <typename OpType, typename OperandsStructType>
 void privatizeSymbol(
     lower::AbstractConverter &converter, fir::FirOpBuilder &firOpBuilder,
@@ -685,7 +700,8 @@ void privatizeSymbol(
     llvm::SetVector<const semantics::Symbol *> &allPrivatizedSymbols,
     llvm::SmallPtrSet<const semantics::Symbol *, 16> &mightHaveReadHostSym,
     const semantics::Symbol *symToPrivatize, OperandsStructType *clauseOps,
-    std::optional<llvm::omp::Directive> dir) {
+    std::optional<llvm::omp::Directive> dir,
+    bool forceHeapAllocationForPrivateDynamicArrays) {
   constexpr bool isDoConcurrent =
       std::is_same_v<OpType, fir::LocalitySpecifierOp>;
   mlir::OpBuilder::InsertPoint dcIP;
@@ -757,13 +773,19 @@ void privatizeSymbol(
   }
 
   mlir::Type argType = privVal.getType();
+  bool forceHeapAllocation = forceHeapAllocationForPrivateDynamicArrays &&
+                             isGpuModule(firOpBuilder) &&
+                             isDynamicallySizedBoxArrayType(allocType);
 
   OpType privatizerOp = [&]() {
     auto moduleOp = firOpBuilder.getModule();
-    auto uniquePrivatizerName = fir::getTypeAsString(
-        allocType, converter.getKindMap(),
+    std::string privatizerPrefix =
         converter.mangleName(*sym) +
-            (emitCopyRegion ? "_firstprivate" : "_private"));
+        (emitCopyRegion ? "_firstprivate" : "_private");
+    if (forceHeapAllocation)
+      privatizerPrefix += "_heap";
+    auto uniquePrivatizerName = fir::getTypeAsString(
+        allocType, converter.getKindMap(), privatizerPrefix);
 
     if (auto existingPrivatizer =
             moduleOp.lookupSymbol<OpType>(uniquePrivatizerName))
@@ -825,7 +847,8 @@ void privatizeSymbol(
           result.getDeallocRegion(),
           emitCopyRegion ? DeclOperationKind::FirstPrivateOrLocalInit
                          : DeclOperationKind::PrivateOrLocal,
-          symToPrivatize, cannotHaveNonDefaultLowerBounds, isDoConcurrent);
+          symToPrivatize, cannotHaveNonDefaultLowerBounds, isDoConcurrent,
+          forceHeapAllocation);
       // TODO: currently there are false positives from dead uses of the mold
       // arg
       if (result.initReadsFromMold())
@@ -896,7 +919,8 @@ privatizeSymbol<mlir::omp::PrivateClauseOp, mlir::omp::PrivateClauseOps>(
     llvm::SmallPtrSet<const semantics::Symbol *, 16> &mightHaveReadHostSym,
     const semantics::Symbol *symToPrivatize,
     mlir::omp::PrivateClauseOps *clauseOps,
-    std::optional<llvm::omp::Directive> dir);
+    std::optional<llvm::omp::Directive> dir,
+    bool forceHeapAllocationForPrivateDynamicArrays);
 
 template void
 privatizeSymbol<fir::LocalitySpecifierOp, fir::LocalitySpecifierOperands>(
@@ -906,6 +930,7 @@ privatizeSymbol<fir::LocalitySpecifierOp, fir::LocalitySpecifierOperands>(
     llvm::SmallPtrSet<const semantics::Symbol *, 16> &mightHaveReadHostSym,
     const semantics::Symbol *symToPrivatize,
     fir::LocalitySpecifierOperands *clauseOps,
-    std::optional<llvm::omp::Directive> dir);
+    std::optional<llvm::omp::Directive> dir,
+    bool forceHeapAllocationForPrivateDynamicArrays);
 
 } // end namespace Fortran::lower
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-teams-distribute-private-adjustable-array.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-teams-distribute-private-adjustable-array.f90
new file mode 100644
index 0000000000000..7e2e411e1e96a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-teams-distribute-private-adjustable-array.f90
@@ -0,0 +1,52 @@
+! Test that GPU delayed privatization allocates dynamic private arrays for
+! target teams distribute parallel do on the heap and emits cleanup.
+
+! RUN: %if amdgpu-registered-target %{ \
+! RUN:   %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir \
+! RUN:     -fopenmp -fopenmp-is-target-device \
+! RUN:     -mmlir --enable-delayed-privatization-staging \
+! RUN:     -o - %s 2>&1 | FileCheck %s \
+! RUN: %}
+
+subroutine dynamic_private_tmp(b)
+  real, dimension(:,:), intent(inout) :: b
+  real, dimension(size(b, 1)) :: tmp
+  integer :: i, j, k
+
+  !$omp target teams distribute parallel do collapse(2) private(tmp)
+  do j = 1, 1
+    do i = 1, 1
+      do k = 1, size(b, 1)
+        tmp(k) = 1.0
+      end do
+      b(i,j) = tmp(1)
+    end do
+  end do
+end subroutine
+
+subroutine static_private_tmp(b)
+  real, dimension(:,:), intent(inout) :: b
+  real, dimension(64) :: tmp
+  integer :: i, j, k
+
+  !$omp target teams distribute parallel do collapse(2) private(tmp)
+  do j = 1, 1
+    do i = 1, 1
+      do k = 1, 64
+        tmp(k) = 1.0
+      end do
+      b(i,j) = tmp(1)
+    end do
+  end do
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private} @{{.*}}Etmp_private_box_64xf32 : !fir.box<!fir.array<64xf32>> init {
+! CHECK-NOT:     fir.allocmem
+! CHECK:         fir.alloca !fir.array<64xf32>
+! CHECK:         omp.yield
+
+! CHECK-LABEL: omp.private {type = private} @{{.*}}Etmp_private_heap_box_Uxf32 : !fir.box<!fir.array<?xf32>> init {
+! CHECK:         %[[DIMS:.*]]:3 = fir.box_dims
+! CHECK:         fir.allocmem !fir.array<?xf32>, %[[DIMS]]#1
+! CHECK:       } dealloc {
+! CHECK:         fir.freemem