[flang-commits] [flang] [flang][OpenMP] Allocate `allocatable` init temps on the stack for GPUs (PR #164761)

Wed Oct 22 23:40:44 PDT 2025

https://github.com/ergawy created https://github.com/llvm/llvm-project/pull/164761

Temps needed for the allocatable reduction/privatization init regions are now allocated on the heap all the time. However, this is performance killer for GPUs since malloc calls are prohibitively expensive. Therefore, we should do these allocations on the stack for GPU reductions.

>From 63eaca27f3ad0bbad3b0ae805caa0a22ccf1e011 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Fri, 17 Oct 2025 03:49:40 -0500
Subject: [PATCH] [flang][OpenMP] Allocate `allocatable` init temps on the
 stack for GPUs

Temps needed for the allocatable reduction/privatization init regions
are now allocated on the heap all the time. However, this is performance
killer for GPUs since malloc calls are prohibitively expensive.
Therefore, we should do these allocations on the stack for GPU reductions.
---
 .../Lower/Support/PrivateReductionUtils.cpp   |  35 +++---
 .../target-private-allocatable.f90            | 107 +++++++++++-------
 2 files changed, 88 insertions(+), 54 deletions(-)

diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index d433ce367d259..c6c428860bca1 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -376,6 +376,8 @@ class PopulateInitAndCleanupRegionsHelper {
     loadedMoldArg = builder.loadIfRef(loc, moldArg);
     return loadedMoldArg;
   }
+
+  bool shouldAllocateTempOnStack() const;
 };
 
 } // namespace
@@ -438,8 +440,14 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
     builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
   }
 
-  mlir::Value valAlloc = builder.createHeapTemporary(loc, innerTy, /*name=*/{},
-                                                     /*shape=*/{}, lenParams);
+  bool shouldAllocateOnStack = shouldAllocateTempOnStack();
+  mlir::Value valAlloc =
+      (shouldAllocateOnStack)
+          ? builder.createTemporary(loc, innerTy, /*name=*/{},
+                                    /*shape=*/{}, lenParams)
+          : builder.createHeapTemporary(loc, innerTy, /*name=*/{},
+                                        /*shape=*/{}, lenParams);
+
   if (scalarInitValue)
     builder.createStoreWithConvert(loc, scalarInitValue, valAlloc);
   mlir::Value box = fir::EmboxOp::create(builder, loc, valType, valAlloc,
@@ -451,8 +459,9 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
   fir::StoreOp lastOp =
       fir::StoreOp::create(builder, loc, box, allocatedPrivVarArg);
 
-  createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
-                      isDoConcurrent);
+  if (!shouldAllocateOnStack)
+    createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
+                        isDoConcurrent);
 
   if (ifUnallocated)
     builder.setInsertionPointAfter(ifUnallocated);
@@ -462,6 +471,14 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
   createYield(allocatedPrivVarArg);
 }
 
+bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack() const {
+  // On the GPU, always allocate on the stack since heap allocatins are very
+  // expensive.
+  auto offloadMod =
+      llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(*builder.getModule());
+  return offloadMod && offloadMod.getIsGPU();
+}
+
 void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
     fir::BaseBoxType boxTy, bool needsInitialization) {
   bool isAllocatableOrPointer =
@@ -504,15 +521,7 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
   // Allocating on the heap in case the whole reduction/privatization is nested
   // inside of a loop
   auto temp = [&]() {
-    bool shouldAllocateOnStack = false;
-
-    // On the GPU, always allocate on the stack since heap allocatins are very
-    // expensive.
-    if (auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
-            *builder.getModule()))
-      shouldAllocateOnStack = offloadMod.getIsGPU();
-
-    if (shouldAllocateOnStack)
+    if (shouldAllocateTempOnStack())
       return createStackTempFromMold(loc, builder, source);
 
     auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90
index 3d93fbc6e446e..272f34fc0fd1a 100644
--- a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90
@@ -1,9 +1,22 @@
 ! Tests delayed privatization for `targets ... private(..)` for allocatables.
 
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization-staging \
-! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN:   -o - %s 2>&1 | FileCheck %s --check-prefix=CPU
+
 ! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization-staging -o - %s 2>&1 \
-! RUN:   | FileCheck %s
+! RUN:   | FileCheck %s --check-prefix=CPU
+
+! RUN: %if amdgpu-registered-target %{ \
+! RUN:   %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir  \
+! RUN:     -fopenmp -fopenmp-is-target-device \
+! RUN:     -mmlir --enable-delayed-privatization-staging \
+! RUN:     -o - %s 2>&1 | \
+! RUN:   FileCheck %s --check-prefix=GPU  \
+! RUN: %}
+
+! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization-staging \
+! RUN:    -fopenmp-is-target-device -fopenmp-is-gpu -o - %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=GPU
 
 subroutine target_allocatable
   implicit none
@@ -14,53 +27,65 @@ subroutine target_allocatable
   !$omp end target
 end subroutine target_allocatable
 
-! CHECK-LABEL: omp.private {type = private}
-! CHECK-SAME:    @[[VAR_PRIVATIZER_SYM:.*]] :
-! CHECK-SAME:      [[DESC_TYPE:!fir.box<!fir.heap<i32>>]] init {
-! CHECK:  ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]):
+! CPU-LABEL: omp.private {type = private}
+! CPU-SAME:    @[[VAR_PRIVATIZER_SYM:.*]] :
+! CPU-SAME:      [[DESC_TYPE:!fir.box<!fir.heap<i32>>]] init {
+! CPU:  ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]):
+
+! CPU-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]]
+! CPU-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : ([[DESC_TYPE]]) -> !fir.heap<i32>
+! CPU-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
+! CPU-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
+! CPU-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
 
-! CHECK-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]]
-! CHECK-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : ([[DESC_TYPE]]) -> !fir.heap<i32>
-! CHECK-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
-! CHECK-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
-! CHECK-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
+! CPU-NEXT:   fir.if %[[ALLOC_COND]] {
+! CPU-NEXT:     %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
+! CPU-NEXT:     fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
+! CPU-NEXT:   } else {
+! CPU-NEXT:     %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32
+! CPU-NEXT:     %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
+! CPU-NEXT:     fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
+! CPU-NEXT:   }
 
-! CHECK-NEXT:   fir.if %[[ALLOC_COND]] {
-! CHECK-NEXT:     %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
-! CHECK-NEXT:     fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
-! CHECK-NEXT:   } else {
-! CHECK-NEXT:     %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32
-! CHECK-NEXT:     %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
-! CHECK-NEXT:     fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
-! CHECK-NEXT:   }
+! CPU-NEXT:   omp.yield(%[[PRIV_ALLOC]] : [[TYPE]])
 
-! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : [[TYPE]])
+! CPU-NEXT: } dealloc {
+! CPU-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
 
-! CHECK-NEXT: } dealloc {
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! CPU-NEXT:   %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]]
+! CPU-NEXT:   %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]]
+! CPU-NEXT:   %[[PRIV_ADDR_I64:.*]] = fir.convert %[[PRIV_ADDR]]
+! CPU-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
+! CPU-NEXT:   %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64
 
-! CHECK-NEXT:   %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]]
-! CHECK-NEXT:   %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]]
-! CHECK-NEXT:   %[[PRIV_ADDR_I64:.*]] = fir.convert %[[PRIV_ADDR]]
-! CHECK-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
-! CHECK-NEXT:   %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64
+! CPU-NEXT:   fir.if %[[PRIV_NULL_COND]] {
+! CPU-NEXT:     fir.freemem %[[PRIV_ADDR]]
+! CPU-NEXT:   }
 
-! CHECK-NEXT:   fir.if %[[PRIV_NULL_COND]] {
-! CHECK-NEXT:     fir.freemem %[[PRIV_ADDR]]
-! CHECK-NEXT:   }
+! CPU-NEXT:   omp.yield
+! CPU-NEXT: }
 
-! CHECK-NEXT:   omp.yield
-! CHECK-NEXT: }
 
+! CPU-LABEL: func.func @_QPtarget_allocatable() {
 
-! CHECK-LABEL: func.func @_QPtarget_allocatable() {
+! CPU:  %[[VAR_ALLOC:.*]] = fir.alloca [[DESC_TYPE]]
+! CPU-SAME: {bindc_name = "alloc_var", {{.*}}}
+! CPU:  %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]]
+! CPU:  %[[BASE_ADDR:.*]] = fir.box_offset %[[VAR_DECL]]#0 base_addr : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> [[MEMBER_TYPE:.*]]
+! CPU:  %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}}
+! CPU:  %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) map_clauses(to) capture(ByRef) members(%[[MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>>
 
-! CHECK:  %[[VAR_ALLOC:.*]] = fir.alloca [[DESC_TYPE]]
-! CHECK-SAME: {bindc_name = "alloc_var", {{.*}}}
-! CHECK:  %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]]
-! CHECK:  %[[BASE_ADDR:.*]] = fir.box_offset %[[VAR_DECL]]#0 base_addr : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> [[MEMBER_TYPE:.*]]
-! CHECK:  %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}}
-! CHECK:  %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) map_clauses(to) capture(ByRef) members(%[[MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>>
+! CPU:  omp.target map_entries(%[[MAP_VAR]] -> %arg0, %[[MEMBER]] -> %arg1 : [[TYPE]], [[MEMBER_TYPE]]) private(
+! CPU-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} [map_idx=0] : [[TYPE]]) {
 
-! CHECK:  omp.target map_entries(%[[MAP_VAR]] -> %arg0, %[[MEMBER]] -> %arg1 : [[TYPE]], [[MEMBER_TYPE]]) private(
-! CHECK-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} [map_idx=0] : [[TYPE]]) {
+! GPU-LABEL: omp.private {type = private} {{.*}} init {
+! GPU:         fir.if %{{.*}} {
+! GPU-NEXT:    %[[ZERO_BOX:.*]] = fir.embox %{{.*}}
+! GPU-NEXT:     fir.store %[[ZERO_BOX]] to %{{.*}}
+! GPU-NEXT:   } else {
+! GPU-NOT:      fir.allocmem i32
+! GPU-NEXT:     %[[PRIV_ALLOC:.*]] = fir.alloca i32
+! GPU-NEXT:     %[[PRIV_ALLOC_BOX:.*]] = fir.embox %[[PRIV_ALLOC]]
+! GPU-NEXT:     fir.store %[[PRIV_ALLOC_BOX]] to %{{.*}}
+! GPU-NEXT:   }
+! GPU-NEXT:   omp.yield(%{{.*}})