[flang-commits] [flang] 585b6e2 - [flang][OpenMP] Allocate `allocatable` init temps on the stack for GPUs (#164761)

Mon Oct 27 09:56:10 PDT 2025

Author: Kareem Ergawy
Date: 2025-10-27T17:56:05+01:00
New Revision: 585b6e2d449e767d41a813e285a8a8d38fb77ea6

URL: https://github.com/llvm/llvm-project/commit/585b6e2d449e767d41a813e285a8a8d38fb77ea6
DIFF: https://github.com/llvm/llvm-project/commit/585b6e2d449e767d41a813e285a8a8d38fb77ea6.diff

LOG: [flang][OpenMP] Allocate `allocatable` init temps on the stack for GPUs (#164761)

Temps needed for the allocatable reduction/privatization init regions
are now allocated on the heap all the time. However, this is performance
killer for GPUs since malloc calls are prohibitively expensive.
Therefore, we should do these allocations on the stack for GPU
reductions.

This is similar to what we do for arrays. Additionally, I am working on
getting reductions-by-ref to work on GPUs which is a bit of a challenge
given the many involved steps (e.g. intra-warp and inter-warp reuctions,
shuffling data from remote lanes, ...). But this is a prerequisite step.

Added: 
    

Modified: 
    flang/lib/Lower/Support/PrivateReductionUtils.cpp
    flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90

Removed: 
    


################################################################################
diff  --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index d433ce367d259..c6c428860bca1 100644

--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -376,6 +376,8 @@ class PopulateInitAndCleanupRegionsHelper {
     loadedMoldArg = builder.loadIfRef(loc, moldArg);
     return loadedMoldArg;
   }
+
+  bool shouldAllocateTempOnStack() const;
 };
 
 } // namespace
@@ -438,8 +440,14 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
     builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
   }
 
-  mlir::Value valAlloc = builder.createHeapTemporary(loc, innerTy, /*name=*/{},
-                                                     /*shape=*/{}, lenParams);
+  bool shouldAllocateOnStack = shouldAllocateTempOnStack();
+  mlir::Value valAlloc =
+      (shouldAllocateOnStack)
+          ? builder.createTemporary(loc, innerTy, /*name=*/{},
+                                    /*shape=*/{}, lenParams)
+          : builder.createHeapTemporary(loc, innerTy, /*name=*/{},
+                                        /*shape=*/{}, lenParams);
+
   if (scalarInitValue)
     builder.createStoreWithConvert(loc, scalarInitValue, valAlloc);
   mlir::Value box = fir::EmboxOp::create(builder, loc, valType, valAlloc,
@@ -451,8 +459,9 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
   fir::StoreOp lastOp =
       fir::StoreOp::create(builder, loc, box, allocatedPrivVarArg);
 
-  createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
-                      isDoConcurrent);
+  if (!shouldAllocateOnStack)
+    createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
+                        isDoConcurrent);
 
   if (ifUnallocated)
     builder.setInsertionPointAfter(ifUnallocated);
@@ -462,6 +471,14 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
   createYield(allocatedPrivVarArg);
 }
 
+bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack() const {
+  // On the GPU, always allocate on the stack since heap allocatins are very
+  // expensive.
+  auto offloadMod =
+      llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(*builder.getModule());
+  return offloadMod && offloadMod.getIsGPU();
+}
+
 void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
     fir::BaseBoxType boxTy, bool needsInitialization) {
   bool isAllocatableOrPointer =
@@ -504,15 +521,7 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
   // Allocating on the heap in case the whole reduction/privatization is nested
   // inside of a loop
   auto temp = [&]() {
-    bool shouldAllocateOnStack = false;
-
-    // On the GPU, always allocate on the stack since heap allocatins are very
-    // expensive.
-    if (auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
-            *builder.getModule()))
-      shouldAllocateOnStack = offloadMod.getIsGPU();
-
-    if (shouldAllocateOnStack)
+    if (shouldAllocateTempOnStack())
       return createStackTempFromMold(loc, builder, source);
 
     auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);

diff  --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90
index 3d93fbc6e446e..272f34fc0fd1a 100644
--- a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90
@@ -1,9 +1,22 @@
 ! Tests delayed privatization for `targets ... private(..)` for allocatables.
 
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization-staging \
-! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN:   -o - %s 2>&1 | FileCheck %s --check-prefix=CPU
+
 ! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization-staging -o - %s 2>&1 \
-! RUN:   | FileCheck %s
+! RUN:   | FileCheck %s --check-prefix=CPU
+
+! RUN: %if amdgpu-registered-target %{ \
+! RUN:   %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir  \
+! RUN:     -fopenmp -fopenmp-is-target-device \
+! RUN:     -mmlir --enable-delayed-privatization-staging \
+! RUN:     -o - %s 2>&1 | \
+! RUN:   FileCheck %s --check-prefix=GPU  \
+! RUN: %}
+
+! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization-staging \
+! RUN:    -fopenmp-is-target-device -fopenmp-is-gpu -o - %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=GPU
 
 subroutine target_allocatable
   implicit none
@@ -14,53 +27,65 @@ subroutine target_allocatable
   !$omp end target
 end subroutine target_allocatable
 
-! CHECK-LABEL: omp.private {type = private}
-! CHECK-SAME:    @[[VAR_PRIVATIZER_SYM:.*]] :
-! CHECK-SAME:      [[DESC_TYPE:!fir.box<!fir.heap<i32>>]] init {
-! CHECK:  ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]):
+! CPU-LABEL: omp.private {type = private}
+! CPU-SAME:    @[[VAR_PRIVATIZER_SYM:.*]] :
+! CPU-SAME:      [[DESC_TYPE:!fir.box<!fir.heap<i32>>]] init {
+! CPU:  ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]):
+
+! CPU-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]]
+! CPU-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : ([[DESC_TYPE]]) -> !fir.heap<i32>
+! CPU-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
+! CPU-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
+! CPU-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
 
-! CHECK-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]]
-! CHECK-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : ([[DESC_TYPE]]) -> !fir.heap<i32>
-! CHECK-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
-! CHECK-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
-! CHECK-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
+! CPU-NEXT:   fir.if %[[ALLOC_COND]] {
+! CPU-NEXT:     %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
+! CPU-NEXT:     fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
+! CPU-NEXT:   } else {
+! CPU-NEXT:     %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32
+! CPU-NEXT:     %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
+! CPU-NEXT:     fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
+! CPU-NEXT:   }
 
-! CHECK-NEXT:   fir.if %[[ALLOC_COND]] {
-! CHECK-NEXT:     %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
-! CHECK-NEXT:     fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
-! CHECK-NEXT:   } else {
-! CHECK-NEXT:     %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32
-! CHECK-NEXT:     %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
-! CHECK-NEXT:     fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
-! CHECK-NEXT:   }
+! CPU-NEXT:   omp.yield(%[[PRIV_ALLOC]] : [[TYPE]])
 
-! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : [[TYPE]])
+! CPU-NEXT: } dealloc {
+! CPU-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
 
-! CHECK-NEXT: } dealloc {
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! CPU-NEXT:   %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]]
+! CPU-NEXT:   %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]]
+! CPU-NEXT:   %[[PRIV_ADDR_I64:.*]] = fir.convert %[[PRIV_ADDR]]
+! CPU-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
+! CPU-NEXT:   %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64
 
-! CHECK-NEXT:   %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]]
-! CHECK-NEXT:   %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]]
-! CHECK-NEXT:   %[[PRIV_ADDR_I64:.*]] = fir.convert %[[PRIV_ADDR]]
-! CHECK-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
-! CHECK-NEXT:   %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64
+! CPU-NEXT:   fir.if %[[PRIV_NULL_COND]] {
+! CPU-NEXT:     fir.freemem %[[PRIV_ADDR]]
+! CPU-NEXT:   }
 
-! CHECK-NEXT:   fir.if %[[PRIV_NULL_COND]] {
-! CHECK-NEXT:     fir.freemem %[[PRIV_ADDR]]
-! CHECK-NEXT:   }
+! CPU-NEXT:   omp.yield
+! CPU-NEXT: }
 
-! CHECK-NEXT:   omp.yield
-! CHECK-NEXT: }
 
+! CPU-LABEL: func.func @_QPtarget_allocatable() {
 
-! CHECK-LABEL: func.func @_QPtarget_allocatable() {
+! CPU:  %[[VAR_ALLOC:.*]] = fir.alloca [[DESC_TYPE]]
+! CPU-SAME: {bindc_name = "alloc_var", {{.*}}}
+! CPU:  %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]]
+! CPU:  %[[BASE_ADDR:.*]] = fir.box_offset %[[VAR_DECL]]#0 base_addr : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> [[MEMBER_TYPE:.*]]
+! CPU:  %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}}
+! CPU:  %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) map_clauses(to) capture(ByRef) members(%[[MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>>
 
-! CHECK:  %[[VAR_ALLOC:.*]] = fir.alloca [[DESC_TYPE]]
-! CHECK-SAME: {bindc_name = "alloc_var", {{.*}}}
-! CHECK:  %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]]
-! CHECK:  %[[BASE_ADDR:.*]] = fir.box_offset %[[VAR_DECL]]#0 base_addr : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> [[MEMBER_TYPE:.*]]
-! CHECK:  %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}}
-! CHECK:  %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) map_clauses(to) capture(ByRef) members(%[[MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>>
+! CPU:  omp.target map_entries(%[[MAP_VAR]] -> %arg0, %[[MEMBER]] -> %arg1 : [[TYPE]], [[MEMBER_TYPE]]) private(
+! CPU-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} [map_idx=0] : [[TYPE]]) {
 
-! CHECK:  omp.target map_entries(%[[MAP_VAR]] -> %arg0, %[[MEMBER]] -> %arg1 : [[TYPE]], [[MEMBER_TYPE]]) private(
-! CHECK-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} [map_idx=0] : [[TYPE]]) {
+! GPU-LABEL: omp.private {type = private} {{.*}} init {
+! GPU:         fir.if %{{.*}} {
+! GPU-NEXT:    %[[ZERO_BOX:.*]] = fir.embox %{{.*}}
+! GPU-NEXT:     fir.store %[[ZERO_BOX]] to %{{.*}}
+! GPU-NEXT:   } else {
+! GPU-NOT:      fir.allocmem i32
+! GPU-NEXT:     %[[PRIV_ALLOC:.*]] = fir.alloca i32
+! GPU-NEXT:     %[[PRIV_ALLOC_BOX:.*]] = fir.embox %[[PRIV_ALLOC]]
+! GPU-NEXT:     fir.store %[[PRIV_ALLOC_BOX]] to %{{.*}}
+! GPU-NEXT:   }
+! GPU-NEXT:   omp.yield(%{{.*}})