[llvm-branch-commits] [flang] 8c9e0c6 - [flang][OpenMP] Allocate `reduction` init temps on the stack for GPUs (#146667)

Sun Jul 6 19:11:02 PDT 2025

Author: Kareem Ergawy
Date: 2025-07-04T06:29:34+02:00
New Revision: 8c9e0c6c61f653928a992422d534e4e7f976dd55

URL: https://github.com/llvm/llvm-project/commit/8c9e0c6c61f653928a992422d534e4e7f976dd55
DIFF: https://github.com/llvm/llvm-project/commit/8c9e0c6c61f653928a992422d534e4e7f976dd55.diff

LOG: [flang][OpenMP] Allocate `reduction` init temps on the stack for GPUs (#146667)

Temps needed for the reduction init regions are now allocate on the heap
all the time. However, this is performance killer for GPUs since malloc
calls are prohibitively expensive. Therefore, we should do these
allocations on the stack for GPU reductions.

Added: 
    

Modified: 
    flang/lib/Lower/Support/PrivateReductionUtils.cpp
    flang/test/Lower/OpenMP/parallel-reduction-array.f90
    mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Removed: 
    


################################################################################
diff  --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index e878041d37c03..c3a5b6101ce00 100644

--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -502,22 +502,37 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
 
   // Allocating on the heap in case the whole reduction/privatization is nested
   // inside of a loop
-  auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
-  // if needsDealloc isn't statically false, add cleanup region. Always
-  // do this for allocatable boxes because they might have been re-allocated
-  // in the body of the loop/parallel region
-
-  std::optional<int64_t> cstNeedsDealloc = fir::getIntIfConstant(needsDealloc);
-  assert(cstNeedsDealloc.has_value() &&
-         "createTempFromMold decides this statically");
-  if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) {
-    mlir::OpBuilder::InsertionGuard guard(builder);
-    createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
-                        isDoConcurrent);
-  } else {
-    assert(!isAllocatableOrPointer &&
-           "Pointer-like arrays must be heap allocated");
-  }
+  auto temp = [&]() {
+    bool shouldAllocateOnStack = false;
+
+    // On the GPU, always allocate on the stack since heap allocatins are very
+    // expensive.
+    if (auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
+            *builder.getModule()))
+      shouldAllocateOnStack = offloadMod.getIsGPU();
+
+    if (shouldAllocateOnStack)
+      return createStackTempFromMold(loc, builder, source);
+
+    auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
+    // if needsDealloc isn't statically false, add cleanup region. Always
+    // do this for allocatable boxes because they might have been re-allocated
+    // in the body of the loop/parallel region
+
+    std::optional<int64_t> cstNeedsDealloc =
+        fir::getIntIfConstant(needsDealloc);
+    assert(cstNeedsDealloc.has_value() &&
+           "createTempFromMold decides this statically");
+    if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) {
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
+                          isDoConcurrent);
+    } else {
+      assert(!isAllocatableOrPointer &&
+             "Pointer-like arrays must be heap allocated");
+    }
+    return temp;
+  }();
 
   // Put the temporary inside of a box:
   // hlfir::genVariableBox doesn't handle non-default lower bounds

diff  --git a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array.f90
index 8e3de498f59c1..4f889d9a4e77f 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array.f90
@@ -1,5 +1,8 @@
-! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s --check-prefix=CPU
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s --check-prefix=CPU
+
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-is-target-device -fopenmp-is-gpu -o - %s 2>&1 | FileCheck %s --check-prefix=GPU
+! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir -fopenmp -fopenmp-is-target-device -o - %s 2>&1 | FileCheck %s --check-prefix=GPU
 
 program reduce
 integer, dimension(3) :: i = 0
@@ -13,81 +16,88 @@ program reduce
 print *,i
 end program
 
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc {
-! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
-! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
-! CHECK-LABEL:   } init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>, %[[ALLOC:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:           %[[VAL_4:.*]] = arith.constant 3 : index
-! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_1:.*]] = fir.allocmem !fir.array<3xi32> {bindc_name = ".tmp", uniq_name = ""}
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<3xi32>>,
-! CHECK:           %[[TRUE:.*]]  = arith.constant true
+! CPU-LABEL:   omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc {
+! CPU:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
+! CPU:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
+! CPU-LABEL:   } init {
+! CPU:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>, %[[ALLOC:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
+! CPU:           %[[VAL_2:.*]] = arith.constant 0 : i32
+! CPU:           %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:           %[[VAL_4:.*]] = arith.constant 3 : index
+! CPU:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
+! CPU:           %[[VAL_1:.*]] = fir.allocmem !fir.array<3xi32> {bindc_name = ".tmp", uniq_name = ""}
+! CPU:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<3xi32>>,
+! CPU:           %[[TRUE:.*]]  = arith.constant true
 !fir.shape<1>) -> (!fir.heap<!fir.array<3xi32>>, !fir.heap<!fir.array<3xi32>>)
-! CHECK:           %[[C0:.*]] = arith.constant 0 : index
-! CHECK:           %[[DIMS:.*]]:3 = fir.box_dims %[[VAL_3]], %[[C0]] : (!fir.box<!fir.array<3xi32>>, index) -> (index, index, index)
-! CHECK:           %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1>
-! CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[SHIFT]]) : (!fir.heap<!fir.array<3xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<3xi32>>
-! CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box<!fir.array<3xi32>>
-! CHECK:           fir.store %[[VAL_7]] to %[[ALLOC]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:           omp.yield(%[[ALLOC]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
-! CHECK:         } combiner {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
-! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:           %[[C1:.*]] = arith.constant 1 : index
-! CHECK:           %[[C3:.*]] = arith.constant 3 : index
-! CHECK:           %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[C1]], %[[C3]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:           %[[C1_0:.*]] = arith.constant 1 : index
-! CHECK:           fir.do_loop %[[VAL_8:.*]] = %[[C1_0]] to %[[C3]] step %[[C1_0]] unordered {
-! CHECK:             %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
-! CHECK:             %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
-! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref<i32>
-! CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
-! CHECK:             fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref<i32>
-! CHECK:           }
-! CHECK:           omp.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
-! CHECK:         }  cleanup {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
-! CHECK:           %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:           %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<3xi32>>) -> !fir.ref<!fir.array<3xi32>>
-! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<3xi32>>) -> i64
-! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
-! CHECK:           %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_3]], %[[VAL_4]] : i64
-! CHECK:           fir.if %[[VAL_5]] {
-! CHECK:             %[[VAL_6:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<3xi32>>) -> !fir.heap<!fir.array<3xi32>>
-! CHECK:             fir.freemem %[[VAL_6]] : !fir.heap<!fir.array<3xi32>>
-! CHECK:           }
-! CHECK:           omp.yield
-! CHECK:         }
+! CPU:           %[[C0:.*]] = arith.constant 0 : index
+! CPU:           %[[DIMS:.*]]:3 = fir.box_dims %[[VAL_3]], %[[C0]] : (!fir.box<!fir.array<3xi32>>, index) -> (index, index, index)
+! CPU:           %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1>
+! CPU:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[SHIFT]]) : (!fir.heap<!fir.array<3xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<3xi32>>
+! CPU:           hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box<!fir.array<3xi32>>
+! CPU:           fir.store %[[VAL_7]] to %[[ALLOC]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:           omp.yield(%[[ALLOC]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
+! CPU:         } combiner {
+! CPU:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
+! CPU:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:           %[[C1:.*]] = arith.constant 1 : index
+! CPU:           %[[C3:.*]] = arith.constant 3 : index
+! CPU:           %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[C1]], %[[C3]] : (index, index) -> !fir.shapeshift<1>
+! CPU:           %[[C1_0:.*]] = arith.constant 1 : index
+! CPU:           fir.do_loop %[[VAL_8:.*]] = %[[C1_0]] to %[[C3]] step %[[C1_0]] unordered {
+! CPU:             %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CPU:             %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CPU:             %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CPU:             %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref<i32>
+! CPU:             %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
+! CPU:             fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref<i32>
+! CPU:           }
+! CPU:           omp.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
+! CPU:         }  cleanup {
+! CPU:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
+! CPU:           %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:           %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<3xi32>>) -> !fir.ref<!fir.array<3xi32>>
+! CPU:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<3xi32>>) -> i64
+! CPU:           %[[VAL_4:.*]] = arith.constant 0 : i64
+! CPU:           %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_3]], %[[VAL_4]] : i64
+! CPU:           fir.if %[[VAL_5]] {
+! CPU:             %[[VAL_6:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<3xi32>>) -> !fir.heap<!fir.array<3xi32>>
+! CPU:             fir.freemem %[[VAL_6]] : !fir.heap<!fir.array<3xi32>>
+! CPU:           }
+! CPU:           omp.yield
+! CPU:         }
+
+! CPU-LABEL:   func.func @_QQmain()
+! CPU:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<!fir.array<3xi32>>
+! CPU:           %[[VAL_1:.*]] = arith.constant 3 : index
+! CPU:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+! CPU:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFEi"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
+! CPU:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+! CPU:           %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
+! CPU:           fir.store %[[VAL_4]] to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:           omp.parallel reduction(byref @add_reduction_byref_box_3xi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) {
+! CPU:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEi"} : (!fir.ref<!fir.box<!fir.array<3xi32>>>) -> (!fir.ref<!fir.box<!fir.array<3xi32>>>, !fir.ref<!fir.box<!fir.array<3xi32>>>)
+! CPU:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CPU:             %[[VAL_9:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:             %[[VAL_10:.*]] = arith.constant 1 : index
+! CPU:             %[[VAL_11:.*]] = hlfir.designate %[[VAL_9]] (%[[VAL_10]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CPU:             hlfir.assign %[[VAL_8]] to %[[VAL_11]] : i32, !fir.ref<i32>
+! CPU:             %[[VAL_12:.*]] = arith.constant 2 : i32
+! CPU:             %[[VAL_13:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:             %[[VAL_14:.*]] = arith.constant 2 : index
+! CPU:             %[[VAL_15:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_14]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CPU:             hlfir.assign %[[VAL_12]] to %[[VAL_15]] : i32, !fir.ref<i32>
+! CPU:             %[[VAL_16:.*]] = arith.constant 3 : i32
+! CPU:             %[[VAL_17:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CPU:             %[[VAL_18:.*]] = arith.constant 3 : index
+! CPU:             %[[VAL_19:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_18]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CPU:             hlfir.assign %[[VAL_16]] to %[[VAL_19]] : i32, !fir.ref<i32>
+! CPU:             omp.terminator
+! CPU:           }
 
-! CHECK-LABEL:   func.func @_QQmain()
-! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<!fir.array<3xi32>>
-! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
-! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFEi"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
-! CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
-! CHECK:           %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:           omp.parallel reduction(byref @add_reduction_byref_box_3xi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) {
-! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEi"} : (!fir.ref<!fir.box<!fir.array<3xi32>>>) -> (!fir.ref<!fir.box<!fir.array<3xi32>>>, !fir.ref<!fir.box<!fir.array<3xi32>>>)
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : index
-! CHECK:             %[[VAL_11:.*]] = hlfir.designate %[[VAL_9]] (%[[VAL_10]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
-! CHECK:             hlfir.assign %[[VAL_8]] to %[[VAL_11]] : i32, !fir.ref<i32>
-! CHECK:             %[[VAL_12:.*]] = arith.constant 2 : i32
-! CHECK:             %[[VAL_13:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:             %[[VAL_14:.*]] = arith.constant 2 : index
-! CHECK:             %[[VAL_15:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_14]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
-! CHECK:             hlfir.assign %[[VAL_12]] to %[[VAL_15]] : i32, !fir.ref<i32>
-! CHECK:             %[[VAL_16:.*]] = arith.constant 3 : i32
-! CHECK:             %[[VAL_17:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
-! CHECK:             %[[VAL_18:.*]] = arith.constant 3 : index
-! CHECK:             %[[VAL_19:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_18]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
-! CHECK:             hlfir.assign %[[VAL_16]] to %[[VAL_19]] : i32, !fir.ref<i32>
-! CHECK:             omp.terminator
-! CHECK:           }
+! GPU: omp.declare_reduction {{.*}} alloc {
+! GPU: } init {
+! GPU-NOT: fir.allocmem {{.*}} {bindc_name = ".tmp", {{.*}}}
+! GPU:     fir.alloca {{.*}} {bindc_name = ".tmp"}
+! GPU: } combiner {
+! GPU: }

diff  --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 883d179580e0c..ed88c19ab2c25 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1291,6 +1291,11 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
     mapInitializationArgs(op, moduleTranslation, reductionDecls,
                           reductionVariableMap, i);
 
+    // TODO In some cases (specially on the GPU), the init regions may
+    // contains stack alloctaions. If the region is inlined in a loop, this is
+    // problematic. Instead of just inlining the region, handle allocations by
+    // hoisting fixed length allocations to the function entry and using
+    // stacksave and restore for variable length ones.
     if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
                                        "omp.reduction.neutral", builder,
                                        moduleTranslation, &phis)))