[llvm] [mlir] [OMPIRBuilder] Hoist alloca's to entry blocks of compiler-emitted GPU reduction functions (PR #181359)

Kareem Ergawy via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 16 21:57:54 PST 2026


https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/181359

>From da466e5dc1fcae7bbe069b62002b62274a89a5c4 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Fri, 13 Feb 2026 03:59:47 -0600
Subject: [PATCH 1/2] [OMPIRBuilder] Hoist alloca's to entry blocks of
 compiler-emitted GPU reduction functions

Fixes a bug in GPU reductions when `-O0` was used to compile GPU
reductions. There were invalid memory accesses at runtime for the
following example:

```fortran
program test_array_reduction()
  integer :: red_array(1)
  integer :: i

  red_array = 0

  !$omp target teams distribute parallel do reduction(+:red_array)
  do i = 1, 100
    red_array(1) = red_array(1) + 4422
  end do
  !$omp end target teams distribute parallel do

  print *, red_array
end program test_array_reduction
```

The issue was caused by alloca's for some temp values in the combiner
region of the reduction op being inlined beyond the entry blocks of the
GPU reduction functions emitted by the compiler.

This PR fixes the issue by hoisting all alloca's to the entry block
after the reduction functions are completely emitted by the compiler.
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp       |  6 ++++++
 .../LLVMIR/allocatable_gpu_reduction.mlir       | 17 +++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 464ec5b5a2ece..7d446983a3882 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4234,6 +4234,12 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction(
     }
 
   Builder.CreateRetVoid();
+
+  PostDominatorTree PostDomTree(*ReductionFunc);
+  for (llvm::BasicBlock &BB : *ReductionFunc)
+    if (PostDomTree.properlyDominates(&BB, &ReductionFunc->getEntryBlock()))
+      hoistNonEntryAllocasToEntryBlock(BB);
+
   return ReductionFunc;
 }
 
diff --git a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir
index 95d12f304aca0..c9ff6de8cc951 100644
--- a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir
+++ b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir
@@ -24,6 +24,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 :
     "llvm.intr.memcpy"(%5, %arg0, %6) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
     %7 = llvm.mlir.constant(24 : i32) : i32
     "llvm.intr.memcpy"(%2, %arg1, %7) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    llvm.br ^bb1
+  ^bb1:
     %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
     %9 = llvm.load %8 : !llvm.ptr -> !llvm.ptr
     %10 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
@@ -63,6 +65,21 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 :
   }
 }
 
+// CHECK: define internal void @"{{.*}}$reduction$reduction_func"(ptr noundef %[[ARG_0:.*]], ptr noundef %[[ARG_1:.*]]) {{.*}} {
+// CHECK: entry:
+// CHECK:   %[[TEMP_1:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }
+// CHECK:   %[[TEMP_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }
+// CHECK:   br label %[[RED_BODY:omp.reduction.nonatomic.body]]
+// Verify that no allocas are emitted beyond the entry block.
+// CHECK-NOT: alloca
+
+// CHECK: [[RED_BODY]]:
+// CHECK:   %[[TEMP_1_ACAST:.*]] = addrspacecast ptr addrspace(5) %[[TEMP_1]] to ptr
+// CHECK:   %[[TEMP_2_ACAST:.*]] = addrspacecast ptr addrspace(5) %[[TEMP_2]] to ptr
+// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr %[[TEMP_2_ACAST]], ptr %{{.*}}, i32 24, i1 false)
+// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr %[[TEMP_1_ACAST]], ptr %{{.*}}, i32 24, i1 false)
+// CHECK: }
+
 // CHECK: define {{.*}} @_omp_reduction_shuffle_and_reduce_func({{.*}}) {{.*}} {
 // CHECK:   %[[REMOTE_RED_LIST:.omp.reduction.remote_reduce_list]] = alloca [1 x ptr], align 8, addrspace(5)
 // CHECK:   %[[RED_ELEM:.omp.reduction.element]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8, addrspace(5)

>From 8c765ba3c16e8433f773e4b6fd3177dd0e11aa88 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Mon, 16 Feb 2026 23:57:21 -0600
Subject: [PATCH 2/2] add comment and refactor hoisting utils

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 26 +++++++++++++----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 7d446983a3882..1947323ef85fd 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -787,6 +787,13 @@ static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block) {
     AllocaInst->moveBefore(InsertPoint);
 }
 
+static void hoistNonEntryAllocasToEntryBlock(llvm::Function *Func) {
+  PostDominatorTree PostDomTree(*Func);
+  for (llvm::BasicBlock &BB : *Func)
+    if (PostDomTree.properlyDominates(&BB, &Func->getEntryBlock()))
+      hoistNonEntryAllocasToEntryBlock(BB);
+}
+
 void OpenMPIRBuilder::finalize(Function *Fn) {
   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
   SmallVector<BasicBlock *, 32> Blocks;
@@ -893,12 +900,8 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
     if (OI.PostOutlineCB)
       OI.PostOutlineCB(*OutlinedFn);
 
-    if (OI.FixUpNonEntryAllocas) {
-      PostDominatorTree PostDomTree(*OutlinedFn);
-      for (llvm::BasicBlock &BB : *OutlinedFn)
-        if (PostDomTree.properlyDominates(&BB, &OutlinedFn->getEntryBlock()))
-          hoistNonEntryAllocasToEntryBlock(BB);
-    }
+    if (OI.FixUpNonEntryAllocas)
+      hoistNonEntryAllocasToEntryBlock(OutlinedFn);
   }
 
   // Remove work items that have been completed.
@@ -4234,11 +4237,12 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction(
     }
 
   Builder.CreateRetVoid();
-
-  PostDominatorTree PostDomTree(*ReductionFunc);
-  for (llvm::BasicBlock &BB : *ReductionFunc)
-    if (PostDomTree.properlyDominates(&BB, &ReductionFunc->getEntryBlock()))
-      hoistNonEntryAllocasToEntryBlock(BB);
+  // Compiling with `-O0`, `alloca`s emitted in non-entry blocks are not hoisted
+  // to the entry block (this is dones for higher opt levels by later passes in
+  // the pipeline). This has caused issues because non-entry `alloca`s force the
+  // function to use dynamic stack allocations and we might run out of scratch
+  // memory.
+  hoistNonEntryAllocasToEntryBlock(ReductionFunc);
 
   return ReductionFunc;
 }



More information about the llvm-commits mailing list