[Mlir-commits] [mlir] ccb47d0 - [OpenMP][MLIR] Hoist static `alloca`s emitted by private `init` regions to the allocation IP of the construct (#171597)

llvmlistbot at llvm.org llvmlistbot at llvm.org
Sun Jan 4 21:39:56 PST 2026


Author: Kareem Ergawy
Date: 2026-01-05T06:39:52+01:00
New Revision: ccb47d0fb9d01d44764fa4ca5c6dcf239ab76ed2

URL: https://github.com/llvm/llvm-project/commit/ccb47d0fb9d01d44764fa4ca5c6dcf239ab76ed2
DIFF: https://github.com/llvm/llvm-project/commit/ccb47d0fb9d01d44764fa4ca5c6dcf239ab76ed2.diff

LOG: [OpenMP][MLIR] Hoist static `alloca`s emitted by private `init` regions to the allocation IP of the construct (#171597)

Having more than 1 descritpr (allocatable or array) on the same
`private` clause triggers a runtime crash on GPUs at the moment.

For SPMD kernels, the issue happens because the initialization logic
includes:
* Allocating a number of temporary structs (these are emitted by flang
when `fir` is lowered to `mlir.llvm`).
* There is a conditional branch that determines whether we will allocate
storage for the descriptor and initialize array bounds from the original
descriptor or whether we will initialize the private descriptor to null.

Because of these 2 things, temp allocations needed for descriptors
beyond the 1st one are preceded by branching which causes the observed
the runtime crash.

This PR solves this issue by hoisting these static `alloca`s
instructions to the suitable allca IP of the parent construct.

Added: 
    mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir

Modified: 
    llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
    llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index f5eb6222fd58d..05d8a7dd168a3 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2348,6 +2348,8 @@ class OpenMPIRBuilder {
     PostOutlineCBTy PostOutlineCB;
     BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
     SmallVector<Value *, 2> ExcludeArgsFromAggregate;
+    // TODO: this should be safe to enable by default
+    bool FixUpNonEntryAllocas = false;
 
     /// Collect all blocks in between EntryBB and ExitBB in both the given
     /// vector and set.

diff  --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 716f8582dd7b2..f764b644edc69 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -763,6 +764,28 @@ static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder,
   }
 }
 
+static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block) {
+  llvm::SmallVector<llvm::Instruction *> AllocasToMove;
+
+  auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
+    // TODO: For now, we support simple static allocations, we might need to
+    // move non-static ones as well. However, this will need further analysis to
+    // move the lenght arguments as well.
+    return !AllocaInst.isArrayAllocation();
+  };
+
+  for (llvm::Instruction &Inst : Block)
+    if (auto *AllocaInst = llvm::dyn_cast<llvm::AllocaInst>(&Inst))
+      if (ShouldHoistAlloca(*AllocaInst))
+        AllocasToMove.push_back(AllocaInst);
+
+  auto InsertPoint =
+      Block.getParent()->getEntryBlock().getTerminator()->getIterator();
+
+  for (llvm::Instruction *AllocaInst : AllocasToMove)
+    AllocaInst->moveBefore(InsertPoint);
+}
+
 void OpenMPIRBuilder::finalize(Function *Fn) {
   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
   SmallVector<BasicBlock *, 32> Blocks;
@@ -867,6 +890,13 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
     // Run a user callback, e.g. to add attributes.
     if (OI.PostOutlineCB)
       OI.PostOutlineCB(*OutlinedFn);
+
+    if (OI.FixUpNonEntryAllocas) {
+      PostDominatorTree PostDomTree(*OutlinedFn);
+      for (llvm::BasicBlock &BB : *OutlinedFn)
+        if (PostDomTree.properlyDominates(&BB, &OutlinedFn->getEntryBlock()))
+          hoistNonEntryAllocasToEntryBlock(BB);
+    }
   }
 
   // Remove work items that have been completed.
@@ -1694,6 +1724,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
                              IfCondition, NumThreads, PrivTID, PrivTIDAddr,
                              ThreadID, ToBeDeletedVec);
     };
+    OI.FixUpNonEntryAllocas = true;
   } else {
     // Generate OpenMP host runtime call
     OI.PostOutlineCB = [=, ToBeDeletedVec =
@@ -1701,6 +1732,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
       hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
                            PrivTID, PrivTIDAddr, ToBeDeletedVec);
     };
+    // TODO: fix-up allocations on the host as well?
   }
 
   OI.OuterAllocaBB = OuterAllocaBlock;

diff  --git a/mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir b/mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir
new file mode 100644
index 0000000000000..71c4b9cdede59
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir
@@ -0,0 +1,91 @@
+// Tests that static alloca's in `omp.private ... init` regions are hoisted to
+// the parent construct's alloca IP.
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+llvm.func @foo1()
+llvm.func @foo2()
+llvm.func @foo3()
+llvm.func @foo4()
+
+omp.private {type = private} @multi_block.privatizer : f32 init {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %alloca1 = llvm.alloca %0 x !llvm.struct<(i64)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>
+
+  %1 = llvm.load %arg0 : !llvm.ptr -> f32
+
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %c2 = llvm.mlir.constant(2 : i32) : i32
+  %cond1 = llvm.icmp "eq" %c1, %c2 : i32
+  llvm.cond_br %cond1, ^bb1, ^bb2
+
+^bb1:
+  llvm.call @foo1() : () -> ()
+  llvm.br ^bb3
+
+^bb2:
+  llvm.call @foo2() : () -> ()
+  llvm.br ^bb3
+
+^bb3:
+  llvm.store %1, %arg1 : f32, !llvm.ptr
+
+  omp.yield(%arg1 : !llvm.ptr)
+}
+
+omp.private {type = private} @multi_block.privatizer2 : f32 init {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %alloca1 = llvm.alloca %0 x !llvm.struct<(ptr)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>
+
+  %1 = llvm.load %arg0 : !llvm.ptr -> f32
+
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %c2 = llvm.mlir.constant(2 : i32) : i32
+  %cond1 = llvm.icmp "eq" %c1, %c2 : i32
+  llvm.cond_br %cond1, ^bb1, ^bb2
+
+^bb1:
+  llvm.call @foo3() : () -> ()
+  llvm.br ^bb3
+
+^bb2:
+  llvm.call @foo4() : () -> ()
+  llvm.br ^bb3
+
+^bb3:
+  llvm.store %1, %arg1 : f32, !llvm.ptr
+
+  omp.yield(%arg1 : !llvm.ptr)
+}
+
+llvm.func @parallel_op_private_multi_block(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
+  %arg0_map = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.ptr)
+        map_clauses(is_device_ptr) capture(ByRef) -> !llvm.ptr {name = ""}
+  %arg1_map = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.ptr)
+        map_clauses(is_device_ptr) capture(ByRef) -> !llvm.ptr {name = ""}
+
+  omp.target map_entries(%arg0_map -> %arg0_arg, %arg1_map -> %arg1_arg : !llvm.ptr, !llvm.ptr) {
+  omp.parallel private(@multi_block.privatizer %arg0_arg -> %arg2,
+                       @multi_block.privatizer2 %arg1_arg -> %arg3 : !llvm.ptr, !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    %1 = llvm.load %arg3 : !llvm.ptr -> f32
+    omp.terminator
+  }
+    omp.terminator
+  }
+  llvm.return
+}
+}
+
+// CHECK: call void @__kmpc_parallel_60(ptr addrspacecast (ptr addrspace(1) @3 to ptr), i32 %omp_global_thread_num, i32 1, i32 -1, i32 -1, ptr @[[OUTLINED_FN:[^[:space:]]+]], {{.*}})
+
+// CHECK: define internal void @[[OUTLINED_FN]]({{.*}}) {{.*}} {
+// CHECK: omp.par.entry:
+// Varify that both allocas were hoisted to the parallel region's entry block.
+// CHECK:        %{{.*}} = alloca { i64 }, align 8
+// CHECK-NEXT:   %{{.*}} = alloca { ptr }, align 8
+// CHECK-NEXT:   br label %omp.region.after_alloca1
+// CHECK: omp.region.after_alloca1:
+// CHECK: }


        


More information about the Mlir-commits mailing list