[Mlir-commits] [llvm] [mlir] [OpenMP][MLIR] Hoist static `alloca`s emitted by private `init` regions to the allocation IP of the construct (PR #171597)

Kareem Ergawy llvmlistbot at llvm.org
Sat Jan 3 22:42:26 PST 2026


https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/171597

>From a00f4e18c4830c442f3ed9d4bd109ab62cff6c5d Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Tue, 9 Dec 2025 09:11:05 -0600
Subject: [PATCH 1/4] [OpenMP][MLIR] Hoist static `alloca`s emitted by private
 `init` regions to the allocation IP of the construct

Having more than 1 descritpr (allocatable or array) on the same `private` clause triggers a runtime crash on GPUs at the moment.

For SPMD kernels, the issue happens because the initialization logic includes:
* Allocating a number of temporary structs (these are emitted by flang when `fir` is lowered to `mlir.llvm`).
* There is a conditional branch that determines whether we will allocate storage for the descriptor and initialize array bounds from the original descriptor or whether we will initialize the private descriptor to null.

Because of these 2 things, temp allocations needed for descriptors beyond the 1st one are preceded by branching which causes the observed the runtime crash.

This PR solves this issue by hoisting these static `alloca`s instructions to the suitable allca IP of the parent construct.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 75 ++++++++++++++++--
 .../openmp-private-allloca-hoisting.mlir      | 79 +++++++++++++++++++
 2 files changed, 146 insertions(+), 8 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index c37af8d7b1673..29a0a98c26acf 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1626,10 +1626,64 @@ static llvm::Expected<llvm::Value *> initPrivateVar(
   return phis[0];
 }
 
+/// Beginning with \p startBlock, this function visits all reachable successor
+/// blocks. For each such block, static alloca instructions (i.e. non-array
+/// allocas) are collected. Then, these collected alloca instructions are moved
+/// to the \p allocaIP insertion point.
+///
+/// This is useful in cases where, for example, more than one allocatable or
+/// array are privatized. In such cases, we allocate a number of temporary
+/// descriptors to handle the initialization logic. Additonally, for each
+/// private value, there is branching logic based on the value of the origianl
+/// private variable's allocation state. Therefore, we end up with descriptor
+/// alloca instructions preceded by conditional branches which casues runtime
+/// issues at least on the GPU.
+static void hoistStaticAllocasToAllocaIP(
+    llvm::BasicBlock *startBlock,
+    const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP) {
+  llvm::SmallVector<llvm::BasicBlock *> inlinedBlocks{startBlock};
+  llvm::SmallPtrSet<llvm::BasicBlock *, 4> seenBlocks;
+  llvm::SmallVector<llvm::Instruction *> staticAllocas;
+
+  while (!inlinedBlocks.empty()) {
+    llvm::BasicBlock *curBlock = inlinedBlocks.front();
+    inlinedBlocks.erase(inlinedBlocks.begin());
+    llvm::Instruction *terminator = curBlock->getTerminator();
+
+    for (llvm::Instruction &inst : *curBlock) {
+      if (auto *allocaInst = mlir::dyn_cast<llvm::AllocaInst>(&inst)) {
+        if (!allocaInst->isArrayAllocation()) {
+#ifdef EXPENSIVE_CHECKS
+          assert(llvm::count(staticInitAllocas, allocaInst) == 0);
+#endif
+          staticAllocas.push_back(allocaInst);
+        }
+      }
+    }
+
+    if (!terminator || !terminator->isTerminator() ||
+        terminator->getNumSuccessors() == 0)
+      continue;
+
+    for (unsigned i = 0; i < terminator->getNumSuccessors(); ++i) {
+      llvm::BasicBlock *successor = terminator->getSuccessor(i);
+
+      if (!seenBlocks.contains(successor)) {
+        inlinedBlocks.push_back(successor);
+        seenBlocks.insert(successor);
+      }
+    }
+  }
+
+  for (llvm::Instruction *staticAlloca : staticAllocas)
+    staticAlloca->moveBefore(allocaIP.getPoint());
+}
+
 static llvm::Error
 initPrivateVars(llvm::IRBuilderBase &builder,
                 LLVM::ModuleTranslation &moduleTranslation,
                 PrivateVarsInfo &privateVarsInfo,
+                const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
                 llvm::DenseMap<Value, Value> *mappedPrivateVars = nullptr) {
   if (privateVarsInfo.blockArgs.empty())
     return llvm::Error::success();
@@ -1654,6 +1708,8 @@ initPrivateVars(llvm::IRBuilderBase &builder,
     setInsertPointForPossiblyEmptyBlock(builder);
   }
 
+  hoistStaticAllocasToAllocaIP(privInitBlock, allocaIP);
+
   return llvm::Error::success();
 }
 
@@ -2605,7 +2661,8 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
                                 deferredStores, isByRef)))
     return failure();
 
-  if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo),
+  if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo,
+                                  allocaIP),
                   opInst)
           .failed())
     return failure();
@@ -2795,9 +2852,9 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     assert(afterAllocas.get()->getSinglePredecessor());
     builder.restoreIP(codeGenIP);
 
-    if (handleError(
-            initPrivateVars(builder, moduleTranslation, privateVarsInfo),
-            *opInst)
+    if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo,
+                                    allocaIP),
+                    *opInst)
             .failed())
       return llvm::make_error<PreviouslyReportedError>();
 
@@ -3014,7 +3071,8 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
                                 deferredStores, isByRef)))
     return failure();
 
-  if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo),
+  if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo,
+                                  allocaIP),
                   opInst)
           .failed())
     return failure();
@@ -5336,8 +5394,9 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
     if (handleError(afterAllocas, opInst).failed())
       return llvm::make_error<PreviouslyReportedError>();
 
-    if (handleError(initPrivateVars(builder, moduleTranslation, privVarsInfo),
-                    opInst)
+    if (handleError(
+            initPrivateVars(builder, moduleTranslation, privVarsInfo, allocaIP),
+            opInst)
             .failed())
       return llvm::make_error<PreviouslyReportedError>();
 
@@ -6138,7 +6197,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
 
     builder.restoreIP(codeGenIP);
     if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo,
-                                    &mappedPrivateVars),
+                                    allocaIP, &mappedPrivateVars),
                     *targetOp)
             .failed())
       return llvm::make_error<PreviouslyReportedError>();
diff --git a/mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir b/mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir
new file mode 100644
index 0000000000000..5371f2814085e
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir
@@ -0,0 +1,79 @@
+// Tests that static alloca's in `omp.private ... init` regions are hoisted to
+// the parent construct's alloca IP.
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+llvm.func @foo1()
+llvm.func @foo2()
+llvm.func @foo3()
+llvm.func @foo4()
+
+omp.private {type = private} @multi_block.privatizer : f32 init {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %alloca1 = llvm.alloca %0 x !llvm.struct<(i64)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+
+  %1 = llvm.load %arg0 : !llvm.ptr -> f32
+
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %c2 = llvm.mlir.constant(2 : i32) : i32
+  %cond1 = llvm.icmp "eq" %c1, %c2 : i32
+  llvm.cond_br %cond1, ^bb1, ^bb2
+
+^bb1:
+  llvm.call @foo1() : () -> ()
+  llvm.br ^bb3
+
+^bb2:
+  llvm.call @foo2() : () -> ()
+  llvm.br ^bb3
+
+^bb3:
+  llvm.store %1, %arg1 : f32, !llvm.ptr
+
+  omp.yield(%arg1 : !llvm.ptr)
+}
+
+omp.private {type = private} @multi_block.privatizer2 : f32 init {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %alloca1 = llvm.alloca %0 x !llvm.struct<(ptr)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+
+  %1 = llvm.load %arg0 : !llvm.ptr -> f32
+
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %c2 = llvm.mlir.constant(2 : i32) : i32
+  %cond1 = llvm.icmp "eq" %c1, %c2 : i32
+  llvm.cond_br %cond1, ^bb1, ^bb2
+
+^bb1:
+  llvm.call @foo3() : () -> ()
+  llvm.br ^bb3
+
+^bb2:
+  llvm.call @foo4() : () -> ()
+  llvm.br ^bb3
+
+^bb3:
+  llvm.store %1, %arg1 : f32, !llvm.ptr
+
+  omp.yield(%arg1 : !llvm.ptr)
+}
+
+llvm.func @parallel_op_private_multi_block(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
+  omp.parallel private(@multi_block.privatizer %arg0 -> %arg2,
+                       @multi_block.privatizer2 %arg1 -> %arg3 : !llvm.ptr, !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    %1 = llvm.load %arg3 : !llvm.ptr -> f32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK: define internal void @parallel_op_private_multi_block..omp_par({{.*}}) {{.*}} {
+// CHECK: omp.par.entry:
+// Varify that both allocas were hoisted to the parallel region's entry block.
+// CHECK:        %{{.*}} = alloca { i64 }, align 8
+// CHECK-NEXT:   %{{.*}} = alloca { ptr }, align 8
+// CHECK-NEXT:   br label %omp.region.after_alloca
+// CHECK: omp.region.after_alloca:
+// CHECK: }

>From 627ba6a3af1d25a5d40fa68a3ee7bb5c428f8b73 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Thu, 11 Dec 2025 02:43:35 -0600
Subject: [PATCH 2/4] review comments, Tom

---
 .../LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp      | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 29a0a98c26acf..9342bafc23332 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1665,14 +1665,11 @@ static void hoistStaticAllocasToAllocaIP(
         terminator->getNumSuccessors() == 0)
       continue;
 
-    for (unsigned i = 0; i < terminator->getNumSuccessors(); ++i) {
-      llvm::BasicBlock *successor = terminator->getSuccessor(i);
-
+    for (llvm::BasicBlock *successor : llvm::successors(terminator))
       if (!seenBlocks.contains(successor)) {
         inlinedBlocks.push_back(successor);
         seenBlocks.insert(successor);
       }
-    }
   }
 
   for (llvm::Instruction *staticAlloca : staticAllocas)

>From 8aec01ad4a28f9c971986506c40934861df181a6 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Mon, 22 Dec 2025 03:45:34 -0600
Subject: [PATCH 3/4] use post-dominance info

---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  1 +
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 34 +++++++++
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 72 +++----------------
 .../openmp-private-allloca-hoisting.mlir      | 26 +++++--
 4 files changed, 62 insertions(+), 71 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index f5eb6222fd58d..59f66f4cf3793 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2348,6 +2348,7 @@ class OpenMPIRBuilder {
     PostOutlineCBTy PostOutlineCB;
     BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
     SmallVector<Value *, 2> ExcludeArgsFromAggregate;
+    bool FixUpNonEntryAllocas = false;
 
     /// Collect all blocks in between EntryBB and ExitBB in both the given
     /// vector and set.
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 716f8582dd7b2..19b24060c0f92 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -763,6 +764,28 @@ static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder,
   }
 }
 
+static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block) {
+  llvm::SmallVector<llvm::Instruction *> AllocasToMove;
+
+  auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
+    // TODO: For now, we support simple static allocations, we might need to
+    // move non-static ones as well. However, this will need further analysis to
+    // move the lenght arguments as well.
+    return !AllocaInst.isArrayAllocation();
+  };
+
+  for (llvm::Instruction &Inst : Block)
+    if (auto *AllocaInst = llvm::dyn_cast<llvm::AllocaInst>(&Inst))
+      if (ShouldHoistAlloca(*AllocaInst))
+        AllocasToMove.push_back(AllocaInst);
+
+  auto InsertPoint =
+      Block.getParent()->getEntryBlock().getTerminator()->getIterator();
+
+  for (llvm::Instruction *AllocaInst : AllocasToMove)
+    AllocaInst->moveBefore(InsertPoint);
+}
+
 void OpenMPIRBuilder::finalize(Function *Fn) {
   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
   SmallVector<BasicBlock *, 32> Blocks;
@@ -867,6 +890,13 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
     // Run a user callback, e.g. to add attributes.
     if (OI.PostOutlineCB)
       OI.PostOutlineCB(*OutlinedFn);
+
+    if (OI.FixUpNonEntryAllocas) {
+      PostDominatorTree PostDomTree(*OutlinedFn);
+      for (llvm::BasicBlock &BB : *OutlinedFn)
+        if (PostDomTree.properlyDominates(&BB, &OutlinedFn->getEntryBlock()))
+          hoistNonEntryAllocasToEntryBlock(BB);
+    }
   }
 
   // Remove work items that have been completed.
@@ -1693,7 +1723,10 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
       targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
                              IfCondition, NumThreads, PrivTID, PrivTIDAddr,
                              ThreadID, ToBeDeletedVec);
+
+
     };
+    OI.FixUpNonEntryAllocas = true;
   } else {
     // Generate OpenMP host runtime call
     OI.PostOutlineCB = [=, ToBeDeletedVec =
@@ -1701,6 +1734,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
       hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
                            PrivTID, PrivTIDAddr, ToBeDeletedVec);
     };
+    // TODO: fix-up allocations on the host as well?
   }
 
   OI.OuterAllocaBB = OuterAllocaBlock;
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 9342bafc23332..c37af8d7b1673 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1626,61 +1626,10 @@ static llvm::Expected<llvm::Value *> initPrivateVar(
   return phis[0];
 }
 
-/// Beginning with \p startBlock, this function visits all reachable successor
-/// blocks. For each such block, static alloca instructions (i.e. non-array
-/// allocas) are collected. Then, these collected alloca instructions are moved
-/// to the \p allocaIP insertion point.
-///
-/// This is useful in cases where, for example, more than one allocatable or
-/// array are privatized. In such cases, we allocate a number of temporary
-/// descriptors to handle the initialization logic. Additonally, for each
-/// private value, there is branching logic based on the value of the origianl
-/// private variable's allocation state. Therefore, we end up with descriptor
-/// alloca instructions preceded by conditional branches which casues runtime
-/// issues at least on the GPU.
-static void hoistStaticAllocasToAllocaIP(
-    llvm::BasicBlock *startBlock,
-    const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP) {
-  llvm::SmallVector<llvm::BasicBlock *> inlinedBlocks{startBlock};
-  llvm::SmallPtrSet<llvm::BasicBlock *, 4> seenBlocks;
-  llvm::SmallVector<llvm::Instruction *> staticAllocas;
-
-  while (!inlinedBlocks.empty()) {
-    llvm::BasicBlock *curBlock = inlinedBlocks.front();
-    inlinedBlocks.erase(inlinedBlocks.begin());
-    llvm::Instruction *terminator = curBlock->getTerminator();
-
-    for (llvm::Instruction &inst : *curBlock) {
-      if (auto *allocaInst = mlir::dyn_cast<llvm::AllocaInst>(&inst)) {
-        if (!allocaInst->isArrayAllocation()) {
-#ifdef EXPENSIVE_CHECKS
-          assert(llvm::count(staticInitAllocas, allocaInst) == 0);
-#endif
-          staticAllocas.push_back(allocaInst);
-        }
-      }
-    }
-
-    if (!terminator || !terminator->isTerminator() ||
-        terminator->getNumSuccessors() == 0)
-      continue;
-
-    for (llvm::BasicBlock *successor : llvm::successors(terminator))
-      if (!seenBlocks.contains(successor)) {
-        inlinedBlocks.push_back(successor);
-        seenBlocks.insert(successor);
-      }
-  }
-
-  for (llvm::Instruction *staticAlloca : staticAllocas)
-    staticAlloca->moveBefore(allocaIP.getPoint());
-}
-
 static llvm::Error
 initPrivateVars(llvm::IRBuilderBase &builder,
                 LLVM::ModuleTranslation &moduleTranslation,
                 PrivateVarsInfo &privateVarsInfo,
-                const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
                 llvm::DenseMap<Value, Value> *mappedPrivateVars = nullptr) {
   if (privateVarsInfo.blockArgs.empty())
     return llvm::Error::success();
@@ -1705,8 +1654,6 @@ initPrivateVars(llvm::IRBuilderBase &builder,
     setInsertPointForPossiblyEmptyBlock(builder);
   }
 
-  hoistStaticAllocasToAllocaIP(privInitBlock, allocaIP);
-
   return llvm::Error::success();
 }
 
@@ -2658,8 +2605,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
                                 deferredStores, isByRef)))
     return failure();
 
-  if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo,
-                                  allocaIP),
+  if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo),
                   opInst)
           .failed())
     return failure();
@@ -2849,9 +2795,9 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     assert(afterAllocas.get()->getSinglePredecessor());
     builder.restoreIP(codeGenIP);
 
-    if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo,
-                                    allocaIP),
-                    *opInst)
+    if (handleError(
+            initPrivateVars(builder, moduleTranslation, privateVarsInfo),
+            *opInst)
             .failed())
       return llvm::make_error<PreviouslyReportedError>();
 
@@ -3068,8 +3014,7 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
                                 deferredStores, isByRef)))
     return failure();
 
-  if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo,
-                                  allocaIP),
+  if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo),
                   opInst)
           .failed())
     return failure();
@@ -5391,9 +5336,8 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
     if (handleError(afterAllocas, opInst).failed())
       return llvm::make_error<PreviouslyReportedError>();
 
-    if (handleError(
-            initPrivateVars(builder, moduleTranslation, privVarsInfo, allocaIP),
-            opInst)
+    if (handleError(initPrivateVars(builder, moduleTranslation, privVarsInfo),
+                    opInst)
             .failed())
       return llvm::make_error<PreviouslyReportedError>();
 
@@ -6194,7 +6138,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
 
     builder.restoreIP(codeGenIP);
     if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo,
-                                    allocaIP, &mappedPrivateVars),
+                                    &mappedPrivateVars),
                     *targetOp)
             .failed())
       return llvm::make_error<PreviouslyReportedError>();
diff --git a/mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir b/mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir
index 5371f2814085e..71c4b9cdede59 100644
--- a/mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir
@@ -2,6 +2,7 @@
 // the parent construct's alloca IP.
 // RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
 
+module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
 llvm.func @foo1()
 llvm.func @foo2()
 llvm.func @foo3()
@@ -10,7 +11,7 @@ llvm.func @foo4()
 omp.private {type = private} @multi_block.privatizer : f32 init {
 ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
   %0 = llvm.mlir.constant(1 : i32) : i32
-  %alloca1 = llvm.alloca %0 x !llvm.struct<(i64)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %alloca1 = llvm.alloca %0 x !llvm.struct<(i64)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>
 
   %1 = llvm.load %arg0 : !llvm.ptr -> f32
 
@@ -36,7 +37,7 @@ omp.private {type = private} @multi_block.privatizer : f32 init {
 omp.private {type = private} @multi_block.privatizer2 : f32 init {
 ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
   %0 = llvm.mlir.constant(1 : i32) : i32
-  %alloca1 = llvm.alloca %0 x !llvm.struct<(ptr)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %alloca1 = llvm.alloca %0 x !llvm.struct<(ptr)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>
 
   %1 = llvm.load %arg0 : !llvm.ptr -> f32
 
@@ -60,20 +61,31 @@ omp.private {type = private} @multi_block.privatizer2 : f32 init {
 }
 
 llvm.func @parallel_op_private_multi_block(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
-  omp.parallel private(@multi_block.privatizer %arg0 -> %arg2,
-                       @multi_block.privatizer2 %arg1 -> %arg3 : !llvm.ptr, !llvm.ptr) {
+  %arg0_map = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.ptr)
+        map_clauses(is_device_ptr) capture(ByRef) -> !llvm.ptr {name = ""}
+  %arg1_map = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.ptr)
+        map_clauses(is_device_ptr) capture(ByRef) -> !llvm.ptr {name = ""}
+
+  omp.target map_entries(%arg0_map -> %arg0_arg, %arg1_map -> %arg1_arg : !llvm.ptr, !llvm.ptr) {
+  omp.parallel private(@multi_block.privatizer %arg0_arg -> %arg2,
+                       @multi_block.privatizer2 %arg1_arg -> %arg3 : !llvm.ptr, !llvm.ptr) {
     %0 = llvm.load %arg2 : !llvm.ptr -> f32
     %1 = llvm.load %arg3 : !llvm.ptr -> f32
     omp.terminator
+  }
+    omp.terminator
   }
   llvm.return
 }
+}
+
+// CHECK: call void @__kmpc_parallel_60(ptr addrspacecast (ptr addrspace(1) @3 to ptr), i32 %omp_global_thread_num, i32 1, i32 -1, i32 -1, ptr @[[OUTLINED_FN:[^[:space:]]+]], {{.*}})
 
-// CHECK: define internal void @parallel_op_private_multi_block..omp_par({{.*}}) {{.*}} {
+// CHECK: define internal void @[[OUTLINED_FN]]({{.*}}) {{.*}} {
 // CHECK: omp.par.entry:
 // Varify that both allocas were hoisted to the parallel region's entry block.
 // CHECK:        %{{.*}} = alloca { i64 }, align 8
 // CHECK-NEXT:   %{{.*}} = alloca { ptr }, align 8
-// CHECK-NEXT:   br label %omp.region.after_alloca
-// CHECK: omp.region.after_alloca:
+// CHECK-NEXT:   br label %omp.region.after_alloca1
+// CHECK: omp.region.after_alloca1:
 // CHECK: }

>From 65371093e5f211933651840b28163a2497215434 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Sun, 4 Jan 2026 00:42:09 -0600
Subject: [PATCH 4/4] review comments, Tom

---
 llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h | 1 +
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp        | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 59f66f4cf3793..05d8a7dd168a3 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2348,6 +2348,7 @@ class OpenMPIRBuilder {
     PostOutlineCBTy PostOutlineCB;
     BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
     SmallVector<Value *, 2> ExcludeArgsFromAggregate;
+    // TODO: this should be safe to enable by default
     bool FixUpNonEntryAllocas = false;
 
     /// Collect all blocks in between EntryBB and ExitBB in both the given
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 19b24060c0f92..f764b644edc69 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1723,8 +1723,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
       targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
                              IfCondition, NumThreads, PrivTID, PrivTIDAddr,
                              ThreadID, ToBeDeletedVec);
-
-
     };
     OI.FixUpNonEntryAllocas = true;
   } else {



More information about the Mlir-commits mailing list