[clang] [OpenMP][OMPIRBuilder] Add support to omp target parallel (PR #67000)
Dominik Adamski via cfe-commits
cfe-commits at lists.llvm.org
Mon Oct 23 04:40:34 PDT 2023
https://github.com/DominikAdamski updated https://github.com/llvm/llvm-project/pull/67000
>From e801022968ea4a42632fbcf4c5ba03e67a32c7ae Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Mon, 11 Sep 2023 05:31:37 -0400
Subject: [PATCH] [OpenMP][OMPIRBuilder] Add support to omp target parallel
Added support for LLVM IR code generation which is used for handling
omp target parallel code. The call for __kmpc_parallel_51 is generated
and the parallel region is outlined to separate function.
The proper setup of kmpc_target_init mode is not included in the commit.
It is assumed that the SPMD mode for target init is properly set by other
codegen functions.
---
clang/test/OpenMP/cancel_codegen.cpp | 20 +-
clang/test/OpenMP/parallel_codegen.cpp | 4 +-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 314 +++++++++++++-----
llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 1 +
.../Frontend/OpenMPIRBuilderTest.cpp | 139 +++++++-
5 files changed, 378 insertions(+), 100 deletions(-)
diff --git a/clang/test/OpenMP/cancel_codegen.cpp b/clang/test/OpenMP/cancel_codegen.cpp
index 53580e0c2b0293f..03024cf331b2717 100644
--- a/clang/test/OpenMP/cancel_codegen.cpp
+++ b/clang/test/OpenMP/cancel_codegen.cpp
@@ -1026,25 +1026,25 @@ for (int i = 0; i < argc; ++i) {
// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
-// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
-// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !14
-// CHECK3-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
-// CHECK3-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
-// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !14
-// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14
-// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias ![[NOALIAS0:[0-9]+]]
+// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias ![[NOALIAS0]]
+// CHECK3-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias ![[NOALIAS0]]
+// CHECK3-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias ![[NOALIAS0]]
+// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias ![[NOALIAS0]]
+// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias ![[NOALIAS0]]
+// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias ![[NOALIAS0]]
// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB12:[0-9]+]])
// CHECK3-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_cancel(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i32 4)
// CHECK3-NEXT: [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
// CHECK3-NEXT: br i1 [[TMP10]], label [[DOTCANCEL_EXIT_I:%.*]], label [[DOTCANCEL_CONTINUE_I:%.*]]
// CHECK3: .cancel.exit.i:
-// CHECK3-NEXT: store i32 1, ptr [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !14
+// CHECK3-NEXT: store i32 1, ptr [[CLEANUP_DEST_SLOT_I]], align 4, !noalias ![[NOALIAS1:[0-9]+]]
// CHECK3-NEXT: br label [[DOTOMP_OUTLINED__EXIT:%.*]]
// CHECK3: .cancel.continue.i:
-// CHECK3-NEXT: store i32 0, ptr [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !14
+// CHECK3-NEXT: store i32 0, ptr [[CLEANUP_DEST_SLOT_I]], align 4, !noalias ![[NOALIAS1]]
// CHECK3-NEXT: br label [[DOTOMP_OUTLINED__EXIT]]
// CHECK3: .omp_outlined..exit:
-// CHECK3-NEXT: [[CLEANUP_DEST_I:%.*]] = load i32, ptr [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !14
+// CHECK3-NEXT: [[CLEANUP_DEST_I:%.*]] = load i32, ptr [[CLEANUP_DEST_SLOT_I]], align 4, !noalias ![[NOALIAS1]]
// CHECK3-NEXT: ret i32 0
//
//
diff --git a/clang/test/OpenMP/parallel_codegen.cpp b/clang/test/OpenMP/parallel_codegen.cpp
index 5c98761be0808ef..d545b4a9d9fa887 100644
--- a/clang/test/OpenMP/parallel_codegen.cpp
+++ b/clang/test/OpenMP/parallel_codegen.cpp
@@ -812,7 +812,7 @@ int main (int argc, char **argv) {
//
//
// CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIPPcEiT_..omp_par
-// CHECK3-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] {
+// CHECK3-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]]
// CHECK3-NEXT: omp.par.entry:
// CHECK3-NEXT: [[GEP__RELOADED:%.*]] = getelementptr { ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
// CHECK3-NEXT: [[LOADGEP__RELOADED:%.*]] = load ptr, ptr [[GEP__RELOADED]], align 8
@@ -956,7 +956,7 @@ int main (int argc, char **argv) {
//
//
// CHECK4-LABEL: define {{[^@]+}}@_Z5tmainIPPcEiT_..omp_par
-// CHECK4-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG57:![0-9]+]] {
+// CHECK4-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG57:![0-9]+]] {
// CHECK4-NEXT: omp.par.entry:
// CHECK4-NEXT: [[GEP__RELOADED:%.*]] = getelementptr { ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
// CHECK4-NEXT: [[LOADGEP__RELOADED:%.*]] = load ptr, ptr [[GEP__RELOADED]], align 8
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 5b24e9fe2e0c5bd..5a305931bf23ea3 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -638,6 +638,13 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
Function *OuterFn = OI.getFunction();
CodeExtractorAnalysisCache CEAC(*OuterFn);
+ // If we generate code for the target device, we need to allocate
+ // struct for aggregate params in the device default alloca address space.
+ // OpenMP runtime requires that the params of the extracted functions are
+ // passed as zero address space pointers. This flag ensures that
+ // CodeExtractor generates correct code for extracted functions
+ // which are used by OpenMP runtime.
+ bool ArgsInZeroAddressSpace = Config.isTargetDevice();
CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
/* AggregateArgs */ true,
/* BlockFrequencyInfo */ nullptr,
@@ -646,7 +653,7 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
/* AllowVarArgs */ true,
/* AllowAlloca */ true,
/* AllocaBlock*/ OI.OuterAllocaBB,
- /* Suffix */ ".omp_par");
+ /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
@@ -1126,6 +1133,185 @@ void OpenMPIRBuilder::emitCancelationCheckImpl(Value *CancelFlag,
Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
}
+// Callback used to create OpenMP runtime calls to support
+// omp parallel clause for the device.
+// We need to use this callback to replace call to the OutlinedFn in OuterFn
+// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
+static void
+targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
+ Function *OuterFn, Value *Ident, Value *IfCondition,
+ Value *NumThreads, Instruction *PrivTID,
+ AllocaInst *PrivTIDAddr, Value *ThreadID,
+ const SmallVector<Instruction *, 4> &ToBeDeleted) {
+ // Add some known attributes.
+ Module &M = OMPIRBuilder->M;
+ IRBuilder<> &Builder = OMPIRBuilder->Builder;
+ OutlinedFn.addParamAttr(0, Attribute::NoAlias);
+ OutlinedFn.addParamAttr(1, Attribute::NoAlias);
+ OutlinedFn.addParamAttr(0, Attribute::NoUndef);
+ OutlinedFn.addParamAttr(1, Attribute::NoUndef);
+ OutlinedFn.addFnAttr(Attribute::NoUnwind);
+
+ assert(OutlinedFn.arg_size() >= 2 &&
+ "Expected at least tid and bounded tid as arguments");
+ unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
+
+ CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
+ assert(CI && "Expected call instruction to outlined function");
+ CI->getParent()->setName("omp_parallel");
+ // Replace direct call to the outlined function by the call to
+ // __kmpc_parallel_51
+ Builder.SetInsertPoint(CI);
+
+ // Build call __kmpc_parallel_51
+ auto PtrTy = Type::getInt8PtrTy(M.getContext());
+ Value *Void = ConstantPointerNull::get(PtrTy);
+ // Add alloca for kernel args. Put this instruction at the beginning
+ // of the function.
+ OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
+ Builder.SetInsertPoint(&OuterFn->front(),
+ OuterFn->front().getFirstInsertionPt());
+ AllocaInst *ArgsAlloca =
+ Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
+ Value *Args =
+ Builder.CreatePointerCast(ArgsAlloca, Type::getInt8PtrTy(M.getContext()));
+ Builder.restoreIP(CurrentIP);
+ // Store captured vars which are used by kmpc_parallel_51
+ if (NumCapturedVars) {
+ for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
+ Value *V = *(CI->arg_begin() + 2 + Idx);
+ Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
+ ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
+ Builder.CreateStore(V, StoreAddress);
+ }
+ }
+ Value *Cond = IfCondition ? Builder.CreateSExtOrTrunc(
+ IfCondition, Type::getInt32Ty(M.getContext()))
+ : Builder.getInt32(1);
+ Value *Parallel51CallArgs[] = {
+ /* identifier*/ Ident,
+ /* global thread num*/ ThreadID,
+ /* if expression */ Cond, NumThreads ? NumThreads : Builder.getInt32(-1),
+ /* Proc bind */ Builder.getInt32(-1),
+ /* outlined function */
+ Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr), Void,
+ Args, Builder.getInt64(NumCapturedVars)};
+
+ SmallVector<Value *, 16> RealArgs;
+ RealArgs.append(std::begin(Parallel51CallArgs), std::end(Parallel51CallArgs));
+ FunctionCallee RTLFn =
+ OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
+
+ Builder.CreateCall(RTLFn, RealArgs);
+
+ LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
+ << *Builder.GetInsertBlock()->getParent() << "\n");
+
+ // Initialize the local TID stack location with the argument value.
+ Builder.SetInsertPoint(PrivTID);
+ Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
+ Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
+ PrivTIDAddr);
+
+ // Remove redundant call to the outlined function.
+ CI->eraseFromParent();
+
+ for (Instruction *I : ToBeDeleted) {
+ I->eraseFromParent();
+ }
+}
+
+// Callback used to create OpenMP runtime calls to support
+// omp parallel clause for the host.
+// We need to use this callback to replace call to the OutlinedFn in OuterFn
+// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
+static void
+hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
+ Function *OuterFn, Value *Ident, Value *IfCondition,
+ Instruction *PrivTID, AllocaInst *PrivTIDAddr,
+ const SmallVector<Instruction *, 4> &ToBeDeleted) {
+ Module &M = OMPIRBuilder->M;
+ IRBuilder<> &Builder = OMPIRBuilder->Builder;
+ FunctionCallee RTLFn;
+ if (IfCondition) {
+ RTLFn =
+ OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
+ } else {
+ RTLFn =
+ OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
+ }
+ if (auto *F = dyn_cast<llvm::Function>(RTLFn.getCallee())) {
+ if (!F->hasMetadata(llvm::LLVMContext::MD_callback)) {
+ llvm::LLVMContext &Ctx = F->getContext();
+ MDBuilder MDB(Ctx);
+ // Annotate the callback behavior of the __kmpc_fork_call:
+ // - The callback callee is argument number 2 (microtask).
+ // - The first two arguments of the callback callee are unknown (-1).
+ // - All variadic arguments to the __kmpc_fork_call are passed to the
+ // callback callee.
+ F->addMetadata(
+ llvm::LLVMContext::MD_callback,
+ *llvm::MDNode::get(
+ Ctx, {MDB.createCallbackEncoding(2, {-1, -1},
+ /* VarArgsArePassed */ true)}));
+ }
+ }
+ // Add some known attributes.
+ OutlinedFn.addParamAttr(0, Attribute::NoAlias);
+ OutlinedFn.addParamAttr(1, Attribute::NoAlias);
+ OutlinedFn.addFnAttr(Attribute::NoUnwind);
+
+ assert(OutlinedFn.arg_size() >= 2 &&
+ "Expected at least tid and bounded tid as arguments");
+ unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
+
+ CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
+ CI->getParent()->setName("omp_parallel");
+ Builder.SetInsertPoint(CI);
+
+ // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
+ Value *ForkCallArgs[] = {
+ Ident, Builder.getInt32(NumCapturedVars),
+ Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
+
+ SmallVector<Value *, 16> RealArgs;
+ RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
+ if (IfCondition) {
+ Value *Cond = Builder.CreateSExtOrTrunc(IfCondition,
+ Type::getInt32Ty(M.getContext()));
+ RealArgs.push_back(Cond);
+ }
+ RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
+
+ // __kmpc_fork_call_if always expects a void ptr as the last argument
+ // If there are no arguments, pass a null pointer.
+ auto PtrTy = Type::getInt8PtrTy(M.getContext());
+ if (IfCondition && NumCapturedVars == 0) {
+ Value *Void = ConstantPointerNull::get(PtrTy);
+ RealArgs.push_back(Void);
+ }
+ if (IfCondition && RealArgs.back()->getType() != PtrTy)
+ RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
+
+ Builder.CreateCall(RTLFn, RealArgs);
+
+ LLVM_DEBUG(dbgs() << "With fork_call placed: "
+ << *Builder.GetInsertBlock()->getParent() << "\n");
+
+ // Initialize the local TID stack location with the argument value.
+ Builder.SetInsertPoint(PrivTID);
+ Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
+ Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
+ PrivTIDAddr);
+
+ // Remove redundant call to the outlined function.
+ CI->eraseFromParent();
+
+ for (Instruction *I : ToBeDeleted) {
+ I->eraseFromParent();
+ }
+}
+
IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
@@ -1140,6 +1326,12 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
Value *ThreadID = getOrCreateThreadID(Ident);
+ // If we generate code for the target device, we need to allocate
+ // struct for aggregate params in the device default alloca address space.
+ // OpenMP runtime requires that the params of the extracted functions are
+ // passed as zero address space pointers. This flag ensures that extracted
+ // function arguments are declared in zero address space
+ bool ArgsInZeroAddressSpace = Config.isTargetDevice();
if (NumThreads) {
// Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
@@ -1173,13 +1365,28 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
// Change the location to the outer alloca insertion point to create and
// initialize the allocas we pass into the parallel region.
Builder.restoreIP(OuterAllocaIP);
- AllocaInst *TIDAddr = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
- AllocaInst *ZeroAddr = Builder.CreateAlloca(Int32, nullptr, "zero.addr");
+ AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
+ AllocaInst *ZeroAddrAlloca =
+ Builder.CreateAlloca(Int32, nullptr, "zero.addr");
+ Instruction *TIDAddr = TIDAddrAlloca;
+ Instruction *ZeroAddr = ZeroAddrAlloca;
+ if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
+ // Add additional casts to enforce pointers in zero address space
+ TIDAddr = new AddrSpaceCastInst(
+ TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
+ TIDAddr->insertAfter(TIDAddrAlloca);
+ ToBeDeleted.push_back(TIDAddr);
+ ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
+ PointerType ::get(M.getContext(), 0),
+ "zero.addr.ascast");
+ ZeroAddr->insertAfter(ZeroAddrAlloca);
+ ToBeDeleted.push_back(ZeroAddr);
+ }
// We only need TIDAddr and ZeroAddr for modeling purposes to get the
// associated arguments in the outlined function, so we delete them later.
- ToBeDeleted.push_back(TIDAddr);
- ToBeDeleted.push_back(ZeroAddr);
+ ToBeDeleted.push_back(TIDAddrAlloca);
+ ToBeDeleted.push_back(ZeroAddrAlloca);
// Create an artificial insertion point that will also ensure the blocks we
// are about to split are not degenerated.
@@ -1247,87 +1454,24 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
BodyGenCB(InnerAllocaIP, CodeGenIP);
LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
- FunctionCallee RTLFn;
- if (IfCondition)
- RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
- else
- RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
-
- if (auto *F = dyn_cast<llvm::Function>(RTLFn.getCallee())) {
- if (!F->hasMetadata(llvm::LLVMContext::MD_callback)) {
- llvm::LLVMContext &Ctx = F->getContext();
- MDBuilder MDB(Ctx);
- // Annotate the callback behavior of the __kmpc_fork_call:
- // - The callback callee is argument number 2 (microtask).
- // - The first two arguments of the callback callee are unknown (-1).
- // - All variadic arguments to the __kmpc_fork_call are passed to the
- // callback callee.
- F->addMetadata(
- llvm::LLVMContext::MD_callback,
- *llvm::MDNode::get(
- Ctx, {MDB.createCallbackEncoding(2, {-1, -1},
- /* VarArgsArePassed */ true)}));
- }
- }
OutlineInfo OI;
- OI.PostOutlineCB = [=](Function &OutlinedFn) {
- // Add some known attributes.
- OutlinedFn.addParamAttr(0, Attribute::NoAlias);
- OutlinedFn.addParamAttr(1, Attribute::NoAlias);
- OutlinedFn.addFnAttr(Attribute::NoUnwind);
- OutlinedFn.addFnAttr(Attribute::NoRecurse);
-
- assert(OutlinedFn.arg_size() >= 2 &&
- "Expected at least tid and bounded tid as arguments");
- unsigned NumCapturedVars =
- OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
-
- CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
- CI->getParent()->setName("omp_parallel");
- Builder.SetInsertPoint(CI);
-
- // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
- Value *ForkCallArgs[] = {
- Ident, Builder.getInt32(NumCapturedVars),
- Builder.CreateBitCast(&OutlinedFn, ParallelTaskPtr)};
-
- SmallVector<Value *, 16> RealArgs;
- RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
- if (IfCondition) {
- Value *Cond = Builder.CreateSExtOrTrunc(IfCondition,
- Type::getInt32Ty(M.getContext()));
- RealArgs.push_back(Cond);
- }
- RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
-
- // __kmpc_fork_call_if always expects a void ptr as the last argument
- // If there are no arguments, pass a null pointer.
- auto PtrTy = Type::getInt8PtrTy(M.getContext());
- if (IfCondition && NumCapturedVars == 0) {
- llvm::Value *Void = ConstantPointerNull::get(PtrTy);
- RealArgs.push_back(Void);
- }
- if (IfCondition && RealArgs.back()->getType() != PtrTy)
- RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
-
- Builder.CreateCall(RTLFn, RealArgs);
-
- LLVM_DEBUG(dbgs() << "With fork_call placed: "
- << *Builder.GetInsertBlock()->getParent() << "\n");
-
- InsertPointTy ExitIP(PRegExitBB, PRegExitBB->end());
-
- // Initialize the local TID stack location with the argument value.
- Builder.SetInsertPoint(PrivTID);
- Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
- Builder.CreateStore(Builder.CreateLoad(Int32, OutlinedAI), PrivTIDAddr);
-
- CI->eraseFromParent();
-
- for (Instruction *I : ToBeDeleted)
- I->eraseFromParent();
- };
+ if (Config.isTargetDevice()) {
+ // Generate OpenMP target specific runtime call
+ OI.PostOutlineCB = [=, ToBeDeletedVec =
+ std::move(ToBeDeleted)](Function &OutlinedFn) {
+ targetParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
+ NumThreads, PrivTID, PrivTIDAddr, ThreadID,
+ ToBeDeletedVec);
+ };
+ } else {
+ // Generate OpenMP host runtime call
+ OI.PostOutlineCB = [=, ToBeDeletedVec =
+ std::move(ToBeDeleted)](Function &OutlinedFn) {
+ hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
+ PrivTID, PrivTIDAddr, ToBeDeletedVec);
+ };
+ }
// Adjust the finalization stack, verify the adjustment, and call the
// finalize function a last time to finalize values between the pre-fini
@@ -1367,7 +1511,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
/* AllowVarArgs */ true,
/* AllowAlloca */ true,
/* AllocationBlock */ OuterAllocaBlock,
- /* Suffix */ ".omp_par");
+ /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
// Find inputs to, outputs from the code region.
BasicBlock *CommonExit = nullptr;
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index e50e74ea6c0d5aa..d6763fe79c24da2 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -274,6 +274,7 @@ struct OMPInformationCache : public InformationCache {
: InformationCache(M, AG, Allocator, CGSCC), OMPBuilder(M),
OpenMPPostLink(OpenMPPostLink) {
+ OMPBuilder.Config.IsTargetDevice = isOpenMPDevice(OMPBuilder.M);
OMPBuilder.initialize();
initializeRuntimeFunctions(M);
initializeInternalControlVars();
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 97cfc339675f657..b0793a034c07614 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -591,9 +591,124 @@ TEST_F(OpenMPIRBuilderTest, DbgLoc) {
EXPECT_EQ(SrcSrc->getAsCString(), ";/src/test.dbg;foo;3;7;;");
}
+TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
+ using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+ std::string oldDLStr = M->getDataLayoutStr();
+ M->setDataLayout(
+ "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:"
+ "256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:"
+ "256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8");
+ OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = true;
+ OMPBuilder.initialize();
+ F->setName("func");
+ IRBuilder<> Builder(BB);
+ BasicBlock *EnterBB = BasicBlock::Create(Ctx, "parallel.enter", F);
+ Builder.CreateBr(EnterBB);
+ Builder.SetInsertPoint(EnterBB);
+ OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+ Loc = OMPBuilder.createTargetInit(Loc, true);
+
+ AllocaInst *PrivAI = nullptr;
+
+ unsigned NumBodiesGenerated = 0;
+ unsigned NumPrivatizedVars = 0;
+ unsigned NumFinalizationPoints = 0;
+
+ auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+ ++NumBodiesGenerated;
+
+ Builder.restoreIP(AllocaIP);
+ PrivAI = Builder.CreateAlloca(F->arg_begin()->getType());
+ Builder.CreateStore(F->arg_begin(), PrivAI);
+
+ Builder.restoreIP(CodeGenIP);
+ Value *PrivLoad =
+ Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI, "local.use");
+ Value *Cmp = Builder.CreateICmpNE(F->arg_begin(), PrivLoad);
+ Instruction *ThenTerm, *ElseTerm;
+ SplitBlockAndInsertIfThenElse(Cmp, CodeGenIP.getBlock()->getTerminator(),
+ &ThenTerm, &ElseTerm);
+ };
+
+ auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+ Value &Orig, Value &Inner,
+ Value *&ReplacementValue) -> InsertPointTy {
+ ++NumPrivatizedVars;
+
+ if (!isa<AllocaInst>(Orig)) {
+ EXPECT_EQ(&Orig, F->arg_begin());
+ ReplacementValue = &Inner;
+ return CodeGenIP;
+ }
+
+ // Since the original value is an allocation, it has a pointer type and
+ // therefore no additional wrapping should happen.
+ EXPECT_EQ(&Orig, &Inner);
+
+ // Trivial copy (=firstprivate).
+ Builder.restoreIP(AllocaIP);
+ Type *VTy = ReplacementValue->getType();
+ Value *V = Builder.CreateLoad(VTy, &Inner, Orig.getName() + ".reload");
+ ReplacementValue = Builder.CreateAlloca(VTy, 0, Orig.getName() + ".copy");
+ Builder.restoreIP(CodeGenIP);
+ Builder.CreateStore(V, ReplacementValue);
+ return CodeGenIP;
+ };
+
+ auto FiniCB = [&](InsertPointTy CodeGenIP) { ++NumFinalizationPoints; };
+
+ IRBuilder<>::InsertPoint AllocaIP(&F->getEntryBlock(),
+ F->getEntryBlock().getFirstInsertionPt());
+ IRBuilder<>::InsertPoint AfterIP =
+ OMPBuilder.createParallel(Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB,
+ nullptr, nullptr, OMP_PROC_BIND_default, false);
+
+ EXPECT_EQ(NumBodiesGenerated, 1U);
+ EXPECT_EQ(NumPrivatizedVars, 1U);
+ EXPECT_EQ(NumFinalizationPoints, 1U);
+
+ Builder.restoreIP(AfterIP);
+ OMPBuilder.createTargetDeinit(Builder);
+ Builder.CreateRetVoid();
+
+ OMPBuilder.finalize();
+ Function *OutlinedFn = PrivAI->getFunction();
+ EXPECT_FALSE(verifyModule(*M, &errs()));
+ EXPECT_NE(OutlinedFn, F);
+ EXPECT_TRUE(OutlinedFn->hasFnAttribute(Attribute::NoUnwind));
+ EXPECT_TRUE(OutlinedFn->hasParamAttribute(0, Attribute::NoAlias));
+ EXPECT_TRUE(OutlinedFn->hasParamAttribute(1, Attribute::NoAlias));
+
+ EXPECT_TRUE(OutlinedFn->hasInternalLinkage());
+ EXPECT_EQ(OutlinedFn->arg_size(), 3U);
+ // Make sure that arguments are pointers in 0 address address space
+ EXPECT_EQ(OutlinedFn->getArg(0)->getType(),
+ PointerType::get(M->getContext(), 0));
+ EXPECT_EQ(OutlinedFn->getArg(1)->getType(),
+ PointerType::get(M->getContext(), 0));
+ EXPECT_EQ(OutlinedFn->getArg(2)->getType(),
+ PointerType::get(M->getContext(), 0));
+ EXPECT_EQ(&OutlinedFn->getEntryBlock(), PrivAI->getParent());
+ EXPECT_EQ(OutlinedFn->getNumUses(), 1U);
+ User *Usr = OutlinedFn->user_back();
+ ASSERT_TRUE(isa<CallInst>(Usr));
+ CallInst *Parallel51CI = dyn_cast<CallInst>(Usr);
+ ASSERT_NE(Parallel51CI, nullptr);
+
+ EXPECT_EQ(Parallel51CI->getCalledFunction()->getName(), "__kmpc_parallel_51");
+ EXPECT_EQ(Parallel51CI->arg_size(), 9U);
+ EXPECT_EQ(Parallel51CI->getArgOperand(5), OutlinedFn);
+ EXPECT_TRUE(
+ isa<GlobalVariable>(Parallel51CI->getArgOperand(0)->stripPointerCasts()));
+ EXPECT_EQ(Parallel51CI, Usr);
+ M->setDataLayout(oldDLStr);
+}
+
TEST_F(OpenMPIRBuilderTest, ParallelSimple) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -671,7 +786,6 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimple) {
EXPECT_NE(F, OutlinedFn);
EXPECT_FALSE(verifyModule(*M, &errs()));
EXPECT_TRUE(OutlinedFn->hasFnAttribute(Attribute::NoUnwind));
- EXPECT_TRUE(OutlinedFn->hasFnAttribute(Attribute::NoRecurse));
EXPECT_TRUE(OutlinedFn->hasParamAttribute(0, Attribute::NoAlias));
EXPECT_TRUE(OutlinedFn->hasParamAttribute(1, Attribute::NoAlias));
@@ -699,6 +813,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimple) {
TEST_F(OpenMPIRBuilderTest, ParallelNested) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -768,7 +883,6 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested) {
continue;
EXPECT_FALSE(verifyModule(*M, &errs()));
EXPECT_TRUE(OutlinedFn.hasFnAttribute(Attribute::NoUnwind));
- EXPECT_TRUE(OutlinedFn.hasFnAttribute(Attribute::NoRecurse));
EXPECT_TRUE(OutlinedFn.hasParamAttribute(0, Attribute::NoAlias));
EXPECT_TRUE(OutlinedFn.hasParamAttribute(1, Attribute::NoAlias));
@@ -793,6 +907,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested) {
TEST_F(OpenMPIRBuilderTest, ParallelNested2Inner) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -872,7 +987,6 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested2Inner) {
continue;
EXPECT_FALSE(verifyModule(*M, &errs()));
EXPECT_TRUE(OutlinedFn.hasFnAttribute(Attribute::NoUnwind));
- EXPECT_TRUE(OutlinedFn.hasFnAttribute(Attribute::NoRecurse));
EXPECT_TRUE(OutlinedFn.hasParamAttribute(0, Attribute::NoAlias));
EXPECT_TRUE(OutlinedFn.hasParamAttribute(1, Attribute::NoAlias));
@@ -902,6 +1016,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested2Inner) {
TEST_F(OpenMPIRBuilderTest, ParallelIfCond) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -1006,6 +1121,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelIfCond) {
TEST_F(OpenMPIRBuilderTest, ParallelCancelBarrier) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -1119,6 +1235,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelCancelBarrier) {
TEST_F(OpenMPIRBuilderTest, ParallelForwardAsPointers) {
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -4004,6 +4121,7 @@ TEST_F(OpenMPIRBuilderTest, OMPAtomicCompareCapture) {
TEST_F(OpenMPIRBuilderTest, CreateTeams) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -4079,6 +4197,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeams) {
TEST_F(OpenMPIRBuilderTest, CreateTeamsWithThreadLimit) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> &Builder = OMPBuilder.Builder;
@@ -4129,6 +4248,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithThreadLimit) {
TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsUpper) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> &Builder = OMPBuilder.Builder;
@@ -4180,6 +4300,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsUpper) {
TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsBoth) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> &Builder = OMPBuilder.Builder;
@@ -4234,6 +4355,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsBoth) {
TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsAndThreadLimit) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> &Builder = OMPBuilder.Builder;
@@ -4293,6 +4415,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsAndThreadLimit) {
TEST_F(OpenMPIRBuilderTest, CreateTeamsWithIfCondition) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> &Builder = OMPBuilder.Builder;
@@ -4351,6 +4474,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithIfCondition) {
TEST_F(OpenMPIRBuilderTest, CreateTeamsWithIfConditionAndNumTeams) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> &Builder = OMPBuilder.Builder;
@@ -4548,6 +4672,7 @@ xorAtomicReduction(OpenMPIRBuilder::InsertPointTy IP, Type *Ty, Value *LHS,
TEST_F(OpenMPIRBuilderTest, CreateReductions) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -4780,6 +4905,7 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) {
TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -5796,6 +5922,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
TEST_F(OpenMPIRBuilderTest, CreateTask) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -5924,6 +6051,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTask) {
TEST_F(OpenMPIRBuilderTest, CreateTaskNoArgs) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -5954,6 +6082,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskNoArgs) {
TEST_F(OpenMPIRBuilderTest, CreateTaskUntied) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -5983,6 +6112,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskUntied) {
TEST_F(OpenMPIRBuilderTest, CreateTaskDepend) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -6056,6 +6186,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskDepend) {
TEST_F(OpenMPIRBuilderTest, CreateTaskFinal) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -6109,6 +6240,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskFinal) {
TEST_F(OpenMPIRBuilderTest, CreateTaskIfCondition) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
@@ -6269,6 +6401,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroup) {
TEST_F(OpenMPIRBuilderTest, CreateTaskgroupWithTasks) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = false;
OMPBuilder.initialize();
F->setName("func");
IRBuilder<> Builder(BB);
More information about the cfe-commits
mailing list