[llvm] 4045970 - [OMPIRBuilder] - Make offloading input data persist for deferred target tasks (#133499)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 16 08:27:52 PDT 2025
Author: Pranav Bhandarkar
Date: 2025-06-16T10:27:48-05:00
New Revision: 404597061f974470e8bd1198e44d024fac8319a1
URL: https://github.com/llvm/llvm-project/commit/404597061f974470e8bd1198e44d024fac8319a1
DIFF: https://github.com/llvm/llvm-project/commit/404597061f974470e8bd1198e44d024fac8319a1.diff
LOG: [OMPIRBuilder] - Make offloading input data persist for deferred target tasks (#133499)
When we offload to the target, the pointers to data used by the kernel
are passed in arrays created by `OMPIRBuilder`. These arrays of pointers
are allocated on the stack on the host. This is fine for the most part
because absent the `nowait` clause, the default behavior is that target
tasks are included tasks. That is, the host is blocked until the
offloaded target kernel is done. In turn, this means that the host's
stack frame is intact and accessing the array of pointers when
offloading is safe. However, when `nowait` is used on the `!$ omp
target` instance, then the target task is a deferred task meaning, the
generating task on the host does not have to wait for the target task
to finish. In such cases, it is very likely that the stack frame of the
function invoking the target call is wound up thereby leading to memory
access errors as shown below.
```
AMDGPU error: Error in hsa_amd_memory_pool_allocate: HSA_STATUS_ERROR_INVALID_ALLOCATION: The requested allocation is not valid.
AMDGPU error: Error in hsa_amd_memory_pool_allocate: HSA_STATUS_ERROR_INVALID_ALLOCATION: The requested allocation is not valid. "PluginInterface" error: Failure to allocate device memory: Failed to allocate from memory manager
fort.cod.out: /llvm/llvm-project/offload/plugins-nextgen/common/src/PluginInterface.cpp:1434: Error llvm::omp::target::plugin::PinnedAllocationMapTy::lockMappedHostBuffer(void *, size_t): Assertion `HstPtr && "Invalid pointer"' failed.
Aborted (core dumped)
```
This PR implements support in `OMPIRBuilder` to store these arrays of
pointers in the task structure that is passed to the target task thereby
ensuring it is available to the target task when the target task is
eventually scheduled.
---------
Co-authored-by: Sergio Afonso <safonsof at amd.com>
Added:
mlir/test/Target/LLVMIR/omptarget-nowait.mlir
Modified:
llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
mlir/test/Target/LLVMIR/omptarget-depend.mlir
mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
Removed:
################################################################################
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index e4b1241151e9d..93fb0d8e8d078 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2507,7 +2507,7 @@ class OpenMPIRBuilder {
TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
OpenMPIRBuilder::InsertPointTy AllocaIP,
const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
- bool HasNoWait);
+ const TargetDataRTArgs &RTArgs, bool HasNoWait);
/// Emit the arguments to be passed to the runtime library based on the
/// arrays of base pointers, pointers, sizes, map types, and mappers. If
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ca3d8438654dc..c1f02b2b240de 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6703,7 +6703,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
/*TargetTaskAllocaIP=*/{}));
else
cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
- /*Dependencies=*/{}, Info.HasNoWait));
+ /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
} else {
Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
omp::OMPRTL___tgt_target_data_begin_mapper);
@@ -7150,15 +7150,55 @@ static Expected<Function *> createOutlinedFunction(
ValueReplacementMap);
return Func;
}
+/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
+/// of pointers containing shared data between the parent task and the created
+/// task.
+static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
+ IRBuilderBase &Builder,
+ Value *TaskWithPrivates,
+ Type *TaskWithPrivatesTy) {
+ Type *TaskTy = OMPIRBuilder.Task;
+ LLVMContext &Ctx = Builder.getContext();
+ Value *TaskT =
+ Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
+ Value *Shareds = TaskT;
+ // TaskWithPrivatesTy can be one of the following
+ // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
+ // %struct.privates }
+ // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
+ //
+ // In the former case, that is when TaskWithPrivatesTy != TaskTy,
+ // its first member has to be the task descriptor. TaskTy is the type of the
+ // task descriptor. TaskT is the pointer to the task descriptor. Loading the
+ // first member of TaskT, gives us the pointer to shared data.
+ if (TaskWithPrivatesTy != TaskTy)
+ Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
+ return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
+}
/// Create an entry point for a target task with the following.
/// It'll have the following signature
/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
/// This function is called from emitTargetTask once the
/// code to launch the target kernel has been outlined already.
-static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
- IRBuilderBase &Builder,
- CallInst *StaleCI) {
+/// NumOffloadingArrays is the number of offloading arrays that we need to copy
+/// into the task structure so that the deferred target task can access this
+/// data even after the stack frame of the generating task has been rolled
+/// back. Offloading arrays contain base pointers, pointers, sizes etc
+/// of the data that the target kernel will access. These in effect are the
+/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
+static Function *emitTargetTaskProxyFunction(
+ OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
+ StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
+ const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
+
+ // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
+ // This is because PrivatesTy is the type of the structure in which
+ // we pass the offloading arrays to the deferred target task.
+ assert((!NumOffloadingArrays || PrivatesTy) &&
+ "PrivatesTy cannot be nullptr when there are offloadingArrays"
+ "to privatize");
+
Module &M = OMPBuilder.M;
// KernelLaunchFunction is the target launch function, i.e.
// the function that sets up kernel arguments and calls
@@ -7185,34 +7225,48 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
// call void @_QQmain..omp_par.1(i32 %global.tid.val6)
OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
StaleCI->getIterator());
+
LLVMContext &Ctx = StaleCI->getParent()->getContext();
+
Type *ThreadIDTy = Type::getInt32Ty(Ctx);
Type *TaskPtrTy = OMPBuilder.TaskPtr;
Type *TaskTy = OMPBuilder.Task;
+
auto ProxyFnTy =
FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
/* isVarArg */ false);
auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
".omp_target_task_proxy_func",
Builder.GetInsertBlock()->getModule());
- ProxyFn->getArg(0)->setName("thread.id");
- ProxyFn->getArg(1)->setName("task");
+ Value *ThreadId = ProxyFn->getArg(0);
+ Value *TaskWithPrivates = ProxyFn->getArg(1);
+ ThreadId->setName("thread.id");
+ TaskWithPrivates->setName("task");
+ bool HasShareds = SharedArgsOperandNo > 0;
+ bool HasOffloadingArrays = NumOffloadingArrays > 0;
BasicBlock *EntryBB =
BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
Builder.SetInsertPoint(EntryBB);
- bool HasShareds = StaleCI->arg_size() > 1;
- // TODO: This is a temporary assert to prove to ourselves that
- // the outlined target launch function is always going to have
- // atmost two arguments if there is any data shared between
- // host and device.
- assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
- "StaleCI with shareds should have exactly two arguments.");
+ SmallVector<Value *> KernelLaunchArgs;
+ KernelLaunchArgs.reserve(StaleCI->arg_size());
+ KernelLaunchArgs.push_back(ThreadId);
+
+ if (HasOffloadingArrays) {
+ assert(TaskTy != TaskWithPrivatesTy &&
+ "If there are offloading arrays to pass to the target"
+ "TaskTy cannot be the same as TaskWithPrivatesTy");
+ Value *Privates =
+ Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
+ for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
+ KernelLaunchArgs.push_back(
+ Builder.CreateStructGEP(PrivatesTy, Privates, i));
+ }
- Value *ThreadId = ProxyFn->getArg(0);
if (HasShareds) {
- auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+ auto *ArgStructAlloca =
+ dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
assert(ArgStructAlloca &&
"Unable to find the alloca instruction corresponding to arguments "
"for extracted function");
@@ -7220,27 +7274,67 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
AllocaInst *NewArgStructAlloca =
Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
- Value *TaskT = ProxyFn->getArg(1);
+
Value *SharedsSize =
Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
- Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
- LoadInst *LoadShared =
- Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
+ LoadInst *LoadShared = loadSharedDataFromTaskDescriptor(
+ OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
Builder.CreateMemCpy(
NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
-
- Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
- } else {
- Builder.CreateCall(KernelLaunchFunction, {ThreadId});
+ KernelLaunchArgs.push_back(NewArgStructAlloca);
}
-
+ Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs);
Builder.CreateRetVoid();
return ProxyFn;
}
+static Type *getOffloadingArrayType(Value *V) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
+ return GEP->getSourceElementType();
+ if (auto *Alloca = dyn_cast<AllocaInst>(V))
+ return Alloca->getAllocatedType();
+
+ llvm_unreachable("Unhandled Instruction type");
+ return nullptr;
+}
+// This function returns a struct that has at most two members.
+// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
+// descriptor. The second member, if needed, is a struct containing arrays
+// that need to be passed to the offloaded target kernel. For example,
+// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
+// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
+// respectively, then the types created by this function are
+//
+// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
+// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
+// %struct.privates }
+// %struct.task_with_privates is returned by this function.
+// If there aren't any offloading arrays to pass to the target kernel,
+// %struct.kmp_task_ompbuilder_t is returned.
+static StructType *
+createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
+ ArrayRef<Value *> OffloadingArraysToPrivatize) {
+
+ if (OffloadingArraysToPrivatize.empty())
+ return OMPIRBuilder.Task;
+
+ SmallVector<Type *, 4> StructFieldTypes;
+ for (Value *V : OffloadingArraysToPrivatize) {
+ assert(V->getType()->isPointerTy() &&
+ "Expected pointer to array to privatize. Got a non-pointer value "
+ "instead");
+ Type *ArrayTy = getOffloadingArrayType(V);
+ assert(ArrayTy && "ArrayType cannot be nullptr");
+ StructFieldTypes.push_back(ArrayTy);
+ }
+ StructType *PrivatesStructTy =
+ StructType::create(StructFieldTypes, "struct.privates");
+ return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
+ "struct.task_with_privates");
+}
static Error emitTargetOutlinedFunction(
OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
TargetRegionEntryInfo &EntryInfo,
@@ -7266,7 +7360,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
OpenMPIRBuilder::InsertPointTy AllocaIP,
const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
- bool HasNoWait) {
+ const TargetDataRTArgs &RTArgs, bool HasNoWait) {
// The following explains the code-gen scenario for the `target` directive. A
// similar scneario is followed for other device-related directives (e.g.
@@ -7276,27 +7370,30 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
// When we arrive at this function, the target region itself has been
// outlined into the function OutlinedFn.
// So at ths point, for
- // --------------------------------------------------
+ // --------------------------------------------------------------
// void user_code_that_offloads(...) {
- // omp target depend(..) map(from:a) map(to:b, c)
- // a = b + c
+ // omp target depend(..) map(from:a) map(to:b) private(i)
+ // do i = 1, 10
+ // a(i) = b(i) + n
// }
//
- // --------------------------------------------------
+ // --------------------------------------------------------------
//
// we have
//
- // --------------------------------------------------
+ // --------------------------------------------------------------
//
// void user_code_that_offloads(...) {
- // %.offload_baseptrs = alloca [3 x ptr], align 8
- // %.offload_ptrs = alloca [3 x ptr], align 8
- // %.offload_mappers = alloca [3 x ptr], align 8
+ // %.offload_baseptrs = alloca [2 x ptr], align 8
+ // %.offload_ptrs = alloca [2 x ptr], align 8
+ // %.offload_mappers = alloca [2 x ptr], align 8
// ;; target region has been outlined and now we need to
// ;; offload to it via a target task.
// }
- // void outlined_device_function(ptr a, ptr b, ptr c) {
- // *a = *b + *c
+ // void outlined_device_function(ptr a, ptr b, ptr n) {
+ // n = *n_ptr;
+ // do i = 1, 10
+ // a(i) = b(i) + n
// }
//
// We have to now do the following
@@ -7309,33 +7406,59 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
// (iii) Create a task with the task entry point created in (ii)
//
// That is we create the following
- //
+ // struct task_with_privates {
+ // struct kmp_task_ompbuilder_t task_struct;
+ // struct privates {
+ // [2 x ptr] ; baseptrs
+ // [2 x ptr] ; ptrs
+ // [2 x i64] ; sizes
+ // }
+ // }
// void user_code_that_offloads(...) {
- // %.offload_baseptrs = alloca [3 x ptr], align 8
- // %.offload_ptrs = alloca [3 x ptr], align 8
- // %.offload_mappers = alloca [3 x ptr], align 8
+ // %.offload_baseptrs = alloca [2 x ptr], align 8
+ // %.offload_ptrs = alloca [2 x ptr], align 8
+ // %.offload_sizes = alloca [2 x i64], align 8
//
// %structArg = alloca { ptr, ptr, ptr }, align 8
- // %strucArg[0] = %.offload_baseptrs
- // %strucArg[1] = %.offload_ptrs
- // %strucArg[2] = %.offload_mappers
- // proxy_target_task = @__kmpc_omp_task_alloc(...,
- // @.omp_target_task_proxy_func)
- // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
+ // %strucArg[0] = a
+ // %strucArg[1] = b
+ // %strucArg[2] = &n
+ //
+ // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
+ // sizeof(kmp_task_ompbuilder_t),
+ // sizeof(structArg),
+ // @.omp_target_task_proxy_func,
+ // ...)
+ // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
+ // sizeof(structArg))
+ // memcpy(target_task_with_privates->privates->baseptrs,
+ // offload_baseptrs, sizeof(offload_baseptrs)
+ // memcpy(target_task_with_privates->privates->ptrs,
+ // offload_ptrs, sizeof(offload_ptrs)
+ // memcpy(target_task_with_privates->privates->sizes,
+ // offload_sizes, sizeof(offload_sizes)
// dependencies_array = ...
// ;; if nowait not present
// call @__kmpc_omp_wait_deps(..., dependencies_array)
// call @__kmpc_omp_task_begin_if0(...)
// call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
- // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
+ // %target_task_with_privates)
+ // call @__kmpc_omp_task_complete_if0(...)
// }
//
// define internal void @.omp_target_task_proxy_func(i32 %thread.id,
// ptr %task) {
// %structArg = alloca {ptr, ptr, ptr}
- // %shared_data = load (getelementptr %task, 0, 0)
- // mempcy(%structArg, %shared_data, sizeof(structArg))
- // kernel_launch_function(%thread.id, %structArg)
+ // %task_ptr = getelementptr(%task, 0, 0)
+ // %shared_data = load (getelementptr %task_ptr, 0, 0)
+ // mempcy(%structArg, %shared_data, sizeof(%structArg))
+ //
+ // %offloading_arrays = getelementptr(%task, 0, 1)
+ // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
+ // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
+ // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
+ // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
+ // %offload_sizes, %structArg)
// }
//
// We need the proxy function because the signature of the task entry point
@@ -7343,21 +7466,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
// that of the kernel_launch function.
//
// kernel_launch_function is generated by emitKernelLaunch and has the
- // always_inline attribute.
- // void kernel_launch_function(thread_id,
- // structArg) alwaysinline {
+ // always_inline attribute. For this example, it'll look like so:
+ // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
+ // %offload_sizes, %structArg) alwaysinline {
// %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
- // offload_baseptrs = load(getelementptr structArg, 0, 0)
- // offload_ptrs = load(getelementptr structArg, 0, 1)
- // offload_mappers = load(getelementptr structArg, 0, 2)
+ // ; load aggregated data from %structArg
// ; setup kernel_args using offload_baseptrs, offload_ptrs and
- // ; offload_mappers
+ // ; offload_sizes
// call i32 @__tgt_target_kernel(...,
// outlined_device_function,
// ptr %kernel_args)
// }
- // void outlined_device_function(ptr a, ptr b, ptr c) {
- // *a = *b + *c
+ // void outlined_device_function(ptr a, ptr b, ptr n) {
+ // n = *n_ptr;
+ // do i = 1, 10
+ // a(i) = b(i) + n
// }
//
BasicBlock *TargetTaskBodyBB =
@@ -7378,6 +7501,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
+ // Generate the task body which will subsequently be outlined.
Builder.restoreIP(TargetTaskBodyIP);
if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
return Err;
@@ -7396,15 +7520,57 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
/*IsFinished=*/true);
- OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait,
- DeviceID](Function &OutlinedFn) mutable {
+ SmallVector<Value *, 2> OffloadingArraysToPrivatize;
+ bool NeedsTargetTask = HasNoWait && DeviceID;
+ if (NeedsTargetTask) {
+ for (auto *V :
+ {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
+ RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
+ RTArgs.SizesArray}) {
+ if (V && !isa<ConstantPointerNull, GlobalVariable>(V)) {
+ OffloadingArraysToPrivatize.push_back(V);
+ OI.ExcludeArgsFromAggregate.push_back(V);
+ }
+ }
+ }
+ OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
+ DeviceID, OffloadingArraysToPrivatize](
+ Function &OutlinedFn) mutable {
assert(OutlinedFn.hasOneUse() &&
"there must be a single user for the outlined function");
CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
- bool HasShareds = StaleCI->arg_size() > 1;
- Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
+ // The first argument of StaleCI is always the thread id.
+ // The next few arguments are the pointers to offloading arrays
+ // if any. (see OffloadingArraysToPrivatize)
+ // Finally, all other local values that are live-in into the outlined region
+ // end up in a structure whose pointer is passed as the last argument. This
+ // piece of data is passed in the "shared" field of the task structure. So,
+ // we know we have to pass shareds to the task if the number of arguments is
+ // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
+ // thread id. Further, for safety, we assert that the number of arguments of
+ // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
+ const unsigned int NumStaleCIArgs = StaleCI->arg_size();
+ bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
+ assert(
+ !HasShareds ||
+ NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2) &&
+ "Wrong number of arguments for StaleCI when shareds are present");
+ int SharedArgOperandNo =
+ HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
+
+ StructType *TaskWithPrivatesTy =
+ createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
+ StructType *PrivatesTy = nullptr;
+
+ if (!OffloadingArraysToPrivatize.empty())
+ PrivatesTy =
+ static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
+
+ Function *ProxyFn = emitTargetTaskProxyFunction(
+ *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
+ OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
<< "\n");
@@ -7422,7 +7588,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
// If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
// the DeviceID to the deferred task and also since
// @__kmpc_omp_target_task_alloc creates an untied/async task.
- bool NeedsTargetTask = HasNoWait && DeviceID;
Function *TaskAllocFn =
!NeedsTargetTask
? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
@@ -7435,17 +7600,19 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
// Argument - `sizeof_kmp_task_t` (TaskSize)
// Tasksize refers to the size in bytes of kmp_task_t data structure
- // including private vars accessed in task.
- // TODO: add kmp_task_t_with_privates (privates)
- Value *TaskSize =
- Builder.getInt64(M.getDataLayout().getTypeStoreSize(Task));
+ // plus any other data to be passed to the target task, if any, which
+ // is packed into a struct. kmp_task_t and the struct so created are
+ // packed into a wrapper struct whose type is TaskWithPrivatesTy.
+ Value *TaskSize = Builder.getInt64(
+ M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
// Argument - `sizeof_shareds` (SharedsSize)
// SharedsSize refers to the shareds array size in the kmp_task_t data
// structure.
Value *SharedsSize = Builder.getInt64(0);
if (HasShareds) {
- auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+ auto *ArgStructAlloca =
+ dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
assert(ArgStructAlloca &&
"Unable to find the alloca instruction corresponding to arguments "
"for extracted function");
@@ -7483,13 +7650,32 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
+ Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
if (HasShareds) {
- Value *Shareds = StaleCI->getArgOperand(1);
- Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
- Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
+ Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
+ Value *TaskShareds = loadSharedDataFromTaskDescriptor(
+ *this, Builder, TaskData, TaskWithPrivatesTy);
Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
SharedsSize);
}
+ if (!OffloadingArraysToPrivatize.empty()) {
+ Value *Privates =
+ Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
+ for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
+ Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
+ Type *ArrayType = getOffloadingArrayType(PtrToPrivatize);
+ assert(ArrayType && "ArrayType cannot be nullptr");
+
+ Type *ElementType = PrivatesTy->getElementType(i);
+ assert(ElementType == ArrayType &&
+ "ElementType should match ArrayType");
+
+ Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
+ Builder.CreateMemCpy(
+ Dst, Alignment, PtrToPrivatize, Alignment,
+ Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
+ }
+ }
Value *DepArray = emitTaskDependencies(*this, Dependencies);
@@ -7635,9 +7821,10 @@ static void emitTargetCall(
// Arguments that are intended to be directly forwarded to an
// emitKernelLaunch call are pased as nullptr, since
// OutlinedFnID=nullptr results in that call not being done.
+ OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
/*RTLoc=*/nullptr, AllocaIP,
- Dependencies, HasNoWait);
+ Dependencies, EmptyRTArgs, HasNoWait);
}
return EmitTargetCallFallbackCB(Builder.saveIP());
}());
@@ -7649,6 +7836,7 @@ static void emitTargetCall(
auto &&EmitTargetCallThen =
[&](OpenMPIRBuilder::InsertPointTy AllocaIP,
OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
+ Info.HasNoWait = HasNoWait;
OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
OpenMPIRBuilder::TargetDataRTArgs RTArgs;
if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
@@ -7726,7 +7914,8 @@ static void emitTargetCall(
// explicit generation of the target task.
if (RequiresOuterTargetTask)
return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
- Dependencies, HasNoWait);
+ Dependencies, KArgs.RTArgs,
+ Info.HasNoWait);
return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
EmitTargetCallFallbackCB, KArgs,
diff --git a/mlir/test/Target/LLVMIR/omptarget-depend.mlir b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
index f2948c6510138..0f2437639319a 100644
--- a/mlir/test/Target/LLVMIR/omptarget-depend.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
@@ -126,7 +126,8 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
// CHECK-DAG: %[[DEP_ARRAY:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
// CHECK: %[[TASKDATA:.+]] = call ptr @__kmpc_omp_task_alloc({{.+}}, ptr @.omp_target_task_proxy_func)
-// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[TASKDATA]], align 8
+// CHECK: %[[SHARED_PTR:.+]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TASKDATA]], i32 0, i32 0
+// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[SHARED_PTR]], align 8
// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHARED_DATA]], ptr align 1 %[[STRUCTARG]], i64 24, i1 false)
// CHECK: %[[DEP_INFO:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARRAY]], i64 0, i64 0
diff --git a/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
index b487b31d54477..5eee7b7d7d976 100644
--- a/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
@@ -13,19 +13,48 @@ module attributes {omp.target_triples = ["dummy-target-triple"]} {
}
llvm.return
}
+}
+// CHECK: %struct.[[TSK_WTH_PRVTS:.*]] = type { %struct.kmp_task_ompbuilder_t, %struct.[[PRVTS:.*]] }
+// CHECK: %struct.kmp_task_ompbuilder_t = type { ptr, ptr, i32, ptr, ptr }
+// CHECK: %struct.[[PRVTS]] = type { [1 x ptr], [1 x ptr] }
// CHECK: define void @_QPfoo() {
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[BASEPTRS:.*]] = alloca [1 x ptr], align 8
+// CHECK: %[[PTRS:.*]] = alloca [1 x ptr], align 8
+// CHECK: %[[MAPPERS:.*]] = alloca [1 x ptr], align 8
+
+// CHECK: getelementptr inbounds [1 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: getelementptr inbounds [1 x ptr], ptr %[[PTRS]], i32 0, i32 0
+// CHECK: %[[BASEPTRS_GEP:.*]] = getelementptr inbounds [1 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: %[[PTRS_GEP:.*]] = getelementptr inbounds [1 x ptr], ptr %[[PTRS]], i32 0, i32 0
-// CHECK: %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
-// CHECK-SAME: (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME: @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
-// CHECK: call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
+// CHECK: %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
+// CHECK-SAME: (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
+// CHECK-SAME: @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK: %[[TSK_PTR:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TSK_PTR]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHAREDS_PTR]], ptr align 1 %[[STRUCTARG]], i64 8, i1 false)
+// CHECK: %[[VAL_50:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK]], i32 0, i32 1
+// CHECK: %[[VAL_51:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 0
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_51]], ptr align 1 %[[BASEPTRS_GEP]], i64 8, i1 false)
+// CHECK: %[[VAL_53:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 1
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_53]], ptr align 1 %[[PTRS_GEP]], i64 8, i1 false)
+// CHECK: call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
// CHECK: }
+// CHECK: define internal void @[[WORKER:.*]](i32 {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}) {
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK: call void @_QPfoo..omp_par(i32 %{{.*}}, ptr %{{.*}})
-// CHECK: }
-}
+// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %[[THREAD_ID_PARAM:.*]], ptr %[[TASK_DESC_PARAM:.*]]) {
+// CHECK: %[[PRIVATE_DATA:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 1
+// CHECK: %[[BASEPTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 0
+// CHECK: %[[PTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 1
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[TASK:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TASK]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[STRUCTARG]], ptr align 1 %[[SHAREDS_PTR]], i64 8, i1 false)
+// CHECK: call void @[[WORKER]](i32 %{{.*}}, ptr %{{.*}})
diff --git a/mlir/test/Target/LLVMIR/omptarget-nowait.mlir b/mlir/test/Target/LLVMIR/omptarget-nowait.mlir
new file mode 100644
index 0000000000000..19333c44322f1
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-nowait.mlir
@@ -0,0 +1,70 @@
+// RUN: mlir-translate -mlir-to-llvmir %s 2>&1 | FileCheck %s
+
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+ llvm.func @launch_(%arg0: !llvm.ptr {fir.bindc_name = "a", llvm.nocapture}) {
+ %0 = llvm.mlir.constant(1 : i64) : i64
+ %1 = llvm.alloca %0 x f64 {bindc_name = "n"} : (i64) -> !llvm.ptr
+ %2 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+ %3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, f64) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%2 : !llvm.ptr) -> !llvm.ptr {name = ""}
+ %4 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(to) capture(ByRef) members(%3 : [0] : !llvm.ptr) -> !llvm.ptr {name = "a"}
+ %5 = omp.map.info var_ptr(%1 : !llvm.ptr, f64) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "n"}
+ omp.target nowait map_entries(%4 -> %arg1, %5 -> %arg2, %3 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+ %two_f = llvm.mlir.constant(2.000000e+00 : f64) : f64
+ %one_i = llvm.mlir.constant(1 : index) : i64
+ %6 = llvm.getelementptr %arg1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+ %7 = llvm.load %6 : !llvm.ptr -> !llvm.ptr
+ %8 = llvm.getelementptr %7[%one_i] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+ %9 = llvm.load %8 : !llvm.ptr -> f64
+ %10 = llvm.fmul %9, %two_f {fastmathFlags = #llvm.fastmath<contract>} : f64
+ llvm.store %10, %8 : f64, !llvm.ptr
+ omp.terminator
+ }
+ llvm.return
+ }
+}
+
+// CHECK: %struct.[[TSK_WTH_PRVTS:.*]] = type { %struct.kmp_task_ompbuilder_t, %struct.[[PRVTS:.*]] }
+// CHECK: %struct.kmp_task_ompbuilder_t = type { ptr, ptr, i32, ptr, ptr }
+// CHECK: %struct.[[PRVTS]] = type { [5 x ptr], [5 x ptr], [5 x i64] }
+
+// CHECK: define void @launch_(ptr captures(none) %0)
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr, ptr }, align 8
+// CHECK: %[[BASEPTRS:.*]] = alloca [5 x ptr], align 8
+// CHECK: %[[PTRS:.*]] = alloca [5 x ptr], align 8
+// CHECK: %[[MAPPERS:.*]] = alloca [5 x ptr], align 8
+// CHECK: %[[SIZES:.*]] = alloca [5 x i64], align 4
+
+
+// CHECK: %[[VAL_20:.*]] = getelementptr inbounds [5 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: %[[BASEPTRS_GEP:.*]] = getelementptr inbounds [5 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: %[[PTRS_GEP:.*]] = getelementptr inbounds [5 x ptr], ptr %[[PTRS]], i32 0, i32 0
+// CHECK: %[[SIZES_GEP:.*]] = getelementptr inbounds [5 x i64], ptr %[[SIZES]], i32 0, i32 0
+
+// CHECK: %[[GL_THRD_NUM:.*]] = call i32 @__kmpc_global_thread_num
+// CHECK: %[[TASK_DESC:.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @4, i32 {{.*}}, i32 0, i64 160, i64 16, ptr [[TGT_TSK_PRXY_FNC:.*]], i64 -1)
+// CHECK: %[[TSK_PTR:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TSK_PTR]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHAREDS_PTR]], ptr align 1 %[[STRUCTARG]], i64 16, i1 false)
+// CHECK: %[[VAL_50:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC]], i32 0, i32 1
+// CHECK: %[[VAL_51:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 0
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_51]], ptr align 1 %[[BASEPTRS_GEP]], i64 40, i1 false)
+// CHECK: %[[VAL_53:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 1
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_53]], ptr align 1 %[[PTRS_GEP]], i64 40, i1 false)
+// CHECK: %[[VAL_54:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 2
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_54]], ptr align 1 %[[SIZES_GEP]], i64 40, i1 false)
+// CHECK: %[[VAL_55:.*]] = call i32 @__kmpc_omp_task(ptr @4, i32 %[[GL_THRD_NUM]], ptr %[[TASK_DESC]])
+
+// CHECK: define internal void @[[WORKER:.*]](i32 {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}) {
+
+// CHECK: define internal void [[TGT_TSK_PRXY_FNC]](i32 %[[THREAD_ID_PARAM:.*]], ptr %[[TASK_DESC_PARAM:.*]]) {
+// CHECK: %[[PRIVATE_DATA:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 1
+// CHECK: %[[BASEPTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 0
+// CHECK: %[[PTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 1
+// CHECK: %[[SIZES:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 2
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr, ptr }, align 8
+// CHECK: %[[TASK:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TASK]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[STRUCTARG]], ptr align 1 %[[SHAREDS_PTR]], i64 16, i1 false)
+// CHECK: call void @[[WORKER]](i32 %[[THREAD_ID_PARAM]], ptr %[[BASEPTRS]], ptr %[[PTRS]], ptr %[[SIZES]], ptr %[[STRUCTARG]])
diff --git a/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir b/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
index 8124d02ef2174..dba8c553aaca5 100644
--- a/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
@@ -14,25 +14,20 @@ llvm.func @_QPopenmp_target_data_enter() {
// CHECK: %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
// CHECK-SAME: (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME: @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK-SAME: @[[TASK_PROXY_FUNC_ENTER:.*]], i64 {{.*}})
// CHECK: call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
// CHECK: }
-// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) {
-// CHECK: %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0
-// CHECK: %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8
-// CHECK: %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1
-// CHECK: %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8
-
+// CHECK: define internal void @[[TASK_BODY_FUNC_ENTER:.*]](i32 {{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) {
// CHECK: call void @__tgt_target_data_begin_nowait_mapper(
// CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1,
-// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]],
+// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]],
// CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null)
// CHECK: }
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK: call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}})
+// CHECK: define internal void @[[TASK_PROXY_FUNC_ENTER]](i32 %{{.*}}, ptr %{{.*}}) {
+// CHECK: call void @[[TASK_BODY_FUNC_ENTER]](i32 %{{.*}}, ptr %{{.*}}, ptr %{{.*}})
// CHECK: }
// -----
@@ -51,25 +46,20 @@ llvm.func @_QPopenmp_target_data_update() {
// CHECK: %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
// CHECK-SAME: (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME: @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK-SAME: @[[TASK_PROXY_FUNC_UPDATE:.*]], i64 {{.*}})
// CHECK: call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
// CHECK: }
-// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) {
-// CHECK: %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0
-// CHECK: %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8
-// CHECK: %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1
-// CHECK: %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8
-
+// CHECK: define internal void @[[TASK_BODY_FUNC_UPDATE:.*]](i32 {{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) {
// CHECK: call void @__tgt_target_data_update_nowait_mapper(
// CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1,
-// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]],
+// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]],
// CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null)
// CHECK: }
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK: call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}})
+// CHECK: define internal void @[[TASK_PROXY_FUNC_UPDATE]](i32 %{{.*}}, ptr %{{.*}}) {
+// CHECK: call void @[[TASK_BODY_FUNC_UPDATE]](i32 %{{.*}}, ptr %{{.*}})
// CHECK: }
// -----
@@ -88,23 +78,18 @@ llvm.func @_QPopenmp_target_data_exit() {
// CHECK: %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
// CHECK-SAME: (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME: @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK-SAME: @[[TASK_PROXY_FUNC_EXIT:.*]], i64 {{.*}})
// CHECK: call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
// CHECK: }
-// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) {
-// CHECK: %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0
-// CHECK: %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8
-// CHECK: %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1
-// CHECK: %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8
-
+// CHECK: define internal void @[[TASK_BODY_FUNC_EXIT:.*]](i32 %{{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) {
// CHECK: call void @__tgt_target_data_end_nowait_mapper(
// CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1,
-// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]],
+// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]],
// CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null)
// CHECK: }
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK: call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}})
+// CHECK: define internal void @[[TASK_PROXY_FUNC_EXIT]](i32 %{{.*}}, ptr %{{.*}}) {
+// CHECK: call void @[[TASK_BODY_FUNC_EXIT]](i32 %{{.*}}, ptr %{{.*}})
// CHECK: }
More information about the llvm-commits
mailing list