[clang] [llvm] [mlir] [OMPIRBuilder] - Handle dependencies in `createTarget` (PR #93977)
Michael Kruse via cfe-commits
cfe-commits at lists.llvm.org
Fri Jun 7 11:05:41 PDT 2024
================
@@ -5229,13 +5362,288 @@ static void emitTargetOutlinedFunction(
OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true,
OutlinedFn, OutlinedFnID);
}
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
+ Function *OutlinedFn, Value *OutlinedFnID,
+ EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
+ Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP,
+ SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
+ bool HasNoWait) {
+
+ // When we arrive at this function, the target region itself has been
+ // outlined into the function OutlinedFn.
+ // So at ths point, for
+ // --------------------------------------------------
+ // void user_code_that_offloads(...) {
+ // omp target depend(..) map(from:a) map(to:b, c)
+ // a = b + c
+ // }
+ //
+ // --------------------------------------------------
+ //
+ // we have
+ //
+ // --------------------------------------------------
+ //
+ // void user_code_that_offloads(...) {
+ // %.offload_baseptrs = alloca [3 x ptr], align 8
+ // %.offload_ptrs = alloca [3 x ptr], align 8
+ // %.offload_mappers = alloca [3 x ptr], align 8
+ // ;; target region has been outlined and now we need to
+ // ;; offload to it via a target task.
+ // }
+ // void outlined_device_function(ptr a, ptr b, ptr c) {
+ // *a = *b + *c
+ // }
+ //
+ // We have to now do the following
+ // (i) Make an offloading call to outlined_device_function using the OpenMP
+ // RTL. See 'kernel_launch_function' in the pseudo code below. This is
+ // emitted by emitKernelLaunch
+ // (ii) Create a task entry point function that calls kernel_launch_function
+ // and is the entry point for the target task. See
+ // '@.omp_target_task_proxy_func in the pseudocode below.
+ // (iii) Create a task with the task entry point created in (ii)
+ //
+ // That is we create the following
+ //
+ // void user_code_that_offloads(...) {
+ // %.offload_baseptrs = alloca [3 x ptr], align 8
+ // %.offload_ptrs = alloca [3 x ptr], align 8
+ // %.offload_mappers = alloca [3 x ptr], align 8
+ //
+ // %structArg = alloca { ptr, ptr, ptr }, align 8
+ // %strucArg[0] = %.offload_baseptrs
+ // %strucArg[1] = %.offload_ptrs
+ // %strucArg[2] = %.offload_mappers
+ // proxy_target_task = @__kmpc_omp_task_alloc(...,
+ // @.omp_target_task_proxy_func)
+ // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
+ // dependencies_array = ...
+ // ;; if nowait not present
+ // call @__kmpc_omp_wait_deps(..., dependencies_array)
+ // call @__kmpc_omp_task_begin_if0(...)
+ // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
+ // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
+ // }
+ //
+ // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
+ // ptr %task) {
+ // %structArg = alloca {ptr, ptr, ptr}
+ // %shared_data = load (getelementptr %task, 0, 0)
+ // mempcy(%structArg, %shared_data, sizeof(structArg))
+ // kernel_launch_function(%thread.id, %structArg)
+ // }
+ //
+ // We need the proxy function because the signature of the task entry point
+ // expected by kmpc_omp_task is always the same and will be different from
+ // that of the kernel_launch function.
+ //
+ // kernel_launch_function is generated by emitKernelLaunch and has the
+ // always_inline attribute. void kernel_launch_function(thread_id,
+ // structArg)
+ // alwaysinline {
+ // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
+ // offload_baseptrs = load(getelementptr structArg, 0, 0)
+ // offload_ptrs = load(getelementptr structArg, 0, 1)
+ // offload_mappers = load(getelementptr structArg, 0, 2)
+ // ; setup kernel_args using offload_baseptrs, offload_ptrs and
+ // ; offload_mappers
+ // call i32 @__tgt_target_kernel(...,
+ // outlined_device_function,
+ // ptr %kernel_args)
+ // }
+ // void outlined_device_function(ptr a, ptr b, ptr c) {
+ // *a = *b + *c
+ // }
+ //
+ BasicBlock *TargetTaskBodyBB =
+ splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
+ BasicBlock *TargetTaskAllocaBB =
+ splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
+
+ InsertPointTy TargetTaskAllocaIP =
+ InsertPointTy(TargetTaskAllocaBB, TargetTaskAllocaBB->begin());
+ InsertPointTy TargetTaskBodyIP =
+ InsertPointTy(TargetTaskBodyBB, TargetTaskBodyBB->begin());
+
+ OutlineInfo OI;
+ OI.EntryBB = TargetTaskAllocaBB;
+ OI.OuterAllocaBB = AllocaIP.getBlock();
+
+ // Add the thread ID argument.
+ std::stack<Instruction *> ToBeDeleted;
+ OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
+ Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
+
+ Builder.restoreIP(TargetTaskBodyIP);
+
+ // emitKernelLaunch makes the necessary runtime call to offload the kernel.
+ // We then outline all that code into a separate function
+ // ('kernel_launch_function' in the pseudo code above). This function is then
+ // called by the target task proxy function (see
+ // '@.omp_target_task_proxy_func' in the pseudo code above)
+ // "@.omp_target_task_proxy_func' is generated by emitProxyTaskFunction
+ Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
+ EmitTargetCallFallbackCB, Args, DeviceID,
+ RTLoc, TargetTaskAllocaIP));
+
+ OI.ExitBB = Builder.saveIP().getBlock();
+ OI.PostOutlineCB = [this, ToBeDeleted, Dependencies,
+ HasNoWait](Function &OutlinedFn) mutable {
+ assert(OutlinedFn.getNumUses() == 1 &&
+ "there must be a single user for the outlined function");
+
+ CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
+ bool HasShareds = StaleCI->arg_size() > 1;
+
+ LLVM_DEBUG(dbgs() << "StaleCI in PostOutlineCB in emitTargetTask = "
+ << *StaleCI << "\n");
+ LLVM_DEBUG(dbgs() << "Module in PostOutlineCB in emitTargetTask = "
+ << *(StaleCI->getParent()->getParent()->getParent())
+ << "\n");
+
+ Function *ProxyFn = emitProxyTaskFunction(*this, Builder, StaleCI);
+
+ LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
+ << "\n");
+
+ Builder.SetInsertPoint(StaleCI);
+
+ // Gather the arguments for emitting the runtime call for
+ uint32_t SrcLocStrSize;
+ Constant *SrcLocStr =
+ getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
+ Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+
+ // @__kmpc_omp_task_alloc
+ Function *TaskAllocFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
+
+ // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
+ // call.
+ Value *ThreadID = getOrCreateThreadID(Ident);
+
+ // Argument - `sizeof_kmp_task_t` (TaskSize)
+ // Tasksize refers to the size in bytes of kmp_task_t data structure
+ // including private vars accessed in task.
+ // TODO: add kmp_task_t_with_privates (privates)
+ Value *TaskSize = Builder.getInt64(
+ divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
----------------
Meinersbur wrote:
```suggestion
Value *TaskSize = Builder.getInt64(
M.getDataLayout().getTypeStoreSize(Task));
```
There usually isn't a reason to manually divide by 8.
https://github.com/llvm/llvm-project/pull/93977
More information about the cfe-commits
mailing list