[Mlir-commits] [clang] [llvm] [mlir] [OMPIRBuilder] - Handle dependencies in `createTarget` (PR #93977)

Fri Jun 7 11:05:41 PDT 2024

================
@@ -5229,13 +5362,288 @@ static void emitTargetOutlinedFunction(
   OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true,
                                       OutlinedFn, OutlinedFnID);
 }
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
+    Function *OutlinedFn, Value *OutlinedFnID,
+    EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
+    Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP,
+    SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
+    bool HasNoWait) {
+
+  // When we arrive at this function, the target region itself has been
+  // outlined into the function OutlinedFn.
+  // So at ths point, for
+  // --------------------------------------------------
+  //   void user_code_that_offloads(...) {
+  //     omp target depend(..) map(from:a) map(to:b, c)
+  //        a = b + c
+  //   }
+  //
+  // --------------------------------------------------
+  //
+  // we have
+  //
+  // --------------------------------------------------
+  //
+  //   void user_code_that_offloads(...) {
+  //     %.offload_baseptrs = alloca [3 x ptr], align 8
+  //     %.offload_ptrs = alloca [3 x ptr], align 8
+  //     %.offload_mappers = alloca [3 x ptr], align 8
+  //     ;; target region has been outlined and now we need to
+  //     ;; offload to it via a target task.
+  //   }
+  //   void outlined_device_function(ptr a, ptr b, ptr c) {
+  //     *a = *b + *c
+  //   }
+  //
+  // We have to now do the following
+  // (i)   Make an offloading call to outlined_device_function using the OpenMP
+  //       RTL. See 'kernel_launch_function' in the pseudo code below. This is
+  //       emitted by emitKernelLaunch
+  // (ii)  Create a task entry point function that calls kernel_launch_function
+  //       and is the entry point for the target task. See
+  //       '@.omp_target_task_proxy_func in the pseudocode below.
+  // (iii) Create a task with the task entry point created in (ii)
+  //
+  // That is we create the following
+  //
+  //   void user_code_that_offloads(...) {
+  //     %.offload_baseptrs = alloca [3 x ptr], align 8
+  //     %.offload_ptrs = alloca [3 x ptr], align 8
+  //     %.offload_mappers = alloca [3 x ptr], align 8
+  //
+  //     %structArg = alloca { ptr, ptr, ptr }, align 8
+  //     %strucArg[0] = %.offload_baseptrs
+  //     %strucArg[1] = %.offload_ptrs
+  //     %strucArg[2] = %.offload_mappers
+  //     proxy_target_task = @__kmpc_omp_task_alloc(...,
+  //                                               @.omp_target_task_proxy_func)
+  //     memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
+  //     dependencies_array = ...
+  //     ;; if nowait not present
+  //     call @__kmpc_omp_wait_deps(..., dependencies_array)
+  //     call @__kmpc_omp_task_begin_if0(...)
+  //     call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
+  //     %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
+  //   }
+  //
+  //   define internal void @.omp_target_task_proxy_func(i32 %thread.id,
+  //                                                     ptr %task) {
+  //       %structArg = alloca {ptr, ptr, ptr}
+  //       %shared_data = load (getelementptr %task, 0, 0)
+  //       mempcy(%structArg, %shared_data, sizeof(structArg))
+  //       kernel_launch_function(%thread.id, %structArg)
+  //   }
+  //
+  //   We need the proxy function because the signature of the task entry point
+  //   expected by kmpc_omp_task is always the same and will be different from
+  //   that of the kernel_launch function.
+  //
+  //   kernel_launch_function is generated by emitKernelLaunch and has the
+  //   always_inline attribute. void kernel_launch_function(thread_id,
+  //                                                        structArg)
+  //                                                        alwaysinline {
+  //       %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
+  //       offload_baseptrs = load(getelementptr structArg, 0, 0)
+  //       offload_ptrs = load(getelementptr structArg, 0, 1)
+  //       offload_mappers = load(getelementptr structArg, 0, 2)
+  //       ; setup kernel_args using offload_baseptrs, offload_ptrs and
+  //       ; offload_mappers
+  //       call i32 @__tgt_target_kernel(...,
+  //                                     outlined_device_function,
+  //                                     ptr %kernel_args)
+  //   }
+  //   void outlined_device_function(ptr a, ptr b, ptr c) {
+  //      *a = *b + *c
+  //   }
+  //
+  BasicBlock *TargetTaskBodyBB =
+      splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
+  BasicBlock *TargetTaskAllocaBB =
+      splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
+
+  InsertPointTy TargetTaskAllocaIP =
+      InsertPointTy(TargetTaskAllocaBB, TargetTaskAllocaBB->begin());
+  InsertPointTy TargetTaskBodyIP =
+      InsertPointTy(TargetTaskBodyBB, TargetTaskBodyBB->begin());
+
+  OutlineInfo OI;
+  OI.EntryBB = TargetTaskAllocaBB;
+  OI.OuterAllocaBB = AllocaIP.getBlock();
+
+  // Add the thread ID argument.
+  std::stack<Instruction *> ToBeDeleted;
----------------
Meinersbur wrote:

```suggestion
  SmallVector<Instruction *> ToBeDeleted;
```
`std::stack` uses `std::deque` (by default), which is quite malloc-heavy. From cppreference:
>  deques typically have large minimal memory cost; a deque holding just one element has to allocate its full internal array (e.g. 8 times the object size on 64-bit libstdc++; 16 times the object size or 4096 bytes, whichever is larger, on 64-bit libc++). 

https://github.com/llvm/llvm-project/pull/93977