[Openmp-commits] [openmp] [libomptarget][OpenMP] Initial implementation of omp_target_memset() and omp_target_memset_async() (PR #68706)

Michael Klemm via Openmp-commits openmp-commits at lists.llvm.org
Tue Oct 10 10:25:38 PDT 2023


================
@@ -241,10 +241,125 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
   return Rc;
 }
 
+static int libomp_target_memset_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
+  if (!Task) {
+    return OFFLOAD_FAIL;
+  }
+
+  auto *Args = reinterpret_cast<TargetMemsetArgsTy *>(Task->shareds);
+  if (!Args) {
+    return OFFLOAD_FAIL;
+  }
+
+  // call omp_target_memset()
+  omp_target_memset(Args->Ptr, Args->C, Args->N, Args->DeviceNum);
+
+  delete Args;
+
+  return OFFLOAD_SUCCESS;
+}
+
+static inline void
+ConvertDepObjVector(llvm::SmallVector<kmp_depend_info_t> &Vec, int DepObjCount,
+                    omp_depend_t *DepObjList) {
+  for (int i = 0; i < DepObjCount; ++i) {
+    omp_depend_t DepObj = DepObjList[i];
+    Vec.push_back(*((kmp_depend_info_t *)DepObj));
+  }
+}
+
+static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args,
+                                              int DepObjCount,
+                                              omp_depend_t *DepObjList) {
+  // Create global thread ID
+  int Gtid = __kmpc_global_thread_num(nullptr);
+  int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memset_async_helper;
+
+  // Setup the hidden helper flags
+  kmp_int32 Flags = 0;
+  kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
+  InputFlags->hidden_helper = 1;
+
+  // Alloc the helper task
+  kmp_task_t *Task = __kmpc_omp_target_task_alloc(
+      nullptr, Gtid, Flags, sizeof(kmp_task_t), 0, Fn, -1);
+  if (!Task) {
+    delete Args;
+    return OFFLOAD_FAIL;
+  }
+
+  // Setup the arguments for the helper task
+  Task->shareds = Args;
+
+  // Convert types of depend objects
+  llvm::SmallVector<kmp_depend_info_t> DepObjs;
+  ConvertDepObjVector(DepObjs, DepObjCount, DepObjList);
+
+  // Launch the helper task
+  int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Task, DepObjCount,
+                                     DepObjs.data(), 0, nullptr);
+
+  return Rc;
+}
+
+EXTERN void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum) {
+  TIMESCOPE();
+  DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n",
+     DeviceNum, Ptr, N);
+
+  // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
+  // of unspecified behavior, see OpenMP spec).
+  if (!Ptr || N == 0) {
+    return Ptr;
+  }
+
+  if (DeviceNum == omp_get_initial_device()) {
+    DP("filling memory on host via memset");
+    memset(Ptr, C, N); // ignore return value, memset() cannot fail
+  } else {
+    // TODO: replace the omp_target_memset() slow path with the fast path.
+    // That will require the ability to execute a kernel from within
+    // libomptarget.so (which we do not have at the moment).
+
+    // This is a very slow path: create a filled array on the host and upload
+    // it to the GPU device.
+    int InitialDevice = omp_get_initial_device();
+    void *Shadow = omp_target_alloc(N, InitialDevice);
+    (void)memset(Shadow, C, N);
+    (void)omp_target_memcpy(Ptr, Shadow, N, 0, 0, DeviceNum, InitialDevice);
+    (void)omp_target_free(Shadow, InitialDevice);
+  }
+
+  DP("omp_target_memset returns %p\n", Ptr);
+  return Ptr;
+}
+
+EXTERN void *omp_target_memset_async(void *Ptr, int C, size_t N, int DeviceNum,
+                                     int DepObjCount,
+                                     omp_depend_t *DepObjList) {
+  DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu",
+     DeviceNum, Ptr, N);
+
+  // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
+  // of unspecified behavior, see OpenMP spec).
+  if (!Ptr || N == 0) {
+    return Ptr;
+  }
+
+  // Create the task object to deal with the async invocation
+  auto *Args = new TargetMemsetArgsTy{Ptr, C, N, DeviceNum};
+
+  // omp_target_memset_async() cannot fail via a return code, so ignore the
+  // return code of the helper function
+  (void)libomp_helper_memset_task_creation(Args, DepObjCount, DepObjList);
+
+  return Ptr;
+}
+
 // Allocate and launch helper task
-static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args,
-                                       int DepObjCount,
-                                       omp_depend_t *DepObjList) {
+static int libomp_helper_memcpy_task_creation(TargetMemcpyArgsTy *Args,
----------------
mjklemm wrote:

How about the last patch set?  The duplication of the helper function, but I have retained the actual task function.  There's still a tiny bit of code duplication, but I felt like that the reduction using a callback is not saving much at the expense of making the actual control flow even harder to understand.  Let me know if you'd agree to that,

https://github.com/llvm/llvm-project/pull/68706


More information about the Openmp-commits mailing list