[Openmp-commits] [openmp] [libomptarget][OpenMP] Initial implementation of omp_target_memset() and omp_target_memset_async() (PR #68706)

Michael Klemm via Openmp-commits openmp-commits at lists.llvm.org
Tue Oct 10 10:21:42 PDT 2023


https://github.com/mjklemm updated https://github.com/llvm/llvm-project/pull/68706

>From 06d5eefaffca64e9b99f3a88b3f88f2203fa3243 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm at amd.com>
Date: Sun, 1 Oct 2023 17:28:22 +0200
Subject: [PATCH 1/6] Add stub functions and exports for omp_target_memset()

---
 openmp/libomptarget/include/omptarget.h       |  1 +
 openmp/libomptarget/src/api.cpp               | 11 +++++
 openmp/libomptarget/src/exports               |  2 +
 .../libomptarget/test/api/omp_target_memset.c | 45 +++++++++++++++++++
 openmp/runtime/src/dllexports                 |  2 +
 openmp/runtime/src/include/omp.h.var          |  5 +++
 openmp/runtime/src/include/omp_lib.f90.var    | 22 +++++++++
 openmp/runtime/src/include/omp_lib.h.var      | 22 +++++++++
 openmp/runtime/src/kmp_ftn_os.h               |  2 +
 9 files changed, 112 insertions(+)
 create mode 100644 openmp/libomptarget/test/api/omp_target_memset.c

diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index f87557a69eff272..e1f0f77849fa206 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -312,6 +312,7 @@ int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
                            const size_t *DstDimensions,
                            const size_t *SrcDimensions, int DstDevice,
                            int SrcDevice);
+void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
 int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
                              size_t Size, size_t DeviceOffset, int DeviceNum);
 int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum);
diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 942df8fdb94d660..595b234d535c8e0 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -241,6 +241,17 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
   return Rc;
 }
 
+EXTERN void * omp_target_memset(void * Ptr, int C, size_t N, int DeviceNum) {
+  return nullptr;
+}
+
+EXTERN void * omp_target_memset_async(void * Ptr, int C, size_t N, int DeviceNum,
+                                      int DepObjCount,
+                                      omp_depend_t * DepObjList) {
+  return nullptr;
+}
+
+
 // Allocate and launch helper task
 static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args,
                                        int DepObjCount,
diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports
index c29c8d03fb1276f..af882a264264725 100644
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -44,6 +44,8 @@ VERS1.0 {
     omp_target_memcpy_rect;
     omp_target_memcpy_async;
     omp_target_memcpy_rect_async;
+    omp_target_memset;
+    omp_target_memset_async;
     omp_target_associate_ptr;
     omp_target_disassociate_ptr;
     llvm_omp_target_alloc_host;
diff --git a/openmp/libomptarget/test/api/omp_target_memset.c b/openmp/libomptarget/test/api/omp_target_memset.c
new file mode 100644
index 000000000000000..ea0c6d73fbdeefb
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_target_memset.c
@@ -0,0 +1,45 @@
+// RUN: %libomptarget-compile-and-run-generic
+
+#include "stdio.h"
+#include <omp.h>
+#include <stdlib.h>
+
+int main() {
+  int d = omp_get_default_device();
+  int id = omp_get_initial_device();
+  int q[128], i;
+  void *p;
+  void *result;
+
+  if (d < 0 || d >= omp_get_num_devices())
+    d = id;
+
+  p = omp_target_alloc(130 * sizeof(int), d);
+  if (p == NULL)
+    return 0;
+
+  for (i = 0; i < 128; i++)
+    q[i] = i;
+
+  result = omp_target_memset(p, 0, 130 * sizeof(int), d);
+  if (result != p) {
+    abort();
+  }
+
+  int q2[128];
+  for (i = 0; i < 128; ++i)
+    q2[i] = i;
+  if (omp_target_memcpy_async(q2, p, 128 * sizeof(int), 0, sizeof(int), id, d,
+                              0, NULL))
+    abort();
+
+#pragma omp taskwait
+
+  for (i = 0; i < 128; ++i)
+    if (q2[i] != 0)
+      abort();
+
+  omp_target_free(p, d);
+
+  return 0;
+}
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index e69cf6670e81489..0d49643709e0a05 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -518,6 +518,8 @@ kmp_set_warnings_off                        780
         omp_target_memcpy_rect              887
         omp_target_associate_ptr            888
         omp_target_disassociate_ptr         889
+        omp_target_memset                   3000
+        omp_target_memset_async             3001
     %endif
 
 kmp_set_disp_num_buffers                    890
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index 1b2c467a2a12d8a..f372402be37d39c 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -236,6 +236,11 @@
     extern int    __KAI_KMPC_CONVENTION  omp_target_memcpy_rect_async(void *, const void *, size_t, int, const size_t *,
                                              const size_t *, const size_t *, const size_t *, const size_t *, int, int,
                                              int, omp_depend_t *);
+
+    /* OpenMP 6.0 device memory routines */
+    extern void * __KAI_KMPC_CONVENTION omp_target_memset(void *, int, size_t, int);
+    extern void * __KAI_KMPC_CONVENTION omp_target_memset_async(void *, int, size_t, int, int, omp_depend_t *);
+
     /*!
      * The `omp_get_mapped_ptr` routine returns the device pointer that is associated with a host pointer for a given device.
      */
diff --git a/openmp/runtime/src/include/omp_lib.f90.var b/openmp/runtime/src/include/omp_lib.f90.var
index c72287422809aef..1ca542db3767ef2 100644
--- a/openmp/runtime/src/include/omp_lib.f90.var
+++ b/openmp/runtime/src/include/omp_lib.f90.var
@@ -635,6 +635,28 @@
             integer (omp_depend_kind), optional :: depobj_list(*)
           end function omp_target_memcpy_rect_async
 
+          function omp_target_memset(ptr, val, count, device_num) bind(c)
+            use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+            type(c_ptr) :: omp_target_memset
+            type(c_ptr), value :: ptr
+            integer(c_int), value :: val
+            integer(c_size_t), value :: count
+            integer(c_int), value :: device_num
+          end function
+
+          function omp_target_memset_async(ptr, val, count, device_num, &
+                                           depobj_count, depobj_list) bind(c)
+            use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+            use omp_lib_kinds
+            type(c_ptr) :: omp_target_memset_async
+            type(c_ptr), value :: ptr
+            integer(c_int), value :: val
+            integer(c_size_t), value :: count
+            integer(c_int), value :: device_num
+            integer(c_int), value :: depobj_count
+            integer(omp_depend_kind), optional :: depobj_list(*)
+          end function
+
           function omp_target_associate_ptr(host_ptr, device_ptr, size,        &
               device_offset, device_num) bind(c)
             use omp_lib_kinds
diff --git a/openmp/runtime/src/include/omp_lib.h.var b/openmp/runtime/src/include/omp_lib.h.var
index 9f5e58515e75159..d20aade6ef8b327 100644
--- a/openmp/runtime/src/include/omp_lib.h.var
+++ b/openmp/runtime/src/include/omp_lib.h.var
@@ -732,6 +732,28 @@
           integer(omp_depend_kind), optional :: depobj_list(*)
         end function omp_target_memcpy_rect_async
 
+        function omp_target_memset(ptr, val, count, device_num) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+          type(c_ptr) :: omp_target_memset
+          type(c_ptr), value :: ptr
+          integer(c_int), value :: val
+          integer(c_size_t), value :: count
+          integer(c_int), value :: device_num
+        end function
+
+        function omp_target_memset_async(ptr, val, count, device_num, &
+                                         depobj_count, depobj_list) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+          use omp_lib_kinds
+          type(c_ptr) :: omp_target_memset_async
+          type(c_ptr), value :: ptr
+          integer(c_int), value :: val
+          integer(c_size_t), value :: count
+          integer(c_int), value :: device_num
+          integer(c_int), value :: depobj_count
+          integer(omp_depend_kind), optional :: depobj_list(*)
+        end function
+
         function omp_target_associate_ptr(host_ptr, device_ptr, size,                                                               &
      &      device_offset, device_num) bind(c)
           use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int
diff --git a/openmp/runtime/src/kmp_ftn_os.h b/openmp/runtime/src/kmp_ftn_os.h
index d37c9c86028eb13..7d595b947f4a9de 100644
--- a/openmp/runtime/src/kmp_ftn_os.h
+++ b/openmp/runtime/src/kmp_ftn_os.h
@@ -116,6 +116,8 @@
 #define FTN_TARGET_IS_PRESENT omp_target_is_present
 #define FTN_TARGET_MEMCPY omp_target_memcpy
 #define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect
+#define FTN_TARGET_MEMSET omp_target_memset
+#define FTN_TARGET_MEMSET_ASYNC omp_target_memset_async
 #define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr
 #define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr
 #endif

>From c1791d6b9509045b38f16b77959fdc0b4d5b7b78 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm at amd.com>
Date: Mon, 2 Oct 2023 16:14:55 +0200
Subject: [PATCH 2/6] Implement a slow-path version of omp_target_memset*()

There is a TODO to implement a fast path that uses an on-device
kernel instead of the host-based memory fill operation.  This may
require some additional plumbing to have kernels in libomptarget.so
---
 openmp/libomptarget/src/api.cpp   | 116 +++++++++++++++++++++++++++---
 openmp/libomptarget/src/private.h |  13 ++++
 2 files changed, 121 insertions(+), 8 deletions(-)

diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 595b234d535c8e0..c8675b698544f3a 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -241,21 +241,121 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
   return Rc;
 }
 
+static int libomp_target_memset_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
+  if (!Task) {
+    return OFFLOAD_FAIL;
+  }
+
+  auto *Args = reinterpret_cast<TargetMemsetArgsTy *>(Task->shareds);
+  if (!Args) {
+    return OFFLOAD_FAIL;
+  }
+
+  // call omp_target_memset()
+  omp_target_memset(Args->Ptr, Args->C, Args->N, Args->DeviceNum);
+
+  delete Args;
+
+  return OFFLOAD_SUCCESS;
+}
+
+static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args,
+                                              int DepObjCount,
+                                              omp_depend_t * DepObjList) {
+  // Create global thread ID
+  int Gtid = __kmpc_global_thread_num(nullptr);
+  int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memset_async_helper;
+
+  // Setup the hidden helper flags
+  kmp_int32 Flags = 0;
+  kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
+  InputFlags->hidden_helper = 1;
+
+  // Alloc the helper task
+  kmp_task_t *Task = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags,
+                                                 sizeof(kmp_task_t), 0, Fn, -1);
+  if (!Task) {
+    delete Args;
+    return OFFLOAD_FAIL;
+  }
+
+  // Setup the arguments for the helper task
+  Task->shareds = Args;
+
+  // Convert types of depend objects
+  llvm::SmallVector<kmp_depend_info_t> DepObjs;
+  for (int i = 0; i < DepObjCount; i++) {
+    omp_depend_t DepObj = DepObjList[i];
+    DepObjs.push_back(*((kmp_depend_info_t *)DepObj));
+  }
+
+  // Launch the helper task
+  int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Task, DepObjCount,
+                                     DepObjs.data(), 0, nullptr);
+
+  return Rc;
+}
+
 EXTERN void * omp_target_memset(void * Ptr, int C, size_t N, int DeviceNum) {
-  return nullptr;
+  TIMESCOPE();
+  DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n",
+     DeviceNum, Ptr, N);
+
+  // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
+  // of unspecified behavior, see OpenMP spec).
+  if (!Ptr || N == 0) {
+    return Ptr;
+  }
+
+  if (DeviceNum == omp_get_initial_device()) {
+    DP("filling memory on host via memset");
+    memset(Ptr, C, N);  // ignore return value, memset() cannot fail
+  }
+  else {
+    // TODO: replace the omp_target_memset() slow path with the fast path.
+    // That will require the ability to execute a kernel from within
+    // libomptarget.so (which we do not have at the moment).
+
+    // This is a very slow path: create a filled array on the host and upload
+    // it to the GPU device.
+    int InitialDevice = omp_get_initial_device();
+    void * Shadow = omp_target_alloc(N, InitialDevice);
+    memset(Shadow, C, N);
+    omp_target_memcpy(Ptr, Shadow, N, 0, 0, DeviceNum, InitialDevice);
+    omp_target_free(Shadow, InitialDevice);
+  }
+
+  DP("omp_target_memset returns %p\n", Ptr);
+  return Ptr;
 }
 
+
 EXTERN void * omp_target_memset_async(void * Ptr, int C, size_t N, int DeviceNum,
                                       int DepObjCount,
                                       omp_depend_t * DepObjList) {
-  return nullptr;
-}
+  DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu",
+     DeviceNum, Ptr, N);
+
+  // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
+  // of unspecified behavior, see OpenMP spec).
+  if (!Ptr || N == 0) {
+    return Ptr;
+  }
 
+  // Create the task object to deal with the async invocation
+  auto *Args = new TargetMemsetArgsTy{Ptr, C, N, DeviceNum};
+
+  // omp_target_memset_async() cannot fail via a return code, so ignore the
+  // return code of the helper function
+  (void) libomp_helper_memset_task_creation(Args, DepObjCount, DepObjList);
+
+  return Ptr;
+}
 
 // Allocate and launch helper task
-static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args,
-                                       int DepObjCount,
-                                       omp_depend_t *DepObjList) {
+static int libomp_helper_memcpy_task_creation(TargetMemcpyArgsTy *Args,
+                                              int DepObjCount,
+                                              omp_depend_t *DepObjList) {
   // Create global thread ID
   int Gtid = __kmpc_global_thread_num(nullptr);
   int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memcpy_async_helper;
@@ -313,7 +413,7 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
       Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);
 
   // Create and launch helper task
-  int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
+  int Rc = libomp_helper_memcpy_task_creation(Args, DepObjCount, DepObjList);
 
   DP("omp_target_memcpy_async returns %d\n", Rc);
   return Rc;
@@ -410,7 +510,7 @@ EXTERN int omp_target_memcpy_rect_async(
       DstDimensions, SrcDimensions, DstDevice, SrcDevice);
 
   // Create and launch helper task
-  int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
+  int Rc = libomp_helper_memcpy_task_creation(Args, DepObjCount, DepObjList);
 
   DP("omp_target_memcpy_rect_async returns %d\n", Rc);
   return Rc;
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index cbce15b63a3eba2..ec1ecd5503e14fb 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -253,6 +253,19 @@ struct TargetMemcpyArgsTy {
         DstOffsets(DstOffsets), SrcOffsets(SrcOffsets),
         DstDimensions(DstDimensions), SrcDimensions(SrcDimensions){};
 };
+
+struct TargetMemsetArgsTy {
+  /**
+   * Common attributes of a memset operation
+   */
+  void *Ptr;
+  int C;
+  size_t N;
+  int DeviceNum;
+
+  // no constructors defined, because this is a PoD
+};
+
 // Invalid GTID as defined by libomp; keep in sync
 #define KMP_GTID_DNE (-2)
 #ifdef __cplusplus

>From 97dba336319777ee3c06ed76be2dc41ee87f01cf Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm at amd.com>
Date: Wed, 4 Oct 2023 16:05:38 +0200
Subject: [PATCH 3/6] Refactor code a bit to reduce code duplication

---
 openmp/libomptarget/src/api.cpp | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index c8675b698544f3a..baf5878595b123c 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -259,6 +259,16 @@ static int libomp_target_memset_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
   return OFFLOAD_SUCCESS;
 }
 
+static inline
+void ConvertDepObjVector(llvm::SmallVector<kmp_depend_info_t> &Vec,
+                         int DepObjCount, omp_depend_t * DepObjList) {
+  for (int i = 0; i < DepObjCount; i++) {
+    omp_depend_t DepObj = DepObjList[i];
+    Vec.push_back(*((kmp_depend_info_t *)DepObj));
+  }
+}
+
+
 static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args,
                                               int DepObjCount,
                                               omp_depend_t * DepObjList) {
@@ -284,10 +294,7 @@ static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args,
 
   // Convert types of depend objects
   llvm::SmallVector<kmp_depend_info_t> DepObjs;
-  for (int i = 0; i < DepObjCount; i++) {
-    omp_depend_t DepObj = DepObjList[i];
-    DepObjs.push_back(*((kmp_depend_info_t *)DepObj));
-  }
+  ConvertDepObjVector(DepObjs, DepObjCount, DepObjList);
 
   // Launch the helper task
   int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Task, DepObjCount,
@@ -381,10 +388,7 @@ static int libomp_helper_memcpy_task_creation(TargetMemcpyArgsTy *Args,
 
   // Convert the type of depend objects
   llvm::SmallVector<kmp_depend_info_t> DepObjs;
-  for (int i = 0; i < DepObjCount; i++) {
-    omp_depend_t DepObj = DepObjList[i];
-    DepObjs.push_back(*((kmp_depend_info_t *)DepObj));
-  }
+  ConvertDepObjVector(DepObjs, DepObjCount, DepObjList);
 
   // Launch the helper task
   int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount,

>From 33e7eb64f56a8e18eb6498a332b98fe8a89f70c2 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm at amd.com>
Date: Fri, 6 Oct 2023 14:44:42 +0200
Subject: [PATCH 4/6] Apply formatting rules

---
 openmp/libomptarget/src/api.cpp | 39 +++++++++++++++------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index baf5878595b123c..0b8a21652ee9b24 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -259,19 +259,18 @@ static int libomp_target_memset_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
   return OFFLOAD_SUCCESS;
 }
 
-static inline
-void ConvertDepObjVector(llvm::SmallVector<kmp_depend_info_t> &Vec,
-                         int DepObjCount, omp_depend_t * DepObjList) {
-  for (int i = 0; i < DepObjCount; i++) {
+static inline void
+ConvertDepObjVector(llvm::SmallVector<kmp_depend_info_t> &Vec, int DepObjCount,
+                    omp_depend_t *DepObjList) {
+  for (int i = 0; i < DepObjCount; ++i) {
     omp_depend_t DepObj = DepObjList[i];
     Vec.push_back(*((kmp_depend_info_t *)DepObj));
   }
 }
 
-
 static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args,
                                               int DepObjCount,
-                                              omp_depend_t * DepObjList) {
+                                              omp_depend_t *DepObjList) {
   // Create global thread ID
   int Gtid = __kmpc_global_thread_num(nullptr);
   int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memset_async_helper;
@@ -282,8 +281,8 @@ static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args,
   InputFlags->hidden_helper = 1;
 
   // Alloc the helper task
-  kmp_task_t *Task = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags,
-                                                 sizeof(kmp_task_t), 0, Fn, -1);
+  kmp_task_t *Task = __kmpc_omp_target_task_alloc(
+      nullptr, Gtid, Flags, sizeof(kmp_task_t), 0, Fn, -1);
   if (!Task) {
     delete Args;
     return OFFLOAD_FAIL;
@@ -303,7 +302,7 @@ static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args,
   return Rc;
 }
 
-EXTERN void * omp_target_memset(void * Ptr, int C, size_t N, int DeviceNum) {
+EXTERN void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum) {
   TIMESCOPE();
   DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n",
      DeviceNum, Ptr, N);
@@ -316,9 +315,8 @@ EXTERN void * omp_target_memset(void * Ptr, int C, size_t N, int DeviceNum) {
 
   if (DeviceNum == omp_get_initial_device()) {
     DP("filling memory on host via memset");
-    memset(Ptr, C, N);  // ignore return value, memset() cannot fail
-  }
-  else {
+    memset(Ptr, C, N); // ignore return value, memset() cannot fail
+  } else {
     // TODO: replace the omp_target_memset() slow path with the fast path.
     // That will require the ability to execute a kernel from within
     // libomptarget.so (which we do not have at the moment).
@@ -326,20 +324,19 @@ EXTERN void * omp_target_memset(void * Ptr, int C, size_t N, int DeviceNum) {
     // This is a very slow path: create a filled array on the host and upload
     // it to the GPU device.
     int InitialDevice = omp_get_initial_device();
-    void * Shadow = omp_target_alloc(N, InitialDevice);
-    memset(Shadow, C, N);
-    omp_target_memcpy(Ptr, Shadow, N, 0, 0, DeviceNum, InitialDevice);
-    omp_target_free(Shadow, InitialDevice);
+    void *Shadow = omp_target_alloc(N, InitialDevice);
+    (void)memset(Shadow, C, N);
+    (void)omp_target_memcpy(Ptr, Shadow, N, 0, 0, DeviceNum, InitialDevice);
+    (void)omp_target_free(Shadow, InitialDevice);
   }
 
   DP("omp_target_memset returns %p\n", Ptr);
   return Ptr;
 }
 
-
-EXTERN void * omp_target_memset_async(void * Ptr, int C, size_t N, int DeviceNum,
-                                      int DepObjCount,
-                                      omp_depend_t * DepObjList) {
+EXTERN void *omp_target_memset_async(void *Ptr, int C, size_t N, int DeviceNum,
+                                     int DepObjCount,
+                                     omp_depend_t *DepObjList) {
   DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu",
      DeviceNum, Ptr, N);
 
@@ -354,7 +351,7 @@ EXTERN void * omp_target_memset_async(void * Ptr, int C, size_t N, int DeviceNum
 
   // omp_target_memset_async() cannot fail via a return code, so ignore the
   // return code of the helper function
-  (void) libomp_helper_memset_task_creation(Args, DepObjCount, DepObjList);
+  (void)libomp_helper_memset_task_creation(Args, DepObjCount, DepObjList);
 
   return Ptr;
 }

>From 7933bfa051acaecd801eb95bddcdbae8456d06c3 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm at amd.com>
Date: Tue, 10 Oct 2023 17:16:01 +0200
Subject: [PATCH 5/6] (Partly) Address feedback by shiltian

---
 openmp/libomptarget/src/api.cpp | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 0b8a21652ee9b24..c8e74ee89bf1421 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -242,9 +242,8 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
 }
 
 static int libomp_target_memset_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
-  if (!Task) {
+  if (!Task)
     return OFFLOAD_FAIL;
-  }
 
   auto *Args = reinterpret_cast<TargetMemsetArgsTy *>(Task->shareds);
   if (!Args) {
@@ -302,20 +301,20 @@ static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args,
   return Rc;
 }
 
-EXTERN void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum) {
+EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes, int DeviceNum) {
   TIMESCOPE();
   DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n",
-     DeviceNum, Ptr, N);
+     DeviceNum, Ptr, NumBytes);
 
   // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
   // of unspecified behavior, see OpenMP spec).
-  if (!Ptr || N == 0) {
+  if (!Ptr || NumBytes == 0) {
     return Ptr;
   }
 
   if (DeviceNum == omp_get_initial_device()) {
     DP("filling memory on host via memset");
-    memset(Ptr, C, N); // ignore return value, memset() cannot fail
+    memset(Ptr, ByteVal, NumBytes); // ignore return value, memset() cannot fail
   } else {
     // TODO: replace the omp_target_memset() slow path with the fast path.
     // That will require the ability to execute a kernel from within
@@ -324,9 +323,9 @@ EXTERN void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum) {
     // This is a very slow path: create a filled array on the host and upload
     // it to the GPU device.
     int InitialDevice = omp_get_initial_device();
-    void *Shadow = omp_target_alloc(N, InitialDevice);
-    (void)memset(Shadow, C, N);
-    (void)omp_target_memcpy(Ptr, Shadow, N, 0, 0, DeviceNum, InitialDevice);
+    void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
+    (void)memset(Shadow, ByteVal, NumBytes);
+    (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum, InitialDevice);
     (void)omp_target_free(Shadow, InitialDevice);
   }
 
@@ -334,20 +333,20 @@ EXTERN void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum) {
   return Ptr;
 }
 
-EXTERN void *omp_target_memset_async(void *Ptr, int C, size_t N, int DeviceNum,
+EXTERN void *omp_target_memset_async(void *Ptr, int ByteVal, size_t NumBytes, int DeviceNum,
                                      int DepObjCount,
                                      omp_depend_t *DepObjList) {
   DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu",
-     DeviceNum, Ptr, N);
+     DeviceNum, Ptr, NumBytes);
 
   // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
   // of unspecified behavior, see OpenMP spec).
-  if (!Ptr || N == 0) {
+  if (!Ptr || NumBytes == 0) {
     return Ptr;
   }
 
   // Create the task object to deal with the async invocation
-  auto *Args = new TargetMemsetArgsTy{Ptr, C, N, DeviceNum};
+  auto *Args = new TargetMemsetArgsTy{Ptr, ByteVal, NumBytes, DeviceNum};
 
   // omp_target_memset_async() cannot fail via a return code, so ignore the
   // return code of the helper function

>From 5e87a5708d4f7902a14893b7a5e5af38c311759b Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm at amd.com>
Date: Tue, 10 Oct 2023 19:20:54 +0200
Subject: [PATCH 6/6] Slightly reduce code complexity, merge task helpers for
 memset/memcpy

---
 openmp/libomptarget/src/api.cpp | 72 +++++++++------------------------
 1 file changed, 19 insertions(+), 53 deletions(-)

diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index c8e74ee89bf1421..ff51c73b58da162 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -210,7 +210,7 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
 }
 
 // The helper function that calls omp_target_memcpy or omp_target_memcpy_rect
-static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
+static int libomp_target_memcpy_async_task(kmp_int32 Gtid, kmp_task_t *Task) {
   if (Task == nullptr)
     return OFFLOAD_FAIL;
 
@@ -241,7 +241,7 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
   return Rc;
 }
 
-static int libomp_target_memset_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
+static int libomp_target_memset_async_task(kmp_int32 Gtid, kmp_task_t *Task) {
   if (!Task)
     return OFFLOAD_FAIL;
 
@@ -267,12 +267,12 @@ ConvertDepObjVector(llvm::SmallVector<kmp_depend_info_t> &Vec, int DepObjCount,
   }
 }
 
-static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args,
-                                              int DepObjCount,
-                                              omp_depend_t *DepObjList) {
+template <class T>
+static inline int
+libomp_helper_task_creation(T *Args, int (*Fn)(kmp_int32, kmp_task_t *),
+                            int DepObjCount, omp_depend_t *DepObjList) {
   // Create global thread ID
   int Gtid = __kmpc_global_thread_num(nullptr);
-  int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memset_async_helper;
 
   // Setup the hidden helper flags
   kmp_int32 Flags = 0;
@@ -301,7 +301,8 @@ static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args,
   return Rc;
 }
 
-EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes, int DeviceNum) {
+EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes,
+                               int DeviceNum) {
   TIMESCOPE();
   DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n",
      DeviceNum, Ptr, NumBytes);
@@ -325,7 +326,8 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes, int Devi
     int InitialDevice = omp_get_initial_device();
     void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
     (void)memset(Shadow, ByteVal, NumBytes);
-    (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum, InitialDevice);
+    (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
+                            InitialDevice);
     (void)omp_target_free(Shadow, InitialDevice);
   }
 
@@ -333,66 +335,28 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes, int Devi
   return Ptr;
 }
 
-EXTERN void *omp_target_memset_async(void *Ptr, int ByteVal, size_t NumBytes, int DeviceNum,
-                                     int DepObjCount,
+EXTERN void *omp_target_memset_async(void *Ptr, int ByteVal, size_t NumBytes,
+                                     int DeviceNum, int DepObjCount,
                                      omp_depend_t *DepObjList) {
   DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu",
      DeviceNum, Ptr, NumBytes);
 
   // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
   // of unspecified behavior, see OpenMP spec).
-  if (!Ptr || NumBytes == 0) {
+  if (!Ptr || NumBytes == 0)
     return Ptr;
-  }
 
   // Create the task object to deal with the async invocation
   auto *Args = new TargetMemsetArgsTy{Ptr, ByteVal, NumBytes, DeviceNum};
 
   // omp_target_memset_async() cannot fail via a return code, so ignore the
   // return code of the helper function
-  (void)libomp_helper_memset_task_creation(Args, DepObjCount, DepObjList);
+  (void)libomp_helper_task_creation(Args, &libomp_target_memset_async_task,
+                                    DepObjCount, DepObjList);
 
   return Ptr;
 }
 
-// Allocate and launch helper task
-static int libomp_helper_memcpy_task_creation(TargetMemcpyArgsTy *Args,
-                                              int DepObjCount,
-                                              omp_depend_t *DepObjList) {
-  // Create global thread ID
-  int Gtid = __kmpc_global_thread_num(nullptr);
-  int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memcpy_async_helper;
-
-  // Setup the hidden helper flags;
-  kmp_int32 Flags = 0;
-  kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
-  InputFlags->hidden_helper = 1;
-
-  // Alloc helper task
-  kmp_task_t *Ptr = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags,
-                                                 sizeof(kmp_task_t), 0, Fn, -1);
-
-  if (Ptr == nullptr) {
-    // Task allocation failed, delete the argument object
-    delete Args;
-
-    return OFFLOAD_FAIL;
-  }
-
-  // Setup the arguments passed to helper task
-  Ptr->shareds = Args;
-
-  // Convert the type of depend objects
-  llvm::SmallVector<kmp_depend_info_t> DepObjs;
-  ConvertDepObjVector(DepObjs, DepObjCount, DepObjList);
-
-  // Launch the helper task
-  int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount,
-                                     DepObjs.data(), 0, nullptr);
-
-  return Rc;
-}
-
 EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
                                    size_t DstOffset, size_t SrcOffset,
                                    int DstDevice, int SrcDevice,
@@ -413,7 +377,8 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
       Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);
 
   // Create and launch helper task
-  int Rc = libomp_helper_memcpy_task_creation(Args, DepObjCount, DepObjList);
+  int Rc = libomp_helper_task_creation(Args, &libomp_target_memcpy_async_task,
+                                       DepObjCount, DepObjList);
 
   DP("omp_target_memcpy_async returns %d\n", Rc);
   return Rc;
@@ -510,7 +475,8 @@ EXTERN int omp_target_memcpy_rect_async(
       DstDimensions, SrcDimensions, DstDevice, SrcDevice);
 
   // Create and launch helper task
-  int Rc = libomp_helper_memcpy_task_creation(Args, DepObjCount, DepObjList);
+  int Rc = libomp_helper_task_creation(Args, &libomp_target_memcpy_async_task,
+                                       DepObjCount, DepObjList);
 
   DP("omp_target_memcpy_rect_async returns %d\n", Rc);
   return Rc;



More information about the Openmp-commits mailing list