[Openmp-commits] [openmp] [libomptarget][OpenMP] Initial implementation of omp_target_memset() and omp_target_memset_async() (PR #68706)
via Openmp-commits
openmp-commits at lists.llvm.org
Tue Oct 10 07:00:22 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-openmp
Author: Michael Klemm (mjklemm)
<details>
<summary>Changes</summary>
This PR adds the newly landed OpenMP API routines omp_target_memset() and omp_target_memset_sync() to fill memory in the device memory. This first implementation is based on a slow path that initializes memory on the host and then issues a H2D transfer to fill the memory on the target device. A future PR will introduce an offloaded kernel (or driver-side API to do fill the memory on the target device)
---
Full diff: https://github.com/llvm/llvm-project/pull/68706.diff
10 Files Affected:
- (modified) openmp/libomptarget/include/omptarget.h (+1)
- (modified) openmp/libomptarget/src/api.cpp (+121-9)
- (modified) openmp/libomptarget/src/exports (+2)
- (modified) openmp/libomptarget/src/private.h (+13)
- (added) openmp/libomptarget/test/api/omp_target_memset.c (+45)
- (modified) openmp/runtime/src/dllexports (+2)
- (modified) openmp/runtime/src/include/omp.h.var (+5)
- (modified) openmp/runtime/src/include/omp_lib.f90.var (+22)
- (modified) openmp/runtime/src/include/omp_lib.h.var (+22)
- (modified) openmp/runtime/src/kmp_ftn_os.h (+2)
``````````diff
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index f87557a69eff272..e1f0f77849fa206 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -312,6 +312,7 @@ int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
const size_t *DstDimensions,
const size_t *SrcDimensions, int DstDevice,
int SrcDevice);
+void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
size_t Size, size_t DeviceOffset, int DeviceNum);
int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum);
diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 942df8fdb94d660..0b8a21652ee9b24 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -241,10 +241,125 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
return Rc;
}
+static int libomp_target_memset_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
+ if (!Task) {
+ return OFFLOAD_FAIL;
+ }
+
+ auto *Args = reinterpret_cast<TargetMemsetArgsTy *>(Task->shareds);
+ if (!Args) {
+ return OFFLOAD_FAIL;
+ }
+
+ // call omp_target_memset()
+ omp_target_memset(Args->Ptr, Args->C, Args->N, Args->DeviceNum);
+
+ delete Args;
+
+ return OFFLOAD_SUCCESS;
+}
+
+static inline void
+ConvertDepObjVector(llvm::SmallVector<kmp_depend_info_t> &Vec, int DepObjCount,
+ omp_depend_t *DepObjList) {
+ for (int i = 0; i < DepObjCount; ++i) {
+ omp_depend_t DepObj = DepObjList[i];
+ Vec.push_back(*((kmp_depend_info_t *)DepObj));
+ }
+}
+
+static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args,
+ int DepObjCount,
+ omp_depend_t *DepObjList) {
+ // Create global thread ID
+ int Gtid = __kmpc_global_thread_num(nullptr);
+ int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memset_async_helper;
+
+ // Setup the hidden helper flags
+ kmp_int32 Flags = 0;
+ kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
+ InputFlags->hidden_helper = 1;
+
+ // Alloc the helper task
+ kmp_task_t *Task = __kmpc_omp_target_task_alloc(
+ nullptr, Gtid, Flags, sizeof(kmp_task_t), 0, Fn, -1);
+ if (!Task) {
+ delete Args;
+ return OFFLOAD_FAIL;
+ }
+
+ // Setup the arguments for the helper task
+ Task->shareds = Args;
+
+ // Convert types of depend objects
+ llvm::SmallVector<kmp_depend_info_t> DepObjs;
+ ConvertDepObjVector(DepObjs, DepObjCount, DepObjList);
+
+ // Launch the helper task
+ int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Task, DepObjCount,
+ DepObjs.data(), 0, nullptr);
+
+ return Rc;
+}
+
+EXTERN void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum) {
+ TIMESCOPE();
+ DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n",
+ DeviceNum, Ptr, N);
+
+ // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
+ // of unspecified behavior, see OpenMP spec).
+ if (!Ptr || N == 0) {
+ return Ptr;
+ }
+
+ if (DeviceNum == omp_get_initial_device()) {
+ DP("filling memory on host via memset");
+ memset(Ptr, C, N); // ignore return value, memset() cannot fail
+ } else {
+ // TODO: replace the omp_target_memset() slow path with the fast path.
+ // That will require the ability to execute a kernel from within
+ // libomptarget.so (which we do not have at the moment).
+
+ // This is a very slow path: create a filled array on the host and upload
+ // it to the GPU device.
+ int InitialDevice = omp_get_initial_device();
+ void *Shadow = omp_target_alloc(N, InitialDevice);
+ (void)memset(Shadow, C, N);
+ (void)omp_target_memcpy(Ptr, Shadow, N, 0, 0, DeviceNum, InitialDevice);
+ (void)omp_target_free(Shadow, InitialDevice);
+ }
+
+ DP("omp_target_memset returns %p\n", Ptr);
+ return Ptr;
+}
+
+EXTERN void *omp_target_memset_async(void *Ptr, int C, size_t N, int DeviceNum,
+ int DepObjCount,
+ omp_depend_t *DepObjList) {
+ DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu",
+ DeviceNum, Ptr, N);
+
+ // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
+ // of unspecified behavior, see OpenMP spec).
+ if (!Ptr || N == 0) {
+ return Ptr;
+ }
+
+ // Create the task object to deal with the async invocation
+ auto *Args = new TargetMemsetArgsTy{Ptr, C, N, DeviceNum};
+
+ // omp_target_memset_async() cannot fail via a return code, so ignore the
+ // return code of the helper function
+ (void)libomp_helper_memset_task_creation(Args, DepObjCount, DepObjList);
+
+ return Ptr;
+}
+
// Allocate and launch helper task
-static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args,
- int DepObjCount,
- omp_depend_t *DepObjList) {
+static int libomp_helper_memcpy_task_creation(TargetMemcpyArgsTy *Args,
+ int DepObjCount,
+ omp_depend_t *DepObjList) {
// Create global thread ID
int Gtid = __kmpc_global_thread_num(nullptr);
int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memcpy_async_helper;
@@ -270,10 +385,7 @@ static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args,
// Convert the type of depend objects
llvm::SmallVector<kmp_depend_info_t> DepObjs;
- for (int i = 0; i < DepObjCount; i++) {
- omp_depend_t DepObj = DepObjList[i];
- DepObjs.push_back(*((kmp_depend_info_t *)DepObj));
- }
+ ConvertDepObjVector(DepObjs, DepObjCount, DepObjList);
// Launch the helper task
int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount,
@@ -302,7 +414,7 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);
// Create and launch helper task
- int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
+ int Rc = libomp_helper_memcpy_task_creation(Args, DepObjCount, DepObjList);
DP("omp_target_memcpy_async returns %d\n", Rc);
return Rc;
@@ -399,7 +511,7 @@ EXTERN int omp_target_memcpy_rect_async(
DstDimensions, SrcDimensions, DstDevice, SrcDevice);
// Create and launch helper task
- int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
+ int Rc = libomp_helper_memcpy_task_creation(Args, DepObjCount, DepObjList);
DP("omp_target_memcpy_rect_async returns %d\n", Rc);
return Rc;
diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports
index c29c8d03fb1276f..af882a264264725 100644
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -44,6 +44,8 @@ VERS1.0 {
omp_target_memcpy_rect;
omp_target_memcpy_async;
omp_target_memcpy_rect_async;
+ omp_target_memset;
+ omp_target_memset_async;
omp_target_associate_ptr;
omp_target_disassociate_ptr;
llvm_omp_target_alloc_host;
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index cbce15b63a3eba2..ec1ecd5503e14fb 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -253,6 +253,19 @@ struct TargetMemcpyArgsTy {
DstOffsets(DstOffsets), SrcOffsets(SrcOffsets),
DstDimensions(DstDimensions), SrcDimensions(SrcDimensions){};
};
+
+struct TargetMemsetArgsTy {
+ /**
+ * Common attributes of a memset operation
+ */
+ void *Ptr;
+ int C;
+ size_t N;
+ int DeviceNum;
+
+ // no constructors defined, because this is a PoD
+};
+
// Invalid GTID as defined by libomp; keep in sync
#define KMP_GTID_DNE (-2)
#ifdef __cplusplus
diff --git a/openmp/libomptarget/test/api/omp_target_memset.c b/openmp/libomptarget/test/api/omp_target_memset.c
new file mode 100644
index 000000000000000..ea0c6d73fbdeefb
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_target_memset.c
@@ -0,0 +1,45 @@
+// RUN: %libomptarget-compile-and-run-generic
+
+#include "stdio.h"
+#include <omp.h>
+#include <stdlib.h>
+
+int main() {
+ int d = omp_get_default_device();
+ int id = omp_get_initial_device();
+ int q[128], i;
+ void *p;
+ void *result;
+
+ if (d < 0 || d >= omp_get_num_devices())
+ d = id;
+
+ p = omp_target_alloc(130 * sizeof(int), d);
+ if (p == NULL)
+ return 0;
+
+ for (i = 0; i < 128; i++)
+ q[i] = i;
+
+ result = omp_target_memset(p, 0, 130 * sizeof(int), d);
+ if (result != p) {
+ abort();
+ }
+
+ int q2[128];
+ for (i = 0; i < 128; ++i)
+ q2[i] = i;
+ if (omp_target_memcpy_async(q2, p, 128 * sizeof(int), 0, sizeof(int), id, d,
+ 0, NULL))
+ abort();
+
+#pragma omp taskwait
+
+ for (i = 0; i < 128; ++i)
+ if (q2[i] != 0)
+ abort();
+
+ omp_target_free(p, d);
+
+ return 0;
+}
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index e69cf6670e81489..0d49643709e0a05 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -518,6 +518,8 @@ kmp_set_warnings_off 780
omp_target_memcpy_rect 887
omp_target_associate_ptr 888
omp_target_disassociate_ptr 889
+ omp_target_memset 3000
+ omp_target_memset_async 3001
%endif
kmp_set_disp_num_buffers 890
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index 1b2c467a2a12d8a..f372402be37d39c 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -236,6 +236,11 @@
extern int __KAI_KMPC_CONVENTION omp_target_memcpy_rect_async(void *, const void *, size_t, int, const size_t *,
const size_t *, const size_t *, const size_t *, const size_t *, int, int,
int, omp_depend_t *);
+
+ /* OpenMP 6.0 device memory routines */
+ extern void * __KAI_KMPC_CONVENTION omp_target_memset(void *, int, size_t, int);
+ extern void * __KAI_KMPC_CONVENTION omp_target_memset_async(void *, int, size_t, int, int, omp_depend_t *);
+
/*!
* The `omp_get_mapped_ptr` routine returns the device pointer that is associated with a host pointer for a given device.
*/
diff --git a/openmp/runtime/src/include/omp_lib.f90.var b/openmp/runtime/src/include/omp_lib.f90.var
index c72287422809aef..1ca542db3767ef2 100644
--- a/openmp/runtime/src/include/omp_lib.f90.var
+++ b/openmp/runtime/src/include/omp_lib.f90.var
@@ -635,6 +635,28 @@
integer (omp_depend_kind), optional :: depobj_list(*)
end function omp_target_memcpy_rect_async
+ function omp_target_memset(ptr, val, count, device_num) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ type(c_ptr) :: omp_target_memset
+ type(c_ptr), value :: ptr
+ integer(c_int), value :: val
+ integer(c_size_t), value :: count
+ integer(c_int), value :: device_num
+ end function
+
+ function omp_target_memset_async(ptr, val, count, device_num, &
+ depobj_count, depobj_list) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ use omp_lib_kinds
+ type(c_ptr) :: omp_target_memset_async
+ type(c_ptr), value :: ptr
+ integer(c_int), value :: val
+ integer(c_size_t), value :: count
+ integer(c_int), value :: device_num
+ integer(c_int), value :: depobj_count
+ integer(omp_depend_kind), optional :: depobj_list(*)
+ end function
+
function omp_target_associate_ptr(host_ptr, device_ptr, size, &
device_offset, device_num) bind(c)
use omp_lib_kinds
diff --git a/openmp/runtime/src/include/omp_lib.h.var b/openmp/runtime/src/include/omp_lib.h.var
index 9f5e58515e75159..d20aade6ef8b327 100644
--- a/openmp/runtime/src/include/omp_lib.h.var
+++ b/openmp/runtime/src/include/omp_lib.h.var
@@ -732,6 +732,28 @@
integer(omp_depend_kind), optional :: depobj_list(*)
end function omp_target_memcpy_rect_async
+ function omp_target_memset(ptr, val, count, device_num) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ type(c_ptr) :: omp_target_memset
+ type(c_ptr), value :: ptr
+ integer(c_int), value :: val
+ integer(c_size_t), value :: count
+ integer(c_int), value :: device_num
+ end function
+
+ function omp_target_memset_async(ptr, val, count, device_num, &
+ depobj_count, depobj_list) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ use omp_lib_kinds
+ type(c_ptr) :: omp_target_memset_async
+ type(c_ptr), value :: ptr
+ integer(c_int), value :: val
+ integer(c_size_t), value :: count
+ integer(c_int), value :: device_num
+ integer(c_int), value :: depobj_count
+ integer(omp_depend_kind), optional :: depobj_list(*)
+ end function
+
function omp_target_associate_ptr(host_ptr, device_ptr, size, &
& device_offset, device_num) bind(c)
use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int
diff --git a/openmp/runtime/src/kmp_ftn_os.h b/openmp/runtime/src/kmp_ftn_os.h
index d37c9c86028eb13..7d595b947f4a9de 100644
--- a/openmp/runtime/src/kmp_ftn_os.h
+++ b/openmp/runtime/src/kmp_ftn_os.h
@@ -116,6 +116,8 @@
#define FTN_TARGET_IS_PRESENT omp_target_is_present
#define FTN_TARGET_MEMCPY omp_target_memcpy
#define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect
+#define FTN_TARGET_MEMSET omp_target_memset
+#define FTN_TARGET_MEMSET_ASYNC omp_target_memset_async
#define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr
#define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr
#endif
``````````
</details>
https://github.com/llvm/llvm-project/pull/68706
More information about the Openmp-commits
mailing list