[Openmp-commits] [openmp] 4753a4e - [OpenMP] asynchronous memory copy support
JP Lehr via Openmp-commits
openmp-commits at lists.llvm.org
Thu Mar 30 12:15:24 PDT 2023
Author: Jisheng Zhao
Date: 2023-03-30T15:14:21-04:00
New Revision: 4753a4e31169800277c09a59b2181ba3d4ddd683
URL: https://github.com/llvm/llvm-project/commit/4753a4e31169800277c09a59b2181ba3d4ddd683
DIFF: https://github.com/llvm/llvm-project/commit/4753a4e31169800277c09a59b2181ba3d4ddd683.diff
LOG: [OpenMP] asynchronous memory copy support
We introduced the implementation of supporting asynchronous routines with depend objects specified in Version 5.1 of the OpenMP Application Programming Interface. In brief, these routines omp_target_memcpy_async and omp_target_memcpy_rect_async perform asynchronous (nonblocking) memory copies between any
combination of host and device pointers. The basic idea is to create the implicit tasks to carry the memory copy calls and handle dependencies specified by depend objects. The implicit tasks are executed via hidden helper thread in OpenMP runtime.
Reviewed By: jdoerfert, tianshilei1992
Committed By: jplehr
Differential Revision: https://reviews.llvm.org/D136103
Added:
openmp/libomptarget/test/api/omp_target_memcpy_async1.c
openmp/libomptarget/test/api/omp_target_memcpy_async2.c
openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c
Modified:
openmp/libomptarget/include/interop.h
openmp/libomptarget/src/api.cpp
openmp/libomptarget/src/exports
openmp/libomptarget/src/private.h
Removed:
################################################################################
diff --git a/openmp/libomptarget/include/interop.h b/openmp/libomptarget/include/interop.h
index 696d2385664b3..ed3aa0d83a863 100644
--- a/openmp/libomptarget/include/interop.h
+++ b/openmp/libomptarget/include/interop.h
@@ -116,30 +116,6 @@ omp_get_interop_type_desc(const omp_interop_t, omp_interop_property_t);
extern const char *__KAI_KMPC_CONVENTION
omp_get_interop_rc_desc(const omp_interop_t, omp_interop_rc_t);
-typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
- /* Compiler flags */ /* Total compiler flags must be 16 bits */
- unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
- unsigned final : 1; /* task is final(1) so execute immediately */
- unsigned merged_if0 : 1; // no __kmpc_task_{begin/complete}_if0 calls in if0
- unsigned destructors_thunk : 1; // set if the compiler creates a thunk to
- unsigned proxy : 1; // task is a proxy task (it will be executed outside the
- unsigned priority_specified : 1; // set if the compiler provides priority
- unsigned detachable : 1; // 1 == can detach */
- unsigned unshackled : 1; /* 1 == unshackled task */
- unsigned target : 1; /* 1 == target task */
- unsigned reserved : 7; /* reserved for compiler use */
- unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
- unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
- unsigned tasking_ser : 1; // all tasks in team are either executed immediately
- unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
- unsigned started : 1; /* 1==started, 0==not started */
- unsigned executing : 1; /* 1==executing, 0==not executing */
- unsigned complete : 1; /* 1==complete, 0==not complete */
- unsigned freed : 1; /* 1==freed, 0==allocated */
- unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
- unsigned reserved31 : 7; /* reserved for library use */
-} kmp_tasking_flags_t;
-
typedef enum omp_interop_backend_type_t {
// reserve 0
omp_interop_backend_type_cuda_1 = 1,
diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 2dfe075b8e746..942df8fdb94d6 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -15,6 +15,8 @@
#include "private.h"
#include "rtl.h"
+#include "llvm/ADT/SmallVector.h"
+
#include <climits>
#include <cstdlib>
#include <cstring>
@@ -207,6 +209,105 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
return Rc;
}
+// The helper function that calls omp_target_memcpy or omp_target_memcpy_rect
+static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
+ if (Task == nullptr)
+ return OFFLOAD_FAIL;
+
+ TargetMemcpyArgsTy *Args = (TargetMemcpyArgsTy *)Task->shareds;
+
+ if (Args == nullptr)
+ return OFFLOAD_FAIL;
+
+ // Call blocked version
+ int Rc = OFFLOAD_SUCCESS;
+ if (Args->IsRectMemcpy) {
+ Rc = omp_target_memcpy_rect(
+ Args->Dst, Args->Src, Args->ElementSize, Args->NumDims, Args->Volume,
+ Args->DstOffsets, Args->SrcOffsets, Args->DstDimensions,
+ Args->SrcDimensions, Args->DstDevice, Args->SrcDevice);
+
+ DP("omp_target_memcpy_rect returns %d\n", Rc);
+ } else {
+ Rc = omp_target_memcpy(Args->Dst, Args->Src, Args->Length, Args->DstOffset,
+ Args->SrcOffset, Args->DstDevice, Args->SrcDevice);
+
+ DP("omp_target_memcpy returns %d\n", Rc);
+ }
+
+ // Release the arguments object
+ delete Args;
+
+ return Rc;
+}
+
+// Allocate and launch helper task
+static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args,
+ int DepObjCount,
+ omp_depend_t *DepObjList) {
+ // Create global thread ID
+ int Gtid = __kmpc_global_thread_num(nullptr);
+ int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memcpy_async_helper;
+
+ // Setup the hidden helper flags;
+ kmp_int32 Flags = 0;
+ kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
+ InputFlags->hidden_helper = 1;
+
+ // Alloc helper task
+ kmp_task_t *Ptr = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags,
+ sizeof(kmp_task_t), 0, Fn, -1);
+
+ if (Ptr == nullptr) {
+ // Task allocation failed, delete the argument object
+ delete Args;
+
+ return OFFLOAD_FAIL;
+ }
+
+ // Setup the arguments passed to helper task
+ Ptr->shareds = Args;
+
+ // Convert the type of depend objects
+ llvm::SmallVector<kmp_depend_info_t> DepObjs;
+ for (int i = 0; i < DepObjCount; i++) {
+ omp_depend_t DepObj = DepObjList[i];
+ DepObjs.push_back(*((kmp_depend_info_t *)DepObj));
+ }
+
+ // Launch the helper task
+ int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount,
+ DepObjs.data(), 0, nullptr);
+
+ return Rc;
+}
+
+EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
+ size_t DstOffset, size_t SrcOffset,
+ int DstDevice, int SrcDevice,
+ int DepObjCount, omp_depend_t *DepObjList) {
+ TIMESCOPE();
+ DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
+ "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
+ "src offset %zu, length %zu\n",
+ DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DstOffset, SrcOffset,
+ Length);
+
+ // Check the source and dest address
+ if (Dst == nullptr || Src == nullptr)
+ return OFFLOAD_FAIL;
+
+ // Create task object
+ TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy(
+ Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);
+
+ // Create and launch helper task
+ int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
+
+ DP("omp_target_memcpy_async returns %d\n", Rc);
+ return Rc;
+}
+
EXTERN int
omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
int NumDims, const size_t *Volume,
@@ -267,6 +368,43 @@ omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
return Rc;
}
+EXTERN int omp_target_memcpy_rect_async(
+ void *Dst, const void *Src, size_t ElementSize, int NumDims,
+ const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
+ const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
+ int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
+ TIMESCOPE();
+ DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
+ "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
+ "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
+ "volume " DPxMOD ", element size %zu, num_dims %d\n",
+ DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets),
+ DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions),
+ DPxPTR(Volume), ElementSize, NumDims);
+
+ // Need to check this first to not return OFFLOAD_FAIL instead
+ if (!Dst && !Src) {
+ DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
+ INT_MAX);
+ return INT_MAX;
+ }
+
+ // Check the source and dest address
+ if (Dst == nullptr || Src == nullptr)
+ return OFFLOAD_FAIL;
+
+ // Create task object
+ TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy(
+ Dst, Src, ElementSize, NumDims, Volume, DstOffsets, SrcOffsets,
+ DstDimensions, SrcDimensions, DstDevice, SrcDevice);
+
+ // Create and launch helper task
+ int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
+
+ DP("omp_target_memcpy_rect_async returns %d\n", Rc);
+ return Rc;
+}
+
EXTERN int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
size_t Size, size_t DeviceOffset,
int DeviceNum) {
diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports
index 42682abf7786c..6c3fdf0950ab6 100644
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -41,6 +41,8 @@ VERS1.0 {
omp_target_is_present;
omp_target_memcpy;
omp_target_memcpy_rect;
+ omp_target_memcpy_async;
+ omp_target_memcpy_rect_async;
omp_target_associate_ptr;
omp_target_disassociate_ptr;
llvm_omp_target_alloc_host;
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index 9f156192e1036..8b62c164039f1 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -104,7 +104,51 @@ extern "C" {
* We maintain the same data structure for compatibility.
*/
typedef int kmp_int32;
+typedef int64_t kmp_int64;
typedef intptr_t kmp_intptr_t;
+
+typedef void *omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *);
+typedef struct kmp_task {
+ void *shareds;
+ kmp_routine_entry_t routine;
+ kmp_int32 part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+ /* Compiler flags */ /* Total compiler flags must be 16 bits */
+ unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
+ unsigned final : 1; /* task is final(1) so execute immediately */
+ unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+ code path */
+ unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+ invoke destructors from the runtime */
+ unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+ context of the RTL) */
+ unsigned priority_specified : 1; /* set if the compiler provides priority
+ setting for the task */
+ unsigned detachable : 1; /* 1 == can detach */
+ unsigned hidden_helper : 1; /* 1 == hidden helper task */
+ unsigned reserved : 8; /* reserved for compiler use */
+
+ /* Library flags */ /* Total library flags must be 16 bits */
+ unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
+ unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+ unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+ // (1) or may be deferred (0)
+ unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+ // (0) [>= 2 threads]
+ /* If either team_serial or tasking_ser is set, task team may be NULL */
+ /* Task State Flags: */
+ unsigned started : 1; /* 1==started, 0==not started */
+ unsigned executing : 1; /* 1==executing, 0==not executing */
+ unsigned complete : 1; /* 1==complete, 0==not complete */
+ unsigned freed : 1; /* 1==freed, 0==allocated */
+ unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+ unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+
// Compiler sends us this info:
typedef struct kmp_depend_info {
kmp_intptr_t base_addr;
@@ -126,6 +170,86 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
void **__kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
__attribute__((weak));
bool __kmpc_omp_has_task_team(kmp_int32 gtid) __attribute__((weak));
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+ kmp_int32 flags, size_t sizeof_kmp_task_t,
+ size_t sizeof_shareds,
+ kmp_routine_entry_t task_entry)
+ __attribute__((weak));
+
+kmp_task_t *
+__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+ size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+ kmp_routine_entry_t task_entry,
+ kmp_int64 device_id) __attribute__((weak));
+
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
+ kmp_task_t *new_task, kmp_int32 ndeps,
+ kmp_depend_info_t *dep_list,
+ kmp_int32 ndeps_noalias,
+ kmp_depend_info_t *noalias_dep_list)
+ __attribute__((weak));
+
+/**
+ * The argument set that is passed from asynchronous memory copy to block
+ * version of memory copy invoked in helper task
+ */
+struct TargetMemcpyArgsTy {
+ /**
+ * Common attribuutes
+ */
+ void *Dst;
+ const void *Src;
+ int DstDevice;
+ int SrcDevice;
+
+ /**
+ * The flag that denotes single dimensional or rectangle dimensional copy
+ */
+ bool IsRectMemcpy;
+
+ /**
+ * Arguments for single dimensional copy
+ */
+ size_t Length;
+ size_t DstOffset;
+ size_t SrcOffset;
+
+ /**
+ * Arguments for rectangle dimensional copy
+ */
+ size_t ElementSize;
+ int NumDims;
+ const size_t *Volume;
+ const size_t *DstOffsets;
+ const size_t *SrcOffsets;
+ const size_t *DstDimensions;
+ const size_t *SrcDimensions;
+
+ /**
+ * Constructor for single dimensional copy
+ */
+ TargetMemcpyArgsTy(void *Dst, const void *Src, size_t Length,
+ size_t DstOffset, size_t SrcOffset, int DstDevice,
+ int SrcDevice)
+ : Dst(Dst), Src(Src), DstDevice(DstDevice), SrcDevice(SrcDevice),
+ IsRectMemcpy(false), Length(Length), DstOffset(DstOffset),
+ SrcOffset(SrcOffset), ElementSize(0), NumDims(0), Volume(0),
+ DstOffsets(0), SrcOffsets(0), DstDimensions(0), SrcDimensions(0){};
+
+ /**
+ * Constructor for rectangle dimensional copy
+ */
+ TargetMemcpyArgsTy(void *Dst, const void *Src, size_t ElementSize,
+ int NumDims, const size_t *Volume,
+ const size_t *DstOffsets, const size_t *SrcOffsets,
+ const size_t *DstDimensions, const size_t *SrcDimensions,
+ int DstDevice, int SrcDevice)
+ : Dst(Dst), Src(Src), DstDevice(DstDevice), SrcDevice(SrcDevice),
+ IsRectMemcpy(true), Length(0), DstOffset(0), SrcOffset(0),
+ ElementSize(ElementSize), NumDims(NumDims), Volume(Volume),
+ DstOffsets(DstOffsets), SrcOffsets(SrcOffsets),
+ DstDimensions(DstDimensions), SrcDimensions(SrcDimensions){};
+};
// Invalid GTID as defined by libomp; keep in sync
#define KMP_GTID_DNE (-2)
#ifdef __cplusplus
diff --git a/openmp/libomptarget/test/api/omp_target_memcpy_async1.c b/openmp/libomptarget/test/api/omp_target_memcpy_async1.c
new file mode 100644
index 0000000000000..1abcfde83dbd8
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_target_memcpy_async1.c
@@ -0,0 +1,48 @@
+// RUN: %libomptarget-compile-and-run-generic
+
+// Test case for omp_target_memcpy_async, oringally from GCC
+
+#include "stdio.h"
+#include <omp.h>
+#include <stdlib.h>
+
+int main() {
+ int d = omp_get_default_device();
+ int id = omp_get_initial_device();
+ int q[128], i;
+ void *p;
+
+ if (d < 0 || d >= omp_get_num_devices())
+ d = id;
+
+ p = omp_target_alloc(130 * sizeof(int), d);
+ if (p == NULL)
+ return 0;
+
+ for (i = 0; i < 128; i++)
+ q[i] = i;
+
+ if (omp_target_memcpy_async(p, q, 128 * sizeof(int), sizeof(int), 0, d, id, 0,
+ NULL)) {
+ abort();
+ }
+
+#pragma omp taskwait
+
+ int q2[128];
+ for (i = 0; i < 128; ++i)
+ q2[i] = 0;
+ if (omp_target_memcpy_async(q2, p, 128 * sizeof(int), 0, sizeof(int), id, d,
+ 0, NULL))
+ abort();
+
+#pragma omp taskwait
+
+ for (i = 0; i < 128; ++i)
+ if (q2[i] != q[i])
+ abort();
+
+ omp_target_free(p, d);
+
+ return 0;
+}
diff --git a/openmp/libomptarget/test/api/omp_target_memcpy_async2.c b/openmp/libomptarget/test/api/omp_target_memcpy_async2.c
new file mode 100644
index 0000000000000..d63f610c22ab7
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_target_memcpy_async2.c
@@ -0,0 +1,73 @@
+// RUN: %libomptarget-compile-and-run-generic
+
+#include "stdio.h"
+#include <omp.h>
+#include <stdlib.h>
+
+int main() {
+ int d = omp_get_default_device();
+ int id = omp_get_initial_device();
+ int a[128], b[64], c[32], e[16], q[128], i;
+ void *p;
+
+ if (d < 0 || d >= omp_get_num_devices())
+ d = id;
+
+ p = omp_target_alloc(130 * sizeof(int), d);
+ if (p == NULL)
+ return 0;
+
+ for (i = 0; i < 128; ++i)
+ a[i] = i + 1;
+ for (i = 0; i < 64; ++i)
+ b[i] = i + 2;
+ for (i = 0; i < 32; i++)
+ c[i] = 0;
+ for (i = 0; i < 16; i++)
+ e[i] = i + 4;
+
+ omp_depend_t obj[2];
+
+#pragma omp parallel num_threads(5)
+#pragma omp single
+ {
+#pragma omp task depend(out : p)
+ omp_target_memcpy(p, a, 128 * sizeof(int), 0, 0, d, id);
+
+#pragma omp task depend(inout : p)
+ omp_target_memcpy(p, b, 64 * sizeof(int), 0, 0, d, id);
+
+#pragma omp task depend(out : c)
+ for (i = 0; i < 32; i++)
+ c[i] = i + 3;
+
+#pragma omp depobj(obj[0]) depend(inout : p)
+#pragma omp depobj(obj[1]) depend(in : c)
+ omp_target_memcpy_async(p, c, 32 * sizeof(int), 0, 0, d, id, 2, obj);
+
+#pragma omp task depend(in : p)
+ omp_target_memcpy(p, e, 16 * sizeof(int), 0, 0, d, id);
+ }
+
+#pragma omp taskwait
+
+ for (i = 0; i < 128; ++i)
+ q[i] = 0;
+ omp_target_memcpy(q, p, 128 * sizeof(int), 0, 0, id, d);
+ for (i = 0; i < 16; ++i)
+ if (q[i] != i + 4)
+ abort();
+ for (i = 16; i < 32; ++i)
+ if (q[i] != i + 3)
+ abort();
+ for (i = 32; i < 64; ++i)
+ if (q[i] != i + 2)
+ abort();
+ for (i = 64; i < 128; ++i)
+ if (q[i] != i + 1)
+ abort();
+
+ omp_target_free(p, d);
+
+ return 0;
+}
diff --git a/openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c b/openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
new file mode 100644
index 0000000000000..4d07898fbdcd2
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
@@ -0,0 +1,66 @@
+// RUN: %libomptarget-compile-and-run-generic
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define NUM_DIMS 3
+
+int main() {
+ int d = omp_get_default_device();
+ int id = omp_get_initial_device();
+ int q[128], q2[128], i;
+ void *p;
+
+ if (d < 0 || d >= omp_get_num_devices())
+ d = id;
+
+ p = omp_target_alloc(130 * sizeof(int), d);
+ if (p == NULL)
+ return 0;
+
+ if (omp_target_memcpy_rect_async(NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+ NULL, d, id, 0, NULL) < 3 ||
+ omp_target_memcpy_rect_async(NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+ NULL, id, d, 0, NULL) < 3 ||
+ omp_target_memcpy_rect_async(NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+ NULL, id, id, 0, NULL) < 3)
+ abort();
+
+ for (i = 0; i < 128; i++)
+ q[i] = 0;
+ if (omp_target_memcpy(p, q, 128 * sizeof(int), 0, 0, d, id) != 0)
+ abort();
+
+ for (i = 0; i < 128; i++)
+ q[i] = i + 1;
+
+ size_t volume[NUM_DIMS] = {1, 2, 3};
+ size_t dst_offsets[NUM_DIMS] = {0, 0, 0};
+ size_t src_offsets[NUM_DIMS] = {0, 0, 0};
+ size_t dst_dimensions[NUM_DIMS] = {3, 4, 5};
+ size_t src_dimensions[NUM_DIMS] = {2, 3, 4};
+
+ if (omp_target_memcpy_rect_async(p, q, sizeof(int), NUM_DIMS, volume,
+ dst_offsets, src_offsets, dst_dimensions,
+ src_dimensions, d, id, 0, NULL) != 0)
+ abort();
+
+#pragma omp taskwait
+
+ for (i = 0; i < 128; i++)
+ q2[i] = 0;
+ if (omp_target_memcpy(q2, p, 128 * sizeof(int), 0, 0, id, d) != 0)
+ abort();
+
+ /* q2 is expected to contain: 1 2 3 0 0 5 6 7 0 0 .. 0 */
+ if (q2[0] != 1 || q2[1] != 2 || q2[2] != 3 || q2[3] != 0 || q2[4] != 0 ||
+ q2[5] != 5 || q2[6] != 6 || q2[7] != 7)
+ abort();
+ for (i = 8; i < 128; ++i)
+ if (q2[i] != 0)
+ abort();
+
+ omp_target_free(p, d);
+ return 0;
+}
diff --git a/openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c b/openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c
new file mode 100644
index 0000000000000..1d17d884be266
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c
@@ -0,0 +1,89 @@
+// RUN: %libomptarget-compile-and-run-generic
+
+#include <omp.h>
+#include <stdlib.h>
+
+#define NUM_DIMS 3
+
+int main() {
+ int d = omp_get_default_device();
+ int id = omp_get_initial_device();
+ int a[128], b[64], c[128], e[16], q[128], i;
+ void *p;
+
+ if (d < 0 || d >= omp_get_num_devices())
+ d = id;
+
+ p = omp_target_alloc(130 * sizeof(int), d);
+ if (p == NULL)
+ return 0;
+
+ for (i = 0; i < 128; i++)
+ q[i] = 0;
+ if (omp_target_memcpy(p, q, 128 * sizeof(int), 0, 0, d, id) != 0)
+ abort();
+
+ size_t volume[NUM_DIMS] = {2, 2, 3};
+ size_t dst_offsets[NUM_DIMS] = {0, 0, 0};
+ size_t src_offsets[NUM_DIMS] = {0, 0, 0};
+ size_t dst_dimensions[NUM_DIMS] = {3, 4, 5};
+ size_t src_dimensions[NUM_DIMS] = {2, 3, 4};
+
+ for (i = 0; i < 128; i++)
+ a[i] = 42;
+ for (i = 0; i < 64; i++)
+ b[i] = 24;
+ for (i = 0; i < 128; i++)
+ c[i] = 0;
+ for (i = 0; i < 16; i++)
+ e[i] = 77;
+
+ omp_depend_t obj[2];
+
+#pragma omp parallel num_threads(5)
+#pragma omp single
+ {
+#pragma omp task depend(out : p)
+ omp_target_memcpy(p, a, 128 * sizeof(int), 0, 0, d, id);
+
+#pragma omp task depend(inout : p)
+ omp_target_memcpy(p, b, 64 * sizeof(int), 0, 0, d, id);
+
+#pragma omp task depend(out : c)
+ for (i = 0; i < 128; i++)
+ c[i] = i + 1;
+
+#pragma omp depobj(obj[0]) depend(inout : p)
+#pragma omp depobj(obj[1]) depend(in : c)
+
+ /* This produces: 1 2 3 - - 5 6 7 - - at positions 0..9 and
+ 13 14 15 - - 17 18 19 - - at positions 20..29. */
+ omp_target_memcpy_rect_async(p, c, sizeof(int), NUM_DIMS, volume,
+ dst_offsets, src_offsets, dst_dimensions,
+ src_dimensions, d, id, 2, obj);
+
+#pragma omp task depend(in : p)
+ omp_target_memcpy(p, e, 16 * sizeof(int), 0, 0, d, id);
+ }
+
+#pragma omp taskwait
+
+ if (omp_target_memcpy(q, p, 128 * sizeof(int), 0, 0, id, d) != 0)
+ abort();
+
+ for (i = 0; i < 16; ++i)
+ if (q[i] != 77)
+ abort();
+ if (q[20] != 13 || q[21] != 14 || q[22] != 15 || q[25] != 17 || q[26] != 18 ||
+ q[27] != 19)
+ abort();
+ for (i = 28; i < 64; ++i)
+ if (q[i] != 24)
+ abort();
+ for (i = 64; i < 128; ++i)
+ if (q[i] != 42)
+ abort();
+
+ omp_target_free(p, d);
+ return 0;
+}
More information about the Openmp-commits
mailing list