[Openmp-commits] [openmp] [OpenMP] [amdgpu] Added a synchronous version of data exchange. (PR #87032)

via Openmp-commits openmp-commits at lists.llvm.org
Fri Mar 29 13:16:54 PDT 2024


https://github.com/dhruvachak updated https://github.com/llvm/llvm-project/pull/87032

>From 962bb4cf0d572b12cc653104623af7983326076e Mon Sep 17 00:00:00 2001
From: Dhruva Chakrabarti <Dhruva.Chakrabarti at amd.com>
Date: Thu, 28 Mar 2024 21:23:38 -0400
Subject: [PATCH 1/2] [OpenMP] [amdgpu] Added a synchronous version of data
 exchange.

Similar to H2D and D2H, use synchronous mode for large data transfers
beyond a certain size for D2D as well. As with H2D and D2H, this size is
controlled by an env-var.
---
 .../plugins-nextgen/amdgpu/src/rtl.cpp        | 21 ++++++
 .../test/offloading/d2d_memcpy_sync.c         | 67 +++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 openmp/libomptarget/test/offloading/d2d_memcpy_sync.c

diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 2dd08dd5d0b4ea..a0fdde951b74a7 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2402,6 +2402,27 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                          AsyncInfoWrapperTy &AsyncInfoWrapper) override {
     AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
 
+    // For large transfers use synchronous behavior.
+    if (Size >= OMPX_MaxAsyncCopyBytes) {
+      if (AsyncInfoWrapper.hasQueue())
+        if (auto Err = synchronize(AsyncInfoWrapper))
+          return Err;
+
+      AMDGPUSignalTy Signal;
+      if (auto Err = Signal.init())
+        return Err;
+
+      if (auto Err = utils::asyncMemCopy(
+              useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
+              getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
+        return Err;
+
+      if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
+        return Err;
+
+      return Signal.deinit();
+    }
+
     AMDGPUStreamTy *Stream = nullptr;
     if (auto Err = getStream(AsyncInfoWrapper, Stream))
       return Err;
diff --git a/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c b/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c
new file mode 100644
index 00000000000000..a768cd1209ac52
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c
@@ -0,0 +1,67 @@
+// RUN: %libomptarget-compile-generic && \
+// RUN: env LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES=0 %libomptarget-run-generic | \
+// RUN: %fcheck-generic -allow-empty
+// REQUIRES: amdgcn-amd-amdhsa
+
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+const int magic_num = 7;
+
+int main(int argc, char *argv[]) {
+  const int N = 128;
+  const int num_devices = omp_get_num_devices();
+
+  // No target device, just return
+  if (num_devices == 0) {
+    printf("PASS\n");
+    return 0;
+  }
+
+  const int src_device = 0;
+  int dst_device = num_devices - 1;
+
+  int length = N * sizeof(int);
+  int *src_ptr = omp_target_alloc(length, src_device);
+  int *dst_ptr = omp_target_alloc(length, dst_device);
+
+  assert(src_ptr && "src_ptr is NULL");
+  assert(dst_ptr && "dst_ptr is NULL");
+
+#pragma omp target teams distribute parallel for device(src_device)            \
+    is_device_ptr(src_ptr)
+  for (int i = 0; i < N; ++i) {
+    src_ptr[i] = magic_num;
+  }
+
+  int rc =
+      omp_target_memcpy(dst_ptr, src_ptr, length, 0, 0, dst_device, src_device);
+
+  assert(rc == 0 && "error in omp_target_memcpy");
+
+  int *buffer = malloc(length);
+
+  assert(buffer && "failed to allocate host buffer");
+
+#pragma omp target teams distribute parallel for device(dst_device)            \
+    map(from : buffer[0 : N]) is_device_ptr(dst_ptr)
+  for (int i = 0; i < N; ++i) {
+    buffer[i] = dst_ptr[i] + magic_num;
+  }
+
+  for (int i = 0; i < N; ++i)
+    assert(buffer[i] == 2 * magic_num);
+
+  printf("PASS\n");
+
+  // Free host and device memory
+  free(buffer);
+  omp_target_free(src_ptr, src_device);
+  omp_target_free(dst_ptr, dst_device);
+
+  return 0;
+}
+
+// CHECK: PASS

>From 9ce9adca26b078556e89e05d5018d92da105f387 Mon Sep 17 00:00:00 2001
From: Dhruva Chakrabarti <Dhruva.Chakrabarti at amd.com>
Date: Fri, 29 Mar 2024 14:42:28 -0400
Subject: [PATCH 2/2] [OpenMP] [amdgpu] Added a synchronous version of data
 exchange.

Changed test to not assert on allocation failure. Instead it checks for
that condition and returns a failure status.
---
 .../test/offloading/d2d_memcpy_sync.c         | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c b/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c
index a768cd1209ac52..6b9b765a74d823 100644
--- a/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c
+++ b/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c
@@ -27,8 +27,10 @@ int main(int argc, char *argv[]) {
   int *src_ptr = omp_target_alloc(length, src_device);
   int *dst_ptr = omp_target_alloc(length, dst_device);
 
-  assert(src_ptr && "src_ptr is NULL");
-  assert(dst_ptr && "dst_ptr is NULL");
+  if (!src_ptr || !dst_ptr) {
+    printf("FAIL\n");
+    return 1;
+  }
 
 #pragma omp target teams distribute parallel for device(src_device)            \
     is_device_ptr(src_ptr)
@@ -36,14 +38,17 @@ int main(int argc, char *argv[]) {
     src_ptr[i] = magic_num;
   }
 
-  int rc =
-      omp_target_memcpy(dst_ptr, src_ptr, length, 0, 0, dst_device, src_device);
-
-  assert(rc == 0 && "error in omp_target_memcpy");
+  if (omp_target_memcpy(dst_ptr, src_ptr, length, 0, 0, dst_device,
+                        src_device)) {
+    printf("FAIL\n");
+    return 1;
+  }
 
   int *buffer = malloc(length);
-
-  assert(buffer && "failed to allocate host buffer");
+  if (!buffer) {
+    printf("FAIL\n");
+    return 1;
+  }
 
 #pragma omp target teams distribute parallel for device(dst_device)            \
     map(from : buffer[0 : N]) is_device_ptr(dst_ptr)



More information about the Openmp-commits mailing list