[Openmp-commits] [openmp] [OpenMP] [amdgpu] Added a synchronous version of data exchange. (PR #87032)

Thu Mar 28 20:19:44 PDT 2024

https://github.com/dhruvachak updated https://github.com/llvm/llvm-project/pull/87032

>From 962bb4cf0d572b12cc653104623af7983326076e Mon Sep 17 00:00:00 2001
From: Dhruva Chakrabarti <Dhruva.Chakrabarti at amd.com>
Date: Thu, 28 Mar 2024 21:23:38 -0400
Subject: [PATCH] [OpenMP] [amdgpu] Added a synchronous version of data
 exchange.

Similar to H2D and D2H, use synchronous mode for large data transfers
beyond a certain size for D2D as well. As with H2D and D2H, this size is
controlled by an env-var.
---
 .../plugins-nextgen/amdgpu/src/rtl.cpp        | 21 ++++++
 .../test/offloading/d2d_memcpy_sync.c         | 67 +++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 openmp/libomptarget/test/offloading/d2d_memcpy_sync.c

diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 2dd08dd5d0b4ea..a0fdde951b74a7 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2402,6 +2402,27 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                          AsyncInfoWrapperTy &AsyncInfoWrapper) override {
     AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
 
+    // For large transfers use synchronous behavior.
+    if (Size >= OMPX_MaxAsyncCopyBytes) {
+      if (AsyncInfoWrapper.hasQueue())
+        if (auto Err = synchronize(AsyncInfoWrapper))
+          return Err;
+
+      AMDGPUSignalTy Signal;
+      if (auto Err = Signal.init())
+        return Err;
+
+      if (auto Err = utils::asyncMemCopy(
+              useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
+              getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
+        return Err;
+
+      if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
+        return Err;
+
+      return Signal.deinit();
+    }
+
     AMDGPUStreamTy *Stream = nullptr;
     if (auto Err = getStream(AsyncInfoWrapper, Stream))
       return Err;
diff --git a/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c b/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c
new file mode 100644
index 00000000000000..a768cd1209ac52
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c
@@ -0,0 +1,67 @@
+// RUN: %libomptarget-compile-generic && \
+// RUN: env LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES=0 %libomptarget-run-generic | \
+// RUN: %fcheck-generic -allow-empty
+// REQUIRES: amdgcn-amd-amdhsa
+
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+const int magic_num = 7;
+
+int main(int argc, char *argv[]) {
+  const int N = 128;
+  const int num_devices = omp_get_num_devices();
+
+  // No target device, just return
+  if (num_devices == 0) {
+    printf("PASS\n");
+    return 0;
+  }
+
+  const int src_device = 0;
+  int dst_device = num_devices - 1;
+
+  int length = N * sizeof(int);
+  int *src_ptr = omp_target_alloc(length, src_device);
+  int *dst_ptr = omp_target_alloc(length, dst_device);
+
+  assert(src_ptr && "src_ptr is NULL");
+  assert(dst_ptr && "dst_ptr is NULL");
+
+#pragma omp target teams distribute parallel for device(src_device)            \
+    is_device_ptr(src_ptr)
+  for (int i = 0; i < N; ++i) {
+    src_ptr[i] = magic_num;
+  }
+
+  int rc =
+      omp_target_memcpy(dst_ptr, src_ptr, length, 0, 0, dst_device, src_device);
+
+  assert(rc == 0 && "error in omp_target_memcpy");
+
+  int *buffer = malloc(length);
+
+  assert(buffer && "failed to allocate host buffer");
+
+#pragma omp target teams distribute parallel for device(dst_device)            \
+    map(from : buffer[0 : N]) is_device_ptr(dst_ptr)
+  for (int i = 0; i < N; ++i) {
+    buffer[i] = dst_ptr[i] + magic_num;
+  }
+
+  for (int i = 0; i < N; ++i)
+    assert(buffer[i] == 2 * magic_num);
+
+  printf("PASS\n");
+
+  // Free host and device memory
+  free(buffer);
+  omp_target_free(src_ptr, src_device);
+  omp_target_free(dst_ptr, dst_device);
+
+  return 0;
+}
+
+// CHECK: PASS